publishFullContrib: true for ydb

<HIDDEN_URL> commit_hash:c82a80ac4594723cebf2c7387dec9c60217f603e
author: maxim-yurchuk <[email protected]> 2024-10-09 12:29:46 +0300
committer: maxim-yurchuk <[email protected]> 2024-10-09 13:14:22 +0300
commit: 9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80 (patch)
tree: a8fb3181d5947c0d78cf402aa56e686130179049 /contrib/python/numpy/py2
parent: a44b779cd359f06c3ebbef4ec98c6b38609d9d85 (diff)
267 files changed, 293450 insertions, 0 deletions
diff --git a/contrib/python/numpy/py2/.dist-info/METADATA b/contrib/python/numpy/py2/.dist-info/METADATA
new file mode 100644
index 00000000000..40a5345b6f4
--- /dev/null
+++ b/contrib/python/numpy/py2/.dist-info/METADATA
@@ -0,0 +1,56 @@
+Metadata-Version: 2.1
+Name: numpy
+Version: 1.16.6
+Summary: NumPy is the fundamental package for array computing with Python.
+Home-page: https://www.numpy.org
+Author: Travis E. Oliphant et al.
+Maintainer: NumPy Developers
+Maintainer-email: [email protected]
+License: BSD
+Download-URL: https://pypi.python.org/pypi/numpy
+Project-URL: Bug Tracker, https://github.com/numpy/numpy/issues
+Project-URL: Source Code, https://github.com/numpy/numpy
+Platform: Windows
+Platform: Linux
+Platform: Solaris
+Platform: Mac OS-X
+Platform: Unix
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Topic :: Software Development
+Classifier: Topic :: Scientific/Engineering
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Operating System :: POSIX
+Classifier: Operating System :: Unix
+Classifier: Operating System :: MacOS
+Requires-Python: >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
+
+It provides:
+
+- a powerful N-dimensional array object
+- sophisticated (broadcasting) functions
+- tools for integrating C/C++ and Fortran code
+- useful linear algebra, Fourier transform, and random number capabilities
+- and much more
+
+Besides its obvious scientific uses, NumPy can also be used as an efficient
+multi-dimensional container of generic data. Arbitrary data-types can be
+defined. This allows NumPy to seamlessly and speedily integrate with a wide
+variety of databases.
+
+All NumPy wheels distributed on PyPI are BSD licensed.
+
+
+
diff --git a/contrib/python/numpy/py2/.dist-info/entry_points.txt b/contrib/python/numpy/py2/.dist-info/entry_points.txt
new file mode 100644
index 00000000000..bddf93b180c
--- /dev/null
+++ b/contrib/python/numpy/py2/.dist-info/entry_points.txt
@@ -0,0 +1,5 @@
+[console_scripts]
+f2py = numpy.f2py.f2py2e:main
+f2py2 = numpy.f2py.f2py2e:main
+f2py2.7 = numpy.f2py.f2py2e:main
+
diff --git a/contrib/python/numpy/py2/.dist-info/top_level.txt b/contrib/python/numpy/py2/.dist-info/top_level.txt
new file mode 100644
index 00000000000..24ce15ab7ea
--- /dev/null
+++ b/contrib/python/numpy/py2/.dist-info/top_level.txt
@@ -0,0 +1 @@
+numpy
diff --git a/contrib/python/numpy/py2/numpy/_build_utils/__init__.py b/contrib/python/numpy/py2/numpy/_build_utils/__init__.py
new file mode 100644
index 00000000000..1d0f69b67d8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/_build_utils/__init__.py
@@ -0,0 +1 @@
+from __future__ import division, absolute_import, print_function
diff --git a/contrib/python/numpy/py2/numpy/_build_utils/apple_accelerate.py b/contrib/python/numpy/py2/numpy/_build_utils/apple_accelerate.py
new file mode 100644
index 00000000000..36dd7584a6e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/_build_utils/apple_accelerate.py
@@ -0,0 +1,28 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import re
+
+__all__ = ['uses_accelerate_framework', 'get_sgemv_fix']
+
+def uses_accelerate_framework(info):
+    """ Returns True if Accelerate framework is used for BLAS/LAPACK """
+    # If we're not building on Darwin (macOS), don't use Accelerate
+    if sys.platform != "darwin":
+        return False
+    # If we're building on macOS, but targeting a different platform,
+    # don't use Accelerate.
+    if os.getenv('_PYTHON_HOST_PLATFORM', None):
+        return False
+    r_accelerate = re.compile("Accelerate")
+    extra_link_args = info.get('extra_link_args', '')
+    for arg in extra_link_args:
+        if r_accelerate.search(arg):
+            return True
+    return False
+
+def get_sgemv_fix():
+    """ Returns source file needed to correct SGEMV """
+    path = os.path.abspath(os.path.dirname(__file__))
+    return [os.path.join(path, 'src', 'apple_sgemv_fix.c')]
diff --git a/contrib/python/numpy/py2/numpy/_build_utils/src/apple_sgemv_fix.c b/contrib/python/numpy/py2/numpy/_build_utils/src/apple_sgemv_fix.c
new file mode 100644
index 00000000000..c33c689929a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/_build_utils/src/apple_sgemv_fix.c
@@ -0,0 +1,253 @@
+/* This is a collection of ugly hacks to circumvent a bug in
+ * Apple Accelerate framework's SGEMV subroutine.
+ *
+ * See: https://github.com/numpy/numpy/issues/4007
+ *
+ * SGEMV in Accelerate framework will segfault on MacOS X version 10.9
+ * (aka Mavericks) if arrays are not aligned to 32 byte boundaries
+ * and the CPU supports AVX instructions. This can produce segfaults
+ * in np.dot.
+ *
+ * This patch overshadows the symbols cblas_sgemv, sgemv_ and sgemv
+ * exported by Accelerate to produce the correct behavior. The MacOS X
+ * version and CPU specs are checked on module import. If Mavericks and
+ * AVX are detected the call to SGEMV is emulated with a call to SGEMM
+ * if the arrays are not 32 byte aligned. If the exported symbols cannot
+ * be overshadowed on module import, a fatal error is produced and the
+ * process aborts. All the fixes are in a self-contained C file
+ * and do not alter the multiarray C code. The patch is not applied
+ * unless NumPy is configured to link with Apple's Accelerate
+ * framework.
+ *
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+#include <string.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <string.h>
+
+/* ----------------------------------------------------------------- */
+/* Original cblas_sgemv */
+
+#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"
+
+enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+extern void cblas_xerbla(int info, const char *rout, const char *form, ...);
+
+typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX,
+                 const float beta, float  *Y, const int incY);
+
+typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
+                 const int M, const int N, const int K,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *B, const int ldb,
+                 const float beta, float  *C, const int incC);
+
+typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY );
+
+static void *veclib = NULL;
+static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
+static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
+static fortran_sgemv_t *accelerate_sgemv = NULL;
+static int AVX_and_10_9 = 0;
+
+/* Dynamic check for AVX support
+ * __builtin_cpu_supports("avx") is available in gcc 4.8,
+ * but clang and icc do not currently support it. */
+static inline int 
+cpu_supports_avx() 
+{
+    int enabled, r;
+    size_t length = sizeof(enabled);
+    r = sysctlbyname("hw.optional.avx1_0", &enabled, &length, NULL, 0);
+    if ( r == 0 && enabled != 0) {
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+	
+/* Check if we are using MacOS X version 10.9 */
+static inline int 
+using_mavericks() 
+{
+    int r;
+    char str[32] = {0};
+    size_t size = sizeof(str);
+    r = sysctlbyname("kern.osproductversion", str, &size, NULL, 0);
+    if ( r == 0 && strncmp(str, "10.9", strlen("10.9")) == 0) {
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+__attribute__((destructor))
+static void unloadlib(void)
+{
+   if (veclib) dlclose(veclib);
+}
+
+__attribute__((constructor))
+static void loadlib()
+/* automatically executed on module import */
+{
+    char errormsg[1024];
+    int AVX, MAVERICKS;
+    memset((void*)errormsg, 0, sizeof(errormsg));
+    /* check if the CPU supports AVX */
+    AVX = cpu_supports_avx();
+    /* check if the OS is MacOS X Mavericks */
+    MAVERICKS = using_mavericks();
+    /* we need the workaround when the CPU supports
+     * AVX and the OS version is Mavericks */
+    AVX_and_10_9 = AVX && MAVERICKS;
+    /* load vecLib */
+    veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
+    if (!veclib) {
+        veclib = NULL;
+        snprintf(errormsg, sizeof(errormsg),
+                 "Failed to open vecLib from location '%s'.", VECLIB_FILE);
+        Py_FatalError(errormsg); /* calls abort() and dumps core */
+    }
+    /* resolve Fortran SGEMV from Accelerate */
+    accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
+    if (!accelerate_sgemv) {
+        unloadlib();
+        Py_FatalError("Failed to resolve symbol 'sgemv_'.");
+    }
+    /* resolve cblas_sgemv from Accelerate */
+    accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
+    if (!accelerate_cblas_sgemv) {
+        unloadlib();
+        Py_FatalError("Failed to resolve symbol 'cblas_sgemv'.");
+    }
+    /* resolve cblas_sgemm from Accelerate */
+    accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
+    if (!accelerate_cblas_sgemm) {
+        unloadlib();
+        Py_FatalError("Failed to resolve symbol 'cblas_sgemm'.");
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Fortran SGEMV override */
+
+void sgemv_( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY )
+{
+    /* It is safe to use the original SGEMV if we are not using AVX on Mavericks
+     * or the input arrays A, X and Y are all aligned on 32 byte boundaries. */
+    #define BADARRAY(x) (((npy_intp)(void*)x) % 32)
+    const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
+    if (!use_sgemm) {
+        accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
+        return;
+    }
+
+    /* Arrays are misaligned, the CPU supports AVX, and we are running
+     * Mavericks.
+     *
+     * Emulation of SGEMV with SGEMM:
+     *
+     * SGEMV allows vectors to be strided. SGEMM requires all arrays to be
+     * contiguous along the leading dimension. To emulate striding in SGEMV
+     * with the leading dimension arguments in SGEMM we compute
+     *
+     *    Y = alpha * op(A) @ X + beta * Y
+     *
+     * as
+     *
+     *    Y.T = alpha * X.T @ op(A).T + beta * Y.T
+     *
+     * Because Fortran uses column major order and X.T and Y.T are row vectors,
+     * the leading dimensions of X.T and Y.T in SGEMM become equal to the
+     * strides of the column vectors X and Y in SGEMV. */
+
+    switch (*trans) {
+        case 'T':
+        case 't':
+        case 'C':
+        case 'c':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        case 'N':
+        case 'n':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
+                1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        default:
+            cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Override for an alias symbol for sgemv_ in Accelerate */
+
+void sgemv (char *trans,
+            const int *m, const int *n,
+            const float *alpha,
+            const float *A, const int *lda,
+            const float *B, const int *incB,
+            const float *beta,
+            float *C, const int *incC)
+{
+    sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
+}
+
+/* ----------------------------------------------------------------- */
+/* cblas_sgemv override, based on Netlib CBLAS code */
+
+void cblas_sgemv(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, const float beta,
+                 float  *Y, const int incY)
+{
+   char TA;
+   if (order == CblasColMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'N';
+      else if (TransA == CblasTrans) TA = 'T';
+      else if (TransA == CblasConjTrans) TA = 'C';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
+      }
+      sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else if (order == CblasRowMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'T';
+      else if (TransA == CblasTrans) TA = 'N';
+      else if (TransA == CblasConjTrans) TA = 'N';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
+         return;
+      }
+      sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else
+      cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
+}
diff --git a/contrib/python/numpy/py2/numpy/compat/setup.py b/contrib/python/numpy/py2/numpy/compat/setup.py
new file mode 100644
index 00000000000..882857428cd
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/compat/setup.py
@@ -0,0 +1,12 @@
+from __future__ import division, print_function
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+
+    config = Configuration('compat', parent_package, top_path)
+    config.add_data_dir('tests')
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/compat/tests/__init__.py b/contrib/python/numpy/py2/numpy/compat/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/compat/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/compat/tests/test_compat.py b/contrib/python/numpy/py2/numpy/compat/tests/test_compat.py
new file mode 100644
index 00000000000..9bb316a4de7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/compat/tests/test_compat.py
@@ -0,0 +1,26 @@
+from __future__ import division, absolute_import, print_function
+
+from os.path import join
+
+from numpy.compat import isfileobj, os_fspath
+from numpy.testing import assert_
+from numpy.testing import tempdir
+
+
+def test_isfileobj():
+    with tempdir(prefix="numpy_test_compat_") as folder:
+        filename = join(folder, 'a.bin')
+
+        with open(filename, 'wb') as f:
+            assert_(isfileobj(f))
+
+        with open(filename, 'ab') as f:
+            assert_(isfileobj(f))
+
+        with open(filename, 'rb') as f:
+            assert_(isfileobj(f))
+
+
+def test_os_fspath_strings():
+    for string_path in (b'/a/b/c.d', u'/a/b/c.d'):
+        assert_(os_fspath(string_path) == string_path)
diff --git a/contrib/python/numpy/py2/numpy/conftest.py b/contrib/python/numpy/py2/numpy/conftest.py
new file mode 100644
index 00000000000..7834dd39dff
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/conftest.py
@@ -0,0 +1,67 @@
+"""
+Pytest configuration and fixtures for the Numpy test suite.
+"""
+from __future__ import division, absolute_import, print_function
+
+import pytest
+import numpy
+
+from numpy.core._multiarray_tests import get_fpu_mode
+
+
+_old_fpu_mode = None
+_collect_results = {}
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers",
+        "valgrind_error: Tests that are known to error under valgrind.")
+    config.addinivalue_line("markers",
+        "slow: Tests that are very slow.")
+
+
+#FIXME when yield tests are gone.
+[email protected]()
+def pytest_itemcollected(item):
+    """
+    Check FPU precision mode was not changed during test collection.
+
+    The clumsy way we do it here is mainly necessary because numpy
+    still uses yield tests, which can execute code at test collection
+    time.
+    """
+    global _old_fpu_mode
+
+    mode = get_fpu_mode()
+
+    if _old_fpu_mode is None:
+        _old_fpu_mode = mode
+    elif mode != _old_fpu_mode:
+        _collect_results[item] = (_old_fpu_mode, mode)
+        _old_fpu_mode = mode
+
+
+[email protected](scope="function", autouse=True)
+def check_fpu_mode(request):
+    """
+    Check FPU precision mode was not changed during the test.
+    """
+    old_mode = get_fpu_mode()
+    yield
+    new_mode = get_fpu_mode()
+
+    if old_mode != new_mode:
+        raise AssertionError("FPU precision mode changed from {0:#x} to {1:#x}"
+                             " during the test".format(old_mode, new_mode))
+
+    collect_result = _collect_results.get(request.node)
+    if collect_result is not None:
+        old_mode, new_mode = collect_result
+        raise AssertionError("FPU precision mode changed from {0:#x} to {1:#x}"
+                             " when collecting the test".format(old_mode,
+                                                                new_mode))
+
+
+[email protected](autouse=True)
+def add_np(doctest_namespace):
+    doctest_namespace['np'] = numpy
diff --git a/contrib/python/numpy/py2/numpy/core/code_generators/__init__.py b/contrib/python/numpy/py2/numpy/core/code_generators/__init__.py
new file mode 100644
index 00000000000..1d0f69b67d8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/code_generators/__init__.py
@@ -0,0 +1 @@
+from __future__ import division, absolute_import, print_function
diff --git a/contrib/python/numpy/py2/numpy/core/code_generators/genapi.py b/contrib/python/numpy/py2/numpy/core/code_generators/genapi.py
new file mode 100644
index 00000000000..4aca2373c66
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/code_generators/genapi.py
@@ -0,0 +1,510 @@
+"""
+Get API information encoded in C files.
+
+See ``find_function`` for how functions should be formatted, and
+``read_order`` for how the order of the functions should be
+specified.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys, os, re
+import hashlib
+
+import textwrap
+
+from os.path import join
+
+__docformat__ = 'restructuredtext'
+
+# The files under src/ that are scanned for API functions
+API_FILES = [join('multiarray', 'alloc.c'),
+             join('multiarray', 'arrayfunction_override.c'),
+             join('multiarray', 'array_assign_array.c'),
+             join('multiarray', 'array_assign_scalar.c'),
+             join('multiarray', 'arrayobject.c'),
+             join('multiarray', 'arraytypes.c.src'),
+             join('multiarray', 'buffer.c'),
+             join('multiarray', 'calculation.c'),
+             join('multiarray', 'conversion_utils.c'),
+             join('multiarray', 'convert.c'),
+             join('multiarray', 'convert_datatype.c'),
+             join('multiarray', 'ctors.c'),
+             join('multiarray', 'datetime.c'),
+             join('multiarray', 'datetime_busday.c'),
+             join('multiarray', 'datetime_busdaycal.c'),
+             join('multiarray', 'datetime_strings.c'),
+             join('multiarray', 'descriptor.c'),
+             join('multiarray', 'einsum.c.src'),
+             join('multiarray', 'flagsobject.c'),
+             join('multiarray', 'getset.c'),
+             join('multiarray', 'item_selection.c'),
+             join('multiarray', 'iterators.c'),
+             join('multiarray', 'mapping.c'),
+             join('multiarray', 'methods.c'),
+             join('multiarray', 'multiarraymodule.c'),
+             join('multiarray', 'nditer_api.c'),
+             join('multiarray', 'nditer_constr.c'),
+             join('multiarray', 'nditer_pywrap.c'),
+             join('multiarray', 'nditer_templ.c.src'),
+             join('multiarray', 'number.c'),
+             join('multiarray', 'refcount.c'),
+             join('multiarray', 'scalartypes.c.src'),
+             join('multiarray', 'scalarapi.c'),
+             join('multiarray', 'sequence.c'),
+             join('multiarray', 'shape.c'),
+             join('multiarray', 'strfuncs.c'),
+             join('multiarray', 'usertypes.c'),
+             join('umath', 'loops.c.src'),
+             join('umath', 'ufunc_object.c'),
+             join('umath', 'ufunc_type_resolution.c'),
+             join('umath', 'reduction.c'),
+            ]
+THIS_DIR = os.path.dirname(__file__)
+API_FILES = [os.path.join(THIS_DIR, '..', 'src', a) for a in API_FILES]
+
+def file_in_this_dir(filename):
+    return os.path.join(THIS_DIR, filename)
+
+def remove_whitespace(s):
+    return ''.join(s.split())
+
+def _repl(str):
+    return str.replace('Bool', 'npy_bool')
+
+
+class StealRef(object):
+    def __init__(self, arg):
+        self.arg = arg # counting from 1
+
+    def __str__(self):
+        try:
+            return ' '.join('NPY_STEALS_REF_TO_ARG(%d)' % x for x in self.arg)
+        except TypeError:
+            return 'NPY_STEALS_REF_TO_ARG(%d)' % self.arg
+
+
+class NonNull(object):
+    def __init__(self, arg):
+        self.arg = arg # counting from 1
+
+    def __str__(self):
+        try:
+            return ' '.join('NPY_GCC_NONNULL(%d)' % x for x in self.arg)
+        except TypeError:
+            return 'NPY_GCC_NONNULL(%d)' % self.arg
+
+
+class Function(object):
+    def __init__(self, name, return_type, args, doc=''):
+        self.name = name
+        self.return_type = _repl(return_type)
+        self.args = args
+        self.doc = doc
+
+    def _format_arg(self, typename, name):
+        if typename.endswith('*'):
+            return typename + name
+        else:
+            return typename + ' ' + name
+
+    def __str__(self):
+        argstr = ', '.join([self._format_arg(*a) for a in self.args])
+        if self.doc:
+            doccomment = '/* %s */\n' % self.doc
+        else:
+            doccomment = ''
+        return '%s%s %s(%s)' % (doccomment, self.return_type, self.name, argstr)
+
+    def to_ReST(self):
+        lines = ['::', '', '  ' + self.return_type]
+        argstr = ',\000'.join([self._format_arg(*a) for a in self.args])
+        name = '  %s' % (self.name,)
+        s = textwrap.wrap('(%s)' % (argstr,), width=72,
+                          initial_indent=name,
+                          subsequent_indent=' ' * (len(name)+1),
+                          break_long_words=False)
+        for l in s:
+            lines.append(l.replace('\000', ' ').rstrip())
+        lines.append('')
+        if self.doc:
+            lines.append(textwrap.dedent(self.doc))
+        return '\n'.join(lines)
+
+    def api_hash(self):
+        m = hashlib.md5()
+        m.update(remove_whitespace(self.return_type))
+        m.update('\000')
+        m.update(self.name)
+        m.update('\000')
+        for typename, name in self.args:
+            m.update(remove_whitespace(typename))
+            m.update('\000')
+        return m.hexdigest()[:8]
+
+class ParseError(Exception):
+    def __init__(self, filename, lineno, msg):
+        self.filename = filename
+        self.lineno = lineno
+        self.msg = msg
+
+    def __str__(self):
+        return '%s:%s:%s' % (self.filename, self.lineno, self.msg)
+
+def skip_brackets(s, lbrac, rbrac):
+    count = 0
+    for i, c in enumerate(s):
+        if c == lbrac:
+            count += 1
+        elif c == rbrac:
+            count -= 1
+        if count == 0:
+            return i
+    raise ValueError("no match '%s' for '%s' (%r)" % (lbrac, rbrac, s))
+
+def split_arguments(argstr):
+    arguments = []
+    current_argument = []
+    i = 0
+    def finish_arg():
+        if current_argument:
+            argstr = ''.join(current_argument).strip()
+            m = re.match(r'(.*(\s+|[*]))(\w+)$', argstr)
+            if m:
+                typename = m.group(1).strip()
+                name = m.group(3)
+            else:
+                typename = argstr
+                name = ''
+            arguments.append((typename, name))
+            del current_argument[:]
+    while i < len(argstr):
+        c = argstr[i]
+        if c == ',':
+            finish_arg()
+        elif c == '(':
+            p = skip_brackets(argstr[i:], '(', ')')
+            current_argument += argstr[i:i+p]
+            i += p-1
+        else:
+            current_argument += c
+        i += 1
+    finish_arg()
+    return arguments
+
+
+def find_functions(filename, tag='API'):
+    """
+    Scan the file, looking for tagged functions.
+
+    Assuming ``tag=='API'``, a tagged function looks like::
+
+        /*API*/
+        static returntype*
+        function_name(argtype1 arg1, argtype2 arg2)
+        {
+        }
+
+    where the return type must be on a separate line, the function
+    name must start the line, and the opening ``{`` must start the line.
+
+    An optional documentation comment in ReST format may follow the tag,
+    as in::
+
+        /*API
+          This function does foo...
+         */
+    """
+    fo = open(filename, 'r')
+    functions = []
+    return_type = None
+    function_name = None
+    function_args = []
+    doclist = []
+    SCANNING, STATE_DOC, STATE_RETTYPE, STATE_NAME, STATE_ARGS = list(range(5))
+    state = SCANNING
+    tagcomment = '/*' + tag
+    for lineno, line in enumerate(fo):
+        try:
+            line = line.strip()
+            if state == SCANNING:
+                if line.startswith(tagcomment):
+                    if line.endswith('*/'):
+                        state = STATE_RETTYPE
+                    else:
+                        state = STATE_DOC
+            elif state == STATE_DOC:
+                if line.startswith('*/'):
+                    state = STATE_RETTYPE
+                else:
+                    line = line.lstrip(' *')
+                    doclist.append(line)
+            elif state == STATE_RETTYPE:
+                # first line of declaration with return type
+                m = re.match(r'NPY_NO_EXPORT\s+(.*)$', line)
+                if m:
+                    line = m.group(1)
+                return_type = line
+                state = STATE_NAME
+            elif state == STATE_NAME:
+                # second line, with function name
+                m = re.match(r'(\w+)\s*\(', line)
+                if m:
+                    function_name = m.group(1)
+                else:
+                    raise ParseError(filename, lineno+1,
+                                     'could not find function name')
+                function_args.append(line[m.end():])
+                state = STATE_ARGS
+            elif state == STATE_ARGS:
+                if line.startswith('{'):
+                    # finished
+                    fargs_str = ' '.join(function_args).rstrip(' )')
+                    fargs = split_arguments(fargs_str)
+                    f = Function(function_name, return_type, fargs,
+                                 '\n'.join(doclist))
+                    functions.append(f)
+                    return_type = None
+                    function_name = None
+                    function_args = []
+                    doclist = []
+                    state = SCANNING
+                else:
+                    function_args.append(line)
+        except Exception:
+            print(filename, lineno + 1)
+            raise
+    fo.close()
+    return functions
+
+def should_rebuild(targets, source_files):
+    from distutils.dep_util import newer_group
+    for t in targets:
+        if not os.path.exists(t):
+            return True
+    sources = API_FILES + list(source_files) + [__file__]
+    if newer_group(sources, targets[0], missing='newer'):
+        return True
+    return False
+
+def write_file(filename, data):
+    """
+    Write data to filename
+    Only write changed data to avoid updating timestamps unnecessarily
+    """
+    if os.path.exists(filename):
+        with open(filename) as f:
+            if data == f.read():
+                return
+
+    with open(filename, 'w') as fid:
+        fid.write(data)
+
+
+# Those *Api classes instances know how to output strings for the generated code
+class TypeApi(object):
+    def __init__(self, name, index, ptr_cast, api_name):
+        self.index = index
+        self.name = name
+        self.ptr_cast = ptr_cast
+        self.api_name = api_name
+
+    def define_from_array_api_string(self):
+        return "#define %s (*(%s *)%s[%d])" % (self.name,
+                                               self.ptr_cast,
+                                               self.api_name,
+                                               self.index)
+
+    def array_api_define(self):
+        return "        (void *) &%s" % self.name
+
+    def internal_define(self):
+        astr = """\
+extern NPY_NO_EXPORT PyTypeObject %(type)s;
+""" % {'type': self.name}
+        return astr
+
+class GlobalVarApi(object):
+    def __init__(self, name, index, type, api_name):
+        self.name = name
+        self.index = index
+        self.type = type
+        self.api_name = api_name
+
+    def define_from_array_api_string(self):
+        return "#define %s (*(%s *)%s[%d])" % (self.name,
+                                                        self.type,
+                                                        self.api_name,
+                                                        self.index)
+
+    def array_api_define(self):
+        return "        (%s *) &%s" % (self.type, self.name)
+
+    def internal_define(self):
+        astr = """\
+extern NPY_NO_EXPORT %(type)s %(name)s;
+""" % {'type': self.type, 'name': self.name}
+        return astr
+
+# Dummy to be able to consistently use *Api instances for all items in the
+# array api
+class BoolValuesApi(object):
+    def __init__(self, name, index, api_name):
+        self.name = name
+        self.index = index
+        self.type = 'PyBoolScalarObject'
+        self.api_name = api_name
+
+    def define_from_array_api_string(self):
+        return "#define %s ((%s *)%s[%d])" % (self.name,
+                                              self.type,
+                                              self.api_name,
+                                              self.index)
+
+    def array_api_define(self):
+        return "        (void *) &%s" % self.name
+
+    def internal_define(self):
+        astr = """\
+extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2];
+"""
+        return astr
+
+class FunctionApi(object):
+    def __init__(self, name, index, annotations, return_type, args, api_name):
+        self.name = name
+        self.index = index
+        self.annotations = annotations
+        self.return_type = return_type
+        self.args = args
+        self.api_name = api_name
+
+    def _argtypes_string(self):
+        if not self.args:
+            return 'void'
+        argstr = ', '.join([_repl(a[0]) for a in self.args])
+        return argstr
+
+    def define_from_array_api_string(self):
+        define = """\
+#define %s \\\n        (*(%s (*)(%s)) \\
+         %s[%d])""" % (self.name,
+                                self.return_type,
+                                self._argtypes_string(),
+                                self.api_name,
+                                self.index)
+        return define
+
+    def array_api_define(self):
+        return "        (void *) %s" % self.name
+
+    def internal_define(self):
+        annstr = [str(a) for a in self.annotations]
+        annstr = ' '.join(annstr)
+        astr = """\
+NPY_NO_EXPORT %s %s %s \\\n       (%s);""" % (annstr, self.return_type,
+                                              self.name,
+                                              self._argtypes_string())
+        return astr
+
+def order_dict(d):
+    """Order dict by its values."""
+    o = list(d.items())
+    def _key(x):
+        return x[1] + (x[0],)
+    return sorted(o, key=_key)
+
+def merge_api_dicts(dicts):
+    ret = {}
+    for d in dicts:
+        for k, v in d.items():
+            ret[k] = v
+
+    return ret
+
+def check_api_dict(d):
+    """Check that an api dict is valid (does not use the same index twice)."""
+    # remove the extra value fields that aren't the index
+    index_d = {k: v[0] for k, v in d.items()}
+
+    # We have if a same index is used twice: we 'revert' the dict so that index
+    # become keys. If the length is different, it means one index has been used
+    # at least twice
+    revert_dict = {v: k for k, v in index_d.items()}
+    if not len(revert_dict) == len(index_d):
+        # We compute a dict index -> list of associated items
+        doubled = {}
+        for name, index in index_d.items():
+            try:
+                doubled[index].append(name)
+            except KeyError:
+                doubled[index] = [name]
+        fmt = "Same index has been used twice in api definition: {}"
+        val = ''.join(
+            '\n\tindex {} -> {}'.format(index, names)
+            for index, names in doubled.items() if len(names) != 1
+        )
+        raise ValueError(fmt.format(val))
+
+    # No 'hole' in the indexes may be allowed, and it must starts at 0
+    indexes = set(index_d.values())
+    expected = set(range(len(indexes)))
+    if indexes != expected:
+        diff = expected.symmetric_difference(indexes)
+        msg = "There are some holes in the API indexing: " \
+              "(symmetric diff is %s)" % diff
+        raise ValueError(msg)
+
+def get_api_functions(tagname, api_dict):
+    """Parse source files to get functions tagged by the given tag."""
+    functions = []
+    for f in API_FILES:
+        functions.extend(find_functions(f, tagname))
+    dfunctions = [(api_dict[func.name][0], func) for func in functions]
+    dfunctions.sort()
+    return [a[1] for a in dfunctions]
+
+def fullapi_hash(api_dicts):
+    """Given a list of api dicts defining the numpy C API, compute a checksum
+    of the list of items in the API (as a string)."""
+    a = []
+    for d in api_dicts:
+        for name, data in order_dict(d):
+            a.extend(name)
+            a.extend(','.join(map(str, data)))
+
+    return hashlib.md5(''.join(a).encode('ascii')).hexdigest()
+
+# To parse strings like 'hex = checksum' where hex is e.g. 0x1234567F and
+# checksum a 128 bits md5 checksum (hex format as well)
+VERRE = re.compile(r'(^0x[\da-f]{8})\s*=\s*([\da-f]{32})')
+
+def get_versions_hash():
+    d = []
+
+    file = os.path.join(os.path.dirname(__file__), 'cversions.txt')
+    fid = open(file, 'r')
+    try:
+        for line in fid:
+            m = VERRE.match(line)
+            if m:
+                d.append((int(m.group(1), 16), m.group(2)))
+    finally:
+        fid.close()
+
+    return dict(d)
+
+def main():
+    tagname = sys.argv[1]
+    order_file = sys.argv[2]
+    functions = get_api_functions(tagname, order_file)
+    m = hashlib.md5(tagname)
+    for func in functions:
+        print(func)
+        ah = func.api_hash()
+        m.update(ah)
+        print(hex(int(ah, 16)))
+    print(hex(int(m.hexdigest()[:8], 16)))
+
+if __name__ == '__main__':
+    main()
diff --git a/contrib/python/numpy/py2/numpy/core/code_generators/generate_ufunc_api.py b/contrib/python/numpy/py2/numpy/core/code_generators/generate_ufunc_api.py
new file mode 100644
index 00000000000..1b0143e88be
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/code_generators/generate_ufunc_api.py
@@ -0,0 +1,211 @@
+from __future__ import division, print_function
+
+import os
+import genapi
+
+import numpy_api
+
+from genapi import \
+        TypeApi, GlobalVarApi, FunctionApi, BoolValuesApi
+
+h_template = r"""
+#ifdef _UMATHMODULE
+
+extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type;
+
+%s
+
+#else
+
+#if defined(PY_UFUNC_UNIQUE_SYMBOL)
+#define PyUFunc_API PY_UFUNC_UNIQUE_SYMBOL
+#endif
+
+#if defined(NO_IMPORT) || defined(NO_IMPORT_UFUNC)
+extern void **PyUFunc_API;
+#else
+#if defined(PY_UFUNC_UNIQUE_SYMBOL)
+void **PyUFunc_API;
+#else
+static void **PyUFunc_API=NULL;
+#endif
+#endif
+
+%s
+
+static NPY_INLINE int
+_import_umath(void)
+{
+  PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath");
+  PyObject *c_api = NULL;
+
+  if (numpy == NULL) {
+      PyErr_SetString(PyExc_ImportError,
+                      "numpy.core._multiarray_umath failed to import");
+      return -1;
+  }
+  c_api = PyObject_GetAttrString(numpy, "_UFUNC_API");
+  Py_DECREF(numpy);
+  if (c_api == NULL) {
+      PyErr_SetString(PyExc_AttributeError, "_UFUNC_API not found");
+      return -1;
+  }
+
+#if PY_VERSION_HEX >= 0x03000000
+  if (!PyCapsule_CheckExact(c_api)) {
+      PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCapsule object");
+      Py_DECREF(c_api);
+      return -1;
+  }
+  PyUFunc_API = (void **)PyCapsule_GetPointer(c_api, NULL);
+#else
+  if (!PyCObject_Check(c_api)) {
+      PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCObject object");
+      Py_DECREF(c_api);
+      return -1;
+  }
+  PyUFunc_API = (void **)PyCObject_AsVoidPtr(c_api);
+#endif
+  Py_DECREF(c_api);
+  if (PyUFunc_API == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is NULL pointer");
+      return -1;
+  }
+  return 0;
+}
+
+#if PY_VERSION_HEX >= 0x03000000
+#define NUMPY_IMPORT_UMATH_RETVAL NULL
+#else
+#define NUMPY_IMPORT_UMATH_RETVAL
+#endif
+
+#define import_umath() \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError,\
+                    "numpy.core.umath failed to import");\
+            return NUMPY_IMPORT_UMATH_RETVAL;\
+        }\
+    } while(0)
+
+#define import_umath1(ret) \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError,\
+                    "numpy.core.umath failed to import");\
+            return ret;\
+        }\
+    } while(0)
+
+#define import_umath2(ret, msg) \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError, msg);\
+            return ret;\
+        }\
+    } while(0)
+
+#define import_ufunc() \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError,\
+                    "numpy.core.umath failed to import");\
+        }\
+    } while(0)
+
+#endif
+"""
+
+c_template = r"""
+/* These pointers will be stored in the C-object for use in other
+    extension modules
+*/
+
+void *PyUFunc_API[] = {
+%s
+};
+"""
+
+def generate_api(output_dir, force=False):
+    basename = 'ufunc_api'
+
+    h_file = os.path.join(output_dir, '__%s.h' % basename)
+    c_file = os.path.join(output_dir, '__%s.c' % basename)
+    d_file = os.path.join(output_dir, '%s.txt' % basename)
+    targets = (h_file, c_file, d_file)
+
+    sources = ['ufunc_api_order.txt']
+
+    if (not force and not genapi.should_rebuild(targets, sources + [__file__])):
+        return targets
+    else:
+        do_generate_api(targets, sources)
+
+    return targets
+
+def do_generate_api(targets, sources):
+    header_file = targets[0]
+    c_file = targets[1]
+    doc_file = targets[2]
+
+    ufunc_api_index = genapi.merge_api_dicts((
+            numpy_api.ufunc_funcs_api,
+            numpy_api.ufunc_types_api))
+    genapi.check_api_dict(ufunc_api_index)
+
+    ufunc_api_list = genapi.get_api_functions('UFUNC_API', numpy_api.ufunc_funcs_api)
+
+    # Create dict name -> *Api instance
+    ufunc_api_dict = {}
+    api_name = 'PyUFunc_API'
+    for f in ufunc_api_list:
+        name = f.name
+        index = ufunc_api_index[name][0]
+        annotations = ufunc_api_index[name][1:]
+        ufunc_api_dict[name] = FunctionApi(f.name, index, annotations,
+                                           f.return_type, f.args, api_name)
+
+    for name, val in numpy_api.ufunc_types_api.items():
+        index = val[0]
+        ufunc_api_dict[name] = TypeApi(name, index, 'PyTypeObject', api_name)
+
+    # set up object API
+    module_list = []
+    extension_list = []
+    init_list = []
+
+    for name, index in genapi.order_dict(ufunc_api_index):
+        api_item = ufunc_api_dict[name]
+        extension_list.append(api_item.define_from_array_api_string())
+        init_list.append(api_item.array_api_define())
+        module_list.append(api_item.internal_define())
+
+    # Write to header
+    s = h_template % ('\n'.join(module_list), '\n'.join(extension_list))
+    genapi.write_file(header_file, s)
+
+    # Write to c-code
+    s = c_template % ',\n'.join(init_list)
+    genapi.write_file(c_file, s)
+
+    # Write to documentation
+    s = '''
+=================
+NumPy Ufunc C-API
+=================
+'''
+    for func in ufunc_api_list:
+        s += func.to_ReST()
+        s += '\n\n'
+    genapi.write_file(doc_file, s)
+
+    return targets
diff --git a/contrib/python/numpy/py2/numpy/core/code_generators/generate_umath.py b/contrib/python/numpy/py2/numpy/core/code_generators/generate_umath.py
new file mode 100644
index 00000000000..daf5949d062
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/code_generators/generate_umath.py
@@ -0,0 +1,1145 @@
+from __future__ import division, print_function
+
+import os
+import re
+import struct
+import sys
+import textwrap
+
+sys.path.insert(0, os.path.dirname(__file__))
+import ufunc_docstrings as docstrings
+sys.path.pop(0)
+
+Zero = "PyInt_FromLong(0)"
+One = "PyInt_FromLong(1)"
+True_ = "(Py_INCREF(Py_True), Py_True)"
+False_ = "(Py_INCREF(Py_False), Py_False)"
+None_ = object()
+AllOnes = "PyInt_FromLong(-1)"
+MinusInfinity = 'PyFloat_FromDouble(-NPY_INFINITY)'
+ReorderableNone = "(Py_INCREF(Py_None), Py_None)"
+
+# Sentinel value to specify using the full type description in the
+# function name
+class FullTypeDescr(object):
+    pass
+
+class FuncNameSuffix(object):
+    """Stores the suffix to append when generating functions names.
+    """
+    def __init__(self, suffix):
+        self.suffix = suffix
+
+class TypeDescription(object):
+    """Type signature for a ufunc.
+
+    Attributes
+    ----------
+    type : str
+        Character representing the nominal type.
+    func_data : str or None or FullTypeDescr or FuncNameSuffix, optional
+        The string representing the expression to insert into the data
+        array, if any.
+    in_ : str or None, optional
+        The typecode(s) of the inputs.
+    out : str or None, optional
+        The typecode(s) of the outputs.
+    astype : dict or None, optional
+        If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
+        instead of PyUFunc_x_x/PyUFunc_xx_x.
+    simd: list
+        Available SIMD ufunc loops, dispatched at runtime in specified order
+        Currently only supported for simples types (see make_arrays)
+    """
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
+        self.type = type
+        self.func_data = f
+        if astype is None:
+            astype = {}
+        self.astype_dict = astype
+        if in_ is not None:
+            in_ = in_.replace('P', type)
+        self.in_ = in_
+        if out is not None:
+            out = out.replace('P', type)
+        self.out = out
+        self.simd = simd
+
+    def finish_signature(self, nin, nout):
+        if self.in_ is None:
+            self.in_ = self.type * nin
+        assert len(self.in_) == nin
+        if self.out is None:
+            self.out = self.type * nout
+        assert len(self.out) == nout
+        self.astype = self.astype_dict.get(self.type, None)
+
+_fdata_map = dict(e='npy_%sf', f='npy_%sf', d='npy_%s', g='npy_%sl',
+                  F='nc_%sf', D='nc_%s', G='nc_%sl')
+def build_func_data(types, f):
+    func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
+    return func_data
+
+def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
+    if f is not None:
+        if isinstance(f, str):
+            func_data = build_func_data(types, f)
+        elif len(f) != len(types):
+            raise ValueError("Number of types and f do not match")
+        else:
+            func_data = f
+    else:
+        func_data = (None,) * len(types)
+    if isinstance(in_, str):
+        in_ = (in_,) * len(types)
+    elif in_ is None:
+        in_ = (None,) * len(types)
+    elif len(in_) != len(types):
+        raise ValueError("Number of types and inputs do not match")
+    if isinstance(out, str):
+        out = (out,) * len(types)
+    elif out is None:
+        out = (None,) * len(types)
+    elif len(out) != len(types):
+        raise ValueError("Number of types and outputs do not match")
+    tds = []
+    for t, fd, i, o in zip(types, func_data, in_, out):
+        # [(simd-name, list of types)]
+        if simd is not None:
+            simdt = [k for k, v in simd if t in v]
+        else:
+            simdt = []
+        tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt))
+    return tds
+
+class Ufunc(object):
+    """Description of a ufunc.
+
+    Attributes
+    ----------
+    nin : number of input arguments
+    nout : number of output arguments
+    identity : identity element for a two-argument function
+    docstring : docstring for the ufunc
+    type_descriptions : list of TypeDescription objects
+    """
+    def __init__(self, nin, nout, identity, docstring, typereso,
+                 *type_descriptions, **kwargs):
+        self.nin = nin
+        self.nout = nout
+        if identity is None:
+            identity = None_
+        self.identity = identity
+        self.docstring = docstring
+        self.typereso = typereso
+        self.type_descriptions = []
+        self.signature = kwargs.pop('signature', None)
+        for td in type_descriptions:
+            self.type_descriptions.extend(td)
+        for td in self.type_descriptions:
+            td.finish_signature(self.nin, self.nout)
+        if kwargs:
+            raise ValueError('unknown kwargs %r' % str(kwargs))
+
+# String-handling utilities to avoid locale-dependence.
+
+import string
+if sys.version_info[0] < 3:
+    UPPER_TABLE = string.maketrans(string.ascii_lowercase,
+                                   string.ascii_uppercase)
+else:
+    UPPER_TABLE = bytes.maketrans(bytes(string.ascii_lowercase, "ascii"),
+                                  bytes(string.ascii_uppercase, "ascii"))
+
+def english_upper(s):
+    """ Apply English case rules to convert ASCII strings to all upper case.
+
+    This is an internal utility function to replace calls to str.upper() such
+    that we can avoid changing behavior with changing locales. In particular,
+    Turkish has distinct dotted and dotless variants of the Latin letter "I" in
+    both lowercase and uppercase. Thus, "i".upper() != "I" in a "tr" locale.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    uppered : str
+
+    Examples
+    --------
+    >>> from numpy.lib.utils import english_upper
+    >>> s = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_'
+    >>> english_upper(s)
+    'ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
+    >>> english_upper('')
+    ''
+    """
+    uppered = s.translate(UPPER_TABLE)
+    return uppered
+
+
+#each entry in defdict is a Ufunc object.
+
+#name: [string of chars for which it is defined,
+#       string of characters using func interface,
+#       tuple of strings giving funcs for data,
+#       (in, out), or (instr, outstr) giving the signature as character codes,
+#       identity,
+#       docstring,
+#       output specification (optional)
+#       ]
+
+chartoname = {'?': 'bool',
+              'b': 'byte',
+              'B': 'ubyte',
+              'h': 'short',
+              'H': 'ushort',
+              'i': 'int',
+              'I': 'uint',
+              'l': 'long',
+              'L': 'ulong',
+              'q': 'longlong',
+              'Q': 'ulonglong',
+              'e': 'half',
+              'f': 'float',
+              'd': 'double',
+              'g': 'longdouble',
+              'F': 'cfloat',
+              'D': 'cdouble',
+              'G': 'clongdouble',
+              'M': 'datetime',
+              'm': 'timedelta',
+              'O': 'OBJECT',
+              # '.' is like 'O', but calls a method of the object instead
+              # of a function
+              'P': 'OBJECT',
+              }
+
+all = '?bBhHiIlLqQefdgFDGOMm'
+O = 'O'
+P = 'P'
+ints = 'bBhHiIlLqQ'
+times = 'Mm'
+timedeltaonly = 'm'
+intsO = ints + O
+bints = '?' + ints
+bintsO = bints + O
+flts = 'efdg'
+fltsO = flts + O
+fltsP = flts + P
+cmplx = 'FDG'
+cmplxO = cmplx + O
+cmplxP = cmplx + P
+inexact = flts + cmplx
+inexactvec = 'fd'
+noint = inexact+O
+nointP = inexact+P
+allP = bints+times+flts+cmplxP
+nobool = all[1:]
+noobj = all[:-3]+all[-2:]
+nobool_or_obj = all[1:-3]+all[-2:]
+nobool_or_datetime = all[1:-2]+all[-1:]
+intflt = ints+flts
+intfltcmplx = ints+flts+cmplx
+nocmplx = bints+times+flts
+nocmplxO = nocmplx+O
+nocmplxP = nocmplx+P
+notimes_or_obj = bints + inexact
+nodatetime_or_obj = bints + inexact
+
+# Find which code corresponds to int64.
+int64 = ''
+uint64 = ''
+for code in 'bhilq':
+    if struct.calcsize(code) == 8:
+        int64 = code
+        uint64 = english_upper(code)
+        break
+
+# This dictionary describes all the ufunc implementations, generating
+# all the function names and their corresponding ufunc signatures.  TD is
+# an object which expands a list of character codes into an array of
+# TypeDescriptions.
+defdict = {
+'add':
+    Ufunc(2, 1, Zero,
+          docstrings.get('numpy.core.umath.add'),
+          'PyUFunc_AdditionTypeResolver',
+          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'm'),
+           TypeDescription('M', FullTypeDescr, 'mM', 'M'),
+          ],
+          TD(O, f='PyNumber_Add'),
+          ),
+'subtract':
+    Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
+          docstrings.get('numpy.core.umath.subtract'),
+          'PyUFunc_SubtractionTypeResolver',
+          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'm'),
+           TypeDescription('M', FullTypeDescr, 'MM', 'm'),
+          ],
+          TD(O, f='PyNumber_Subtract'),
+          ),
+'multiply':
+    Ufunc(2, 1, One,
+          docstrings.get('numpy.core.umath.multiply'),
+          'PyUFunc_MultiplicationTypeResolver',
+          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
+           TypeDescription('m', FullTypeDescr, 'qm', 'm'),
+           TypeDescription('m', FullTypeDescr, 'md', 'm'),
+           TypeDescription('m', FullTypeDescr, 'dm', 'm'),
+          ],
+          TD(O, f='PyNumber_Multiply'),
+          ),
+'divide':
+    Ufunc(2, 1, None, # One is only a unit to the right, not the left
+          docstrings.get('numpy.core.umath.divide'),
+          'PyUFunc_MixedDivisionTypeResolver',
+          TD(intfltcmplx),
+          [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
+           TypeDescription('m', FullTypeDescr, 'md', 'm'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+          ],
+          TD(O, f='PyNumber_Divide'),
+          ),
+'floor_divide':
+    Ufunc(2, 1, None, # One is only a unit to the right, not the left
+          docstrings.get('numpy.core.umath.floor_divide'),
+          'PyUFunc_DivisionTypeResolver',
+          TD(intfltcmplx),
+          [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
+           TypeDescription('m', FullTypeDescr, 'md', 'm'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'q'),
+          ],
+          TD(O, f='PyNumber_FloorDivide'),
+          ),
+'true_divide':
+    Ufunc(2, 1, None, # One is only a unit to the right, not the left
+          docstrings.get('numpy.core.umath.true_divide'),
+          'PyUFunc_TrueDivisionTypeResolver',
+          TD(flts+cmplx),
+          [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
+           TypeDescription('m', FullTypeDescr, 'md', 'm'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+          ],
+          TD(O, f='PyNumber_TrueDivide'),
+          ),
+'conjugate':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.conjugate'),
+          None,
+          TD(ints+flts+cmplx, simd=[('avx2', ints)]),
+          TD(P, f='conjugate'),
+          ),
+'fmod':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.fmod'),
+          None,
+          TD(ints),
+          TD(flts, f='fmod', astype={'e':'f'}),
+          TD(P, f='fmod'),
+          ),
+'square':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.square'),
+          None,
+          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(O, f='Py_square'),
+          ),
+'reciprocal':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.reciprocal'),
+          None,
+          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(O, f='Py_reciprocal'),
+          ),
+# This is no longer used as numpy.ones_like, however it is
+# still used by some internal calls.
+'_ones_like':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath._ones_like'),
+          'PyUFunc_OnesLikeTypeResolver',
+          TD(noobj),
+          TD(O, f='Py_get_one'),
+          ),
+'power':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.power'),
+          None,
+          TD(ints),
+          TD(inexact, f='pow', astype={'e':'f'}),
+          TD(O, f='npy_ObjectPower'),
+          ),
+'float_power':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.float_power'),
+          None,
+          TD('dgDG', f='pow'),
+          ),
+'absolute':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.absolute'),
+          'PyUFunc_AbsoluteTypeResolver',
+          TD(bints+flts+timedeltaonly),
+          TD(cmplx, out=('f', 'd', 'g')),
+          TD(O, f='PyNumber_Absolute'),
+          ),
+'_arg':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath._arg'),
+          None,
+          TD(cmplx, out=('f', 'd', 'g')),
+          ),
+'negative':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.negative'),
+          'PyUFunc_NegativeTypeResolver',
+          TD(bints+flts+timedeltaonly, simd=[('avx2', ints)]),
+          TD(cmplx, f='neg'),
+          TD(O, f='PyNumber_Negative'),
+          ),
+'positive':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.positive'),
+          'PyUFunc_SimpleUnaryOperationTypeResolver',
+          TD(ints+flts+timedeltaonly),
+          TD(cmplx, f='pos'),
+          TD(O, f='PyNumber_Positive'),
+          ),
+'sign':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.sign'),
+          'PyUFunc_SimpleUnaryOperationTypeResolver',
+          TD(nobool_or_datetime),
+          ),
+'greater':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.greater'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(all, out='?', simd=[('avx2', ints)]),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          ),
+'greater_equal':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.greater_equal'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(all, out='?', simd=[('avx2', ints)]),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          ),
+'less':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.less'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(all, out='?', simd=[('avx2', ints)]),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          ),
+'less_equal':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.less_equal'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(all, out='?', simd=[('avx2', ints)]),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          ),
+'equal':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.equal'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(all, out='?', simd=[('avx2', ints)]),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          ),
+'not_equal':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.not_equal'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(all, out='?', simd=[('avx2', ints)]),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          ),
+'logical_and':
+    Ufunc(2, 1, True_,
+          docstrings.get('numpy.core.umath.logical_and'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(O, f='npy_ObjectLogicalAnd'),
+          ),
+'logical_not':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.logical_not'),
+          None,
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(O, f='npy_ObjectLogicalNot'),
+          ),
+'logical_or':
+    Ufunc(2, 1, False_,
+          docstrings.get('numpy.core.umath.logical_or'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(O, f='npy_ObjectLogicalOr'),
+          ),
+'logical_xor':
+    Ufunc(2, 1, False_,
+          docstrings.get('numpy.core.umath.logical_xor'),
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
+          TD(nodatetime_or_obj, out='?'),
+          TD(P, f='logical_xor'),
+          ),
+'maximum':
+    Ufunc(2, 1, ReorderableNone,
+          docstrings.get('numpy.core.umath.maximum'),
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
+          TD(noobj),
+          TD(O, f='npy_ObjectMax')
+          ),
+'minimum':
+    Ufunc(2, 1, ReorderableNone,
+          docstrings.get('numpy.core.umath.minimum'),
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
+          TD(noobj),
+          TD(O, f='npy_ObjectMin')
+          ),
+'fmax':
+    Ufunc(2, 1, ReorderableNone,
+          docstrings.get('numpy.core.umath.fmax'),
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
+          TD(noobj),
+          TD(O, f='npy_ObjectMax')
+          ),
+'fmin':
+    Ufunc(2, 1, ReorderableNone,
+          docstrings.get('numpy.core.umath.fmin'),
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
+          TD(noobj),
+          TD(O, f='npy_ObjectMin')
+          ),
+'logaddexp':
+    Ufunc(2, 1, MinusInfinity,
+          docstrings.get('numpy.core.umath.logaddexp'),
+          None,
+          TD(flts, f="logaddexp", astype={'e':'f'})
+          ),
+'logaddexp2':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.logaddexp2'),
+          None,
+          TD(flts, f="logaddexp2", astype={'e':'f'})
+          ),
+'bitwise_and':
+    Ufunc(2, 1, AllOnes,
+          docstrings.get('numpy.core.umath.bitwise_and'),
+          None,
+          TD(bints, simd=[('avx2', ints)]),
+          TD(O, f='PyNumber_And'),
+          ),
+'bitwise_or':
+    Ufunc(2, 1, Zero,
+          docstrings.get('numpy.core.umath.bitwise_or'),
+          None,
+          TD(bints, simd=[('avx2', ints)]),
+          TD(O, f='PyNumber_Or'),
+          ),
+'bitwise_xor':
+    Ufunc(2, 1, Zero,
+          docstrings.get('numpy.core.umath.bitwise_xor'),
+          None,
+          TD(bints, simd=[('avx2', ints)]),
+          TD(O, f='PyNumber_Xor'),
+          ),
+'invert':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.invert'),
+          None,
+          TD(bints, simd=[('avx2', ints)]),
+          TD(O, f='PyNumber_Invert'),
+          ),
+'left_shift':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.left_shift'),
+          None,
+          TD(ints, simd=[('avx2', ints)]),
+          TD(O, f='PyNumber_Lshift'),
+          ),
+'right_shift':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.right_shift'),
+          None,
+          TD(ints, simd=[('avx2', ints)]),
+          TD(O, f='PyNumber_Rshift'),
+          ),
+'heaviside':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.heaviside'),
+          None,
+          TD(flts, f='heaviside', astype={'e':'f'}),
+          ),
+'degrees':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.degrees'),
+          None,
+          TD(fltsP, f='degrees', astype={'e':'f'}),
+          ),
+'rad2deg':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.rad2deg'),
+          None,
+          TD(fltsP, f='rad2deg', astype={'e':'f'}),
+          ),
+'radians':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.radians'),
+          None,
+          TD(fltsP, f='radians', astype={'e':'f'}),
+          ),
+'deg2rad':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.deg2rad'),
+          None,
+          TD(fltsP, f='deg2rad', astype={'e':'f'}),
+          ),
+'arccos':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.arccos'),
+          None,
+          TD(inexact, f='acos', astype={'e':'f'}),
+          TD(P, f='arccos'),
+          ),
+'arccosh':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.arccosh'),
+          None,
+          TD(inexact, f='acosh', astype={'e':'f'}),
+          TD(P, f='arccosh'),
+          ),
+'arcsin':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.arcsin'),
+          None,
+          TD(inexact, f='asin', astype={'e':'f'}),
+          TD(P, f='arcsin'),
+          ),
+'arcsinh':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.arcsinh'),
+          None,
+          TD(inexact, f='asinh', astype={'e':'f'}),
+          TD(P, f='arcsinh'),
+          ),
+'arctan':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.arctan'),
+          None,
+          TD(inexact, f='atan', astype={'e':'f'}),
+          TD(P, f='arctan'),
+          ),
+'arctanh':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.arctanh'),
+          None,
+          TD(inexact, f='atanh', astype={'e':'f'}),
+          TD(P, f='arctanh'),
+          ),
+'cos':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.cos'),
+          None,
+          TD(inexact, f='cos', astype={'e':'f'}),
+          TD(P, f='cos'),
+          ),
+'sin':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.sin'),
+          None,
+          TD(inexact, f='sin', astype={'e':'f'}),
+          TD(P, f='sin'),
+          ),
+'tan':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.tan'),
+          None,
+          TD(inexact, f='tan', astype={'e':'f'}),
+          TD(P, f='tan'),
+          ),
+'cosh':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.cosh'),
+          None,
+          TD(inexact, f='cosh', astype={'e':'f'}),
+          TD(P, f='cosh'),
+          ),
+'sinh':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.sinh'),
+          None,
+          TD(inexact, f='sinh', astype={'e':'f'}),
+          TD(P, f='sinh'),
+          ),
+'tanh':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.tanh'),
+          None,
+          TD(inexact, f='tanh', astype={'e':'f'}),
+          TD(P, f='tanh'),
+          ),
+'exp':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.exp'),
+          None,
+          TD(inexact, f='exp', astype={'e':'f'}),
+          TD(P, f='exp'),
+          ),
+'exp2':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.exp2'),
+          None,
+          TD(inexact, f='exp2', astype={'e':'f'}),
+          TD(P, f='exp2'),
+          ),
+'expm1':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.expm1'),
+          None,
+          TD(inexact, f='expm1', astype={'e':'f'}),
+          TD(P, f='expm1'),
+          ),
+'log':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.log'),
+          None,
+          TD(inexact, f='log', astype={'e':'f'}),
+          TD(P, f='log'),
+          ),
+'log2':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.log2'),
+          None,
+          TD(inexact, f='log2', astype={'e':'f'}),
+          TD(P, f='log2'),
+          ),
+'log10':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.log10'),
+          None,
+          TD(inexact, f='log10', astype={'e':'f'}),
+          TD(P, f='log10'),
+          ),
+'log1p':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.log1p'),
+          None,
+          TD(inexact, f='log1p', astype={'e':'f'}),
+          TD(P, f='log1p'),
+          ),
+'sqrt':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.sqrt'),
+          None,
+          TD('e', f='sqrt', astype={'e':'f'}),
+          TD(inexactvec),
+          TD(inexact, f='sqrt', astype={'e':'f'}),
+          TD(P, f='sqrt'),
+          ),
+'cbrt':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.cbrt'),
+          None,
+          TD(flts, f='cbrt', astype={'e':'f'}),
+          TD(P, f='cbrt'),
+          ),
+'ceil':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.ceil'),
+          None,
+          TD(flts, f='ceil', astype={'e':'f'}),
+          TD(P, f='ceil'),
+          ),
+'trunc':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.trunc'),
+          None,
+          TD(flts, f='trunc', astype={'e':'f'}),
+          TD(P, f='trunc'),
+          ),
+'fabs':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.fabs'),
+          None,
+          TD(flts, f='fabs', astype={'e':'f'}),
+          TD(P, f='fabs'),
+       ),
+'floor':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.floor'),
+          None,
+          TD(flts, f='floor', astype={'e':'f'}),
+          TD(P, f='floor'),
+          ),
+'rint':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.rint'),
+          None,
+          TD(inexact, f='rint', astype={'e':'f'}),
+          TD(P, f='rint'),
+          ),
+'arctan2':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.arctan2'),
+          None,
+          TD(flts, f='atan2', astype={'e':'f'}),
+          TD(P, f='arctan2'),
+          ),
+'remainder':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.remainder'),
+          'PyUFunc_RemainderTypeResolver',
+          TD(intflt),
+          [TypeDescription('m', FullTypeDescr, 'mm', 'm')],
+          TD(O, f='PyNumber_Remainder'),
+          ),
+'divmod':
+    Ufunc(2, 2, None,
+          docstrings.get('numpy.core.umath.divmod'),
+          'PyUFunc_DivmodTypeResolver',
+          TD(intflt),
+          [TypeDescription('m', FullTypeDescr, 'mm', 'qm')],
+          # TD(O, f='PyNumber_Divmod'),  # gh-9730
+          ),
+'hypot':
+    Ufunc(2, 1, Zero,
+          docstrings.get('numpy.core.umath.hypot'),
+          None,
+          TD(flts, f='hypot', astype={'e':'f'}),
+          TD(P, f='hypot'),
+          ),
+'isnan':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.isnan'),
+          None,
+          TD(inexact, out='?'),
+          ),
+'isnat':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.isnat'),
+          'PyUFunc_IsNaTTypeResolver',
+          TD(times, out='?'),
+          ),
+'isinf':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.isinf'),
+          None,
+          TD(inexact, out='?'),
+          ),
+'isfinite':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.isfinite'),
+          None,
+          TD(inexact, out='?'),
+          ),
+'signbit':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.signbit'),
+          None,
+          TD(flts, out='?'),
+          ),
+'copysign':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.copysign'),
+          None,
+          TD(flts),
+          ),
+'nextafter':
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.nextafter'),
+          None,
+          TD(flts),
+          ),
+'spacing':
+    Ufunc(1, 1, None,
+          docstrings.get('numpy.core.umath.spacing'),
+          None,
+          TD(flts),
+          ),
+'modf':
+    Ufunc(1, 2, None,
+          docstrings.get('numpy.core.umath.modf'),
+          None,
+          TD(flts),
+          ),
+'ldexp' :
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.ldexp'),
+          None,
+          [TypeDescription('e', None, 'ei', 'e'),
+          TypeDescription('f', None, 'fi', 'f'),
+          TypeDescription('e', FuncNameSuffix('long'), 'el', 'e'),
+          TypeDescription('f', FuncNameSuffix('long'), 'fl', 'f'),
+          TypeDescription('d', None, 'di', 'd'),
+          TypeDescription('d', FuncNameSuffix('long'), 'dl', 'd'),
+          TypeDescription('g', None, 'gi', 'g'),
+          TypeDescription('g', FuncNameSuffix('long'), 'gl', 'g'),
+          ],
+          ),
+'frexp' :
+    Ufunc(1, 2, None,
+          docstrings.get('numpy.core.umath.frexp'),
+          None,
+          [TypeDescription('e', None, 'e', 'ei'),
+          TypeDescription('f', None, 'f', 'fi'),
+          TypeDescription('d', None, 'd', 'di'),
+          TypeDescription('g', None, 'g', 'gi'),
+          ],
+          ),
+'gcd' :
+    Ufunc(2, 1, Zero,
+          docstrings.get('numpy.core.umath.gcd'),
+          "PyUFunc_SimpleBinaryOperationTypeResolver",
+          TD(ints),
+          TD('O', f='npy_ObjectGCD'),
+          ),
+'lcm' :
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.lcm'),
+          "PyUFunc_SimpleBinaryOperationTypeResolver",
+          TD(ints),
+          TD('O', f='npy_ObjectLCM'),
+          ),
+'matmul' :
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.matmul'),
+          "PyUFunc_SimpleBinaryOperationTypeResolver",
+          TD(notimes_or_obj),
+          TD(O),
+          signature='(n?,k),(k,m?)->(n?,m?)',
+          ),
+}
+
+if sys.version_info[0] >= 3:
+    # Will be aliased to true_divide in umathmodule.c.src:InitOtherOperators
+    del defdict['divide']
+
+def indent(st, spaces):
+    indentation = ' '*spaces
+    indented = indentation + st.replace('\n', '\n'+indentation)
+    # trim off any trailing spaces
+    indented = re.sub(r' +$', r'', indented)
+    return indented
+
+chartotype1 = {'e': 'e_e',
+               'f': 'f_f',
+               'd': 'd_d',
+               'g': 'g_g',
+               'F': 'F_F',
+               'D': 'D_D',
+               'G': 'G_G',
+               'O': 'O_O',
+               'P': 'O_O_method'}
+
+chartotype2 = {'e': 'ee_e',
+               'f': 'ff_f',
+               'd': 'dd_d',
+               'g': 'gg_g',
+               'F': 'FF_F',
+               'D': 'DD_D',
+               'G': 'GG_G',
+               'O': 'OO_O',
+               'P': 'OO_O_method'}
+#for each name
+# 1) create functions, data, and signature
+# 2) fill in functions and data in InitOperators
+# 3) add function.
+
+def make_arrays(funcdict):
+    # functions array contains an entry for every type implemented NULL
+    # should be placed where PyUfunc_ style function will be filled in
+    # later
+    code1list = []
+    code2list = []
+    names = sorted(funcdict.keys())
+    for name in names:
+        uf = funcdict[name]
+        funclist = []
+        datalist = []
+        siglist = []
+        k = 0
+        sub = 0
+
+        for t in uf.type_descriptions:
+            if t.func_data is FullTypeDescr:
+                tname = english_upper(chartoname[t.type])
+                datalist.append('(void *)NULL')
+                funclist.append(
+                        '%s_%s_%s_%s' % (tname, t.in_, t.out, name))
+            elif isinstance(t.func_data, FuncNameSuffix):
+                datalist.append('(void *)NULL')
+                tname = english_upper(chartoname[t.type])
+                funclist.append(
+                        '%s_%s_%s' % (tname, name, t.func_data.suffix))
+            elif t.func_data is None:
+                datalist.append('(void *)NULL')
+                tname = english_upper(chartoname[t.type])
+                funclist.append('%s_%s' % (tname, name))
+                if t.simd is not None:
+                    for vt in t.simd:
+                        code2list.append(textwrap.dedent("""\
+                        #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
+                        if (npy_cpu_supports("{isa}")) {{
+                            {fname}_functions[{idx}] = {type}_{fname}_{isa};
+                        }}
+                        #endif
+                        """).format(
+                            ISA=vt.upper(), isa=vt,
+                            fname=name, type=tname, idx=k
+                        ))
+            else:
+                funclist.append('NULL')
+                if (uf.nin, uf.nout) == (2, 1):
+                    thedict = chartotype2
+                elif (uf.nin, uf.nout) == (1, 1):
+                    thedict = chartotype1
+                else:
+                    raise ValueError("Could not handle {}[{}]".format(name, t.type))
+
+                astype = ''
+                if not t.astype is None:
+                    astype = '_As_%s' % thedict[t.astype]
+                astr = ('%s_functions[%d] = PyUFunc_%s%s;' %
+                           (name, k, thedict[t.type], astype))
+                code2list.append(astr)
+                if t.type == 'O':
+                    astr = ('%s_data[%d] = (void *) %s;' %
+                               (name, k, t.func_data))
+                    code2list.append(astr)
+                    datalist.append('(void *)NULL')
+                elif t.type == 'P':
+                    datalist.append('(void *)"%s"' % t.func_data)
+                else:
+                    astr = ('%s_data[%d] = (void *) %s;' %
+                               (name, k, t.func_data))
+                    code2list.append(astr)
+                    datalist.append('(void *)NULL')
+                    #datalist.append('(void *)%s' % t.func_data)
+                sub += 1
+
+            for x in t.in_ + t.out:
+                siglist.append('NPY_%s' % (english_upper(chartoname[x]),))
+
+            k += 1
+
+        funcnames = ', '.join(funclist)
+        signames = ', '.join(siglist)
+        datanames = ', '.join(datalist)
+        code1list.append("static PyUFuncGenericFunction %s_functions[] = {%s};"
+                         % (name, funcnames))
+        code1list.append("static void * %s_data[] = {%s};"
+                         % (name, datanames))
+        code1list.append("static char %s_signatures[] = {%s};"
+                         % (name, signames))
+    return "\n".join(code1list), "\n".join(code2list)
+
+def make_ufuncs(funcdict):
+    code3list = []
+    names = sorted(funcdict.keys())
+    for name in names:
+        uf = funcdict[name]
+        mlist = []
+        docstring = textwrap.dedent(uf.docstring).strip()
+        if sys.version_info[0] < 3:
+            docstring = docstring.encode('string-escape')
+            docstring = docstring.replace(r'"', r'\"')
+        else:
+            docstring = docstring.encode('unicode-escape').decode('ascii')
+            docstring = docstring.replace(r'"', r'\"')
+            # XXX: I don't understand why the following replace is not
+            # necessary in the python 2 case.
+            docstring = docstring.replace(r"'", r"\'")
+        # Split the docstring because some compilers (like MS) do not like big
+        # string literal in C code. We split at endlines because textwrap.wrap
+        # do not play well with \n
+        docstring = '\\n\"\"'.join(docstring.split(r"\n"))
+        if uf.signature is None:
+            sig = "NULL"
+        else:
+            sig = '"{}"'.format(uf.signature)
+        fmt = textwrap.dedent("""\
+            identity = {identity_expr};
+            if ({has_identity} && identity == NULL) {{
+                return -1;
+            }}
+            f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+                {name}_functions, {name}_data, {name}_signatures, {nloops},
+                {nin}, {nout}, {identity}, "{name}",
+                "{doc}", 0, {sig}, identity
+            );
+            if ({has_identity}) {{
+                Py_DECREF(identity);
+            }}
+            if (f == NULL) {{
+                return -1;
+            }}
+        """)
+        args = dict(
+            name=name, nloops=len(uf.type_descriptions),
+            nin=uf.nin, nout=uf.nout,
+            has_identity='0' if uf.identity is None_ else '1',
+            identity='PyUFunc_IdentityValue',
+            identity_expr=uf.identity,
+            doc=docstring,
+            sig=sig,
+        )
+
+        # Only PyUFunc_None means don't reorder - we pass this using the old
+        # argument
+        if uf.identity is None_:
+            args['identity'] = 'PyUFunc_None'
+            args['identity_expr'] = 'NULL'
+
+        mlist.append(fmt.format(**args))
+        if uf.typereso is not None:
+            mlist.append(
+                r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
+        mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
+        mlist.append(r"""Py_DECREF(f);""")
+        code3list.append('\n'.join(mlist))
+    return '\n'.join(code3list)
+
+
+def make_code(funcdict, filename):
+    code1, code2 = make_arrays(funcdict)
+    code3 = make_ufuncs(funcdict)
+    code2 = indent(code2, 4)
+    code3 = indent(code3, 4)
+    code = textwrap.dedent(r"""
+
+    /** Warning this file is autogenerated!!!
+
+        Please make changes to the code generator program (%s)
+    **/
+    #include "cpuid.h"
+    #include "ufunc_object.h"
+    #include "ufunc_type_resolution.h"
+    #include "loops.h"
+    #include "matmul.h"
+    %s
+
+    static int
+    InitOperators(PyObject *dictionary) {
+        PyObject *f, *identity;
+
+    %s
+    %s
+
+        return 0;
+    }
+    """) % (filename, code1, code2, code3)
+    return code
+
+
+if __name__ == "__main__":
+    filename = __file__
+    fid = open('__umath_generated.c', 'w')
+    code = make_code(defdict, filename)
+    fid.write(code)
+    fid.close()
diff --git a/contrib/python/numpy/py2/numpy/core/code_generators/numpy_api.py b/contrib/python/numpy/py2/numpy/core/code_generators/numpy_api.py
new file mode 100644
index 00000000000..a71c236fdde
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/code_generators/numpy_api.py
@@ -0,0 +1,423 @@
+"""Here we define the exported functions, types, etc... which need to be
+exported through a global C pointer.
+
+Each dictionary contains name -> index pair.
+
+Whenever you change one index, you break the ABI (and the ABI version number
+should be incremented). Whenever you add an item to one of the dict, the API
+needs to be updated in both setup_common.py and by adding an appropriate
+entry to cversion.txt (generate the hash via "python cversions.py").
+
+When adding a function, make sure to use the next integer not used as an index
+(in case you use an existing index or jump, the build will stop and raise an
+exception, so it should hopefully not get unnoticed).
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from code_generators.genapi import StealRef, NonNull
+
+# index, type
+multiarray_global_vars = {
+    'NPY_NUMUSERTYPES':             (7, 'int'),
+    'NPY_DEFAULT_ASSIGN_CASTING':   (292, 'NPY_CASTING'),
+}
+
+multiarray_scalar_bool_values = {
+    '_PyArrayScalar_BoolValues':    (9,)
+}
+
+# index, annotations
+# please mark functions that have been checked to not need any annotations
+multiarray_types_api = {
+    'PyBigArray_Type':                  (1,),
+    'PyArray_Type':                     (2,),
+    'PyArrayDescr_Type':                (3,),
+    'PyArrayFlags_Type':                (4,),
+    'PyArrayIter_Type':                 (5,),
+    'PyArrayMultiIter_Type':            (6,),
+    'PyBoolArrType_Type':               (8,),
+    'PyGenericArrType_Type':            (10,),
+    'PyNumberArrType_Type':             (11,),
+    'PyIntegerArrType_Type':            (12,),
+    'PySignedIntegerArrType_Type':      (13,),
+    'PyUnsignedIntegerArrType_Type':    (14,),
+    'PyInexactArrType_Type':            (15,),
+    'PyFloatingArrType_Type':           (16,),
+    'PyComplexFloatingArrType_Type':    (17,),
+    'PyFlexibleArrType_Type':           (18,),
+    'PyCharacterArrType_Type':          (19,),
+    'PyByteArrType_Type':               (20,),
+    'PyShortArrType_Type':              (21,),
+    'PyIntArrType_Type':                (22,),
+    'PyLongArrType_Type':               (23,),
+    'PyLongLongArrType_Type':           (24,),
+    'PyUByteArrType_Type':              (25,),
+    'PyUShortArrType_Type':             (26,),
+    'PyUIntArrType_Type':               (27,),
+    'PyULongArrType_Type':              (28,),
+    'PyULongLongArrType_Type':          (29,),
+    'PyFloatArrType_Type':              (30,),
+    'PyDoubleArrType_Type':             (31,),
+    'PyLongDoubleArrType_Type':         (32,),
+    'PyCFloatArrType_Type':             (33,),
+    'PyCDoubleArrType_Type':            (34,),
+    'PyCLongDoubleArrType_Type':        (35,),
+    'PyObjectArrType_Type':             (36,),
+    'PyStringArrType_Type':             (37,),
+    'PyUnicodeArrType_Type':            (38,),
+    'PyVoidArrType_Type':               (39,),
+    # End 1.5 API
+    'PyTimeIntegerArrType_Type':        (214,),
+    'PyDatetimeArrType_Type':           (215,),
+    'PyTimedeltaArrType_Type':          (216,),
+    'PyHalfArrType_Type':               (217,),
+    'NpyIter_Type':                     (218,),
+    # End 1.6 API
+}
+
+#define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
+#define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[7])
+#define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[8])
+
+multiarray_funcs_api = {
+    'PyArray_GetNDArrayCVersion':           (0,),
+    'PyArray_SetNumericOps':                (40,),
+    'PyArray_GetNumericOps':                (41,),
+    'PyArray_INCREF':                       (42,),
+    'PyArray_XDECREF':                      (43,),
+    'PyArray_SetStringFunction':            (44,),
+    'PyArray_DescrFromType':                (45,),
+    'PyArray_TypeObjectFromType':           (46,),
+    'PyArray_Zero':                         (47,),
+    'PyArray_One':                          (48,),
+    'PyArray_CastToType':                   (49, StealRef(2), NonNull(2)),
+    'PyArray_CastTo':                       (50,),
+    'PyArray_CastAnyTo':                    (51,),
+    'PyArray_CanCastSafely':                (52,),
+    'PyArray_CanCastTo':                    (53,),
+    'PyArray_ObjectType':                   (54,),
+    'PyArray_DescrFromObject':              (55,),
+    'PyArray_ConvertToCommonType':          (56,),
+    'PyArray_DescrFromScalar':              (57,),
+    'PyArray_DescrFromTypeObject':          (58,),
+    'PyArray_Size':                         (59,),
+    'PyArray_Scalar':                       (60,),
+    'PyArray_FromScalar':                   (61, StealRef(2)),
+    'PyArray_ScalarAsCtype':                (62,),
+    'PyArray_CastScalarToCtype':            (63,),
+    'PyArray_CastScalarDirect':             (64,),
+    'PyArray_ScalarFromObject':             (65,),
+    'PyArray_GetCastFunc':                  (66,),
+    'PyArray_FromDims':                     (67,),
+    'PyArray_FromDimsAndDataAndDescr':      (68, StealRef(3)),
+    'PyArray_FromAny':                      (69, StealRef(2)),
+    'PyArray_EnsureArray':                  (70, StealRef(1)),
+    'PyArray_EnsureAnyArray':               (71, StealRef(1)),
+    'PyArray_FromFile':                     (72,),
+    'PyArray_FromString':                   (73,),
+    'PyArray_FromBuffer':                   (74,),
+    'PyArray_FromIter':                     (75, StealRef(2)),
+    'PyArray_Return':                       (76, StealRef(1)),
+    'PyArray_GetField':                     (77, StealRef(2), NonNull(2)),
+    'PyArray_SetField':                     (78, StealRef(2), NonNull(2)),
+    'PyArray_Byteswap':                     (79,),
+    'PyArray_Resize':                       (80,),
+    'PyArray_MoveInto':                     (81,),
+    'PyArray_CopyInto':                     (82,),
+    'PyArray_CopyAnyInto':                  (83,),
+    'PyArray_CopyObject':                   (84,),
+    'PyArray_NewCopy':                      (85, NonNull(1)),
+    'PyArray_ToList':                       (86,),
+    'PyArray_ToString':                     (87,),
+    'PyArray_ToFile':                       (88,),
+    'PyArray_Dump':                         (89,),
+    'PyArray_Dumps':                        (90,),
+    'PyArray_ValidType':                    (91,),
+    'PyArray_UpdateFlags':                  (92,),
+    'PyArray_New':                          (93, NonNull(1)),
+    'PyArray_NewFromDescr':                 (94, StealRef(2), NonNull([1, 2])),
+    'PyArray_DescrNew':                     (95,),
+    'PyArray_DescrNewFromType':             (96,),
+    'PyArray_GetPriority':                  (97,),
+    'PyArray_IterNew':                      (98,),
+    'PyArray_MultiIterNew':                 (99,),
+    'PyArray_PyIntAsInt':                   (100,),
+    'PyArray_PyIntAsIntp':                  (101,),
+    'PyArray_Broadcast':                    (102,),
+    'PyArray_FillObjectArray':              (103,),
+    'PyArray_FillWithScalar':               (104,),
+    'PyArray_CheckStrides':                 (105,),
+    'PyArray_DescrNewByteorder':            (106,),
+    'PyArray_IterAllButAxis':               (107,),
+    'PyArray_CheckFromAny':                 (108, StealRef(2)),
+    'PyArray_FromArray':                    (109, StealRef(2)),
+    'PyArray_FromInterface':                (110,),
+    'PyArray_FromStructInterface':          (111,),
+    'PyArray_FromArrayAttr':                (112,),
+    'PyArray_ScalarKind':                   (113,),
+    'PyArray_CanCoerceScalar':              (114,),
+    'PyArray_NewFlagsObject':               (115,),
+    'PyArray_CanCastScalar':                (116,),
+    'PyArray_CompareUCS4':                  (117,),
+    'PyArray_RemoveSmallest':               (118,),
+    'PyArray_ElementStrides':               (119,),
+    'PyArray_Item_INCREF':                  (120,),
+    'PyArray_Item_XDECREF':                 (121,),
+    'PyArray_FieldNames':                   (122,),
+    'PyArray_Transpose':                    (123,),
+    'PyArray_TakeFrom':                     (124,),
+    'PyArray_PutTo':                        (125,),
+    'PyArray_PutMask':                      (126,),
+    'PyArray_Repeat':                       (127,),
+    'PyArray_Choose':                       (128,),
+    'PyArray_Sort':                         (129,),
+    'PyArray_ArgSort':                      (130,),
+    'PyArray_SearchSorted':                 (131,),
+    'PyArray_ArgMax':                       (132,),
+    'PyArray_ArgMin':                       (133,),
+    'PyArray_Reshape':                      (134,),
+    'PyArray_Newshape':                     (135,),
+    'PyArray_Squeeze':                      (136,),
+    'PyArray_View':                         (137, StealRef(2)),
+    'PyArray_SwapAxes':                     (138,),
+    'PyArray_Max':                          (139,),
+    'PyArray_Min':                          (140,),
+    'PyArray_Ptp':                          (141,),
+    'PyArray_Mean':                         (142,),
+    'PyArray_Trace':                        (143,),
+    'PyArray_Diagonal':                     (144,),
+    'PyArray_Clip':                         (145,),
+    'PyArray_Conjugate':                    (146,),
+    'PyArray_Nonzero':                      (147,),
+    'PyArray_Std':                          (148,),
+    'PyArray_Sum':                          (149,),
+    'PyArray_CumSum':                       (150,),
+    'PyArray_Prod':                         (151,),
+    'PyArray_CumProd':                      (152,),
+    'PyArray_All':                          (153,),
+    'PyArray_Any':                          (154,),
+    'PyArray_Compress':                     (155,),
+    'PyArray_Flatten':                      (156,),
+    'PyArray_Ravel':                        (157,),
+    'PyArray_MultiplyList':                 (158,),
+    'PyArray_MultiplyIntList':              (159,),
+    'PyArray_GetPtr':                       (160,),
+    'PyArray_CompareLists':                 (161,),
+    'PyArray_AsCArray':                     (162, StealRef(5)),
+    'PyArray_As1D':                         (163,),
+    'PyArray_As2D':                         (164,),
+    'PyArray_Free':                         (165,),
+    'PyArray_Converter':                    (166,),
+    'PyArray_IntpFromSequence':             (167,),
+    'PyArray_Concatenate':                  (168,),
+    'PyArray_InnerProduct':                 (169,),
+    'PyArray_MatrixProduct':                (170,),
+    'PyArray_CopyAndTranspose':             (171,),
+    'PyArray_Correlate':                    (172,),
+    'PyArray_TypestrConvert':               (173,),
+    'PyArray_DescrConverter':               (174,),
+    'PyArray_DescrConverter2':              (175,),
+    'PyArray_IntpConverter':                (176,),
+    'PyArray_BufferConverter':              (177,),
+    'PyArray_AxisConverter':                (178,),
+    'PyArray_BoolConverter':                (179,),
+    'PyArray_ByteorderConverter':           (180,),
+    'PyArray_OrderConverter':               (181,),
+    'PyArray_EquivTypes':                   (182,),
+    'PyArray_Zeros':                        (183, StealRef(3)),
+    'PyArray_Empty':                        (184, StealRef(3)),
+    'PyArray_Where':                        (185,),
+    'PyArray_Arange':                       (186,),
+    'PyArray_ArangeObj':                    (187,),
+    'PyArray_SortkindConverter':            (188,),
+    'PyArray_LexSort':                      (189,),
+    'PyArray_Round':                        (190,),
+    'PyArray_EquivTypenums':                (191,),
+    'PyArray_RegisterDataType':             (192,),
+    'PyArray_RegisterCastFunc':             (193,),
+    'PyArray_RegisterCanCast':              (194,),
+    'PyArray_InitArrFuncs':                 (195,),
+    'PyArray_IntTupleFromIntp':             (196,),
+    'PyArray_TypeNumFromName':              (197,),
+    'PyArray_ClipmodeConverter':            (198,),
+    'PyArray_OutputConverter':              (199,),
+    'PyArray_BroadcastToShape':             (200,),
+    '_PyArray_SigintHandler':               (201,),
+    '_PyArray_GetSigintBuf':                (202,),
+    'PyArray_DescrAlignConverter':          (203,),
+    'PyArray_DescrAlignConverter2':         (204,),
+    'PyArray_SearchsideConverter':          (205,),
+    'PyArray_CheckAxis':                    (206,),
+    'PyArray_OverflowMultiplyList':         (207,),
+    'PyArray_CompareString':                (208,),
+    'PyArray_MultiIterFromObjects':         (209,),
+    'PyArray_GetEndianness':                (210,),
+    'PyArray_GetNDArrayCFeatureVersion':    (211,),
+    'PyArray_Correlate2':                   (212,),
+    'PyArray_NeighborhoodIterNew':          (213,),
+    # End 1.5 API
+    'PyArray_SetDatetimeParseFunction':     (219,),
+    'PyArray_DatetimeToDatetimeStruct':     (220,),
+    'PyArray_TimedeltaToTimedeltaStruct':   (221,),
+    'PyArray_DatetimeStructToDatetime':     (222,),
+    'PyArray_TimedeltaStructToTimedelta':   (223,),
+    # NDIter API
+    'NpyIter_New':                          (224,),
+    'NpyIter_MultiNew':                     (225,),
+    'NpyIter_AdvancedNew':                  (226,),
+    'NpyIter_Copy':                         (227,),
+    'NpyIter_Deallocate':                   (228,),
+    'NpyIter_HasDelayedBufAlloc':           (229,),
+    'NpyIter_HasExternalLoop':              (230,),
+    'NpyIter_EnableExternalLoop':           (231,),
+    'NpyIter_GetInnerStrideArray':          (232,),
+    'NpyIter_GetInnerLoopSizePtr':          (233,),
+    'NpyIter_Reset':                        (234,),
+    'NpyIter_ResetBasePointers':            (235,),
+    'NpyIter_ResetToIterIndexRange':        (236,),
+    'NpyIter_GetNDim':                      (237,),
+    'NpyIter_GetNOp':                       (238,),
+    'NpyIter_GetIterNext':                  (239,),
+    'NpyIter_GetIterSize':                  (240,),
+    'NpyIter_GetIterIndexRange':            (241,),
+    'NpyIter_GetIterIndex':                 (242,),
+    'NpyIter_GotoIterIndex':                (243,),
+    'NpyIter_HasMultiIndex':                (244,),
+    'NpyIter_GetShape':                     (245,),
+    'NpyIter_GetGetMultiIndex':             (246,),
+    'NpyIter_GotoMultiIndex':               (247,),
+    'NpyIter_RemoveMultiIndex':             (248,),
+    'NpyIter_HasIndex':                     (249,),
+    'NpyIter_IsBuffered':                   (250,),
+    'NpyIter_IsGrowInner':                  (251,),
+    'NpyIter_GetBufferSize':                (252,),
+    'NpyIter_GetIndexPtr':                  (253,),
+    'NpyIter_GotoIndex':                    (254,),
+    'NpyIter_GetDataPtrArray':              (255,),
+    'NpyIter_GetDescrArray':                (256,),
+    'NpyIter_GetOperandArray':              (257,),
+    'NpyIter_GetIterView':                  (258,),
+    'NpyIter_GetReadFlags':                 (259,),
+    'NpyIter_GetWriteFlags':                (260,),
+    'NpyIter_DebugPrint':                   (261,),
+    'NpyIter_IterationNeedsAPI':            (262,),
+    'NpyIter_GetInnerFixedStrideArray':     (263,),
+    'NpyIter_RemoveAxis':                   (264,),
+    'NpyIter_GetAxisStrideArray':           (265,),
+    'NpyIter_RequiresBuffering':            (266,),
+    'NpyIter_GetInitialDataPtrArray':       (267,),
+    'NpyIter_CreateCompatibleStrides':      (268,),
+    #
+    'PyArray_CastingConverter':             (269,),
+    'PyArray_CountNonzero':                 (270,),
+    'PyArray_PromoteTypes':                 (271,),
+    'PyArray_MinScalarType':                (272,),
+    'PyArray_ResultType':                   (273,),
+    'PyArray_CanCastArrayTo':               (274,),
+    'PyArray_CanCastTypeTo':                (275,),
+    'PyArray_EinsteinSum':                  (276,),
+    'PyArray_NewLikeArray':                 (277, StealRef(3), NonNull(1)),
+    'PyArray_GetArrayParamsFromObject':     (278,),
+    'PyArray_ConvertClipmodeSequence':      (279,),
+    'PyArray_MatrixProduct2':               (280,),
+    # End 1.6 API
+    'NpyIter_IsFirstVisit':                 (281,),
+    'PyArray_SetBaseObject':                (282, StealRef(2)),
+    'PyArray_CreateSortedStridePerm':       (283,),
+    'PyArray_RemoveAxesInPlace':            (284,),
+    'PyArray_DebugPrint':                   (285,),
+    'PyArray_FailUnlessWriteable':          (286,),
+    'PyArray_SetUpdateIfCopyBase':          (287, StealRef(2)),
+    'PyDataMem_NEW':                        (288,),
+    'PyDataMem_FREE':                       (289,),
+    'PyDataMem_RENEW':                      (290,),
+    'PyDataMem_SetEventHook':               (291,),
+    'PyArray_MapIterSwapAxes':              (293,),
+    'PyArray_MapIterArray':                 (294,),
+    'PyArray_MapIterNext':                  (295,),
+    # End 1.7 API
+    'PyArray_Partition':                    (296,),
+    'PyArray_ArgPartition':                 (297,),
+    'PyArray_SelectkindConverter':          (298,),
+    'PyDataMem_NEW_ZEROED':                 (299,),
+    # End 1.8 API
+    # End 1.9 API
+    'PyArray_CheckAnyScalarExact':          (300, NonNull(1)),
+    # End 1.10 API
+    'PyArray_MapIterArrayCopyIfOverlap':    (301,),
+    # End 1.13 API
+    'PyArray_ResolveWritebackIfCopy':       (302,),
+    'PyArray_SetWritebackIfCopyBase':       (303,),
+    # End 1.14 API
+}
+
+ufunc_types_api = {
+    'PyUFunc_Type':                             (0,)
+}
+
+ufunc_funcs_api = {
+    'PyUFunc_FromFuncAndData':                  (1,),
+    'PyUFunc_RegisterLoopForType':              (2,),
+    'PyUFunc_GenericFunction':                  (3,),
+    'PyUFunc_f_f_As_d_d':                       (4,),
+    'PyUFunc_d_d':                              (5,),
+    'PyUFunc_f_f':                              (6,),
+    'PyUFunc_g_g':                              (7,),
+    'PyUFunc_F_F_As_D_D':                       (8,),
+    'PyUFunc_F_F':                              (9,),
+    'PyUFunc_D_D':                              (10,),
+    'PyUFunc_G_G':                              (11,),
+    'PyUFunc_O_O':                              (12,),
+    'PyUFunc_ff_f_As_dd_d':                     (13,),
+    'PyUFunc_ff_f':                             (14,),
+    'PyUFunc_dd_d':                             (15,),
+    'PyUFunc_gg_g':                             (16,),
+    'PyUFunc_FF_F_As_DD_D':                     (17,),
+    'PyUFunc_DD_D':                             (18,),
+    'PyUFunc_FF_F':                             (19,),
+    'PyUFunc_GG_G':                             (20,),
+    'PyUFunc_OO_O':                             (21,),
+    'PyUFunc_O_O_method':                       (22,),
+    'PyUFunc_OO_O_method':                      (23,),
+    'PyUFunc_On_Om':                            (24,),
+    'PyUFunc_GetPyValues':                      (25,),
+    'PyUFunc_checkfperr':                       (26,),
+    'PyUFunc_clearfperr':                       (27,),
+    'PyUFunc_getfperr':                         (28,),
+    'PyUFunc_handlefperr':                      (29,),
+    'PyUFunc_ReplaceLoopBySignature':           (30,),
+    'PyUFunc_FromFuncAndDataAndSignature':      (31,),
+    'PyUFunc_SetUsesArraysAsData':              (32,),
+    # End 1.5 API
+    'PyUFunc_e_e':                              (33,),
+    'PyUFunc_e_e_As_f_f':                       (34,),
+    'PyUFunc_e_e_As_d_d':                       (35,),
+    'PyUFunc_ee_e':                             (36,),
+    'PyUFunc_ee_e_As_ff_f':                     (37,),
+    'PyUFunc_ee_e_As_dd_d':                     (38,),
+    # End 1.6 API
+    'PyUFunc_DefaultTypeResolver':              (39,),
+    'PyUFunc_ValidateCasting':                  (40,),
+    # End 1.7 API
+    'PyUFunc_RegisterLoopForDescr':             (41,),
+    # End 1.8 API
+    'PyUFunc_FromFuncAndDataAndSignatureAndIdentity': (42,),
+    # End 1.16 API
+}
+
+# List of all the dicts which define the C API
+# XXX: DO NOT CHANGE THE ORDER OF TUPLES BELOW !
+multiarray_api = (
+        multiarray_global_vars,
+        multiarray_scalar_bool_values,
+        multiarray_types_api,
+        multiarray_funcs_api,
+)
+
+ufunc_api = (
+        ufunc_funcs_api,
+        ufunc_types_api
+)
+
+full_api = multiarray_api + ufunc_api
diff --git a/contrib/python/numpy/py2/numpy/core/code_generators/ufunc_docstrings.py b/contrib/python/numpy/py2/numpy/core/code_generators/ufunc_docstrings.py
new file mode 100644
index 00000000000..6c0555e2311
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/code_generators/ufunc_docstrings.py
@@ -0,0 +1,3930 @@
+"""
+Docstrings for generated ufuncs
+
+The syntax is designed to look like the function add_newdoc is being
+called from numpy.lib, but in this file  add_newdoc puts the docstrings
+in a dictionary. This dictionary is used in
+numpy/core/code_generators/generate_umath.py to generate the docstrings
+for the ufuncs in numpy.core at the C level when the ufuncs are created
+at compile time.
+
+"""
+from __future__ import division, absolute_import, print_function
+import textwrap
+
+docdict = {}
+
+def get(name):
+    return docdict.get(name)
+
+# common parameter text to all ufuncs
+subst = {
+    'PARAMS': textwrap.dedent("""
+        out : ndarray, None, or tuple of ndarray and None, optional
+            A location into which the result is stored. If provided, it must have
+            a shape that the inputs broadcast to. If not provided or `None`,
+            a freshly-allocated array is returned. A tuple (possible only as a
+            keyword argument) must have length equal to the number of outputs.
+        where : array_like, optional
+            Values of True indicate to calculate the ufunc at that position, values
+            of False indicate to leave the value in the output alone.
+        **kwargs
+            For other keyword-only arguments, see the
+            :ref:`ufunc docs <ufuncs.kwargs>`.
+    """).strip(),
+    'OUT_SCALAR_1': "This is a scalar if `x` is a scalar.",
+    'OUT_SCALAR_2': "This is a scalar if both `x1` and `x2` are scalars.",
+}
+
+def add_newdoc(place, name, doc):
+    doc = textwrap.dedent(doc).strip()
+
+    if name[0] != '_' and name != 'matmul':
+        # matmul is special, it does not use the OUT_SCALAR replacement strings
+        if '\nx :' in doc:
+            assert '$OUT_SCALAR_1' in doc, "in {}".format(name)
+        elif '\nx2 :' in doc or '\nx1, x2 :' in doc:
+            assert '$OUT_SCALAR_2' in doc, "in {}".format(name)
+        else:
+            assert False, "Could not detect number of inputs in {}".format(name)
+    for k, v in subst.items():
+        doc = doc.replace('$' + k, v)
+
+    docdict['.'.join((place, name))] = doc
+
+
+add_newdoc('numpy.core.umath', 'absolute',
+    """
+    Calculate the absolute value element-wise.
+
+    ``np.abs`` is a shorthand for this function.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    absolute : ndarray
+        An ndarray containing the absolute value of
+        each element in `x`.  For complex input, ``a + ib``, the
+        absolute value is :math:`\\sqrt{ a^2 + b^2 }`.
+        $OUT_SCALAR_1
+
+    Examples
+    --------
+    >>> x = np.array([-1.2, 1.2])
+    >>> np.absolute(x)
+    array([ 1.2,  1.2])
+    >>> np.absolute(1.2 + 1j)
+    1.5620499351813308
+
+    Plot the function over ``[-10, 10]``:
+
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(start=-10, stop=10, num=101)
+    >>> plt.plot(x, np.absolute(x))
+    >>> plt.show()
+
+    Plot the function over the complex plane:
+
+    >>> xx = x + 1j * x[:, np.newaxis]
+    >>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap='gray')
+    >>> plt.show()
+
+    """)
+
+add_newdoc('numpy.core.umath', 'add',
+    """
+    Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The arrays to be added.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    $PARAMS
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    Notes
+    -----
+    Equivalent to `x1` + `x2` in terms of array broadcasting.
+
+    Examples
+    --------
+    >>> np.add(1.0, 4.0)
+    5.0
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.add(x1, x2)
+    array([[  0.,   2.,   4.],
+           [  3.,   5.,   7.],
+           [  6.,   8.,  10.]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'arccos',
+    """
+    Trigonometric inverse cosine, element-wise.
+
+    The inverse of `cos` so that, if ``y = cos(x)``, then ``x = arccos(y)``.
+
+    Parameters
+    ----------
+    x : array_like
+        `x`-coordinate on the unit circle.
+        For real arguments, the domain is [-1, 1].
+    $PARAMS
+
+    Returns
+    -------
+    angle : ndarray
+        The angle of the ray intersecting the unit circle at the given
+        `x`-coordinate in radians [0, pi].
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    cos, arctan, arcsin, emath.arccos
+
+    Notes
+    -----
+    `arccos` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `cos(z) = x`. The convention is to return
+    the angle `z` whose real part lies in `[0, pi]`.
+
+    For real-valued input data types, `arccos` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arccos` is a complex analytic function that
+    has branch cuts `[-inf, -1]` and `[1, inf]` and is continuous from
+    above on the former and from below on the latter.
+
+    The inverse `cos` is also known as `acos` or cos^-1.
+
+    References
+    ----------
+    M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+    10th printing, 1964, pp. 79. http://www.math.sfu.ca/~cbm/aands/
+
+    Examples
+    --------
+    We expect the arccos of 1 to be 0, and of -1 to be pi:
+
+    >>> np.arccos([1, -1])
+    array([ 0.        ,  3.14159265])
+
+    Plot arccos:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-1, 1, num=100)
+    >>> plt.plot(x, np.arccos(x))
+    >>> plt.axis('tight')
+    >>> plt.show()
+
+    """)
+
+add_newdoc('numpy.core.umath', 'arccosh',
+    """
+    Inverse hyperbolic cosine, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    arccosh : ndarray
+        Array of the same shape as `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+
+    cosh, arcsinh, sinh, arctanh, tanh
+
+    Notes
+    -----
+    `arccosh` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `cosh(z) = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi, pi]` and the real part in
+    ``[0, inf]``.
+
+    For real-valued input data types, `arccosh` always returns real output.
+    For each value that cannot be expressed as a real number or infinity, it
+    yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arccosh` is a complex analytical function that
+    has a branch cut `[-inf, 1]` and is continuous from above on it.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Inverse hyperbolic function",
+           https://en.wikipedia.org/wiki/Arccosh
+
+    Examples
+    --------
+    >>> np.arccosh([np.e, 10.0])
+    array([ 1.65745445,  2.99322285])
+    >>> np.arccosh(1)
+    0.0
+
+    """)
+
+add_newdoc('numpy.core.umath', 'arcsin',
+    """
+    Inverse sine, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        `y`-coordinate on the unit circle.
+    $PARAMS
+
+    Returns
+    -------
+    angle : ndarray
+        The inverse sine of each element in `x`, in radians and in the
+        closed interval ``[-pi/2, pi/2]``.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    sin, cos, arccos, tan, arctan, arctan2, emath.arcsin
+
+    Notes
+    -----
+    `arcsin` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that :math:`sin(z) = x`.  The convention is to
+    return the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, *arcsin* always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arcsin` is a complex analytic function that
+    has, by convention, the branch cuts [-inf, -1] and [1, inf]  and is
+    continuous from above on the former and from below on the latter.
+
+    The inverse sine is also known as `asin` or sin^{-1}.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79ff.
+    http://www.math.sfu.ca/~cbm/aands/
+
+    Examples
+    --------
+    >>> np.arcsin(1)     # pi/2
+    1.5707963267948966
+    >>> np.arcsin(-1)    # -pi/2
+    -1.5707963267948966
+    >>> np.arcsin(0)
+    0.0
+
+    """)
+
+add_newdoc('numpy.core.umath', 'arcsinh',
+    """
+    Inverse hyperbolic sine element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Array of the same shape as `x`.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    `arcsinh` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `sinh(z) = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi/2, pi/2]`.
+
+    For real-valued input data types, `arcsinh` always returns real output.
+    For each value that cannot be expressed as a real number or infinity, it
+    returns ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arccos` is a complex analytical function that
+    has branch cuts `[1j, infj]` and `[-1j, -infj]` and is continuous from
+    the right on the former and from the left on the latter.
+
+    The inverse hyperbolic sine is also known as `asinh` or ``sinh^-1``.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Inverse hyperbolic function",
+           https://en.wikipedia.org/wiki/Arcsinh
+
+    Examples
+    --------
+    >>> np.arcsinh(np.array([np.e, 10.0]))
+    array([ 1.72538256,  2.99822295])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'arctan',
+    """
+    Trigonometric inverse tangent, element-wise.
+
+    The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``.
+
+    Parameters
+    ----------
+    x : array_like
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Out has the same shape as `x`.  Its real part is in
+        ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``).
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    arctan2 : The "four quadrant" arctan of the angle formed by (`x`, `y`)
+        and the positive `x`-axis.
+    angle : Argument of complex values.
+
+    Notes
+    -----
+    `arctan` is a multi-valued function: for each `x` there are infinitely
+    many numbers `z` such that tan(`z`) = `x`.  The convention is to return
+    the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, `arctan` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arctan` is a complex analytic function that
+    has [`1j, infj`] and [`-1j, -infj`] as branch cuts, and is continuous
+    from the left on the former and from the right on the latter.
+
+    The inverse tangent is also known as `atan` or tan^{-1}.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79.
+    http://www.math.sfu.ca/~cbm/aands/
+
+    Examples
+    --------
+    We expect the arctan of 0 to be 0, and of 1 to be pi/4:
+
+    >>> np.arctan([0, 1])
+    array([ 0.        ,  0.78539816])
+
+    >>> np.pi/4
+    0.78539816339744828
+
+    Plot arctan:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-10, 10)
+    >>> plt.plot(x, np.arctan(x))
+    >>> plt.axis('tight')
+    >>> plt.show()
+
+    """)
+
+add_newdoc('numpy.core.umath', 'arctan2',
+    """
+    Element-wise arc tangent of ``x1/x2`` choosing the quadrant correctly.
+
+    The quadrant (i.e., branch) is chosen so that ``arctan2(x1, x2)`` is
+    the signed angle in radians between the ray ending at the origin and
+    passing through the point (1,0), and the ray ending at the origin and
+    passing through the point (`x2`, `x1`).  (Note the role reversal: the
+    "`y`-coordinate" is the first function parameter, the "`x`-coordinate"
+    is the second.)  By IEEE convention, this function is defined for
+    `x2` = +/-0 and for either or both of `x1` and `x2` = +/-inf (see
+    Notes for specific values).
+
+    This function is not defined for complex-valued arguments; for the
+    so-called argument of complex values, use `angle`.
+
+    Parameters
+    ----------
+    x1 : array_like, real-valued
+        `y`-coordinates.
+    x2 : array_like, real-valued
+        `x`-coordinates. `x2` must be broadcastable to match the shape of
+        `x1` or vice versa.
+    $PARAMS
+
+    Returns
+    -------
+    angle : ndarray
+        Array of angles in radians, in the range ``[-pi, pi]``.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    arctan, tan, angle
+
+    Notes
+    -----
+    *arctan2* is identical to the `atan2` function of the underlying
+    C library.  The following special values are defined in the C
+    standard: [1]_
+
+    ====== ====== ================
+    `x1`   `x2`   `arctan2(x1,x2)`
+    ====== ====== ================
+    +/- 0  +0     +/- 0
+    +/- 0  -0     +/- pi
+     > 0   +/-inf +0 / +pi
+     < 0   +/-inf -0 / -pi
+    +/-inf +inf   +/- (pi/4)
+    +/-inf -inf   +/- (3*pi/4)
+    ====== ====== ================
+
+    Note that +0 and -0 are distinct floating point numbers, as are +inf
+    and -inf.
+
+    References
+    ----------
+    .. [1] ISO/IEC standard 9899:1999, "Programming language C."
+
+    Examples
+    --------
+    Consider four points in different quadrants:
+
+    >>> x = np.array([-1, +1, +1, -1])
+    >>> y = np.array([-1, -1, +1, +1])
+    >>> np.arctan2(y, x) * 180 / np.pi
+    array([-135.,  -45.,   45.,  135.])
+
+    Note the order of the parameters. `arctan2` is defined also when `x2` = 0
+    and at several other special points, obtaining values in
+    the range ``[-pi, pi]``:
+
+    >>> np.arctan2([1., -1.], [0., 0.])
+    array([ 1.57079633, -1.57079633])
+    >>> np.arctan2([0., 0., np.inf], [+0., -0., np.inf])
+    array([ 0.        ,  3.14159265,  0.78539816])
+
+    """)
+
+add_newdoc('numpy.core.umath', '_arg',
+    """
+    DO NOT USE, ONLY FOR TESTING
+    """)
+
+add_newdoc('numpy.core.umath', 'arctanh',
+    """
+    Inverse hyperbolic tangent element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Array of the same shape as `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    emath.arctanh
+
+    Notes
+    -----
+    `arctanh` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `tanh(z) = x`. The convention is to return
+    the `z` whose imaginary part lies in `[-pi/2, pi/2]`.
+
+    For real-valued input data types, `arctanh` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arctanh` is a complex analytical function
+    that has branch cuts `[-1, -inf]` and `[1, inf]` and is continuous from
+    above on the former and from below on the latter.
+
+    The inverse hyperbolic tangent is also known as `atanh` or ``tanh^-1``.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Inverse hyperbolic function",
+           https://en.wikipedia.org/wiki/Arctanh
+
+    Examples
+    --------
+    >>> np.arctanh([0, -0.5])
+    array([ 0.        , -0.54930614])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'bitwise_and',
+    """
+    Compute the bit-wise AND of two arrays element-wise.
+
+    Computes the bit-wise AND of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``&``.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Only integer and boolean types are handled.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logical_and
+    bitwise_or
+    bitwise_xor
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    The number 13 is represented by ``00001101``.  Likewise, 17 is
+    represented by ``00010001``.  The bit-wise AND of 13 and 17 is
+    therefore ``000000001``, or 1:
+
+    >>> np.bitwise_and(13, 17)
+    1
+
+    >>> np.bitwise_and(14, 13)
+    12
+    >>> np.binary_repr(12)
+    '1100'
+    >>> np.bitwise_and([14,3], 13)
+    array([12,  1])
+
+    >>> np.bitwise_and([11,7], [4,25])
+    array([0, 1])
+    >>> np.bitwise_and(np.array([2,5,255]), np.array([3,14,16]))
+    array([ 2,  4, 16])
+    >>> np.bitwise_and([True, True], [False, True])
+    array([False,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'bitwise_or',
+    """
+    Compute the bit-wise OR of two arrays element-wise.
+
+    Computes the bit-wise OR of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``|``.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Only integer and boolean types are handled.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logical_or
+    bitwise_and
+    bitwise_xor
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    The number 13 has the binaray representation ``00001101``. Likewise,
+    16 is represented by ``00010000``.  The bit-wise OR of 13 and 16 is
+    then ``000111011``, or 29:
+
+    >>> np.bitwise_or(13, 16)
+    29
+    >>> np.binary_repr(29)
+    '11101'
+
+    >>> np.bitwise_or(32, 2)
+    34
+    >>> np.bitwise_or([33, 4], 1)
+    array([33,  5])
+    >>> np.bitwise_or([33, 4], [1, 2])
+    array([33,  6])
+
+    >>> np.bitwise_or(np.array([2, 5, 255]), np.array([4, 4, 4]))
+    array([  6,   5, 255])
+    >>> np.array([2, 5, 255]) | np.array([4, 4, 4])
+    array([  6,   5, 255])
+    >>> np.bitwise_or(np.array([2, 5, 255, 2147483647L], dtype=np.int32),
+    ...               np.array([4, 4, 4, 2147483647L], dtype=np.int32))
+    array([         6,          5,        255, 2147483647])
+    >>> np.bitwise_or([True, True], [False, True])
+    array([ True,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'bitwise_xor',
+    """
+    Compute the bit-wise XOR of two arrays element-wise.
+
+    Computes the bit-wise XOR of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``^``.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Only integer and boolean types are handled.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logical_xor
+    bitwise_and
+    bitwise_or
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    The number 13 is represented by ``00001101``. Likewise, 17 is
+    represented by ``00010001``.  The bit-wise XOR of 13 and 17 is
+    therefore ``00011100``, or 28:
+
+    >>> np.bitwise_xor(13, 17)
+    28
+    >>> np.binary_repr(28)
+    '11100'
+
+    >>> np.bitwise_xor(31, 5)
+    26
+    >>> np.bitwise_xor([31,3], 5)
+    array([26,  6])
+
+    >>> np.bitwise_xor([31,3], [5,6])
+    array([26,  5])
+    >>> np.bitwise_xor([True, True], [False, True])
+    array([ True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'ceil',
+    """
+    Return the ceiling of the input, element-wise.
+
+    The ceil of the scalar `x` is the smallest integer `i`, such that
+    `i >= x`.  It is often denoted as :math:`\\lceil x \\rceil`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The ceiling of each element in `x`, with `float` dtype.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    floor, trunc, rint
+
+    Examples
+    --------
+    >>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> np.ceil(a)
+    array([-1., -1., -0.,  1.,  2.,  2.,  2.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'trunc',
+    """
+    Return the truncated value of the input, element-wise.
+
+    The truncated value of the scalar `x` is the nearest integer `i` which
+    is closer to zero than `x` is. In short, the fractional part of the
+    signed number `x` is discarded.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The truncated value of each element in `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    ceil, floor, rint
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    Examples
+    --------
+    >>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> np.trunc(a)
+    array([-1., -1., -0.,  0.,  1.,  1.,  2.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'conjugate',
+    """
+    Return the complex conjugate, element-wise.
+
+    The complex conjugate of a complex number is obtained by changing the
+    sign of its imaginary part.
+
+    Parameters
+    ----------
+    x : array_like
+        Input value.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The complex conjugate of `x`, with same dtype as `y`.
+        $OUT_SCALAR_1
+
+    Examples
+    --------
+    >>> np.conjugate(1+2j)
+    (1-2j)
+
+    >>> x = np.eye(2) + 1j * np.eye(2)
+    >>> np.conjugate(x)
+    array([[ 1.-1.j,  0.-0.j],
+           [ 0.-0.j,  1.-1.j]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'cos',
+    """
+    Cosine element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array in radians.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding cosine values.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+    New York, NY: Dover, 1972.
+
+    Examples
+    --------
+    >>> np.cos(np.array([0, np.pi/2, np.pi]))
+    array([  1.00000000e+00,   6.12303177e-17,  -1.00000000e+00])
+    >>>
+    >>> # Example of providing the optional output parameter
+    >>> out2 = np.cos([0.1], out1)
+    >>> out2 is out1
+    True
+    >>>
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> np.cos(np.zeros((3,3)),np.zeros((2,2)))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
+
+    """)
+
+add_newdoc('numpy.core.umath', 'cosh',
+    """
+    Hyperbolic cosine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) + np.exp(-x))`` and ``np.cos(1j*x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array of same shape as `x`.
+        $OUT_SCALAR_1
+
+    Examples
+    --------
+    >>> np.cosh(0)
+    1.0
+
+    The hyperbolic cosine describes the shape of a hanging cable:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-4, 4, 1000)
+    >>> plt.plot(x, np.cosh(x))
+    >>> plt.show()
+
+    """)
+
+add_newdoc('numpy.core.umath', 'degrees',
+    """
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array in radians.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray of floats
+        The corresponding degree values; if `out` was supplied this is a
+        reference to it.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    rad2deg : equivalent function
+
+    Examples
+    --------
+    Convert a radian array to degrees
+
+    >>> rad = np.arange(12.)*np.pi/6
+    >>> np.degrees(rad)
+    array([   0.,   30.,   60.,   90.,  120.,  150.,  180.,  210.,  240.,
+            270.,  300.,  330.])
+
+    >>> out = np.zeros((rad.shape))
+    >>> r = degrees(rad, out)
+    >>> np.all(r == out)
+    True
+
+    """)
+
+add_newdoc('numpy.core.umath', 'rad2deg',
+    """
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : array_like
+        Angle in radians.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding angle in degrees.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    deg2rad : Convert angles from degrees to radians.
+    unwrap : Remove large jumps in angle by wrapping.
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    rad2deg(x) is ``180 * x / pi``.
+
+    Examples
+    --------
+    >>> np.rad2deg(np.pi/2)
+    90.0
+
+    """)
+
+add_newdoc('numpy.core.umath', 'heaviside',
+    """
+    Compute the Heaviside step function.
+
+    The Heaviside step function is defined as::
+
+                              0   if x1 < 0
+        heaviside(x1, x2) =  x2   if x1 == 0
+                              1   if x1 > 0
+
+    where `x2` is often taken to be 0.5, but 0 and 1 are also sometimes used.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Input values.
+    x2 : array_like
+        The value of the function when x1 is 0.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The output array, element-wise Heaviside step function of `x1`.
+        $OUT_SCALAR_2
+
+    Notes
+    -----
+    .. versionadded:: 1.13.0
+
+    References
+    ----------
+    .. Wikipedia, "Heaviside step function",
+       https://en.wikipedia.org/wiki/Heaviside_step_function
+
+    Examples
+    --------
+    >>> np.heaviside([-1.5, 0, 2.0], 0.5)
+    array([ 0. ,  0.5,  1. ])
+    >>> np.heaviside([-1.5, 0, 2.0], 1)
+    array([ 0.,  1.,  1.])
+    """)
+
+add_newdoc('numpy.core.umath', 'divide',
+    """
+    Divide arguments element-wise.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend array.
+    x2 : array_like
+        Divisor array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The quotient ``x1/x2``, element-wise.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    seterr : Set whether to raise or warn on overflow, underflow and
+             division by zero.
+
+    Notes
+    -----
+    Equivalent to ``x1`` / ``x2`` in terms of array-broadcasting.
+
+    Behavior on division by zero can be changed using ``seterr``.
+
+    In Python 2, when both ``x1`` and ``x2`` are of an integer type,
+    ``divide`` will behave like ``floor_divide``. In Python 3, it behaves
+    like ``true_divide``.
+
+    Examples
+    --------
+    >>> np.divide(2.0, 4.0)
+    0.5
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.divide(x1, x2)
+    array([[ NaN,  1. ,  1. ],
+           [ Inf,  4. ,  2.5],
+           [ Inf,  7. ,  4. ]])
+
+    Note the behavior with integer types (Python 2 only):
+
+    >>> np.divide(2, 4)
+    0
+    >>> np.divide(2, 4.)
+    0.5
+
+    Division by zero always yields zero in integer arithmetic (again,
+    Python 2 only), and does not raise an exception or a warning:
+
+    >>> np.divide(np.array([0, 1], dtype=int), np.array([0, 0], dtype=int))
+    array([0, 0])
+
+    Division by zero can, however, be caught using ``seterr``:
+
+    >>> old_err_state = np.seterr(divide='raise')
+    >>> np.divide(1, 0)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    FloatingPointError: divide by zero encountered in divide
+
+    >>> ignored_states = np.seterr(**old_err_state)
+    >>> np.divide(1, 0)
+    0
+
+    """)
+
+add_newdoc('numpy.core.umath', 'equal',
+    """
+    Return (x1 == x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays of the same shape.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise comparison of `x1` and `x2`.
+        Typically of type bool, unless ``dtype=object`` is passed.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    not_equal, greater_equal, less_equal, greater, less
+
+    Examples
+    --------
+    >>> np.equal([0, 1, 3], np.arange(3))
+    array([ True,  True, False])
+
+    What is compared are values, not types. So an int (1) and an array of
+    length one can evaluate as True:
+
+    >>> np.equal(1, np.ones(1))
+    array([ True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'exp',
+    """
+    Calculate the exponential of all elements in the input array.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise exponential of `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    expm1 : Calculate ``exp(x) - 1`` for all elements in the array.
+    exp2  : Calculate ``2**x`` for all elements in the array.
+
+    Notes
+    -----
+    The irrational number ``e`` is also known as Euler's number.  It is
+    approximately 2.718281, and is the base of the natural logarithm,
+    ``ln`` (this means that, if :math:`x = \\ln y = \\log_e y`,
+    then :math:`e^x = y`. For real input, ``exp(x)`` is always positive.
+
+    For complex arguments, ``x = a + ib``, we can write
+    :math:`e^x = e^a e^{ib}`.  The first term, :math:`e^a`, is already
+    known (it is the real argument, described above).  The second term,
+    :math:`e^{ib}`, is :math:`\\cos b + i \\sin b`, a function with
+    magnitude 1 and a periodic phase.
+
+    References
+    ----------
+    .. [1] Wikipedia, "Exponential function",
+           https://en.wikipedia.org/wiki/Exponential_function
+    .. [2] M. Abramovitz and I. A. Stegun, "Handbook of Mathematical Functions
+           with Formulas, Graphs, and Mathematical Tables," Dover, 1964, p. 69,
+           http://www.math.sfu.ca/~cbm/aands/page_69.htm
+
+    Examples
+    --------
+    Plot the magnitude and phase of ``exp(x)`` in the complex plane:
+
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-2*np.pi, 2*np.pi, 100)
+    >>> xx = x + 1j * x[:, np.newaxis] # a + ib over complex plane
+    >>> out = np.exp(xx)
+
+    >>> plt.subplot(121)
+    >>> plt.imshow(np.abs(out),
+    ...            extent=[-2*np.pi, 2*np.pi, -2*np.pi, 2*np.pi], cmap='gray')
+    >>> plt.title('Magnitude of exp(x)')
+
+    >>> plt.subplot(122)
+    >>> plt.imshow(np.angle(out),
+    ...            extent=[-2*np.pi, 2*np.pi, -2*np.pi, 2*np.pi], cmap='hsv')
+    >>> plt.title('Phase (angle) of exp(x)')
+    >>> plt.show()
+
+    """)
+
+add_newdoc('numpy.core.umath', 'exp2',
+    """
+    Calculate `2**p` for all `p` in the input array.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Element-wise 2 to the power `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    power
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+
+
+    Examples
+    --------
+    >>> np.exp2([2, 3])
+    array([ 4.,  8.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'expm1',
+    """
+    Calculate ``exp(x) - 1`` for all elements in the array.
+
+    Parameters
+    ----------
+    x : array_like
+       Input values.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Element-wise exponential minus one: ``out = exp(x) - 1``.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    log1p : ``log(1 + x)``, the inverse of expm1.
+
+
+    Notes
+    -----
+    This function provides greater precision than ``exp(x) - 1``
+    for small values of ``x``.
+
+    Examples
+    --------
+    The true value of ``exp(1e-10) - 1`` is ``1.00000000005e-10`` to
+    about 32 significant digits. This example shows the superiority of
+    expm1 in this case.
+
+    >>> np.expm1(1e-10)
+    1.00000000005e-10
+    >>> np.exp(1e-10) - 1
+    1.000000082740371e-10
+
+    """)
+
+add_newdoc('numpy.core.umath', 'fabs',
+    """
+    Compute the absolute values element-wise.
+
+    This function returns the absolute values (positive magnitude) of the
+    data in `x`. Complex values are not handled, use `absolute` to find the
+    absolute values of complex data.
+
+    Parameters
+    ----------
+    x : array_like
+        The array of numbers for which the absolute values are required. If
+        `x` is a scalar, the result `y` will also be a scalar.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The absolute values of `x`, the returned values are always floats.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    absolute : Absolute values including `complex` types.
+
+    Examples
+    --------
+    >>> np.fabs(-1)
+    1.0
+    >>> np.fabs([-1.2, 1.2])
+    array([ 1.2,  1.2])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'floor',
+    """
+    Return the floor of the input, element-wise.
+
+    The floor of the scalar `x` is the largest integer `i`, such that
+    `i <= x`.  It is often denoted as :math:`\\lfloor x \\rfloor`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The floor of each element in `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    ceil, trunc, rint
+
+    Notes
+    -----
+    Some spreadsheet programs calculate the "floor-towards-zero", in other
+    words ``floor(-2.5) == -2``.  NumPy instead uses the definition of
+    `floor` where `floor(-2.5) == -3`.
+
+    Examples
+    --------
+    >>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> np.floor(a)
+    array([-2., -2., -1.,  0.,  1.,  1.,  2.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'floor_divide',
+    """
+    Return the largest integer smaller or equal to the division of the inputs.
+    It is equivalent to the Python ``//`` operator and pairs with the
+    Python ``%`` (`remainder`), function so that ``b = a % b + b * (a // b)``
+    up to roundoff.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Numerator.
+    x2 : array_like
+        Denominator.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        y = floor(`x1`/`x2`)
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    remainder : Remainder complementary to floor_divide.
+    divmod : Simultaneous floor division and remainder.
+    divide : Standard division.
+    floor : Round a number to the nearest integer toward minus infinity.
+    ceil : Round a number to the nearest integer toward infinity.
+
+    Examples
+    --------
+    >>> np.floor_divide(7,3)
+    2
+    >>> np.floor_divide([1., 2., 3., 4.], 2.5)
+    array([ 0.,  0.,  1.,  1.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'fmod',
+    """
+    Return the element-wise remainder of division.
+
+    This is the NumPy implementation of the C library function fmod, the
+    remainder has the same sign as the dividend `x1`. It is equivalent to
+    the Matlab(TM) ``rem`` function and should not be confused with the
+    Python modulus operator ``x1 % x2``.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend.
+    x2 : array_like
+        Divisor.
+    $PARAMS
+
+    Returns
+    -------
+    y : array_like
+        The remainder of the division of `x1` by `x2`.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    remainder : Equivalent to the Python ``%`` operator.
+    divide
+
+    Notes
+    -----
+    The result of the modulo operation for negative dividend and divisors
+    is bound by conventions. For `fmod`, the sign of result is the sign of
+    the dividend, while for `remainder` the sign of the result is the sign
+    of the divisor. The `fmod` function is equivalent to the Matlab(TM)
+    ``rem`` function.
+
+    Examples
+    --------
+    >>> np.fmod([-3, -2, -1, 1, 2, 3], 2)
+    array([-1,  0, -1,  1,  0,  1])
+    >>> np.remainder([-3, -2, -1, 1, 2, 3], 2)
+    array([1, 0, 1, 1, 0, 1])
+
+    >>> np.fmod([5, 3], [2, 2.])
+    array([ 1.,  1.])
+    >>> a = np.arange(-3, 3).reshape(3, 2)
+    >>> a
+    array([[-3, -2],
+           [-1,  0],
+           [ 1,  2]])
+    >>> np.fmod(a, [2,2])
+    array([[-1,  0],
+           [-1,  0],
+           [ 1,  0]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'greater',
+    """
+    Return the truth value of (x1 > x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise comparison of `x1` and `x2`.
+        Typically of type bool, unless ``dtype=object`` is passed.
+        $OUT_SCALAR_2
+
+
+    See Also
+    --------
+    greater_equal, less, less_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> np.greater([4,2],[2,2])
+    array([ True, False])
+
+    If the inputs are ndarrays, then np.greater is equivalent to '>'.
+
+    >>> a = np.array([4,2])
+    >>> b = np.array([2,2])
+    >>> a > b
+    array([ True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'greater_equal',
+    """
+    Return the truth value of (x1 >= x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    $PARAMS
+
+    Returns
+    -------
+    out : bool or ndarray of bool
+        Output array, element-wise comparison of `x1` and `x2`.
+        Typically of type bool, unless ``dtype=object`` is passed.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    greater, less, less_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> np.greater_equal([4, 2, 1], [2, 2, 2])
+    array([ True, True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'hypot',
+    """
+    Given the "legs" of a right triangle, return its hypotenuse.
+
+    Equivalent to ``sqrt(x1**2 + x2**2)``, element-wise.  If `x1` or
+    `x2` is scalar_like (i.e., unambiguously cast-able to a scalar type),
+    it is broadcast for use with each element of the other argument.
+    (See Examples)
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Leg of the triangle(s).
+    $PARAMS
+
+    Returns
+    -------
+    z : ndarray
+        The hypotenuse of the triangle(s).
+        $OUT_SCALAR_2
+
+    Examples
+    --------
+    >>> np.hypot(3*np.ones((3, 3)), 4*np.ones((3, 3)))
+    array([[ 5.,  5.,  5.],
+           [ 5.,  5.,  5.],
+           [ 5.,  5.,  5.]])
+
+    Example showing broadcast of scalar_like argument:
+
+    >>> np.hypot(3*np.ones((3, 3)), [4])
+    array([[ 5.,  5.,  5.],
+           [ 5.,  5.,  5.],
+           [ 5.,  5.,  5.]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'invert',
+    """
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    For signed integer inputs, the two's complement is returned.  In a
+    two's-complement system negative numbers are represented by the two's
+    complement of the absolute value. This is the most common method of
+    representing signed integers on computers [1]_. A N-bit
+    two's-complement system can represent every integer in the range
+    :math:`-2^{N-1}` to :math:`+2^{N-1}-1`.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+
+    References
+    ----------
+    .. [1] Wikipedia, "Two's complement",
+        https://en.wikipedia.org/wiki/Two's_complement
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> np.invert(np.array([13], dtype=uint8))
+    array([242], dtype=uint8)
+    >>> np.binary_repr(x, width=8)
+    '00001101'
+    >>> np.binary_repr(242, width=8)
+    '11110010'
+
+    The result depends on the bit-width:
+
+    >>> np.invert(np.array([13], dtype=uint16))
+    array([65522], dtype=uint16)
+    >>> np.binary_repr(x, width=16)
+    '0000000000001101'
+    >>> np.binary_repr(65522, width=16)
+    '1111111111110010'
+
+    When using signed integer types the result is the two's complement of
+    the result for the unsigned type:
+
+    >>> np.invert(np.array([13], dtype=int8))
+    array([-14], dtype=int8)
+    >>> np.binary_repr(-14, width=8)
+    '11110010'
+
+    Booleans are accepted as well:
+
+    >>> np.invert(array([True, False]))
+    array([False,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'isfinite',
+    """
+    Test element-wise for finiteness (not infinity or not Not a Number).
+
+    The result is returned as a boolean array.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray, bool
+        True where ``x`` is not positive infinity, negative infinity,
+        or NaN; false otherwise.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    isinf, isneginf, isposinf, isnan
+
+    Notes
+    -----
+    Not a Number, positive infinity and negative infinity are considered
+    to be non-finite.
+
+    NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+    Also that positive infinity is not equivalent to negative infinity. But
+    infinity is equivalent to positive infinity.  Errors result if the
+    second argument is also supplied when `x` is a scalar input, or if
+    first and second arguments have different shapes.
+
+    Examples
+    --------
+    >>> np.isfinite(1)
+    True
+    >>> np.isfinite(0)
+    True
+    >>> np.isfinite(np.nan)
+    False
+    >>> np.isfinite(np.inf)
+    False
+    >>> np.isfinite(np.NINF)
+    False
+    >>> np.isfinite([np.log(-1.),1.,np.log(0)])
+    array([False,  True, False])
+
+    >>> x = np.array([-np.inf, 0., np.inf])
+    >>> y = np.array([2, 2, 2])
+    >>> np.isfinite(x, y)
+    array([0, 1, 0])
+    >>> y
+    array([0, 1, 0])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'isinf',
+    """
+    Test element-wise for positive or negative infinity.
+
+    Returns a boolean array of the same shape as `x`, True where ``x ==
+    +/-inf``, otherwise False.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values
+    $PARAMS
+
+    Returns
+    -------
+    y : bool (scalar) or boolean ndarray
+        True where ``x`` is positive or negative infinity, false otherwise.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    isneginf, isposinf, isnan, isfinite
+
+    Notes
+    -----
+    NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754).
+
+    Errors result if the second argument is supplied when the first
+    argument is a scalar, or if the first and second arguments have
+    different shapes.
+
+    Examples
+    --------
+    >>> np.isinf(np.inf)
+    True
+    >>> np.isinf(np.nan)
+    False
+    >>> np.isinf(np.NINF)
+    True
+    >>> np.isinf([np.inf, -np.inf, 1.0, np.nan])
+    array([ True,  True, False, False])
+
+    >>> x = np.array([-np.inf, 0., np.inf])
+    >>> y = np.array([2, 2, 2])
+    >>> np.isinf(x, y)
+    array([1, 0, 1])
+    >>> y
+    array([1, 0, 1])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'isnan',
+    """
+    Test element-wise for NaN and return result as a boolean array.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or bool
+        True where ``x`` is NaN, false otherwise.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    isinf, isneginf, isposinf, isfinite, isnat
+
+    Notes
+    -----
+    NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+
+    Examples
+    --------
+    >>> np.isnan(np.nan)
+    True
+    >>> np.isnan(np.inf)
+    False
+    >>> np.isnan([np.log(-1.),1.,np.log(0)])
+    array([ True, False, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'isnat',
+    """
+    Test element-wise for NaT (not a time) and return result as a boolean array.
+
+    .. versionadded:: 1.13.0
+
+    Parameters
+    ----------
+    x : array_like
+        Input array with datetime or timedelta data type.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or bool
+        True where ``x`` is NaT, false otherwise.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    isnan, isinf, isneginf, isposinf, isfinite
+
+    Examples
+    --------
+    >>> np.isnat(np.datetime64("NaT"))
+    True
+    >>> np.isnat(np.datetime64("2016-01-01"))
+    False
+    >>> np.isnat(np.array(["NaT", "2016-01-01"], dtype="datetime64[ns]"))
+    array([ True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'left_shift',
+    """
+    Shift the bits of an integer to the left.
+
+    Bits are shifted to the left by appending `x2` 0s at the right of `x1`.
+    Since the internal representation of numbers is in binary format, this
+    operation is equivalent to multiplying `x1` by ``2**x2``.
+
+    Parameters
+    ----------
+    x1 : array_like of integer type
+        Input values.
+    x2 : array_like of integer type
+        Number of zeros to append to `x1`. Has to be non-negative.
+    $PARAMS
+
+    Returns
+    -------
+    out : array of integer type
+        Return `x1` with bits shifted `x2` times to the left.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    right_shift : Shift the bits of an integer to the right.
+    binary_repr : Return the binary representation of the input number
+        as a string.
+
+    Examples
+    --------
+    >>> np.binary_repr(5)
+    '101'
+    >>> np.left_shift(5, 2)
+    20
+    >>> np.binary_repr(20)
+    '10100'
+
+    >>> np.left_shift(5, [1,2,3])
+    array([10, 20, 40])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'less',
+    """
+    Return the truth value of (x1 < x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise comparison of `x1` and `x2`.
+        Typically of type bool, unless ``dtype=object`` is passed.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    greater, less_equal, greater_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> np.less([1, 2], [2, 2])
+    array([ True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'less_equal',
+    """
+    Return the truth value of (x1 =< x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise comparison of `x1` and `x2`.
+        Typically of type bool, unless ``dtype=object`` is passed.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    greater, less, greater_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> np.less_equal([4, 2, 1], [2, 2, 2])
+    array([False,  True,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'log',
+    """
+    Natural logarithm, element-wise.
+
+    The natural logarithm `log` is the inverse of the exponential function,
+    so that `log(exp(x)) = x`. The natural logarithm is logarithm in base
+    `e`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input value.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The natural logarithm of `x`, element-wise.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    log10, log2, log1p, emath.log
+
+    Notes
+    -----
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `exp(z) = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log` always returns real output. For
+    each value that cannot be expressed as a real number or infinity, it
+    yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log` is a complex analytical function that
+    has a branch cut `[-inf, 0]` and is continuous from above on it. `log`
+    handles the floating-point negative zero as an infinitesimal negative
+    number, conforming to the C99 standard.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
+
+    Examples
+    --------
+    >>> np.log([1, np.e, np.e**2, 0])
+    array([  0.,   1.,   2., -Inf])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'log10',
+    """
+    Return the base 10 logarithm of the input array, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The logarithm to the base 10 of `x`, element-wise. NaNs are
+        returned where x is negative.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    emath.log10
+
+    Notes
+    -----
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `10**z = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log10` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log10` is a complex analytical function that
+    has a branch cut `[-inf, 0]` and is continuous from above on it.
+    `log10` handles the floating-point negative zero as an infinitesimal
+    negative number, conforming to the C99 standard.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
+
+    Examples
+    --------
+    >>> np.log10([1e-15, -3.])
+    array([-15.,  NaN])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'log2',
+    """
+    Base-2 logarithm of `x`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        Base-2 logarithm of `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    log, log10, log1p, emath.log2
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `2**z = x`. The convention is to return the `z`
+    whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log2` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log2` is a complex analytical function that
+    has a branch cut `[-inf, 0]` and is continuous from above on it. `log2`
+    handles the floating-point negative zero as an infinitesimal negative
+    number, conforming to the C99 standard.
+
+    Examples
+    --------
+    >>> x = np.array([0, 1, 2, 2**4])
+    >>> np.log2(x)
+    array([-Inf,   0.,   1.,   4.])
+
+    >>> xi = np.array([0+1.j, 1, 2+0.j, 4.j])
+    >>> np.log2(xi)
+    array([ 0.+2.26618007j,  0.+0.j        ,  1.+0.j        ,  2.+2.26618007j])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'logaddexp',
+    """
+    Logarithm of the sum of exponentiations of the inputs.
+
+    Calculates ``log(exp(x1) + exp(x2))``. This function is useful in
+    statistics where the calculated probabilities of events may be so small
+    as to exceed the range of normal floating point numbers.  In such cases
+    the logarithm of the calculated probability is stored. This function
+    allows adding probabilities stored in such a fashion.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    result : ndarray
+        Logarithm of ``exp(x1) + exp(x2)``.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logaddexp2: Logarithm of the sum of exponentiations of inputs in base 2.
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    Examples
+    --------
+    >>> prob1 = np.log(1e-50)
+    >>> prob2 = np.log(2.5e-50)
+    >>> prob12 = np.logaddexp(prob1, prob2)
+    >>> prob12
+    -113.87649168120691
+    >>> np.exp(prob12)
+    3.5000000000000057e-50
+
+    """)
+
+add_newdoc('numpy.core.umath', 'logaddexp2',
+    """
+    Logarithm of the sum of exponentiations of the inputs in base-2.
+
+    Calculates ``log2(2**x1 + 2**x2)``. This function is useful in machine
+    learning when the calculated probabilities of events may be so small as
+    to exceed the range of normal floating point numbers.  In such cases
+    the base-2 logarithm of the calculated probability can be used instead.
+    This function allows adding probabilities stored in such a fashion.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    result : ndarray
+        Base-2 logarithm of ``2**x1 + 2**x2``.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logaddexp: Logarithm of the sum of exponentiations of the inputs.
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    Examples
+    --------
+    >>> prob1 = np.log2(1e-50)
+    >>> prob2 = np.log2(2.5e-50)
+    >>> prob12 = np.logaddexp2(prob1, prob2)
+    >>> prob1, prob2, prob12
+    (-166.09640474436813, -164.77447664948076, -164.28904982231052)
+    >>> 2**prob12
+    3.4999999999999914e-50
+
+    """)
+
+add_newdoc('numpy.core.umath', 'log1p',
+    """
+    Return the natural logarithm of one plus the input array, element-wise.
+
+    Calculates ``log(1 + x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        Natural logarithm of `1 + x`, element-wise.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    expm1 : ``exp(x) - 1``, the inverse of `log1p`.
+
+    Notes
+    -----
+    For real-valued input, `log1p` is accurate also for `x` so small
+    that `1 + x == 1` in floating-point accuracy.
+
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `exp(z) = 1 + x`. The convention is to return
+    the `z` whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log1p` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log1p` is a complex analytical function that
+    has a branch cut `[-inf, -1]` and is continuous from above on it.
+    `log1p` handles the floating-point negative zero as an infinitesimal
+    negative number, conforming to the C99 standard.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
+
+    Examples
+    --------
+    >>> np.log1p(1e-99)
+    1e-99
+    >>> np.log(1 + 1e-99)
+    0.0
+
+    """)
+
+add_newdoc('numpy.core.umath', 'logical_and',
+    """
+    Compute the truth value of x1 AND x2 element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays. `x1` and `x2` must be of the same shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or bool
+        Boolean result with the same shape as `x1` and `x2` of the logical
+        AND operation on corresponding elements of `x1` and `x2`.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logical_or, logical_not, logical_xor
+    bitwise_and
+
+    Examples
+    --------
+    >>> np.logical_and(True, False)
+    False
+    >>> np.logical_and([True, False], [False, False])
+    array([False, False])
+
+    >>> x = np.arange(5)
+    >>> np.logical_and(x>1, x<4)
+    array([False, False,  True,  True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'logical_not',
+    """
+    Compute the truth value of NOT x element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Logical NOT is applied to the elements of `x`.
+    $PARAMS
+
+    Returns
+    -------
+    y : bool or ndarray of bool
+        Boolean result with the same shape as `x` of the NOT operation
+        on elements of `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    logical_and, logical_or, logical_xor
+
+    Examples
+    --------
+    >>> np.logical_not(3)
+    False
+    >>> np.logical_not([True, False, 0, 1])
+    array([False,  True,  True, False])
+
+    >>> x = np.arange(5)
+    >>> np.logical_not(x<3)
+    array([False, False, False,  True,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'logical_or',
+    """
+    Compute the truth value of x1 OR x2 element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Logical OR is applied to the elements of `x1` and `x2`.
+        They have to be of the same shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or bool
+        Boolean result with the same shape as `x1` and `x2` of the logical
+        OR operation on elements of `x1` and `x2`.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logical_and, logical_not, logical_xor
+    bitwise_or
+
+    Examples
+    --------
+    >>> np.logical_or(True, False)
+    True
+    >>> np.logical_or([True, False], [False, False])
+    array([ True, False])
+
+    >>> x = np.arange(5)
+    >>> np.logical_or(x < 1, x > 3)
+    array([ True, False, False, False,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'logical_xor',
+    """
+    Compute the truth value of x1 XOR x2, element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Logical XOR is applied to the elements of `x1` and `x2`.  They must
+        be broadcastable to the same shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : bool or ndarray of bool
+        Boolean result of the logical XOR operation applied to the elements
+        of `x1` and `x2`; the shape is determined by whether or not
+        broadcasting of one or both arrays was required.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    logical_and, logical_or, logical_not, bitwise_xor
+
+    Examples
+    --------
+    >>> np.logical_xor(True, False)
+    True
+    >>> np.logical_xor([True, True, False, False], [True, False, True, False])
+    array([False,  True,  True, False])
+
+    >>> x = np.arange(5)
+    >>> np.logical_xor(x < 1, x > 3)
+    array([ True, False, False, False,  True])
+
+    Simple example showing support of broadcasting
+
+    >>> np.logical_xor(0, np.eye(2))
+    array([[ True, False],
+           [False,  True]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'maximum',
+    """
+    Element-wise maximum of array elements.
+
+    Compare two arrays and returns a new array containing the element-wise
+    maxima. If one of the elements being compared is a NaN, then that
+    element is returned. If both elements are NaNs then the first is
+    returned. The latter distinction is important for complex NaNs, which
+    are defined as at least one of the real or imaginary parts being a NaN.
+    The net effect is that NaNs are propagated.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The arrays holding the elements to be compared. They must have
+        the same shape, or shapes that can be broadcast to a single shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The maximum of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    minimum :
+        Element-wise minimum of two arrays, propagates NaNs.
+    fmax :
+        Element-wise maximum of two arrays, ignores NaNs.
+    amax :
+        The maximum value of an array along a given axis, propagates NaNs.
+    nanmax :
+        The maximum value of an array along a given axis, ignores NaNs.
+
+    fmin, amin, nanmin
+
+    Notes
+    -----
+    The maximum is equivalent to ``np.where(x1 >= x2, x1, x2)`` when
+    neither x1 nor x2 are nans, but it is faster and does proper
+    broadcasting.
+
+    Examples
+    --------
+    >>> np.maximum([2, 3, 4], [1, 5, 2])
+    array([2, 5, 4])
+
+    >>> np.maximum(np.eye(2), [0.5, 2]) # broadcasting
+    array([[ 1. ,  2. ],
+           [ 0.5,  2. ]])
+
+    >>> np.maximum([np.nan, 0, np.nan], [0, np.nan, np.nan])
+    array([ NaN,  NaN,  NaN])
+    >>> np.maximum(np.Inf, 1)
+    inf
+
+    """)
+
+add_newdoc('numpy.core.umath', 'minimum',
+    """
+    Element-wise minimum of array elements.
+
+    Compare two arrays and returns a new array containing the element-wise
+    minima. If one of the elements being compared is a NaN, then that
+    element is returned. If both elements are NaNs then the first is
+    returned. The latter distinction is important for complex NaNs, which
+    are defined as at least one of the real or imaginary parts being a NaN.
+    The net effect is that NaNs are propagated.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The arrays holding the elements to be compared. They must have
+        the same shape, or shapes that can be broadcast to a single shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The minimum of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    maximum :
+        Element-wise maximum of two arrays, propagates NaNs.
+    fmin :
+        Element-wise minimum of two arrays, ignores NaNs.
+    amin :
+        The minimum value of an array along a given axis, propagates NaNs.
+    nanmin :
+        The minimum value of an array along a given axis, ignores NaNs.
+
+    fmax, amax, nanmax
+
+    Notes
+    -----
+    The minimum is equivalent to ``np.where(x1 <= x2, x1, x2)`` when
+    neither x1 nor x2 are NaNs, but it is faster and does proper
+    broadcasting.
+
+    Examples
+    --------
+    >>> np.minimum([2, 3, 4], [1, 5, 2])
+    array([1, 3, 2])
+
+    >>> np.minimum(np.eye(2), [0.5, 2]) # broadcasting
+    array([[ 0.5,  0. ],
+           [ 0. ,  1. ]])
+
+    >>> np.minimum([np.nan, 0, np.nan],[0, np.nan, np.nan])
+    array([ NaN,  NaN,  NaN])
+    >>> np.minimum(-np.Inf, 1)
+    -inf
+
+    """)
+
+add_newdoc('numpy.core.umath', 'fmax',
+    """
+    Element-wise maximum of array elements.
+
+    Compare two arrays and returns a new array containing the element-wise
+    maxima. If one of the elements being compared is a NaN, then the
+    non-nan element is returned. If both elements are NaNs then the first
+    is returned.  The latter distinction is important for complex NaNs,
+    which are defined as at least one of the real or imaginary parts being
+    a NaN. The net effect is that NaNs are ignored when possible.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The arrays holding the elements to be compared. They must have
+        the same shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The maximum of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    fmin :
+        Element-wise minimum of two arrays, ignores NaNs.
+    maximum :
+        Element-wise maximum of two arrays, propagates NaNs.
+    amax :
+        The maximum value of an array along a given axis, propagates NaNs.
+    nanmax :
+        The maximum value of an array along a given axis, ignores NaNs.
+
+    minimum, amin, nanmin
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    The fmax is equivalent to ``np.where(x1 >= x2, x1, x2)`` when neither
+    x1 nor x2 are NaNs, but it is faster and does proper broadcasting.
+
+    Examples
+    --------
+    >>> np.fmax([2, 3, 4], [1, 5, 2])
+    array([ 2.,  5.,  4.])
+
+    >>> np.fmax(np.eye(2), [0.5, 2])
+    array([[ 1. ,  2. ],
+           [ 0.5,  2. ]])
+
+    >>> np.fmax([np.nan, 0, np.nan],[0, np.nan, np.nan])
+    array([  0.,   0.,  NaN])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'fmin',
+    """
+    Element-wise minimum of array elements.
+
+    Compare two arrays and returns a new array containing the element-wise
+    minima. If one of the elements being compared is a NaN, then the
+    non-nan element is returned. If both elements are NaNs then the first
+    is returned.  The latter distinction is important for complex NaNs,
+    which are defined as at least one of the real or imaginary parts being
+    a NaN. The net effect is that NaNs are ignored when possible.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The arrays holding the elements to be compared. They must have
+        the same shape.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The minimum of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    fmax :
+        Element-wise maximum of two arrays, ignores NaNs.
+    minimum :
+        Element-wise minimum of two arrays, propagates NaNs.
+    amin :
+        The minimum value of an array along a given axis, propagates NaNs.
+    nanmin :
+        The minimum value of an array along a given axis, ignores NaNs.
+
+    maximum, amax, nanmax
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    The fmin is equivalent to ``np.where(x1 <= x2, x1, x2)`` when neither
+    x1 nor x2 are NaNs, but it is faster and does proper broadcasting.
+
+    Examples
+    --------
+    >>> np.fmin([2, 3, 4], [1, 5, 2])
+    array([1, 3, 2])
+
+    >>> np.fmin(np.eye(2), [0.5, 2])
+    array([[ 0.5,  0. ],
+           [ 0. ,  1. ]])
+
+    >>> np.fmin([np.nan, 0, np.nan],[0, np.nan, np.nan])
+    array([  0.,   0.,  NaN])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'matmul',
+    """
+    Matrix product of two arrays.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays, scalars not allowed.
+    out : ndarray, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that matches the signature `(n,k),(k,m)->(n,m)`. If not
+        provided or `None`, a freshly-allocated array is returned.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+        ..versionadded:: 1.16
+          Now handles ufunc kwargs
+
+    Returns
+    -------
+    y : ndarray
+        The matrix product of the inputs.
+        This is a scalar only when both x1, x2 are 1-d vectors.
+
+    Raises
+    ------
+    ValueError
+        If the last dimension of `a` is not the same size as
+        the second-to-last dimension of `b`.
+
+        If a scalar value is passed in.
+
+    See Also
+    --------
+    vdot : Complex-conjugating dot product.
+    tensordot : Sum products over arbitrary axes.
+    einsum : Einstein summation convention.
+    dot : alternative matrix product with different broadcasting rules.
+
+    Notes
+    -----
+
+    The behavior depends on the arguments in the following way.
+
+    - If both arguments are 2-D they are multiplied like conventional
+      matrices.
+    - If either argument is N-D, N > 2, it is treated as a stack of
+      matrices residing in the last two indexes and broadcast accordingly.
+    - If the first argument is 1-D, it is promoted to a matrix by
+      prepending a 1 to its dimensions. After matrix multiplication
+      the prepended 1 is removed.
+    - If the second argument is 1-D, it is promoted to a matrix by
+      appending a 1 to its dimensions. After matrix multiplication
+      the appended 1 is removed.
+
+    ``matmul`` differs from ``dot`` in two important ways:
+
+    - Multiplication by scalars is not allowed, use ``*`` instead.
+    - Stacks of matrices are broadcast together as if the matrices
+      were elements, respecting the signature ``(n,k),(k,m)->(n,m)``:
+
+      >>> a = np.ones([9, 5, 7, 4])
+      >>> c = np.ones([9, 5, 4, 3])
+      >>> np.dot(a, c).shape
+      (9, 5, 7, 9, 5, 3)
+      >>> np.matmul(a, c).shape
+      (9, 5, 7, 3)
+      >>> # n is 7, k is 4, m is 3
+
+    The matmul function implements the semantics of the `@` operator introduced
+    in Python 3.5 following PEP465.
+
+    Examples
+    --------
+    For 2-D arrays it is the matrix product:
+
+    >>> a = np.array([[1, 0],
+    ...               [0, 1]])
+    >>> b = np.array([[4, 1],
+    ...               [2, 2]]
+    >>> np.matmul(a, b)
+    array([[4, 1],
+           [2, 2]])
+
+    For 2-D mixed with 1-D, the result is the usual.
+
+    >>> a = np.array([[1, 0],
+    ...               [0, 1]]
+    >>> b = np.array([1, 2])
+    >>> np.matmul(a, b)
+    array([1, 2])
+    >>> np.matmul(b, a)
+    array([1, 2])
+
+
+    Broadcasting is conventional for stacks of arrays
+
+    >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
+    >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
+    >>> np.matmul(a,b).shape
+    (2, 2, 2)
+    >>> np.matmul(a, b)[0, 1, 1]
+    98
+    >>> sum(a[0, 1, :] * b[0 , :, 1])
+    98
+
+    Vector, vector returns the scalar inner product, but neither argument
+    is complex-conjugated:
+
+    >>> np.matmul([2j, 3j], [2j, 3j])
+    (-13+0j)
+
+    Scalar multiplication raises an error.
+
+    >>> np.matmul([1,2], 3)
+    Traceback (most recent call last):
+    ...
+    ValueError: matmul: Input operand 1 does not have enough dimensions ...
+
+    .. versionadded:: 1.10.0
+    """)
+
+add_newdoc('numpy.core.umath', 'modf',
+    """
+    Return the fractional and integral parts of an array, element-wise.
+
+    The fractional and integral parts are negative if the given number is
+    negative.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y1 : ndarray
+        Fractional part of `x`.
+        $OUT_SCALAR_1
+    y2 : ndarray
+        Integral part of `x`.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    For integer input the return values are floats.
+
+    See Also
+    --------
+    divmod : ``divmod(x, 1)`` is equivalent to ``modf`` with the return values
+             switched, except it always has a positive remainder.
+
+    Examples
+    --------
+    >>> np.modf([0, 3.5])
+    (array([ 0. ,  0.5]), array([ 0.,  3.]))
+    >>> np.modf(-0.5)
+    (-0.5, -0)
+
+    """)
+
+add_newdoc('numpy.core.umath', 'multiply',
+    """
+    Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays to be multiplied.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The product of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    Notes
+    -----
+    Equivalent to `x1` * `x2` in terms of array broadcasting.
+
+    Examples
+    --------
+    >>> np.multiply(2.0, 4.0)
+    8.0
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.multiply(x1, x2)
+    array([[  0.,   1.,   4.],
+           [  0.,   4.,  10.],
+           [  0.,   7.,  16.]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'negative',
+    """
+    Numerical negative, element-wise.
+
+    Parameters
+    ----------
+    x : array_like or scalar
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        Returned array or scalar: `y = -x`.
+        $OUT_SCALAR_1
+
+    Examples
+    --------
+    >>> np.negative([1.,-1.])
+    array([-1.,  1.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'positive',
+    """
+    Numerical positive, element-wise.
+
+    .. versionadded:: 1.13.0
+
+    Parameters
+    ----------
+    x : array_like or scalar
+        Input array.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        Returned array or scalar: `y = +x`.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    Equivalent to `x.copy()`, but only defined for types that support
+    arithmetic.
+
+    """)
+
+add_newdoc('numpy.core.umath', 'not_equal',
+    """
+    Return (x1 != x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise comparison of `x1` and `x2`.
+        Typically of type bool, unless ``dtype=object`` is passed.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    equal, greater, greater_equal, less, less_equal
+
+    Examples
+    --------
+    >>> np.not_equal([1.,2.], [1., 3.])
+    array([False,  True])
+    >>> np.not_equal([1, 2], [[1, 3],[1, 4]])
+    array([[False,  True],
+           [False,  True]])
+
+    """)
+
+add_newdoc('numpy.core.umath', '_ones_like',
+    """
+    This function used to be the numpy.ones_like, but now a specific
+    function for that has been written for consistency with the other
+    *_like functions. It is only used internally in a limited fashion now.
+
+    See Also
+    --------
+    ones_like
+
+    """)
+
+add_newdoc('numpy.core.umath', 'power',
+    """
+    First array elements raised to powers from second array, element-wise.
+
+    Raise each base in `x1` to the positionally-corresponding power in
+    `x2`.  `x1` and `x2` must be broadcastable to the same shape. Note that an
+    integer type raised to a negative integer power will raise a ValueError.
+
+    Parameters
+    ----------
+    x1 : array_like
+        The bases.
+    x2 : array_like
+        The exponents.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The bases in `x1` raised to the exponents in `x2`.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    float_power : power function that promotes integers to float
+
+    Examples
+    --------
+    Cube each element in a list.
+
+    >>> x1 = range(6)
+    >>> x1
+    [0, 1, 2, 3, 4, 5]
+    >>> np.power(x1, 3)
+    array([  0,   1,   8,  27,  64, 125])
+
+    Raise the bases to different exponents.
+
+    >>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0]
+    >>> np.power(x1, x2)
+    array([  0.,   1.,   8.,  27.,  16.,   5.])
+
+    The effect of broadcasting.
+
+    >>> x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])
+    >>> x2
+    array([[1, 2, 3, 3, 2, 1],
+           [1, 2, 3, 3, 2, 1]])
+    >>> np.power(x1, x2)
+    array([[ 0,  1,  8, 27, 16,  5],
+           [ 0,  1,  8, 27, 16,  5]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'float_power',
+    """
+    First array elements raised to powers from second array, element-wise.
+
+    Raise each base in `x1` to the positionally-corresponding power in `x2`.
+    `x1` and `x2` must be broadcastable to the same shape. This differs from
+    the power function in that integers, float16, and float32  are promoted to
+    floats with a minimum precision of float64 so that the result is always
+    inexact.  The intent is that the function will return a usable result for
+    negative powers and seldom overflow for positive powers.
+
+    .. versionadded:: 1.12.0
+
+    Parameters
+    ----------
+    x1 : array_like
+        The bases.
+    x2 : array_like
+        The exponents.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The bases in `x1` raised to the exponents in `x2`.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    power : power function that preserves type
+
+    Examples
+    --------
+    Cube each element in a list.
+
+    >>> x1 = range(6)
+    >>> x1
+    [0, 1, 2, 3, 4, 5]
+    >>> np.float_power(x1, 3)
+    array([   0.,    1.,    8.,   27.,   64.,  125.])
+
+    Raise the bases to different exponents.
+
+    >>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0]
+    >>> np.float_power(x1, x2)
+    array([  0.,   1.,   8.,  27.,  16.,   5.])
+
+    The effect of broadcasting.
+
+    >>> x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])
+    >>> x2
+    array([[1, 2, 3, 3, 2, 1],
+           [1, 2, 3, 3, 2, 1]])
+    >>> np.float_power(x1, x2)
+    array([[  0.,   1.,   8.,  27.,  16.,   5.],
+           [  0.,   1.,   8.,  27.,  16.,   5.]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'radians',
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array in degrees.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding radian values.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    deg2rad : equivalent function
+
+    Examples
+    --------
+    Convert a degree array to radians
+
+    >>> deg = np.arange(12.) * 30.
+    >>> np.radians(deg)
+    array([ 0.        ,  0.52359878,  1.04719755,  1.57079633,  2.0943951 ,
+            2.61799388,  3.14159265,  3.66519143,  4.1887902 ,  4.71238898,
+            5.23598776,  5.75958653])
+
+    >>> out = np.zeros((deg.shape))
+    >>> ret = np.radians(deg, out)
+    >>> ret is out
+    True
+
+    """)
+
+add_newdoc('numpy.core.umath', 'deg2rad',
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : array_like
+        Angles in degrees.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding angle in radians.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    rad2deg : Convert angles from radians to degrees.
+    unwrap : Remove large jumps in angle by wrapping.
+
+    Notes
+    -----
+    .. versionadded:: 1.3.0
+
+    ``deg2rad(x)`` is ``x * pi / 180``.
+
+    Examples
+    --------
+    >>> np.deg2rad(180)
+    3.1415926535897931
+
+    """)
+
+add_newdoc('numpy.core.umath', 'reciprocal',
+    """
+    Return the reciprocal of the argument, element-wise.
+
+    Calculates ``1/x``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        Return array.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    .. note::
+        This function is not designed to work with integers.
+
+    For integer arguments with absolute value larger than 1 the result is
+    always zero because of the way Python handles integer division.  For
+    integer zero the result is an overflow.
+
+    Examples
+    --------
+    >>> np.reciprocal(2.)
+    0.5
+    >>> np.reciprocal([1, 2., 3.33])
+    array([ 1.       ,  0.5      ,  0.3003003])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'remainder',
+    """
+    Return element-wise remainder of division.
+
+    Computes the remainder complementary to the `floor_divide` function.  It is
+    equivalent to the Python modulus operator``x1 % x2`` and has the same sign
+    as the divisor `x2`. The MATLAB function equivalent to ``np.remainder``
+    is ``mod``.
+
+    .. warning::
+
+        This should not be confused with:
+
+        * Python 3.7's `math.remainder` and C's ``remainder``, which
+          computes the IEEE remainder, which are the complement to
+          ``round(x1 / x2)``.
+        * The MATLAB ``rem`` function and or the C ``%`` operator which is the
+          complement to ``int(x1 / x2)``.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend array.
+    x2 : array_like
+        Divisor array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The element-wise remainder of the quotient ``floor_divide(x1, x2)``.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    floor_divide : Equivalent of Python ``//`` operator.
+    divmod : Simultaneous floor division and remainder.
+    fmod : Equivalent of the MATLAB ``rem`` function.
+    divide, floor
+
+    Notes
+    -----
+    Returns 0 when `x2` is 0 and both `x1` and `x2` are (arrays of)
+    integers.
+
+    Examples
+    --------
+    >>> np.remainder([4, 7], [2, 3])
+    array([0, 1])
+    >>> np.remainder(np.arange(7), 5)
+    array([0, 1, 2, 3, 4, 0, 1])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'divmod',
+    """
+    Return element-wise quotient and remainder simultaneously.
+
+    .. versionadded:: 1.13.0
+
+    ``np.divmod(x, y)`` is equivalent to ``(x // y, x % y)``, but faster
+    because it avoids redundant work. It is used to implement the Python
+    built-in function ``divmod`` on NumPy arrays.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend array.
+    x2 : array_like
+        Divisor array.
+    $PARAMS
+
+    Returns
+    -------
+    out1 : ndarray
+        Element-wise quotient resulting from floor division.
+        $OUT_SCALAR_2
+    out2 : ndarray
+        Element-wise remainder from floor division.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    floor_divide : Equivalent to Python's ``//`` operator.
+    remainder : Equivalent to Python's ``%`` operator.
+    modf : Equivalent to ``divmod(x, 1)`` for positive ``x`` with the return
+           values switched.
+
+    Examples
+    --------
+    >>> np.divmod(np.arange(5), 3)
+    (array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))
+
+    """)
+
+add_newdoc('numpy.core.umath', 'right_shift',
+    """
+    Shift the bits of an integer to the right.
+
+    Bits are shifted to the right `x2`.  Because the internal
+    representation of numbers is in binary format, this operation is
+    equivalent to dividing `x1` by ``2**x2``.
+
+    Parameters
+    ----------
+    x1 : array_like, int
+        Input values.
+    x2 : array_like, int
+        Number of bits to remove at the right of `x1`.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray, int
+        Return `x1` with bits shifted `x2` times to the right.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    left_shift : Shift the bits of an integer to the left.
+    binary_repr : Return the binary representation of the input number
+        as a string.
+
+    Examples
+    --------
+    >>> np.binary_repr(10)
+    '1010'
+    >>> np.right_shift(10, 1)
+    5
+    >>> np.binary_repr(5)
+    '101'
+
+    >>> np.right_shift(10, [1,2,3])
+    array([5, 2, 1])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'rint',
+    """
+    Round elements of the array to the nearest integer.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array is same shape and type as `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    ceil, floor, trunc
+
+    Examples
+    --------
+    >>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> np.rint(a)
+    array([-2., -2., -0.,  0.,  2.,  2.,  2.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'sign',
+    """
+    Returns an element-wise indication of the sign of a number.
+
+    The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``.  nan
+    is returned for nan inputs.
+
+    For complex inputs, the `sign` function returns
+    ``sign(x.real) + 0j if x.real != 0 else sign(x.imag) + 0j``.
+
+    complex(nan, 0) is returned for complex nan inputs.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The sign of `x`.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    There is more than one definition of sign in common use for complex
+    numbers.  The definition used here is equivalent to :math:`x/\\sqrt{x*x}`
+    which is different from a common alternative, :math:`x/|x|`.
+
+    Examples
+    --------
+    >>> np.sign([-5., 4.5])
+    array([-1.,  1.])
+    >>> np.sign(0)
+    0
+    >>> np.sign(5-2j)
+    (1+0j)
+
+    """)
+
+add_newdoc('numpy.core.umath', 'signbit',
+    """
+    Returns element-wise True where signbit is set (less than zero).
+
+    Parameters
+    ----------
+    x : array_like
+        The input value(s).
+    $PARAMS
+
+    Returns
+    -------
+    result : ndarray of bool
+        Output array, or reference to `out` if that was supplied.
+        $OUT_SCALAR_1
+
+    Examples
+    --------
+    >>> np.signbit(-1.2)
+    True
+    >>> np.signbit(np.array([1, -2.3, 2.1]))
+    array([False,  True, False])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'copysign',
+    """
+    Change the sign of x1 to that of x2, element-wise.
+
+    If both arguments are arrays or sequences, they have to be of the same
+    length. If `x2` is a scalar, its sign will be copied to all elements of
+    `x1`.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Values to change the sign of.
+    x2 : array_like
+        The sign of `x2` is copied to `x1`.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The values of `x1` with the sign of `x2`.
+        $OUT_SCALAR_2
+
+    Examples
+    --------
+    >>> np.copysign(1.3, -1)
+    -1.3
+    >>> 1/np.copysign(0, 1)
+    inf
+    >>> 1/np.copysign(0, -1)
+    -inf
+
+    >>> np.copysign([-1, 0, 1], -1.1)
+    array([-1., -0., -1.])
+    >>> np.copysign([-1, 0, 1], np.arange(3)-1)
+    array([-1.,  0.,  1.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'nextafter',
+    """
+    Return the next floating-point value after x1 towards x2, element-wise.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Values to find the next representable value of.
+    x2 : array_like
+        The direction where to look for the next representable value of `x1`.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The next representable values of `x1` in the direction of `x2`.
+        $OUT_SCALAR_2
+
+    Examples
+    --------
+    >>> eps = np.finfo(np.float64).eps
+    >>> np.nextafter(1, 2) == eps + 1
+    True
+    >>> np.nextafter([1, 2], [2, 1]) == [eps + 1, 2 - eps]
+    array([ True,  True])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'spacing',
+    """
+    Return the distance between x and the nearest adjacent number.
+
+    Parameters
+    ----------
+    x : array_like
+        Values to find the spacing of.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The spacing of values of `x`.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    It can be considered as a generalization of EPS:
+    ``spacing(np.float64(1)) == np.finfo(np.float64).eps``, and there
+    should not be any representable number between ``x + spacing(x)`` and
+    x for any finite x.
+
+    Spacing of +- inf and NaN is NaN.
+
+    Examples
+    --------
+    >>> np.spacing(1) == np.finfo(np.float64).eps
+    True
+
+    """)
+
+add_newdoc('numpy.core.umath', 'sin',
+    """
+    Trigonometric sine, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Angle, in radians (:math:`2 \\pi` rad equals 360 degrees).
+    $PARAMS
+
+    Returns
+    -------
+    y : array_like
+        The sine of each element of x.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    arcsin, sinh, cos
+
+    Notes
+    -----
+    The sine is one of the fundamental functions of trigonometry (the
+    mathematical study of triangles).  Consider a circle of radius 1
+    centered on the origin.  A ray comes in from the :math:`+x` axis, makes
+    an angle at the origin (measured counter-clockwise from that axis), and
+    departs from the origin.  The :math:`y` coordinate of the outgoing
+    ray's intersection with the unit circle is the sine of that angle.  It
+    ranges from -1 for :math:`x=3\\pi / 2` to +1 for :math:`\\pi / 2.`  The
+    function has zeroes where the angle is a multiple of :math:`\\pi`.
+    Sines of angles between :math:`\\pi` and :math:`2\\pi` are negative.
+    The numerous properties of the sine and related functions are included
+    in any standard trigonometry text.
+
+    Examples
+    --------
+    Print sine of one angle:
+
+    >>> np.sin(np.pi/2.)
+    1.0
+
+    Print sines of an array of angles given in degrees:
+
+    >>> np.sin(np.array((0., 30., 45., 60., 90.)) * np.pi / 180. )
+    array([ 0.        ,  0.5       ,  0.70710678,  0.8660254 ,  1.        ])
+
+    Plot the sine function:
+
+    >>> import matplotlib.pylab as plt
+    >>> x = np.linspace(-np.pi, np.pi, 201)
+    >>> plt.plot(x, np.sin(x))
+    >>> plt.xlabel('Angle [rad]')
+    >>> plt.ylabel('sin(x)')
+    >>> plt.axis('tight')
+    >>> plt.show()
+
+    """)
+
+add_newdoc('numpy.core.umath', 'sinh',
+    """
+    Hyperbolic sine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) - np.exp(-x))`` or
+    ``-1j * np.sin(1j*x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding hyperbolic sine values.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+    New York, NY: Dover, 1972, pg. 83.
+
+    Examples
+    --------
+    >>> np.sinh(0)
+    0.0
+    >>> np.sinh(np.pi*1j/2)
+    1j
+    >>> np.sinh(np.pi*1j) # (exact value is 0)
+    1.2246063538223773e-016j
+    >>> # Discrepancy due to vagaries of floating point arithmetic.
+
+    >>> # Example of providing the optional output parameter
+    >>> out2 = np.sinh([0.1], out1)
+    >>> out2 is out1
+    True
+
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> np.sinh(np.zeros((3,3)),np.zeros((2,2)))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
+
+    """)
+
+add_newdoc('numpy.core.umath', 'sqrt',
+    """
+    Return the non-negative square-root of an array, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        The values whose square-roots are required.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        An array of the same shape as `x`, containing the positive
+        square-root of each element in `x`.  If any element in `x` is
+        complex, a complex array is returned (and the square-roots of
+        negative reals are calculated).  If all of the elements in `x`
+        are real, so is `y`, with negative elements returning ``nan``.
+        If `out` was provided, `y` is a reference to it.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    lib.scimath.sqrt
+        A version which returns complex numbers when given negative reals.
+
+    Notes
+    -----
+    *sqrt* has--consistent with common convention--as its branch cut the
+    real "interval" [`-inf`, 0), and is continuous from above on it.
+    A branch cut is a curve in the complex plane across which a given
+    complex function fails to be continuous.
+
+    Examples
+    --------
+    >>> np.sqrt([1,4,9])
+    array([ 1.,  2.,  3.])
+
+    >>> np.sqrt([4, -1, -3+4J])
+    array([ 2.+0.j,  0.+1.j,  1.+2.j])
+
+    >>> np.sqrt([4, -1, numpy.inf])
+    array([  2.,  NaN,  Inf])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'cbrt',
+    """
+    Return the cube-root of an array, element-wise.
+
+    .. versionadded:: 1.10.0
+
+    Parameters
+    ----------
+    x : array_like
+        The values whose cube-roots are required.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        An array of the same shape as `x`, containing the cube
+        cube-root of each element in `x`.
+        If `out` was provided, `y` is a reference to it.
+        $OUT_SCALAR_1
+
+
+    Examples
+    --------
+    >>> np.cbrt([1,8,27])
+    array([ 1.,  2.,  3.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'square',
+    """
+    Return the element-wise square of the input.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Element-wise `x*x`, of the same shape and dtype as `x`.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    numpy.linalg.matrix_power
+    sqrt
+    power
+
+    Examples
+    --------
+    >>> np.square([-1j, 1])
+    array([-1.-0.j,  1.+0.j])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'subtract',
+    """
+    Subtract arguments, element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The arrays to be subtracted from each other.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The difference of `x1` and `x2`, element-wise.
+        $OUT_SCALAR_2
+
+    Notes
+    -----
+    Equivalent to ``x1 - x2`` in terms of array broadcasting.
+
+    Examples
+    --------
+    >>> np.subtract(1.0, 4.0)
+    -3.0
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.subtract(x1, x2)
+    array([[ 0.,  0.,  0.],
+           [ 3.,  3.,  3.],
+           [ 6.,  6.,  6.]])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'tan',
+    """
+    Compute tangent element-wise.
+
+    Equivalent to ``np.sin(x)/np.cos(x)`` element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding tangent values.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+    New York, NY: Dover, 1972.
+
+    Examples
+    --------
+    >>> from math import pi
+    >>> np.tan(np.array([-pi,pi/2,pi]))
+    array([  1.22460635e-16,   1.63317787e+16,  -1.22460635e-16])
+    >>>
+    >>> # Example of providing the optional output parameter illustrating
+    >>> # that what is returned is a reference to said parameter
+    >>> out2 = np.cos([0.1], out1)
+    >>> out2 is out1
+    True
+    >>>
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> np.cos(np.zeros((3,3)),np.zeros((2,2)))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
+
+    """)
+
+add_newdoc('numpy.core.umath', 'tanh',
+    """
+    Compute hyperbolic tangent element-wise.
+
+    Equivalent to ``np.sinh(x)/np.cosh(x)`` or ``-1j * np.tan(1j*x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding hyperbolic tangent values.
+        $OUT_SCALAR_1
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+           New York, NY: Dover, 1972, pg. 83.
+           http://www.math.sfu.ca/~cbm/aands/
+
+    .. [2] Wikipedia, "Hyperbolic function",
+           https://en.wikipedia.org/wiki/Hyperbolic_function
+
+    Examples
+    --------
+    >>> np.tanh((0, np.pi*1j, np.pi*1j/2))
+    array([ 0. +0.00000000e+00j,  0. -1.22460635e-16j,  0. +1.63317787e+16j])
+
+    >>> # Example of providing the optional output parameter illustrating
+    >>> # that what is returned is a reference to said parameter
+    >>> out2 = np.tanh([0.1], out1)
+    >>> out2 is out1
+    True
+
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> np.tanh(np.zeros((3,3)),np.zeros((2,2)))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
+
+    """)
+
+add_newdoc('numpy.core.umath', 'true_divide',
+    """
+    Returns a true division of the inputs, element-wise.
+
+    Instead of the Python traditional 'floor division', this returns a true
+    division.  True division adjusts the output type to present the best
+    answer, regardless of input types.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend array.
+    x2 : array_like
+        Divisor array.
+    $PARAMS
+
+    Returns
+    -------
+    out : ndarray or scalar
+        $OUT_SCALAR_2
+
+    Notes
+    -----
+    The floor division operator ``//`` was added in Python 2.2 making
+    ``//`` and ``/`` equivalent operators.  The default floor division
+    operation of ``/`` can be replaced by true division with ``from
+    __future__ import division``.
+
+    In Python 3.0, ``//`` is the floor division operator and ``/`` the
+    true division operator.  The ``true_divide(x1, x2)`` function is
+    equivalent to true division in Python.
+
+    Examples
+    --------
+    >>> x = np.arange(5)
+    >>> np.true_divide(x, 4)
+    array([ 0.  ,  0.25,  0.5 ,  0.75,  1.  ])
+
+    >>> x/4
+    array([0, 0, 0, 0, 1])
+    >>> x//4
+    array([0, 0, 0, 0, 1])
+
+    >>> from __future__ import division
+    >>> x/4
+    array([ 0.  ,  0.25,  0.5 ,  0.75,  1.  ])
+    >>> x//4
+    array([0, 0, 0, 0, 1])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'frexp',
+    """
+    Decompose the elements of x into mantissa and twos exponent.
+
+    Returns (`mantissa`, `exponent`), where `x = mantissa * 2**exponent``.
+    The mantissa is lies in the open interval(-1, 1), while the twos
+    exponent is a signed integer.
+
+    Parameters
+    ----------
+    x : array_like
+        Array of numbers to be decomposed.
+    out1 : ndarray, optional
+        Output array for the mantissa. Must have the same shape as `x`.
+    out2 : ndarray, optional
+        Output array for the exponent. Must have the same shape as `x`.
+    $PARAMS
+
+    Returns
+    -------
+    mantissa : ndarray
+        Floating values between -1 and 1.
+        $OUT_SCALAR_1
+    exponent : ndarray
+        Integer exponents of 2.
+        $OUT_SCALAR_1
+
+    See Also
+    --------
+    ldexp : Compute ``y = x1 * 2**x2``, the inverse of `frexp`.
+
+    Notes
+    -----
+    Complex dtypes are not supported, they will raise a TypeError.
+
+    Examples
+    --------
+    >>> x = np.arange(9)
+    >>> y1, y2 = np.frexp(x)
+    >>> y1
+    array([ 0.   ,  0.5  ,  0.5  ,  0.75 ,  0.5  ,  0.625,  0.75 ,  0.875,
+            0.5  ])
+    >>> y2
+    array([0, 1, 2, 2, 3, 3, 3, 3, 4])
+    >>> y1 * 2**y2
+    array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'ldexp',
+    """
+    Returns x1 * 2**x2, element-wise.
+
+    The mantissas `x1` and twos exponents `x2` are used to construct
+    floating point numbers ``x1 * 2**x2``.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Array of multipliers.
+    x2 : array_like, int
+        Array of twos exponents.
+    $PARAMS
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The result of ``x1 * 2**x2``.
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    frexp : Return (y1, y2) from ``x = y1 * 2**y2``, inverse to `ldexp`.
+
+    Notes
+    -----
+    Complex dtypes are not supported, they will raise a TypeError.
+
+    `ldexp` is useful as the inverse of `frexp`, if used by itself it is
+    more clear to simply use the expression ``x1 * 2**x2``.
+
+    Examples
+    --------
+    >>> np.ldexp(5, np.arange(4))
+    array([  5.,  10.,  20.,  40.], dtype=float32)
+
+    >>> x = np.arange(6)
+    >>> np.ldexp(*np.frexp(x))
+    array([ 0.,  1.,  2.,  3.,  4.,  5.])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'gcd',
+    """
+    Returns the greatest common divisor of ``|x1|`` and ``|x2|``
+
+    Parameters
+    ----------
+    x1, x2 : array_like, int
+        Arrays of values
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The greatest common divisor of the absolute value of the inputs
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    lcm : The lowest common multiple
+
+    Examples
+    --------
+    >>> np.gcd(12, 20)
+    4
+    >>> np.gcd.reduce([15, 25, 35])
+    5
+    >>> np.gcd(np.arange(6), 20)
+    array([20,  1,  2,  1,  4,  5])
+
+    """)
+
+add_newdoc('numpy.core.umath', 'lcm',
+    """
+    Returns the lowest common multiple of ``|x1|`` and ``|x2|``
+
+    Parameters
+    ----------
+    x1, x2 : array_like, int
+        Arrays of values
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The lowest common multiple of the absolute value of the inputs
+        $OUT_SCALAR_2
+
+    See Also
+    --------
+    gcd : The greatest common divisor
+
+    Examples
+    --------
+    >>> np.lcm(12, 20)
+    60
+    >>> np.lcm.reduce([3, 12, 20])
+    60
+    >>> np.lcm.reduce([40, 12, 20])
+    120
+    >>> np.lcm(np.arange(6), 20)
+    array([ 0, 20, 20, 60, 20, 20])
+
+    """)
diff --git a/contrib/python/numpy/py2/numpy/core/include/numpy/oldnumeric.h b/contrib/python/numpy/py2/numpy/core/include/numpy/oldnumeric.h
new file mode 100644
index 00000000000..38530faf045
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/include/numpy/oldnumeric.h
@@ -0,0 +1,25 @@
+#include "arrayobject.h"
+
+#ifndef PYPY_VERSION
+#ifndef REFCOUNT
+#  define REFCOUNT NPY_REFCOUNT
+#  define MAX_ELSIZE 16
+#endif
+#endif
+
+#define PyArray_UNSIGNED_TYPES
+#define PyArray_SBYTE NPY_BYTE
+#define PyArray_CopyArray PyArray_CopyInto
+#define _PyArray_multiply_list PyArray_MultiplyIntList
+#define PyArray_ISSPACESAVER(m) NPY_FALSE
+#define PyScalarArray_Check PyArray_CheckScalar
+
+#define CONTIGUOUS NPY_CONTIGUOUS
+#define OWN_DIMENSIONS 0
+#define OWN_STRIDES 0
+#define OWN_DATA NPY_OWNDATA
+#define SAVESPACE 0
+#define SAVESPACEBIT 0
+
+#undef import_array
+#define import_array() { if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); } }
diff --git a/contrib/python/numpy/py2/numpy/core/lib/npy-pkg-config/mlib.ini b/contrib/python/numpy/py2/numpy/core/lib/npy-pkg-config/mlib.ini
new file mode 100644
index 00000000000..5840f5e1bc1
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/lib/npy-pkg-config/mlib.ini
@@ -0,0 +1,12 @@
+[meta]
+Name = mlib
+Description = Math library used with this version of numpy
+Version = 1.0
+
+[default]
+Libs=-lm
+Cflags=
+
+[msvc]
+Libs=m.lib
+Cflags=
diff --git a/contrib/python/numpy/py2/numpy/core/lib/npy-pkg-config/npymath.ini b/contrib/python/numpy/py2/numpy/core/lib/npy-pkg-config/npymath.ini
new file mode 100644
index 00000000000..3e465ad2ace
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/lib/npy-pkg-config/npymath.ini
@@ -0,0 +1,20 @@
+[meta]
+Name=npymath
+Description=Portable, core math library implementing C99 standard
+Version=0.1
+
+[variables]
+pkgname=numpy.core
+prefix=${pkgdir}
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+[default]
+Libs=-L${libdir} -lnpymath
+Cflags=-I${includedir}
+Requires=mlib
+
+[msvc]
+Libs=/LIBPATH:${libdir} npymath.lib
+Cflags=/INCLUDE:${includedir}
+Requires=mlib
diff --git a/contrib/python/numpy/py2/numpy/core/mlib.ini.in b/contrib/python/numpy/py2/numpy/core/mlib.ini.in
new file mode 100644
index 00000000000..badaa2ae9de
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/mlib.ini.in
@@ -0,0 +1,12 @@
+[meta]
+Name = mlib
+Description = Math library used with this version of numpy
+Version = 1.0
+
+[default]
+Libs=@posix_mathlib@
+Cflags=
+
+[msvc]
+Libs=@msvc_mathlib@
+Cflags=
diff --git a/contrib/python/numpy/py2/numpy/core/npymath.ini.in b/contrib/python/numpy/py2/numpy/core/npymath.ini.in
new file mode 100644
index 00000000000..a233b8f3bfa
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/npymath.ini.in
@@ -0,0 +1,20 @@
+[meta]
+Name=npymath
+Description=Portable, core math library implementing C99 standard
+Version=0.1
+
+[variables]
+pkgname=@pkgname@
+prefix=${pkgdir}
+libdir=${prefix}@sep@lib
+includedir=${prefix}@sep@include
+
+[default]
+Libs=-L${libdir} -lnpymath
+Cflags=-I${includedir}
+Requires=mlib
+
+[msvc]
+Libs=/LIBPATH:${libdir} npymath.lib
+Cflags=/INCLUDE:${includedir}
+Requires=mlib
diff --git a/contrib/python/numpy/py2/numpy/core/src/common/npy_binsearch.h.src b/contrib/python/numpy/py2/numpy/core/src/common/npy_binsearch.h.src
new file mode 100644
index 00000000000..ce3b34b0ef4
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/common/npy_binsearch.h.src
@@ -0,0 +1,144 @@
+#ifndef __NPY_BINSEARCH_H__
+#define __NPY_BINSEARCH_H__
+
+#include "npy_sort.h"
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+typedef void (PyArray_BinSearchFunc)(const char*, const char*, char*,
+                                     npy_intp, npy_intp,
+                                     npy_intp, npy_intp, npy_intp,
+                                     PyArrayObject*);
+
+typedef int (PyArray_ArgBinSearchFunc)(const char*, const char*,
+                                       const char*, char*,
+                                       npy_intp, npy_intp, npy_intp,
+                                       npy_intp, npy_intp, npy_intp,
+                                       PyArrayObject*);
+
+typedef struct {
+    int typenum;
+    PyArray_BinSearchFunc *binsearch[NPY_NSEARCHSIDES];
+} binsearch_map;
+
+typedef struct {
+    int typenum;
+    PyArray_ArgBinSearchFunc *argbinsearch[NPY_NSEARCHSIDES];
+} argbinsearch_map;
+
+/**begin repeat
+ *
+ * #side = left, right#
+ */
+
+/**begin repeat1
+ *
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ */
+
+NPY_VISIBILITY_HIDDEN void
+binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
+                        npy_intp arr_len, npy_intp key_len,
+                        npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
+                        PyArrayObject *unused);
+NPY_VISIBILITY_HIDDEN int
+argbinsearch_@side@_@suff@(const char *arr, const char *key,
+                           const char *sort, char *ret,
+                           npy_intp arr_len, npy_intp key_len,
+                           npy_intp arr_str, npy_intp key_str,
+                           npy_intp sort_str, npy_intp ret_str,
+                           PyArrayObject *unused);
+/**end repeat1**/
+
+NPY_VISIBILITY_HIDDEN void
+npy_binsearch_@side@(const char *arr, const char *key, char *ret,
+                     npy_intp arr_len, npy_intp key_len,
+                     npy_intp arr_str, npy_intp key_str,
+                     npy_intp ret_str, PyArrayObject *cmp);
+NPY_VISIBILITY_HIDDEN int
+npy_argbinsearch_@side@(const char *arr, const char *key,
+                        const char *sort, char *ret,
+                        npy_intp arr_len, npy_intp key_len,
+                        npy_intp arr_str, npy_intp key_str,
+                        npy_intp sort_str, npy_intp ret_str,
+                        PyArrayObject *cmp);
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #arg = , arg#
+ * #Arg = , Arg#
+ */
+
+static @arg@binsearch_map _@arg@binsearch_map[] = {
+    /* If adding new types, make sure to keep them ordered by type num */
+    /**begin repeat1
+     *
+     * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+     *         LONGLONG, ULONGLONG, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE, DATETIME, TIMEDELTA, HALF#
+     * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+     *         longlong, ulonglong, float, double, longdouble,
+     *         cfloat, cdouble, clongdouble, datetime, timedelta, half#
+     */
+    {NPY_@TYPE@,
+        {
+            &@arg@binsearch_left_@suff@,
+            &@arg@binsearch_right_@suff@,
+        },
+    },
+    /**end repeat1**/
+};
+
+static PyArray_@Arg@BinSearchFunc *gen@arg@binsearch_map[] = {
+    &npy_@arg@binsearch_left,
+    &npy_@arg@binsearch_right,
+};
+
+static NPY_INLINE PyArray_@Arg@BinSearchFunc*
+get_@arg@binsearch_func(PyArray_Descr *dtype, NPY_SEARCHSIDE side)
+{
+    npy_intp nfuncs = ARRAY_SIZE(_@arg@binsearch_map);
+    npy_intp min_idx = 0;
+    npy_intp max_idx = nfuncs;
+    int type = dtype->type_num;
+
+    if (side >= NPY_NSEARCHSIDES) {
+        return NULL;
+    }
+
+    /*
+     * It seems only fair that a binary search function be searched for
+     * using a binary search...
+     */
+    while (min_idx < max_idx) {
+        npy_intp mid_idx = min_idx + ((max_idx - min_idx) >> 1);
+
+        if (_@arg@binsearch_map[mid_idx].typenum < type) {
+            min_idx = mid_idx + 1;
+        }
+        else {
+            max_idx = mid_idx;
+        }
+    }
+
+    if (min_idx < nfuncs &&
+            _@arg@binsearch_map[min_idx].typenum == type) {
+        return _@arg@binsearch_map[min_idx].@arg@binsearch[side];
+    }
+
+    if (dtype->f->compare) {
+        return gen@arg@binsearch_map[side];
+    }
+
+    return NULL;
+}
+/**end repeat**/
+
+#undef ARRAY_SIZE
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/common/npy_partition.h.src b/contrib/python/numpy/py2/numpy/core/src/common/npy_partition.h.src
new file mode 100644
index 00000000000..a22cf911c78
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/common/npy_partition.h.src
@@ -0,0 +1,129 @@
+/*
+ *****************************************************************************
+ **               IMPORTANT NOTE for npy_partition.h.src -> npy_partition.h **
+ *****************************************************************************
+ *  The template file loops.h.src is not automatically converted into
+ *  loops.h by the build system.  If you edit this file, you must manually
+ *  do the conversion using numpy/distutils/conv_template.py from the
+ *  command line as follows:
+ *
+ *  $ cd <NumPy source root directory>
+ *  $ python  numpy/distutils/conv_template.py numpy/core/src/private/npy_partition.h.src
+ *  $
+ */
+
+
+#ifndef __NPY_PARTITION_H__
+#define __NPY_PARTITION_H__
+
+
+#include "npy_sort.h"
+
+/* Python include is for future object sorts */
+#include <Python.h>
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+#define NPY_MAX_PIVOT_STACK 50
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+
+NPY_VISIBILITY_HIDDEN int introselect_@suff@(@type@ *v, npy_intp num,
+                                             npy_intp kth,
+                                             npy_intp * pivots,
+                                             npy_intp * npiv,
+                                             void *NOT_USED);
+NPY_VISIBILITY_HIDDEN int aintroselect_@suff@(@type@ *v, npy_intp* tosort, npy_intp num,
+                                              npy_intp kth,
+                                              npy_intp * pivots,
+                                              npy_intp * npiv,
+                                              void *NOT_USED);
+
+
+/**end repeat**/
+
+typedef struct {
+    int typenum;
+    PyArray_PartitionFunc * part[NPY_NSELECTS];
+    PyArray_ArgPartitionFunc * argpart[NPY_NSELECTS];
+} part_map;
+
+static part_map _part_map[] = {
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ */
+    {
+        NPY_@TYPE@,
+        {
+            (PyArray_PartitionFunc *)&introselect_@suff@,
+        },
+        {
+            (PyArray_ArgPartitionFunc *)&aintroselect_@suff@,
+        }
+    },
+/**end repeat**/
+};
+
+
+static NPY_INLINE PyArray_PartitionFunc *
+get_partition_func(int type, NPY_SELECTKIND which)
+{
+    npy_intp i;
+    npy_intp ntypes = ARRAY_SIZE(_part_map);
+
+    if (which >= NPY_NSELECTS) {
+        return NULL;
+    }
+    for (i = 0; i < ntypes; i++) {
+        if (type == _part_map[i].typenum) {
+            return _part_map[i].part[which];
+        }
+    }
+    return NULL;
+}
+
+
+static NPY_INLINE PyArray_ArgPartitionFunc *
+get_argpartition_func(int type, NPY_SELECTKIND which)
+{
+    npy_intp i;
+    npy_intp ntypes = ARRAY_SIZE(_part_map);
+
+    if (which >= NPY_NSELECTS) {
+        return NULL;
+    }
+    for (i = 0; i < ntypes; i++) {
+        if (type == _part_map[i].typenum) {
+            return _part_map[i].argpart[which];
+        }
+    }
+    return NULL;
+}
+
+#undef ARRAY_SIZE
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/common/npy_sort.h.src b/contrib/python/numpy/py2/numpy/core/src/common/npy_sort.h.src
new file mode 100644
index 00000000000..c31a827645f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/common/npy_sort.h.src
@@ -0,0 +1,83 @@
+#ifndef __NPY_SORT_H__
+#define __NPY_SORT_H__
+
+/* Python include is for future object sorts */
+#include <Python.h>
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>
+
+#define NPY_ENOMEM 1
+#define NPY_ECOMP 2
+
+static NPY_INLINE int npy_get_msb(npy_uintp unum)
+{
+    int depth_limit = 0;
+    while (unum >>= 1)  {
+        depth_limit++;
+    }
+    return depth_limit;
+}
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ */
+
+int quicksort_@suff@(void *vec, npy_intp cnt, void *null);
+int heapsort_@suff@(void *vec, npy_intp cnt, void *null);
+int mergesort_@suff@(void *vec, npy_intp cnt, void *null);
+int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+/**end repeat**/
+
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #suff = string, unicode#
+ */
+
+int quicksort_@suff@(void *vec, npy_intp cnt, void *arr);
+int heapsort_@suff@(void *vec, npy_intp cnt, void *arr);
+int mergesort_@suff@(void *vec, npy_intp cnt, void *arr);
+int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+int npy_quicksort(void *vec, npy_intp cnt, void *arr);
+int npy_heapsort(void *vec, npy_intp cnt, void *arr);
+int npy_mergesort(void *vec, npy_intp cnt, void *arr);
+int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/common/python_xerbla.c b/contrib/python/numpy/py2/numpy/core/src/common/python_xerbla.c
new file mode 100644
index 00000000000..bdf0b9058f7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/common/python_xerbla.c
@@ -0,0 +1,51 @@
+#include "Python.h"
+
+/*
+ * From f2c.h, this should be safe unless fortran is set to use 64
+ * bit integers. We don't seem to have any good way to detect that.
+ */
+typedef int integer;
+
+/*
+  From the original manpage:
+  --------------------------
+  XERBLA is an error handler for the LAPACK routines.
+  It is called by an LAPACK routine if an input parameter has an invalid value.
+  A message is printed and execution stops.
+
+  Instead of printing a message and stopping the execution, a
+  ValueError is raised with the message.
+
+  Parameters:
+  -----------
+  srname: Subroutine name to use in error message, maximum six characters.
+          Spaces at the end are skipped.
+  info: Number of the invalid parameter.
+*/
+
+int xerbla_(char *srname, integer *info)
+{
+        static const char format[] = "On entry to %.*s" \
+                " parameter number %d had an illegal value";
+        char buf[sizeof(format) + 6 + 4];   /* 6 for name, 4 for param. num. */
+
+        int len = 0; /* length of subroutine name*/
+#ifdef WITH_THREAD
+        PyGILState_STATE save;
+#endif
+
+        while( len<6 && srname[len]!='\0' )
+                len++;
+        while( len && srname[len-1]==' ' )
+                len--;
+#ifdef WITH_THREAD
+        save = PyGILState_Ensure();
+#endif
+        PyOS_snprintf(buf, sizeof(buf), format, len, srname, *info);
+        PyErr_SetString(PyExc_ValueError, buf);
+#ifdef WITH_THREAD
+        PyGILState_Release(save);
+#endif
+
+        return 0;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/common/templ_common.h.src b/contrib/python/numpy/py2/numpy/core/src/common/templ_common.h.src
new file mode 100644
index 00000000000..a65a0075866
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/common/templ_common.h.src
@@ -0,0 +1,46 @@
+#ifndef __NPY_TYPED_COMMON_INC
+#define __NPY_TYPED_COMMON_INC
+
+/* utility functions that profit from templates */
+
+#include "numpy/npy_common.h"
+
+/**begin repeat
+ *  #name = int, uint, long, ulong,
+ *          longlong, ulonglong, intp#
+ *  #type = npy_int, npy_uint, npy_long, npy_ulong,
+ *          npy_longlong, npy_ulonglong, npy_intp#
+ *  #MAX = NPY_MAX_INT, NPY_MAX_UINT, NPY_MAX_LONG, NPY_MAX_ULONG,
+ *         NPY_MAX_LONGLONG, NPY_MAX_ULONGLONG, NPY_MAX_INTP#
+ */
+
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static NPY_INLINE int
+npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const @type@ half_sz = ((@type@)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if (NPY_UNLIKELY((a | b) >= half_sz) &&
+            a != 0 && b > @MAX@ / a) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+/**end repeat**/
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/_multiarray_tests.c.src b/contrib/python/numpy/py2/numpy/core/src/multiarray/_multiarray_tests.c.src
new file mode 100644
index 00000000000..9061c05184f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -0,0 +1,2103 @@
+/* -*-c-*- */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <Python.h>
+#define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
+#include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "common.h"
+#include "mem_overlap.h"
+#include "npy_extint128.h"
+#include "common.h"
+
+
+#if defined(MS_WIN32) || defined(__CYGWIN__)
+#define EXPORT(x) __declspec(dllexport) x
+#else
+#define EXPORT(x) x
+#endif
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+/* test PyArray_IsPythonScalar, before including private py3 compat header */
+static PyObject *
+IsPythonScalar(PyObject * dummy, PyObject *args)
+{
+    PyObject *arg = NULL;
+    if (!PyArg_ParseTuple(args, "O", &arg)) {
+        return NULL;
+    }
+    if (PyArray_IsPythonScalar(arg)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+#include "npy_pycompat.h"
+
+/** Function to test calling via ctypes */
+EXPORT(void*) forward_pointer(void *x)
+{
+    return x;
+}
+
+/*
+ * TODO:
+ *  - Handle mode
+ */
+
+/**begin repeat
+ * #name = double, int#
+ * #type = npy_double, npy_int#
+ * #typenum = NPY_DOUBLE, NPY_INT#
+ */
+static int copy_@name@(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *niterx,
+        npy_intp *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    @type@ *ptr;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    for (i = 0; i < itx->size; ++i) {
+        PyArrayNeighborhoodIter_Reset(niterx);
+
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(
+                                PyArray_NDIM(itx->ao), odims, @typenum@);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        ptr = (@type@*)PyArray_DATA(aout);
+
+        for (j = 0; j < niterx->size; ++j) {
+            *ptr = *((@type@*)niterx->dataptr);
+            PyArrayNeighborhoodIter_Next(niterx);
+            ptr += 1;
+        }
+
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArray_ITER_NEXT(itx);
+    }
+
+    return 0;
+}
+/**end repeat**/
+
+static int copy_object(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *niterx,
+        npy_intp *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+    PyArray_CopySwapFunc *copyswap = PyArray_DESCR(itx->ao)->f->copyswap;
+    npy_int itemsize = PyArray_ITEMSIZE(itx->ao);
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    for (i = 0; i < itx->size; ++i) {
+        PyArrayNeighborhoodIter_Reset(niterx);
+
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(PyArray_NDIM(itx->ao), odims, NPY_OBJECT);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        for (j = 0; j < niterx->size; ++j) {
+            copyswap(PyArray_BYTES(aout) + j * itemsize, niterx->dataptr, 0, NULL);
+            PyArrayNeighborhoodIter_Next(niterx);
+        }
+
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArray_ITER_NEXT(itx);
+    }
+
+    return 0;
+}
+
+static PyObject*
+test_neighborhood_iterator(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    PyObject *x, *fill, *out, *b;
+    PyArrayObject *ax, *afill;
+    PyArrayIterObject *itx;
+    int i, typenum, mode, st;
+    npy_intp bounds[NPY_MAXDIMS*2];
+    PyArrayNeighborhoodIterObject *niterx;
+
+    if (!PyArg_ParseTuple(args, "OOOi", &x, &b, &fill, &mode)) {
+        return NULL;
+    }
+
+    if (!PySequence_Check(b)) {
+        return NULL;
+    }
+
+    typenum = PyArray_ObjectType(x, 0);
+    typenum = PyArray_ObjectType(fill, typenum);
+
+    ax = (PyArrayObject*)PyArray_FromObject(x, typenum, 1, 10);
+    if (ax == NULL) {
+        return NULL;
+    }
+    if (PySequence_Size(b) != 2 * PyArray_NDIM(ax)) {
+        PyErr_SetString(PyExc_ValueError,
+                "bounds sequence size not compatible with x input");
+        goto clean_ax;
+    }
+
+    out = PyList_New(0);
+    if (out == NULL) {
+        goto clean_ax;
+    }
+
+    itx = (PyArrayIterObject*)PyArray_IterNew(x);
+    if (itx == NULL) {
+        goto clean_out;
+    }
+
+    /* Compute boundaries for the neighborhood iterator */
+    for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
+        PyObject* bound;
+        bound = PySequence_GetItem(b, i);
+        if (bound == NULL) {
+            goto clean_itx;
+        }
+        if (!PyInt_Check(bound)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "bound not long");
+            Py_DECREF(bound);
+            goto clean_itx;
+        }
+        bounds[i] = PyInt_AsLong(bound);
+        Py_DECREF(bound);
+    }
+
+    /* Create the neighborhood iterator */
+    afill = NULL;
+    if (mode == NPY_NEIGHBORHOOD_ITER_CONSTANT_PADDING) {
+            afill = (PyArrayObject *)PyArray_FromObject(fill, typenum, 0, 0);
+            if (afill == NULL) {
+            goto clean_itx;
+        }
+    }
+
+    niterx = (PyArrayNeighborhoodIterObject*)PyArray_NeighborhoodIterNew(
+                    (PyArrayIterObject*)itx, bounds, mode, afill);
+    if (niterx == NULL) {
+        goto clean_afill;
+    }
+
+    switch (typenum) {
+        case NPY_OBJECT:
+            st = copy_object(itx, niterx, bounds, &out);
+            break;
+        case NPY_INT:
+            st = copy_int(itx, niterx, bounds, &out);
+            break;
+        case NPY_DOUBLE:
+            st = copy_double(itx, niterx, bounds, &out);
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError,
+                    "Type not supported");
+            goto clean_niterx;
+    }
+
+    if (st) {
+        goto clean_niterx;
+    }
+
+    Py_DECREF(niterx);
+    Py_XDECREF(afill);
+    Py_DECREF(itx);
+
+    Py_DECREF(ax);
+
+    return out;
+
+clean_niterx:
+    Py_DECREF(niterx);
+clean_afill:
+    Py_XDECREF(afill);
+clean_itx:
+    Py_DECREF(itx);
+clean_out:
+    Py_DECREF(out);
+clean_ax:
+    Py_DECREF(ax);
+    return NULL;
+}
+
+static int
+copy_double_double(PyArrayNeighborhoodIterObject *itx,
+        PyArrayNeighborhoodIterObject *niterx,
+        npy_intp *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    double *ptr;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    PyArrayNeighborhoodIter_Reset(itx);
+    for (i = 0; i < itx->size; ++i) {
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(
+                            PyArray_NDIM(itx->ao), odims, NPY_DOUBLE);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        ptr = (double*)PyArray_DATA(aout);
+
+        PyArrayNeighborhoodIter_Reset(niterx);
+        for (j = 0; j < niterx->size; ++j) {
+            *ptr = *((double*)niterx->dataptr);
+            ptr += 1;
+            PyArrayNeighborhoodIter_Next(niterx);
+        }
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArrayNeighborhoodIter_Next(itx);
+    }
+    return 0;
+}
+
+static PyObject*
+test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    PyObject *x, *out, *b1, *b2;
+    PyArrayObject *ax;
+    PyArrayIterObject *itx;
+    int i, typenum, mode1, mode2, st;
+    npy_intp bounds[NPY_MAXDIMS*2];
+    PyArrayNeighborhoodIterObject *niterx1, *niterx2;
+
+    if (!PyArg_ParseTuple(args, "OOiOi", &x, &b1, &mode1, &b2, &mode2)) {
+        return NULL;
+    }
+
+    if (!PySequence_Check(b1) || !PySequence_Check(b2)) {
+        return NULL;
+    }
+
+    typenum = PyArray_ObjectType(x, 0);
+
+    ax = (PyArrayObject*)PyArray_FromObject(x, typenum, 1, 10);
+    if (ax == NULL) {
+        return NULL;
+    }
+    if (PySequence_Size(b1) != 2 * PyArray_NDIM(ax)) {
+        PyErr_SetString(PyExc_ValueError,
+                "bounds sequence 1 size not compatible with x input");
+        goto clean_ax;
+    }
+    if (PySequence_Size(b2) != 2 * PyArray_NDIM(ax)) {
+        PyErr_SetString(PyExc_ValueError,
+                "bounds sequence 2 size not compatible with x input");
+        goto clean_ax;
+    }
+
+    out = PyList_New(0);
+    if (out == NULL) {
+        goto clean_ax;
+    }
+
+    itx = (PyArrayIterObject*)PyArray_IterNew(x);
+    if (itx == NULL) {
+        goto clean_out;
+    }
+
+    /* Compute boundaries for the neighborhood iterator */
+    for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
+        PyObject* bound;
+        bound = PySequence_GetItem(b1, i);
+        if (bound == NULL) {
+            goto clean_itx;
+        }
+        if (!PyInt_Check(bound)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "bound not long");
+            Py_DECREF(bound);
+            goto clean_itx;
+        }
+        bounds[i] = PyInt_AsLong(bound);
+        Py_DECREF(bound);
+    }
+
+    /* Create the neighborhood iterator */
+    niterx1 = (PyArrayNeighborhoodIterObject*)PyArray_NeighborhoodIterNew(
+                    (PyArrayIterObject*)itx, bounds,
+                    mode1, NULL);
+    if (niterx1 == NULL) {
+        goto clean_out;
+    }
+
+    for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
+        PyObject* bound;
+        bound = PySequence_GetItem(b2, i);
+        if (bound == NULL) {
+            goto clean_itx;
+        }
+        if (!PyInt_Check(bound)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "bound not long");
+            Py_DECREF(bound);
+            goto clean_itx;
+        }
+        bounds[i] = PyInt_AsLong(bound);
+        Py_DECREF(bound);
+    }
+
+    niterx2 = (PyArrayNeighborhoodIterObject*)PyArray_NeighborhoodIterNew(
+                    (PyArrayIterObject*)niterx1, bounds,
+                    mode2, NULL);
+    if (niterx1 == NULL) {
+        goto clean_niterx1;
+    }
+
+    switch (typenum) {
+        case NPY_DOUBLE:
+            st = copy_double_double(niterx1, niterx2, bounds, &out);
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError,
+                    "Type not supported");
+            goto clean_niterx2;
+    }
+
+    if (st) {
+        goto clean_niterx2;
+    }
+
+    Py_DECREF(niterx2);
+    Py_DECREF(niterx1);
+    Py_DECREF(itx);
+    Py_DECREF(ax);
+    return out;
+
+clean_niterx2:
+    Py_DECREF(niterx2);
+clean_niterx1:
+    Py_DECREF(niterx1);
+clean_itx:
+    Py_DECREF(itx);
+clean_out:
+    Py_DECREF(out);
+clean_ax:
+    Py_DECREF(ax);
+    return NULL;
+}
+
+/* PyDataMem_SetHook tests */
+static int malloc_free_counts[2];
+static PyDataMem_EventHookFunc *old_hook = NULL;
+static void *old_data;
+
+static void test_hook(void *old, void *new, size_t size, void *user_data)
+{
+    int* counters = (int *) user_data;
+    if (old == NULL) {
+        counters[0]++; /* malloc counter */
+    }
+    if (size == 0) {
+        counters[1]++; /* free counter */
+    }
+}
+
+static PyObject*
+test_pydatamem_seteventhook_start(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    malloc_free_counts[0] = malloc_free_counts[1] = 0;
+    old_hook = PyDataMem_SetEventHook(test_hook, (void *) malloc_free_counts, &old_data);
+    Py_RETURN_NONE;
+}
+
+static PyObject*
+test_pydatamem_seteventhook_end(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyDataMem_EventHookFunc *my_hook;
+    void *my_data;
+
+    my_hook = PyDataMem_SetEventHook(old_hook, old_data, &my_data);
+    if ((my_hook != test_hook) || (my_data != (void *) malloc_free_counts)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "hook/data was not the expected test hook");
+        return NULL;
+    }
+
+    if (malloc_free_counts[0] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "malloc count is zero after test");
+        return NULL;
+    }
+    if (malloc_free_counts[1] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "free count is zero after test");
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+
+typedef void (*inplace_map_binop)(PyArrayMapIterObject *, PyArrayIterObject *);
+
+static void npy_float64_inplace_add(PyArrayMapIterObject *mit, PyArrayIterObject *it)
+{
+    int index = mit->size;
+    while (index--) {
+        ((npy_float64*)mit->dataptr)[0] = ((npy_float64*)mit->dataptr)[0] + ((npy_float64*)it->dataptr)[0];
+
+        PyArray_MapIterNext(mit);
+        PyArray_ITER_NEXT(it);
+    }
+}
+
+inplace_map_binop addition_funcs[] = {
+npy_float64_inplace_add,
+NULL};
+
+int type_numbers[] = {
+NPY_FLOAT64,
+-1000};
+
+
+
+static int
+map_increment(PyArrayMapIterObject *mit, PyObject *op, inplace_map_binop add_inplace)
+{
+    PyArrayObject *arr = NULL;
+    PyArrayIterObject *it;
+    PyArray_Descr *descr;
+
+    if (mit->ait == NULL) {
+        return -1;
+    }
+    descr = PyArray_DESCR(mit->ait->ao);
+    Py_INCREF(descr);
+    arr = (PyArrayObject *)PyArray_FromAny(op, descr,
+                                0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        return -1;
+    }
+
+    if ((mit->subspace != NULL) && (mit->consec)) {
+        PyArray_MapIterSwapAxes(mit, (PyArrayObject **)&arr, 0);
+        if (arr == NULL) {
+            return -1;
+        }
+    }
+
+    if ((it = (PyArrayIterObject *)\
+            PyArray_BroadcastToShape((PyObject *)arr, mit->dimensions,
+                                     mit->nd)) == NULL) {
+        Py_DECREF(arr);
+
+        return -1;
+    }
+
+    (*add_inplace)(mit, it);
+
+    Py_DECREF(arr);
+    Py_DECREF(it);
+    return 0;
+}
+
+
+static PyObject *
+inplace_increment(PyObject *dummy, PyObject *args)
+{
+    PyObject *arg_a = NULL, *index=NULL, *inc=NULL;
+    PyArrayObject *a;
+    inplace_map_binop add_inplace = NULL;
+    int type_number = -1;
+    int i =0;
+    PyArrayMapIterObject * mit;
+
+    if (!PyArg_ParseTuple(args, "OOO", &arg_a, &index,
+            &inc)) {
+        return NULL;
+    }
+    if (!PyArray_Check(arg_a)) {
+         PyErr_SetString(PyExc_ValueError, "needs an ndarray as first argument");
+         return NULL;
+    }
+    a = (PyArrayObject *) arg_a;
+
+    if (PyArray_FailUnlessWriteable(a, "input/output array") < 0) {
+        return NULL;
+    }
+
+    if (PyArray_NDIM(a) == 0) {
+        PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed.");
+        return NULL;
+    }
+    type_number = PyArray_TYPE(a);
+
+    while (type_numbers[i] >= 0 && addition_funcs[i] != NULL){
+        if (type_number == type_numbers[i]) {
+            add_inplace = addition_funcs[i];
+            break;
+        }
+        i++ ;
+    }
+
+    if (add_inplace == NULL) {
+        PyErr_SetString(PyExc_TypeError, "unsupported type for a");
+        return NULL;
+    }
+
+    mit = (PyArrayMapIterObject *) PyArray_MapIterArray(a, index);
+    if (mit == NULL) {
+        goto fail;
+    }
+
+    if (map_increment(mit, inc, add_inplace) != 0) {
+        goto fail;
+    }
+
+    Py_DECREF(mit);
+
+    Py_RETURN_NONE;
+
+fail:
+    Py_XDECREF(mit);
+
+    return NULL;
+}
+
+/* check no elison for avoided increfs */
+static PyObject *
+incref_elide(PyObject *dummy, PyObject *args)
+{
+    PyObject *arg = NULL, *res, *tup;
+    if (!PyArg_ParseTuple(args, "O", &arg)) {
+        return NULL;
+    }
+
+    /* refcount 1 array but should not be elided */
+    arg = PyArray_NewCopy((PyArrayObject*)arg, NPY_KEEPORDER);
+    res = PyNumber_Add(arg, arg);
+
+    /* return original copy, should be equal to input */
+    tup = PyTuple_Pack(2, arg, res);
+    Py_DECREF(arg);
+    Py_DECREF(res);
+    return tup;
+}
+
+/* check no elison for get from list without incref */
+static PyObject *
+incref_elide_l(PyObject *dummy, PyObject *args)
+{
+    PyObject *arg = NULL, *r, *res;
+    if (!PyArg_ParseTuple(args, "O", &arg)) {
+        return NULL;
+    }
+    /* get item without increasing refcount, item may still be on the python
+     * stack but above the inaccessible top */
+    r = PyList_GetItem(arg, 4);
+    res = PyNumber_Add(r, r);
+
+    return res;
+}
+
+/* used to test NPY_CHAR usage emits deprecation warning */
+static PyObject*
+npy_char_deprecation(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyArray_Descr * descr = PyArray_DescrFromType(NPY_CHAR);
+    return (PyObject *)descr;
+}
+
+/* used to test UPDATEIFCOPY usage emits deprecation warning */
+static PyObject*
+npy_updateifcopy_deprecation(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags;
+    PyObject* array;
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    flags = NPY_ARRAY_CARRAY | NPY_ARRAY_UPDATEIFCOPY;
+    array = PyArray_FromArray((PyArrayObject*)args, NULL, flags);
+    if (array == NULL)
+        return NULL;
+    PyArray_ResolveWritebackIfCopy((PyArrayObject*)array);
+    Py_DECREF(array);
+    Py_RETURN_NONE;
+}
+
+/* used to create array with WRITEBACKIFCOPY flag */
+static PyObject*
+npy_create_writebackifcopy(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags;
+    PyObject* array;
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    flags = NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEBACKIFCOPY;
+    array = PyArray_FromArray((PyArrayObject*)args, NULL, flags);
+    if (array == NULL)
+        return NULL;
+    return array;
+}
+
+/* used to test WRITEBACKIFCOPY without resolution emits runtime warning */
+static PyObject*
+npy_abuse_writebackifcopy(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags;
+    PyObject* array;
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    flags = NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEBACKIFCOPY;
+    array = PyArray_FromArray((PyArrayObject*)args, NULL, flags);
+    if (array == NULL)
+        return NULL;
+    Py_DECREF(array); /* calls array_dealloc even on PyPy */
+    Py_RETURN_NONE;
+}
+
+/* resolve WRITEBACKIFCOPY */
+static PyObject*
+npy_resolve(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    PyArray_ResolveWritebackIfCopy((PyArrayObject*)args);
+    Py_RETURN_NONE;
+}
+
+/* resolve WRITEBACKIFCOPY */
+static PyObject*
+npy_discard(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    PyArray_DiscardWritebackIfCopy((PyArrayObject*)args);
+    Py_RETURN_NONE;
+}
+
+#if !defined(NPY_PY3K)
+static PyObject *
+int_subclass(PyObject *dummy, PyObject *args)
+{
+
+  PyObject *result = NULL;
+  PyObject *scalar_object = NULL;
+
+  if (!PyArg_UnpackTuple(args, "test_int_subclass", 1, 1, &scalar_object))
+    return NULL;
+
+  if (PyInt_Check(scalar_object))
+    result = Py_True;
+  else
+    result = Py_False;
+
+  Py_INCREF(result);
+
+  return result;
+
+}
+#endif
+
+
+/*
+ * Create python string from a FLAG and or the corresponding PyBuf flag
+ * for the use in get_buffer_info.
+ */
+#define GET_PYBUF_FLAG(FLAG)                                        \
+    buf_flag = PyUnicode_FromString(#FLAG);                         \
+    flag_matches = PyObject_RichCompareBool(buf_flag, tmp, Py_EQ);  \
+    Py_DECREF(buf_flag);                                            \
+    if (flag_matches == 1) {                                        \
+        Py_DECREF(tmp);                                             \
+        flags |= PyBUF_##FLAG;                                      \
+        continue;                                                   \
+    }                                                               \
+    else if (flag_matches == -1) {                                  \
+        Py_DECREF(tmp);                                             \
+        return NULL;                                                \
+    }
+
+
+/*
+ * Get information for a buffer through PyBuf_GetBuffer with the
+ * corresponding flags or'ed. Note that the python caller has to
+ * make sure that or'ing those flags actually makes sense.
+ * More information should probably be returned for future tests.
+ */
+static PyObject *
+get_buffer_info(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *buffer_obj, *pyflags;
+    PyObject *tmp, *buf_flag;
+    Py_buffer buffer;
+    PyObject *shape, *strides;
+    Py_ssize_t i, n;
+    int flag_matches;
+    int flags = 0;
+
+    if (!PyArg_ParseTuple(args, "OO", &buffer_obj, &pyflags)) {
+        return NULL;
+    }
+
+    n = PySequence_Length(pyflags);
+    if (n < 0) {
+        return NULL;
+    }
+
+    for (i=0; i < n; i++) {
+        tmp = PySequence_GetItem(pyflags, i);
+        if (tmp == NULL) {
+            return NULL;
+        }
+
+        GET_PYBUF_FLAG(SIMPLE);
+        GET_PYBUF_FLAG(WRITABLE);
+        GET_PYBUF_FLAG(STRIDES);
+        GET_PYBUF_FLAG(ND);
+        GET_PYBUF_FLAG(C_CONTIGUOUS);
+        GET_PYBUF_FLAG(F_CONTIGUOUS);
+        GET_PYBUF_FLAG(ANY_CONTIGUOUS);
+        GET_PYBUF_FLAG(INDIRECT);
+        GET_PYBUF_FLAG(FORMAT);
+        GET_PYBUF_FLAG(STRIDED);
+        GET_PYBUF_FLAG(STRIDED_RO);
+        GET_PYBUF_FLAG(RECORDS);
+        GET_PYBUF_FLAG(RECORDS_RO);
+        GET_PYBUF_FLAG(FULL);
+        GET_PYBUF_FLAG(FULL_RO);
+        GET_PYBUF_FLAG(CONTIG);
+        GET_PYBUF_FLAG(CONTIG_RO);
+
+        Py_DECREF(tmp);
+
+        /* One of the flags must match */
+        PyErr_SetString(PyExc_ValueError, "invalid flag used.");
+        return NULL;
+    }
+
+    if (PyObject_GetBuffer(buffer_obj, &buffer, flags) < 0) {
+        return NULL;
+    }
+
+    if (buffer.shape == NULL) {
+        Py_INCREF(Py_None);
+        shape = Py_None;
+    }
+    else {
+        shape = PyTuple_New(buffer.ndim);
+        for (i=0; i < buffer.ndim; i++) {
+            PyTuple_SET_ITEM(shape, i, PyLong_FromSsize_t(buffer.shape[i]));
+        }
+    }
+
+    if (buffer.strides == NULL) {
+        Py_INCREF(Py_None);
+        strides = Py_None;
+    }
+    else {
+        strides = PyTuple_New(buffer.ndim);
+        for (i=0; i < buffer.ndim; i++) {
+            PyTuple_SET_ITEM(strides, i, PyLong_FromSsize_t(buffer.strides[i]));
+        }
+    }
+
+    PyBuffer_Release(&buffer);
+    return Py_BuildValue("(NN)", shape, strides);
+}
+
+#undef GET_PYBUF_FLAG
+
+/*
+ * Return a new array object wrapping existing C-allocated (dummy) data.
+ * Such an array does not own its data (must not free it), but because it
+ * wraps C data, it also has no base object. Used to test arr.flags.writeable
+ * setting behaviour.
+ */
+static PyObject*
+get_c_wrapping_array(PyObject* NPY_UNUSED(self), PyObject* arg)
+{
+    int writeable, flags;
+    npy_intp zero = 0;
+
+    writeable = PyObject_IsTrue(arg);
+    if (error_converting(writeable)) {
+        return NULL;
+    }
+
+    flags = writeable ? NPY_ARRAY_WRITEABLE : 0;
+    /* Create an empty array (which points to a random place) */
+    return PyArray_NewFromDescr(&PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+                                1, &zero, NULL, &zero, flags, NULL);
+}
+
+
+/*
+ * Test C-api level item getting.
+ */
+static PyObject *
+array_indexing(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    int mode;
+    Py_ssize_t i;
+    PyObject *arr, *op = NULL;
+
+    if (!PyArg_ParseTuple(args, "iOn|O", &mode, &arr, &i, &op)) {
+        return NULL;
+    }
+
+    if (mode == 0) {
+        return PySequence_GetItem(arr, i);
+    }
+    if (mode == 1) {
+        if (PySequence_SetItem(arr, i, op) < 0) {
+            return NULL;
+        }
+        Py_RETURN_NONE;
+    }
+
+    PyErr_SetString(PyExc_ValueError,
+                    "invalid mode. 0: item 1: assign");
+    return NULL;
+}
+
+/*
+ * Test C-api PyArray_AsCArray item getter
+ */
+static PyObject *
+test_as_c_array(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyArrayObject *array_obj;
+    npy_intp dims[3];   /* max 3-dim */
+    npy_intp i=0, j=0, k=0;
+    npy_intp num_dims = 0;
+    PyArray_Descr *descr = NULL;
+    double *array1 = NULL;
+    double **array2 = NULL;
+    double ***array3 = NULL;
+    double temp = 9999;
+
+    if (!PyArg_ParseTuple(args, "O!l|ll",
+                &PyArray_Type, &array_obj,
+                &i, &j, &k)) {
+        return NULL;
+    }
+
+    if (NULL == array_obj) {
+        return NULL;
+    }
+
+    num_dims = PyArray_NDIM(array_obj);
+    descr = PyArray_DESCR(array_obj);
+
+    switch (num_dims) {
+        case 1:
+            if (PyArray_AsCArray(
+                    (PyObject **) &array_obj,
+                    (void *) &array1,
+                    dims,
+                    1,
+                    descr) < 0) {
+                PyErr_SetString(PyExc_RuntimeError, "error converting 1D array");
+                return NULL;
+            }
+            temp = array1[i];
+            PyArray_Free((PyObject *) array_obj, (void *) array1);
+            break;
+        case 2:
+            if (PyArray_AsCArray(
+                    (PyObject **) &array_obj,
+                    (void **) &array2,
+                    dims,
+                    2,
+                    descr) < 0) {
+                PyErr_SetString(PyExc_RuntimeError, "error converting 2D array");
+                return NULL;
+            }
+            temp = array2[i][j];
+            PyArray_Free((PyObject *) array_obj, (void *) array2);
+            break;
+        case 3:
+            if (PyArray_AsCArray(
+                    (PyObject **) &array_obj,
+                    (void ***) &array3,
+                    dims,
+                    3,
+                    descr) < 0) {
+                PyErr_SetString(PyExc_RuntimeError, "error converting 3D array");
+                return NULL;
+            }
+            temp = array3[i][j][k];
+            PyArray_Free((PyObject *) array_obj, (void *) array3);
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError, "array.ndim not in [1, 3]");
+            return NULL;
+    }
+    return Py_BuildValue("f", temp);
+}
+
+/*
+ * Test nditer of too large arrays using remove axis, etc.
+ */
+static PyObject *
+test_nditer_too_large(PyObject *NPY_UNUSED(self), PyObject *args) {
+    NpyIter *iter;
+    PyObject *array_tuple, *arr;
+    PyArrayObject *arrays[NPY_MAXARGS];
+    npy_uint32 op_flags[NPY_MAXARGS];
+    Py_ssize_t nop;
+    int i, axis, mode;
+
+    npy_intp index[NPY_MAXARGS] = {0};
+    char *msg;
+
+    if (!PyArg_ParseTuple(args, "Oii", &array_tuple, &axis, &mode)) {
+        return NULL;
+    }
+
+    if (!PyTuple_CheckExact(array_tuple)) {
+        PyErr_SetString(PyExc_ValueError, "tuple required as first argument");
+        return NULL;
+    }
+    nop = PyTuple_Size(array_tuple);
+    if (nop > NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError, "tuple must be smaller then maxargs");
+        return NULL;
+    }
+
+    for (i=0; i < nop; i++) {
+        arr = PyTuple_GET_ITEM(array_tuple, i);
+        if (!PyArray_CheckExact(arr)) {
+            PyErr_SetString(PyExc_ValueError, "require base class ndarray");
+            return NULL;
+        }
+        arrays[i] = (PyArrayObject *)arr;
+        op_flags[i] = NPY_ITER_READONLY;
+    }
+
+    iter = NpyIter_MultiNew(nop, arrays, NPY_ITER_MULTI_INDEX | NPY_ITER_RANGED,
+                            NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
+
+    if (iter == NULL) {
+        return NULL;
+    }
+
+    /* Remove an axis (negative, do not remove any) */
+    if (axis >= 0) {
+        if (!NpyIter_RemoveAxis(iter, axis)) {
+            goto fail;
+        }
+    }
+
+    switch (mode) {
+        /* Test IterNext getting */
+        case 0:
+            if (NpyIter_GetIterNext(iter, NULL) == NULL) {
+                goto fail;
+            }
+            break;
+        case 1:
+            if (NpyIter_GetIterNext(iter, &msg) == NULL) {
+                PyErr_SetString(PyExc_ValueError, msg);
+                goto fail;
+            }
+            break;
+        /* Test Multi Index removal */
+        case 2:
+            if (!NpyIter_RemoveMultiIndex(iter)) {
+                goto fail;
+            }
+            break;
+        /* Test GotoMultiIndex (just 0 hardcoded) */
+        case 3:
+            if (!NpyIter_GotoMultiIndex(iter, index)) {
+                goto fail;
+            }
+            break;
+        /* Test setting iterrange (hardcoded range of 0, 1) */
+        case 4:
+            if (!NpyIter_ResetToIterIndexRange(iter, 0, 1, NULL)) {
+                goto fail;
+            }
+            break;
+        case 5:
+            if (!NpyIter_ResetToIterIndexRange(iter, 0, 1, &msg)) {
+                PyErr_SetString(PyExc_ValueError, msg);
+                goto fail;
+            }
+            break;
+        /* Do nothing */
+        default:
+            break;
+    }
+
+    NpyIter_Deallocate(iter);
+    Py_RETURN_NONE;
+  fail:
+    NpyIter_Deallocate(iter);
+    return NULL;
+}
+
+static PyObject *
+array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+    PyObject *A = NULL;
+    PyObject *U = NULL;
+    Py_ssize_t b_input = 0;
+    Py_ssize_t max_work = -1;
+    int simplify = 0;
+    int require_ub_nontrivial = 0;
+    static char *kwlist[] = {"A", "U", "b", "max_work", "simplify",
+                             "require_ub_nontrivial", NULL};
+
+    diophantine_term_t terms[2*NPY_MAXDIMS+2];
+    npy_int64 x[2*NPY_MAXDIMS+2];
+    npy_int64 b;
+    unsigned int nterms, j;
+    mem_overlap_t result = MEM_OVERLAP_YES;
+    PyObject *retval = NULL;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O!n|nii", kwlist,
+                                     &PyTuple_Type, &A,
+                                     &PyTuple_Type, &U,
+                                     &b_input, &max_work, &simplify,
+                                     &require_ub_nontrivial)) {
+        return NULL;
+    }
+
+    if (PyTuple_GET_SIZE(A) > (Py_ssize_t)ARRAY_SIZE(terms)) {
+        PyErr_SetString(PyExc_ValueError, "too many terms in equation");
+        goto fail;
+    }
+
+    nterms = PyTuple_GET_SIZE(A);
+
+    if (PyTuple_GET_SIZE(U) != nterms) {
+        PyErr_SetString(PyExc_ValueError, "A, U must be tuples of equal length");
+        goto fail;
+    }
+
+    for (j = 0; j < nterms; ++j) {
+        terms[j].a = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(A, j));
+        if (error_converting(terms[j].a)) {
+            goto fail;
+        }
+        terms[j].ub = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(U, j));
+        if (error_converting(terms[j].ub)) {
+            goto fail;
+        }
+    }
+
+    b = b_input;
+
+    NPY_BEGIN_THREADS;
+    if (simplify && !require_ub_nontrivial) {
+        if (diophantine_simplify(&nterms, terms, b)) {
+            result = MEM_OVERLAP_OVERFLOW;
+        }
+    }
+    if (result == MEM_OVERLAP_YES) {
+        result = solve_diophantine(nterms, terms, b, max_work, require_ub_nontrivial, x);
+    }
+    NPY_END_THREADS;
+
+    if (result == MEM_OVERLAP_YES) {
+        retval = PyTuple_New(nterms);
+        if (retval == NULL) {
+            goto fail;
+        }
+
+        for (j = 0; j < nterms; ++j) {
+            PyObject *obj;
+#if defined(NPY_PY3K)
+            obj = PyLong_FromSsize_t(x[j]);
+#else
+            obj = PyInt_FromSsize_t(x[j]);
+#endif
+            if (obj == NULL) {
+                goto fail;
+            }
+            PyTuple_SET_ITEM(retval, j, obj);
+        }
+    }
+    else if (result == MEM_OVERLAP_NO) {
+        retval = Py_None;
+        Py_INCREF(retval);
+    }
+    else if (result == MEM_OVERLAP_ERROR) {
+        PyErr_SetString(PyExc_ValueError, "Invalid arguments");
+    }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError, "Integer overflow");
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        PyErr_SetString(PyExc_RuntimeError, "Too much work done");
+    }
+    else {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown error");
+    }
+
+    return retval;
+
+fail:
+    Py_XDECREF(retval);
+    return NULL;
+}
+
+
+static PyObject *
+array_internal_overlap(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+    PyArrayObject * self = NULL;
+    static char *kwlist[] = {"self", "max_work", NULL};
+
+    mem_overlap_t result;
+    Py_ssize_t max_work = NPY_MAY_SHARE_EXACT;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|n", kwlist,
+                                     PyArray_Converter, &self,
+                                     &max_work)) {
+        return NULL;
+    }
+
+    if (max_work < -2) {
+        PyErr_SetString(PyExc_ValueError, "Invalid value for max_work");
+        goto fail;
+    }
+
+    NPY_BEGIN_THREADS;
+    result = solve_may_have_internal_overlap(self, max_work);
+    NPY_END_THREADS;
+
+    Py_XDECREF(self);
+
+    if (result == MEM_OVERLAP_NO) {
+        Py_RETURN_FALSE;
+    }
+    else if (result == MEM_OVERLAP_YES) {
+        Py_RETURN_TRUE;
+    }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Integer overflow in computing overlap");
+        return NULL;
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Exceeded max_work");
+        return NULL;
+    }
+    else {
+        /* Doesn't happen usually */
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Error in computing overlap");
+        return NULL;
+    }
+
+fail:
+    Py_XDECREF(self);
+    return NULL;
+}
+
+
+static PyObject *
+pylong_from_int128(npy_extint128_t value)
+{
+    PyObject *val_64 = NULL, *val = NULL, *tmp = NULL, *tmp2 = NULL;
+
+    val_64 = PyLong_FromLong(64);
+    if (val_64 == NULL) {
+        goto fail;
+    }
+
+    val = PyLong_FromUnsignedLongLong(value.hi);
+    if (val == NULL) {
+        goto fail;
+    }
+
+    tmp = PyNumber_Lshift(val, val_64);
+    if (tmp == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(val);
+    val = tmp;
+
+    tmp = PyLong_FromUnsignedLongLong(value.lo);
+    if (tmp == NULL) {
+        goto fail;
+    }
+
+    tmp2 = PyNumber_Or(val, tmp);
+    if (tmp2 == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(val);
+    Py_DECREF(tmp);
+
+    val = NULL;
+    tmp = NULL;
+
+    if (value.sign < 0) {
+        val = PyNumber_Negative(tmp2);
+        if (val == NULL) {
+            goto fail;
+        }
+        Py_DECREF(tmp2);
+        return val;
+    }
+    else {
+        val = tmp2;
+    }
+    return val;
+
+fail:
+    Py_XDECREF(val_64);
+    Py_XDECREF(tmp);
+    Py_XDECREF(tmp2);
+    Py_XDECREF(val);
+    return NULL;
+}
+
+
+static int
+int128_from_pylong(PyObject *obj, npy_extint128_t *result)
+{
+    PyObject *long_obj = NULL, *val_64 = NULL, *val_0 = NULL,
+        *mask_64 = NULL, *max_128 = NULL, *hi_bits = NULL,
+        *lo_bits = NULL, *tmp = NULL;
+    int cmp;
+    int negative_zero = 0;
+
+    if (PyBool_Check(obj)) {
+        /* False means negative zero */
+        negative_zero = 1;
+    }
+
+    long_obj = PyObject_CallFunction((PyObject*)&PyLong_Type, "O", obj);
+    if (long_obj == NULL) {
+        goto fail;
+    }
+
+    val_0 = PyLong_FromLong(0);
+    if (val_0 == NULL) {
+        goto fail;
+    }
+
+    val_64 = PyLong_FromLong(64);
+    if (val_64 == NULL) {
+        goto fail;
+    }
+
+    mask_64 = PyLong_FromUnsignedLongLong(0xffffffffffffffffULL);
+    if (mask_64 == NULL) {
+        goto fail;
+    }
+
+    tmp = PyNumber_Lshift(mask_64, val_64);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    max_128 = PyNumber_Or(tmp, mask_64);
+    if (max_128 == NULL) {
+        goto fail;
+    }
+    Py_DECREF(tmp);
+    tmp = NULL;
+
+    cmp = PyObject_RichCompareBool(long_obj, val_0, Py_LT);
+    if (cmp == -1) {
+        goto fail;
+    }
+    else if (cmp == 1) {
+        tmp = PyNumber_Negative(long_obj);
+        if (tmp == NULL) {
+            goto fail;
+        }
+        Py_DECREF(long_obj);
+        long_obj = tmp;
+        tmp = NULL;
+        result->sign = -1;
+    }
+    else {
+        result->sign = 1;
+    }
+
+    cmp = PyObject_RichCompareBool(long_obj, max_128, Py_GT);
+    if (cmp == 1) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        goto fail;
+    }
+    else if (cmp == -1) {
+        goto fail;
+    }
+
+    hi_bits = PyNumber_Rshift(long_obj, val_64);
+    if (hi_bits == NULL) {
+        goto fail;
+    }
+
+    lo_bits = PyNumber_And(long_obj, mask_64);
+    if (lo_bits == NULL) {
+        goto fail;
+    }
+
+    result->hi = PyLong_AsUnsignedLongLong(hi_bits);
+    if (result->hi == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    result->lo = PyLong_AsUnsignedLongLong(lo_bits);
+    if (result->lo == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    if (negative_zero && result->hi == 0 && result->lo == 0) {
+        result->sign = -1;
+    }
+
+    Py_XDECREF(long_obj);
+    Py_XDECREF(val_64);
+    Py_XDECREF(val_0);
+    Py_XDECREF(mask_64);
+    Py_XDECREF(max_128);
+    Py_XDECREF(hi_bits);
+    Py_XDECREF(lo_bits);
+    Py_XDECREF(tmp);
+    return 0;
+
+fail:
+    Py_XDECREF(long_obj);
+    Py_XDECREF(val_64);
+    Py_XDECREF(val_0);
+    Py_XDECREF(mask_64);
+    Py_XDECREF(max_128);
+    Py_XDECREF(hi_bits);
+    Py_XDECREF(lo_bits);
+    Py_XDECREF(tmp);
+    return -1;
+}
+
+
+static PyObject *
+extint_safe_binop(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a, b, c;
+    int op;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "LLi", &a, &b, &op)) {
+        return NULL;
+    }
+    if (op == 1) {
+        c = safe_add(a, b, &overflow);
+    }
+    else if (op == 2) {
+        c = safe_sub(a, b, &overflow);
+    }
+    else if (op == 3) {
+        c = safe_mul(a, b, &overflow);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "invalid op");
+        return NULL;
+    }
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return PyLong_FromLongLong(c);
+}
+
+
+static PyObject *
+extint_to_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a;
+    if (!PyArg_ParseTuple(args, "L", &a)) {
+        return NULL;
+    }
+    return pylong_from_int128(to_128(a));
+}
+
+
+static PyObject *
+extint_to_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a;
+    PY_LONG_LONG r;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    r = to_64(a, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return PyLong_FromLongLong(r);
+}
+
+
+static PyObject *
+extint_mul_64_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a, b;
+    npy_extint128_t c;
+    if (!PyArg_ParseTuple(args, "LL", &a, &b)) {
+        return NULL;
+    }
+    c = mul_64_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_add_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b, c;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    c = add_128(a, b, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_sub_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b, c;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    c = sub_128(a, b, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_neg_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = neg_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shl_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = shl_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shr_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = shr_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_gt_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    if (gt_128(a, b)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+
+static PyObject *
+extint_divmod_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *ret = NULL, *tmp = NULL;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    npy_int64 mod;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        goto fail;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        goto fail;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        goto fail;
+    }
+
+    c = divmod_128_64(a, b, &mod);
+
+    ret = PyTuple_New(2);
+
+    tmp = pylong_from_int128(c);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    PyTuple_SET_ITEM(ret, 0, tmp);
+
+    tmp = PyLong_FromLongLong(mod);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    PyTuple_SET_ITEM(ret, 1, tmp);
+    return ret;
+
+fail:
+    Py_XDECREF(ret);
+    Py_XDECREF(tmp);
+    return NULL;
+}
+
+
+static PyObject *
+extint_floordiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        return NULL;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    c = floordiv_128_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_ceildiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        return NULL;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    c = ceildiv_128_64(a, b);
+    return pylong_from_int128(c);
+}
+
+struct TestStruct1 {
+    npy_uint8 a;
+    npy_complex64 b;
+};
+
+struct TestStruct2 {
+    npy_uint32 a;
+    npy_complex64 b;
+};
+
+struct TestStruct3 {
+    npy_uint8 a;
+    struct TestStruct1 b;
+};
+
+static PyObject *
+get_struct_alignments(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *ret = PyTuple_New(3);
+    PyObject *alignment, *size, *val;
+
+/**begin repeat
+ * #N = 1,2,3#
+ */
+    alignment = PyInt_FromLong(_ALIGN(struct TestStruct@N@));
+    size = PyInt_FromLong(sizeof(struct TestStruct@N@));
+    val = PyTuple_Pack(2, alignment, size);
+    Py_DECREF(alignment);
+    Py_DECREF(size);
+    if (val == NULL) {
+        return NULL;
+    }
+    PyTuple_SET_ITEM(ret, @N@-1, val);
+/**end repeat**/
+    return ret;
+}
+
+
+static char get_fpu_mode_doc[] = (
+    "get_fpu_mode()\n"
+    "\n"
+    "Get the current FPU control word, in a platform-dependent format.\n"
+    "Returns None if not implemented on current platform.");
+
+static PyObject *
+get_fpu_mode(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    if (!PyArg_ParseTuple(args, "")) {
+        return NULL;
+    }
+
+#if defined(_MSC_VER)
+    {
+        unsigned int result = 0;
+        result = _controlfp(0, 0);
+        return PyLong_FromLongLong(result);
+    }
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+    {
+        unsigned short cw = 0;
+        __asm__("fstcw %w0" : "=m" (cw));
+        return PyLong_FromLongLong(cw);
+    }
+#else
+    Py_RETURN_NONE;
+#endif
+}
+
+/*
+ * npymath wrappers
+ */
+
+/**begin repeat
+ * #name = cabs, carg#
+ */
+
+/**begin repeat1
+ * #itype = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #ITYPE = NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE#
+ * #otype = npy_float, npy_double, npy_longdouble#
+ * #OTYPE = NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE#
+ * #suffix= f, , l#
+ */
+
+static PyObject *
+call_npy_@name@@suffix@(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, @ITYPE@, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, @OTYPE@);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(@otype@*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_@name@@suffix@(*(@itype@*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/**begin repeat
+ * #name = log10, cosh, sinh, tan, tanh#
+ */
+
+/**begin repeat1
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE#
+ * #suffix= f, , l#
+ */
+
+static PyObject *
+call_npy_@name@@suffix@(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, @TYPE@, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, @TYPE@);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(@type@*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_@name@@suffix@(*(@type@*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ * For development/testing purposes, it's convenient to have access to the
+ * system printf for floats. This is a very simple printf interface.
+ */
+PyObject *
+PrintFloat_Printf_g(PyObject *obj, int precision)
+{
+    char str[1024];
+
+    if (PyArray_IsScalar(obj, Half)) {
+        npy_half x = ((PyHalfScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision,
+                      npy_half_to_double(x));
+    }
+    else if (PyArray_IsScalar(obj, Float)) {
+        npy_float x = ((PyFloatScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+    }
+    else if (PyArray_IsScalar(obj, Double)) {
+        npy_double x = ((PyDoubleScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+        /* would be better to use lg, but not available in C90 */
+    }
+    else if (PyArray_IsScalar(obj, LongDouble)) {
+        npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*Lg", precision, x);
+    }
+    else{
+        double val = PyFloat_AsDouble(obj);
+        if (error_converting(val)) {
+            return NULL;
+        }
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, val);
+    }
+
+    return PyUString_FromString(str);
+}
+
+
+static PyObject *
+printf_float_g(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    PyObject *obj;
+    int precision;
+
+    if (!PyArg_ParseTuple(args,"Oi:format_float_OSprintf_g", &obj,
+                                                             &precision)) {
+        return NULL;
+    }
+
+    if (precision < 0) {
+        PyErr_SetString(PyExc_TypeError, "precision must be non-negative");
+        return NULL;
+    }
+
+    return PrintFloat_Printf_g(obj, precision);
+}
+
+static PyObject *
+getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyObject *ret;
+    PyObject *ops = PyArray_GetNumericOps();
+    if (ops == NULL) {
+        return NULL;
+    }
+    ret = PyLong_FromLong(PyArray_SetNumericOps(ops));
+    Py_DECREF(ops);
+    return ret;
+}
+
+static PyMethodDef Multiarray_TestsMethods[] = {
+    {"IsPythonScalar",
+        IsPythonScalar,
+        METH_VARARGS, NULL},
+    {"test_neighborhood_iterator",
+        test_neighborhood_iterator,
+        METH_VARARGS, NULL},
+    {"test_neighborhood_iterator_oob",
+        test_neighborhood_iterator_oob,
+        METH_VARARGS, NULL},
+    {"test_pydatamem_seteventhook_start",
+        test_pydatamem_seteventhook_start,
+        METH_NOARGS, NULL},
+    {"test_pydatamem_seteventhook_end",
+        test_pydatamem_seteventhook_end,
+        METH_NOARGS, NULL},
+    {"test_inplace_increment",
+        inplace_increment,
+        METH_VARARGS, NULL},
+    {"incref_elide",
+        incref_elide,
+        METH_VARARGS, NULL},
+    {"incref_elide_l",
+        incref_elide_l,
+        METH_VARARGS, NULL},
+    {"npy_char_deprecation",
+        npy_char_deprecation,
+        METH_NOARGS, NULL},
+    {"npy_updateifcopy_deprecation",
+        npy_updateifcopy_deprecation,
+        METH_O, NULL},
+    {"npy_create_writebackifcopy",
+        npy_create_writebackifcopy,
+        METH_O, NULL},
+    {"npy_abuse_writebackifcopy",
+        npy_abuse_writebackifcopy,
+        METH_O, NULL},
+    {"npy_resolve",
+        npy_resolve,
+        METH_O, NULL},
+    {"npy_discard",
+        npy_discard,
+        METH_O, NULL},
+#if !defined(NPY_PY3K)
+    {"test_int_subclass",
+        int_subclass,
+        METH_VARARGS, NULL},
+#endif
+    {"get_buffer_info",
+        get_buffer_info,
+        METH_VARARGS, NULL},
+    {"get_c_wrapping_array",
+        get_c_wrapping_array,
+        METH_O, NULL},
+    {"array_indexing",
+        array_indexing,
+        METH_VARARGS, NULL},
+    {"test_as_c_array",
+        test_as_c_array,
+        METH_VARARGS, NULL},
+    {"test_nditer_too_large",
+        test_nditer_too_large,
+        METH_VARARGS, NULL},
+    {"solve_diophantine",
+        (PyCFunction)array_solve_diophantine,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"internal_overlap",
+        (PyCFunction)array_internal_overlap,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"extint_safe_binop",
+        extint_safe_binop,
+        METH_VARARGS, NULL},
+    {"extint_to_128",
+        extint_to_128,
+        METH_VARARGS, NULL},
+    {"extint_to_64",
+        extint_to_64,
+        METH_VARARGS, NULL},
+    {"extint_mul_64_64",
+        extint_mul_64_64,
+        METH_VARARGS, NULL},
+    {"extint_add_128",
+        extint_add_128,
+        METH_VARARGS, NULL},
+    {"extint_sub_128",
+        extint_sub_128,
+        METH_VARARGS, NULL},
+    {"extint_neg_128",
+        extint_neg_128,
+        METH_VARARGS, NULL},
+    {"extint_shl_128",
+        extint_shl_128,
+        METH_VARARGS, NULL},
+    {"extint_shr_128",
+        extint_shr_128,
+        METH_VARARGS, NULL},
+    {"extint_gt_128",
+        extint_gt_128,
+        METH_VARARGS, NULL},
+    {"extint_divmod_128_64",
+        extint_divmod_128_64,
+        METH_VARARGS, NULL},
+    {"extint_floordiv_128_64",
+        extint_floordiv_128_64,
+        METH_VARARGS, NULL},
+    {"extint_ceildiv_128_64",
+        extint_ceildiv_128_64,
+        METH_VARARGS, NULL},
+    {"get_fpu_mode",
+        get_fpu_mode,
+        METH_VARARGS, get_fpu_mode_doc},
+    {"getset_numericops",
+        getset_numericops,
+        METH_NOARGS, NULL},
+/**begin repeat
+ * #name = cabs, carg#
+ */
+
+/**begin repeat1
+ * #suffix = f, , l#
+ */
+    {"npy_@name@@suffix@",
+        call_npy_@name@@suffix@,
+        METH_VARARGS, NULL},
+/**end repeat1**/
+
+/**end repeat**/
+
+/**begin repeat
+ * #name = log10, cosh, sinh, tan, tanh#
+ */
+
+/**begin repeat1
+ * #suffix= f, , l#
+ */
+    {"npy_@name@@suffix@",
+        call_npy_@name@@suffix@,
+        METH_VARARGS, NULL},
+/**end repeat1**/
+
+/**end repeat**/
+    {"format_float_OSprintf_g",
+        (PyCFunction)printf_float_g,
+        METH_VARARGS , NULL},
+    {"get_struct_alignments",
+        get_struct_alignments,
+        METH_VARARGS, NULL},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+
+#if defined(NPY_PY3K)
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "_multiarray_tests",
+        NULL,
+        -1,
+        Multiarray_TestsMethods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+#endif
+
+#if defined(NPY_PY3K)
+#define RETVAL m
+PyMODINIT_FUNC PyInit__multiarray_tests(void)
+#else
+#define RETVAL
+PyMODINIT_FUNC
+init_multiarray_tests(void)
+#endif
+{
+    PyObject *m;
+
+#if defined(NPY_PY3K)
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule("_multiarray_tests", Multiarray_TestsMethods);
+#endif
+    if (m == NULL) {
+        return RETVAL;
+    }
+    import_array();
+    if (PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _multiarray_tests module.");
+    }
+    return RETVAL;
+}
+
+NPY_NO_EXPORT int
+test_not_exported(void)
+{
+    return 1;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/arraytypes.c.src b/contrib/python/numpy/py2/numpy/core/src/multiarray/arraytypes.c.src
new file mode 100644
index 00000000000..d921b9d9023
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/arraytypes.c.src
@@ -0,0 +1,4921 @@
+/* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+#include <limits.h>
+#include <assert.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
+
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
+#include "npy_pycompat.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+
+#include "npy_config.h"
+#include "npy_sort.h"
+#include "common.h"
+#include "ctors.h"
+#include "lowlevel_strided_loops.h"
+#include "usertypes.h"
+#include "_datetime.h"
+#include "arrayobject.h"
+#include "alloc.h"
+#include "typeinfo.h"
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+#include <emmintrin.h>
+#endif
+
+#include "npy_longdouble.h"
+#include "numpyos.h"
+#include <string.h>
+
+#include "cblasfuncs.h"
+#include "npy_cblas.h"
+#include "buffer.h"
+
+/* check for sequences, but ignore the types numpy considers scalars */
+static NPY_INLINE npy_bool
+PySequence_NoString_Check(PyObject *op) {
+    return
+        PySequence_Check(op) &&
+        !PyString_Check(op) &&
+        !PyUnicode_Check(op) &&
+        !PyArray_IsZeroDim(op);
+}
+
+/*
+ *****************************************************************************
+ **                        PYTHON TYPES TO C TYPES                          **
+ *****************************************************************************
+ */
+
+static double
+MyPyFloat_AsDouble(PyObject *obj)
+{
+    double ret = 0;
+    PyObject *num;
+
+    if (obj == Py_None) {
+        return NPY_NAN;
+    }
+    num = PyNumber_Float(obj);
+    if (num == NULL) {
+        return NPY_NAN;
+    }
+    ret = PyFloat_AsDouble(num);
+    Py_DECREF(num);
+    return ret;
+}
+
+static npy_half
+MyPyFloat_AsHalf(PyObject *obj)
+{
+    return npy_double_to_half(MyPyFloat_AsDouble(obj));
+}
+
+static PyObject *
+MyPyFloat_FromHalf(npy_half h)
+{
+    return PyFloat_FromDouble(npy_half_to_double(h));
+}
+
+/* Handle case of assigning from an array scalar in setitem */
+static int
+convert_to_scalar_and_retry(PyObject *op, void *ov, void *vap,
+                      int (*setitem)(PyObject *op, void *ov, void *vap))
+{
+    PyObject *temp;
+
+    assert(PyArray_IsZeroDim(op));
+    temp = PyArray_ToScalar(PyArray_BYTES((PyArrayObject *)op),
+                                      (PyArrayObject *)op);
+    if (temp == NULL) {
+        return -1;
+    }
+    else {
+        int res = setitem(temp, ov, vap);
+        Py_DECREF(temp);
+        return res;
+    }
+}
+
+
+/**begin repeat
+ *
+ * #Type = Long, LongLong#
+ * #type = npy_long, npy_longlong#
+ */
+static @type@
+MyPyLong_As@Type@ (PyObject *obj)
+{
+    @type@ ret;
+    PyObject *num = PyNumber_Long(obj);
+
+    if (num == NULL) {
+        return -1;
+    }
+    ret = PyLong_As@Type@(num);
+    Py_DECREF(num);
+    return ret;
+}
+
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #Type = Long, LongLong#
+ * #type = npy_ulong, npy_ulonglong#
+ */
+static @type@
+MyPyLong_AsUnsigned@Type@ (PyObject *obj)
+{
+    @type@ ret;
+    PyObject *num = PyNumber_Long(obj);
+
+    if (num == NULL) {
+        return -1;
+    }
+    ret = PyLong_AsUnsigned@Type@(num);
+    if (PyErr_Occurred()) {
+        PyErr_Clear();
+        ret = PyLong_As@Type@(num);
+    }
+    Py_DECREF(num);
+    return ret;
+}
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         GETITEM AND SETITEM                             **
+ *****************************************************************************
+ */
+
+#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/*
+ * Disable harmless compiler warning "4116: unnamed type definition in
+ * parentheses" which is caused by the _ALIGN macro.
+ */
+#if defined(_MSC_VER)
+#pragma warning(disable:4116)
+#endif
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, LONG, UINT, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE#
+ * #func1 = PyBool_FromLong, PyInt_FromLong*6, PyLong_FromUnsignedLong*2,
+ *          PyLong_FromLongLong, PyLong_FromUnsignedLongLong,
+ *          MyPyFloat_FromHalf, PyFloat_FromDouble*2#
+ * #func2 = PyObject_IsTrue, MyPyLong_AsLong*6, MyPyLong_AsUnsignedLong*2,
+ *          MyPyLong_AsLongLong, MyPyLong_AsUnsignedLongLong,
+ *          MyPyFloat_AsHalf, MyPyFloat_AsDouble*2#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_long, npy_uint, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double#
+ * #type1 = long*7, npy_ulong*2, npy_longlong, npy_ulonglong,
+ *          npy_half, npy_float, npy_double#
+ * #kind = Bool, Byte, UByte, Short, UShort, Int, Long, UInt, ULong,
+ *         LongLong, ULongLong, Half, Float, Double#
+*/
+static PyObject *
+@TYPE@_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    @type@ t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((@type@ *)ip);
+        return @func1@((@type1@)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return @func1@((@type1@)t1);
+    }
+}
+
+static int
+@TYPE@_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    @type@ temp;  /* ensures alignment */
+
+    if (PyArray_IsScalar(op, @kind@)) {
+        temp = ((Py@kind@ScalarObject *)op)->obval;
+    }
+    else {
+        temp = (@type@)@func2@(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            Py_DECREF(type);
+            Py_XDECREF(value);
+            Py_XDECREF(traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, _ALIGN(@type@)));
+        *((@type@ *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type = npy_float, npy_double#
+ */
+static PyObject *
+@TYPE@_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    @type@ t1, t2;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        return PyComplex_FromDoubles((double)((@type@ *)ip)[0],
+                (double)((@type@ *)ip)[1]);
+    }
+    else {
+        int size = sizeof(@type@);
+
+        npy_bool swap = PyArray_ISBYTESWAPPED(ap);
+        copy_and_swap(&t1, ip, size, 1, 0, swap);
+        copy_and_swap(&t2, ip + size, size, 1, 0, swap);
+        return PyComplex_FromDoubles((double)t1, (double)t2);
+    }
+}
+
+/**end repeat**/
+
+
+
+/**begin repeat
+ *
+ * #NAME = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #ftype = npy_float, npy_double, npy_longdouble#
+ * #kind = CFloat, CDouble, CLongDouble#
+ */
+static int
+@NAME@_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    Py_complex oop;
+    @type@ temp;
+    int rsize;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, @NAME@_setitem);
+    }
+
+    if (PyArray_IsScalar(op, @kind@)){
+        temp = ((Py@kind@ScalarObject *)op)->obval;
+    }
+    else {
+        if (op == Py_None) {
+            oop.real = NPY_NAN;
+            oop.imag = NPY_NAN;
+        }
+        else {
+            oop = PyComplex_AsCComplex (op);
+            if (PyErr_Occurred()) {
+                return -1;
+            }
+        }
+        temp.real = (@ftype@) oop.real;
+        temp.imag = (@ftype@) oop.imag;
+    }
+
+    memcpy(ov, &temp, PyArray_DESCR(ap)->elsize);
+    if (PyArray_ISBYTESWAPPED(ap)) {
+        byte_swap_vector(ov, 2, sizeof(@ftype@));
+    }
+    rsize = sizeof(@ftype@);
+    copy_and_swap(ov, &temp, rsize, 2, rsize, PyArray_ISBYTESWAPPED(ap));
+    return 0;
+}
+
+/**end repeat**/
+
+static NPY_INLINE npy_longdouble
+string_to_long_double(PyObject*op)
+{
+    char *s;
+    char *end;
+    npy_longdouble temp;
+    PyObject* b;
+
+    /* Convert python long objects to a longdouble, without precision or range
+     * loss via a double.
+     */
+    if ((PyLong_Check(op) && !PyBool_Check(op))
+#if !defined(NPY_PY3K)
+        || (PyInt_Check(op) && !PyBool_Check(op))
+#endif
+    ) {
+        return npy_longdouble_from_PyLong(op);
+    }
+
+    if (PyUnicode_Check(op)) {
+        b = PyUnicode_AsUTF8String(op);
+        if (!b) {
+            return 0;
+        }
+    }
+    else {
+        b = op;
+        Py_XINCREF(b);
+    }
+    s = PyBytes_AsString(b);
+    if (s) {
+        errno = 0;
+        temp = NumPyOS_ascii_strtold(s, &end);
+        if (errno == ERANGE) {
+           if (PyErr_Warn(PyExc_RuntimeWarning,
+                   "overflow encountered in conversion from string") < 0) {
+               Py_XDECREF(b);
+               return 0;
+           }
+           /* strtold returns INFINITY of the correct sign. */
+        }
+        else if (errno) {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid literal for long double: %s (%s)",
+                         s,
+                         strerror(errno));
+            Py_XDECREF(b);
+            return 0;
+        }
+
+        /* Extra characters at the end of the string, or nothing parsed */
+        if (end == s || *end) {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid literal for long double: %s",
+                         s);
+            Py_XDECREF(b);
+            return 0;
+        }
+        Py_XDECREF(b);
+    }
+    else {
+        /* Probably wasn't a string, try converting it via a python double */
+        PyErr_Clear();
+        Py_XDECREF(b);
+        temp = (npy_longdouble) MyPyFloat_AsDouble(op);
+    }
+    return temp;
+}
+
+/*
+ * These return array scalars which are different than other date-types.
+ */
+
+static PyObject *
+LONGDOUBLE_getitem(void *ip, void *ap)
+{
+    return PyArray_Scalar(ip, PyArray_DESCR((PyArrayObject *)ap), NULL);
+}
+
+static int
+LONGDOUBLE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* ensure alignment */
+    npy_longdouble temp;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, LONGDOUBLE_setitem);
+    }
+
+    if (PyArray_IsScalar(op, LongDouble)) {
+        temp = ((PyLongDoubleScalarObject *)op)->obval;
+    }
+    else {
+        /* In case something funny happened in PyArray_IsScalar */
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+        temp = string_to_long_double(op);
+    }
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        *((npy_longdouble *)ov) = temp;
+    }
+    else {
+        copy_and_swap(ov, &temp, PyArray_DESCR(ap)->elsize, 1, 0,
+                      PyArray_ISBYTESWAPPED(ap));
+    }
+    return 0;
+}
+
+static PyObject *
+CLONGDOUBLE_getitem(void *ip, void *ap)
+{
+    return PyArray_Scalar(ip, PyArray_DESCR((PyArrayObject *)ap), NULL);
+}
+
+/* UNICODE */
+static PyObject *
+UNICODE_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    Py_ssize_t size = PyArray_ITEMSIZE(ap);
+    int swap = PyArray_ISBYTESWAPPED(ap);
+    int align = !PyArray_ISALIGNED(ap);
+
+    return (PyObject *)PyUnicode_FromUCS4(ip, size, swap, align);
+}
+
+static int
+UNICODE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    PyObject *temp;
+    Py_UNICODE *ptr;
+    int datalen;
+#ifndef Py_UNICODE_WIDE
+    char *buffer;
+#endif
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
+    }
+
+    if (PySequence_NoString_Check(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                "setting an array element with a sequence");
+        return -1;
+    }
+#if defined(NPY_PY3K)
+    if (PyBytes_Check(op)) {
+        /* Try to decode from ASCII */
+        temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    else if ((temp=PyObject_Str(op)) == NULL) {
+#else
+    if ((temp=PyObject_Unicode(op)) == NULL) {
+#endif
+        return -1;
+    }
+    ptr = PyUnicode_AS_UNICODE(temp);
+    if ((ptr == NULL) || (PyErr_Occurred())) {
+        Py_DECREF(temp);
+        return -1;
+    }
+    datalen = PyUnicode_GET_DATA_SIZE(temp);
+
+#ifdef Py_UNICODE_WIDE
+    memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
+#else
+    if (!PyArray_ISALIGNED(ap)) {
+        buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
+        if (buffer == NULL) {
+            Py_DECREF(temp);
+            PyErr_NoMemory();
+            return -1;
+        }
+    }
+    else {
+        buffer = ov;
+    }
+    datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
+            datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
+    datalen <<= 2;
+    if (!PyArray_ISALIGNED(ap)) {
+        memcpy(ov, buffer, datalen);
+        PyArray_free(buffer);
+    }
+#endif
+    /* Fill in the rest of the space with 0 */
+    if (PyArray_DESCR(ap)->elsize > datalen) {
+        memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
+    }
+    if (PyArray_ISBYTESWAPPED(ap)) {
+        byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
+    }
+    Py_DECREF(temp);
+    return 0;
+}
+
+/* STRING
+ *
+ * can handle both NULL-terminated and not NULL-terminated cases
+ * will truncate all ending NULLs in returned string.
+ */
+static PyObject *
+STRING_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* Will eliminate NULLs at the end */
+    char *ptr;
+    int size = PyArray_DESCR(ap)->elsize;
+
+    ptr = (char *)ip + size - 1;
+    while (size > 0 && *ptr-- == '\0') {
+        size--;
+    }
+    return PyBytes_FromStringAndSize(ip,size);
+}
+
+static int
+STRING_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ptr;
+    Py_ssize_t len;
+    PyObject *temp = NULL;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, STRING_setitem);
+    }
+
+    if (PySequence_NoString_Check(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                "setting an array element with a sequence");
+        return -1;
+    }
+#if defined(NPY_PY3K)
+    if (PyUnicode_Check(op)) {
+        /* Assume ASCII codec -- function similarly as Python 2 */
+        temp = PyUnicode_AsASCIIString(op);
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    else if (PyBytes_Check(op) || PyMemoryView_Check(op)) {
+        temp = PyObject_Bytes(op);
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    else {
+        /* Emulate similar casting behavior as on Python 2 */
+        PyObject *str;
+        str = PyObject_Str(op);
+        if (str == NULL) {
+            return -1;
+        }
+        temp = PyUnicode_AsASCIIString(str);
+        Py_DECREF(str);
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+#else
+    if ((temp = PyObject_Str(op)) == NULL) {
+        return -1;
+    }
+#endif
+    if (PyBytes_AsStringAndSize(temp, &ptr, &len) < 0) {
+        Py_DECREF(temp);
+        return -1;
+    }
+    memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize,len));
+    /*
+     * If string length is smaller than room in array
+     * Then fill the rest of the element size with NULL
+     */
+    if (PyArray_DESCR(ap)->elsize > len) {
+        memset((char *)ov + len, 0, (PyArray_DESCR(ap)->elsize - len));
+    }
+    Py_DECREF(temp);
+    return 0;
+}
+
+/* OBJECT */
+
+#define __ALIGNED(obj, sz) ((((size_t) obj) % (sz))==0)
+
+static PyObject *
+OBJECT_getitem(void *ip, void *NPY_UNUSED(ap))
+{
+    PyObject *obj;
+    NPY_COPY_PYOBJECT_PTR(&obj, ip);
+    if (obj == NULL) {
+        Py_RETURN_NONE;
+    }
+    else {
+        Py_INCREF(obj);
+        return obj;
+    }
+}
+
+
+static int
+OBJECT_setitem(PyObject *op, void *ov, void *NPY_UNUSED(ap))
+{
+    PyObject *obj;
+
+    NPY_COPY_PYOBJECT_PTR(&obj, ov);
+
+    Py_INCREF(op);
+    Py_XDECREF(obj);
+
+    NPY_COPY_PYOBJECT_PTR(ov, &op);
+
+    return PyErr_Occurred() ? -1 : 0;
+}
+
+/* VOID */
+
+static PyObject *
+VOID_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    PyArray_Descr* descr;
+
+    descr = PyArray_DESCR(ap);
+    if (PyDataType_HASFIELDS(descr)) {
+        PyObject *key;
+        PyObject *names;
+        int i, n;
+        PyObject *ret;
+        PyObject *tup;
+        int savedflags;
+
+        /* get the names from the fields dictionary*/
+        names = descr->names;
+        n = PyTuple_GET_SIZE(names);
+        ret = PyTuple_New(n);
+        savedflags = PyArray_FLAGS(ap);
+        for (i = 0; i < n; i++) {
+            npy_intp offset;
+            PyArray_Descr *new;
+            key = PyTuple_GET_ITEM(names, i);
+            tup = PyDict_GetItem(descr->fields, key);
+            if (_unpack_field(tup, &new, &offset) < 0) {
+                Py_DECREF(ret);
+                ((PyArrayObject_fields *)ap)->descr = descr;
+                return NULL;
+            }
+            /*
+             * TODO: temporarily modifying the array like this
+             *       is bad coding style, should be changed.
+             */
+            ((PyArrayObject_fields *)ap)->descr = new;
+            /* update alignment based on offset */
+            if ((new->alignment > 1)
+                    && ((((npy_intp)(ip+offset)) % new->alignment) != 0)) {
+                PyArray_CLEARFLAGS(ap, NPY_ARRAY_ALIGNED);
+            }
+            else {
+                PyArray_ENABLEFLAGS(ap, NPY_ARRAY_ALIGNED);
+            }
+            PyTuple_SET_ITEM(ret, i, PyArray_GETITEM(ap, ip+offset));
+            ((PyArrayObject_fields *)ap)->flags = savedflags;
+        }
+        ((PyArrayObject_fields *)ap)->descr = descr;
+        return ret;
+    }
+
+    if (descr->subarray) {
+        /* return an array of the basic type */
+        PyArray_Dims shape = {NULL, -1};
+        PyArrayObject *ret;
+
+        if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) {
+            npy_free_cache_dim_obj(shape);
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid shape in fixed-type tuple.");
+            return NULL;
+        }
+        Py_INCREF(descr->subarray->base);
+        ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+                &PyArray_Type, descr->subarray->base,
+                shape.len, shape.ptr, NULL, ip,
+                PyArray_FLAGS(ap) & ~NPY_ARRAY_F_CONTIGUOUS,
+                NULL, (PyObject *)ap);
+        npy_free_cache_dim_obj(shape);
+        return (PyObject *)ret;
+    }
+
+    return PyBytes_FromStringAndSize(ip, descr->elsize);
+}
+
+
+NPY_NO_EXPORT int PyArray_CopyObject(PyArrayObject *, PyObject *);
+
+/* Given a structured PyArrayObject arr, index i and structured datatype descr,
+ * modify the dtype of arr to contain a single field corresponding to the ith
+ * field of descr, recompute the alignment flag, and return the offset of the
+ * field (in offset_p). This is useful in preparation for calling copyswap on
+ * individual fields of a numpy structure, in VOID_setitem.  Compare to inner
+ * loops in VOID_getitem and VOID_nonzero.
+ *
+ * WARNING: Clobbers arr's dtype and alignment flag.
+ */
+NPY_NO_EXPORT int
+_setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
+            npy_intp *offset_p, char *dstdata)
+{
+    PyObject *key;
+    PyObject *tup;
+    PyArray_Descr *new;
+    npy_intp offset;
+
+    key = PyTuple_GET_ITEM(descr->names, i);
+    tup = PyDict_GetItem(descr->fields, key);
+    if (_unpack_field(tup, &new, &offset) < 0) {
+        return -1;
+    }
+
+    ((PyArrayObject_fields *)(arr))->descr = new;
+    if ((new->alignment > 1) && 
+                ((((uintptr_t)dstdata + offset) % new->alignment) != 0)) {
+        PyArray_CLEARFLAGS(arr, NPY_ARRAY_ALIGNED);
+    }
+    else {
+        PyArray_ENABLEFLAGS(arr, NPY_ARRAY_ALIGNED);
+    }
+
+    *offset_p = offset;
+    return 0;
+}
+
+/* Helper function for VOID_setitem, which uses the copyswap or casting code to
+ * copy structured datatypes between numpy arrays or scalars.
+ */
+static int
+_copy_and_return_void_setitem(PyArray_Descr *dstdescr, char *dstdata,
+                              PyArray_Descr *srcdescr, char *srcdata){
+    PyArrayObject_fields dummy_struct;
+    PyArrayObject *dummy = (PyArrayObject *)&dummy_struct;
+    npy_int names_size = PyTuple_GET_SIZE(dstdescr->names);
+    npy_intp offset;
+    npy_int i;
+    int ret;
+
+    /* Fast path if dtypes are equal */
+    if (PyArray_EquivTypes(srcdescr, dstdescr)) {
+        for (i = 0; i < names_size; i++) {
+            /* neither line can ever fail, in principle */
+            if (_setup_field(i, dstdescr, dummy, &offset, dstdata)) {
+                return -1;
+            }
+            PyArray_DESCR(dummy)->f->copyswap(dstdata + offset,
+                                              srcdata + offset, 0, dummy);
+        }
+        return 0;
+    }
+
+    /* Slow path */
+    ret = PyArray_CastRawArrays(1, srcdata, dstdata, 0, 0,
+                                srcdescr, dstdescr, 0);
+    if (ret != NPY_SUCCEED) {
+        return -1;
+    }
+    return 0;
+}
+
+static int
+VOID_setitem(PyObject *op, void *input, void *vap)
+{
+    char *ip = input;
+    PyArrayObject *ap = vap;
+    PyArray_Descr *descr;
+    int flags;
+    int itemsize=PyArray_DESCR(ap)->elsize;
+    int res;
+
+    descr = PyArray_DESCR(ap);
+    flags = PyArray_FLAGS(ap);
+    if (PyDataType_HASFIELDS(descr)) {
+        PyObject *errmsg;
+        npy_int i;
+        npy_intp offset;
+        int failed = 0;
+
+        /* If op is 0d-ndarray or numpy scalar, directly get dtype & data ptr */
+        if (PyArray_Check(op)) {
+            PyArrayObject *oparr = (PyArrayObject *)op;
+            if (PyArray_SIZE(oparr) != 1) {
+                PyErr_SetString(PyExc_ValueError,
+                        "setting an array element with a sequence.");
+                return -1;
+            }
+            return _copy_and_return_void_setitem(descr, ip,
+                                    PyArray_DESCR(oparr), PyArray_DATA(oparr));
+        }
+        else if (PyArray_IsScalar(op, Void)) {
+            PyArray_Descr *srcdescr = ((PyVoidScalarObject *)op)->descr;
+            char *srcdata = ((PyVoidScalarObject *)op)->obval;
+            return _copy_and_return_void_setitem(descr, ip, srcdescr, srcdata);
+        }
+        else if (PyTuple_Check(op)) {
+            /* if it's a tuple, copy field-by-field to ap, */
+            npy_intp names_size = PyTuple_GET_SIZE(descr->names);
+
+            if (names_size != PyTuple_Size(op)) {
+                errmsg = PyUString_FromFormat(
+                        "could not assign tuple of length %zd to structure "
+                        "with %" NPY_INTP_FMT " fields.", 
+                        PyTuple_Size(op), names_size);
+                PyErr_SetObject(PyExc_ValueError, errmsg);
+                Py_DECREF(errmsg);
+                return -1;
+            }
+
+            for (i = 0; i < names_size; i++) {
+                PyObject *item;
+
+                /* temporarily make ap have only this field */
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
+                    failed = 1;
+                    break;
+                }
+                item = PyTuple_GetItem(op, i);
+                if (item == NULL) {
+                    failed = 1;
+                    break;
+                }
+                /* use setitem to set this field */
+                if (PyArray_SETITEM(ap, ip + offset, item) < 0) {
+                    failed = 1;
+                    break;
+                }
+            }
+        }
+        else {
+            /* Otherwise must be non-void scalar. Try to assign to each field */
+            npy_intp names_size = PyTuple_GET_SIZE(descr->names);
+
+            for (i = 0; i < names_size; i++) {
+                /* temporarily make ap have only this field */
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
+                    failed = 1;
+                    break;
+                }
+                /* use setitem to set this field */
+                if (PyArray_SETITEM(ap, ip + offset, op) < 0) {
+                    failed = 1;
+                    break;
+                }
+            }
+        }
+
+        /* reset clobbered attributes */
+        ((PyArrayObject_fields *)(ap))->descr = descr;
+        ((PyArrayObject_fields *)(ap))->flags = flags;
+
+        if (failed) {
+            return -1;
+        }
+        return 0;
+    }
+    else if (PyDataType_HASSUBARRAY(descr)) {
+        /* copy into an array of the same basic type */
+        PyArray_Dims shape = {NULL, -1};
+        PyArrayObject *ret;
+        if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) {
+            npy_free_cache_dim_obj(shape);
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid shape in fixed-type tuple.");
+            return -1;
+        }
+        Py_INCREF(descr->subarray->base);
+        ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+                &PyArray_Type, descr->subarray->base,
+                shape.len, shape.ptr, NULL, ip,
+                PyArray_FLAGS(ap), NULL, (PyObject *)ap);
+        npy_free_cache_dim_obj(shape);
+        if (!ret) {
+            return -1;
+        }
+        res = PyArray_CopyObject(ret, op);
+        Py_DECREF(ret);
+        return res;
+    }
+
+    /*
+     * Fall through case - non-structured void datatype. This is a very
+     * undiscerning case: It interprets any object as a buffer
+     * and reads as many bytes as possible, padding with 0.
+     */
+#if defined(NPY_PY3K)
+    {
+        Py_buffer view;
+
+        if (PyObject_GetBuffer(op, &view, PyBUF_SIMPLE) < 0) {
+            return -1;
+        }
+        memcpy(ip, view.buf, PyArray_MIN(view.len, itemsize));
+        if (itemsize > view.len) {
+            memset(ip + view.len, 0, itemsize - view.len);
+        }
+        PyBuffer_Release(&view);
+        _dealloc_cached_buffer_info(op);
+    }
+#else
+    {
+        const void *buffer;
+        Py_ssize_t buflen;
+
+        if (PyObject_AsReadBuffer(op, &buffer, &buflen) < 0) {
+            return -1;
+        }
+        memcpy(ip, buffer, PyArray_MIN(buflen, itemsize));
+        if (itemsize > buflen) {
+            memset(ip + buflen, 0, itemsize - buflen);
+        }
+    }
+#endif
+    return 0;
+}
+
+static PyObject *
+DATETIME_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_datetime dt;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return NULL;
+    }
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        dt = *((npy_datetime *)ip);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&dt, ip, PyArray_ISBYTESWAPPED(ap), ap);
+    }
+
+    return convert_datetime_to_pyobject(dt, meta);
+}
+
+
+static PyObject *
+TIMEDELTA_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_timedelta td;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return NULL;
+    }
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        td = *((npy_timedelta *)ip);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&td, ip, PyArray_ISBYTESWAPPED(ap), ap);
+    }
+
+    return convert_timedelta_to_pyobject(td, meta);
+}
+
+static int
+DATETIME_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* ensure alignment */
+    npy_datetime temp = 0;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return -1;
+    }
+
+    /* Convert the object into a NumPy datetime */
+    if (convert_pyobject_to_datetime(meta, op,
+                            NPY_SAME_KIND_CASTING, &temp) < 0) {
+        return -1;
+    }
+
+    /* Copy the value into the output */
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        *((npy_datetime *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+
+    return 0;
+}
+
+static int
+TIMEDELTA_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* ensure alignment */
+    npy_timedelta temp = 0;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return -1;
+    }
+
+    /* Convert the object into a NumPy datetime */
+    if (convert_pyobject_to_timedelta(meta, op,
+                            NPY_SAME_KIND_CASTING, &temp) < 0) {
+        return -1;
+    }
+
+    /* Copy the value into the output */
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        *((npy_timedelta *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+
+    return 0;
+}
+
+
+/*
+ *****************************************************************************
+ **                       TYPE TO TYPE CONVERSIONS                          **
+ *****************************************************************************
+ */
+
+
+/* Assumes contiguous, and aligned, from and to */
+
+
+/**begin repeat
+ *
+ * #TOTYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *           LONGLONG, ULONGLONG, FLOAT, DOUBLE, LONGDOUBLE, DATETIME,
+ *           TIMEDELTA#
+ * #totype = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *           npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *           npy_float, npy_double, npy_longdouble,
+ *           npy_datetime, npy_timedelta#
+ */
+
+/**begin repeat1
+ *
+ * #FROMTYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *             LONGLONG, ULONGLONG, FLOAT, DOUBLE, LONGDOUBLE, DATETIME,
+ *             TIMEDELTA#
+ * #fromtype = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_float, npy_double, npy_longdouble,
+ *             npy_datetime, npy_timedelta#
+ */
+static void
+@FROMTYPE@_to_@TOTYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @fromtype@ *ip = input;
+    @totype@ *op = output;
+
+    while (n--) {
+        *op++ = (@totype@)*ip++;
+    }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ *
+ * #FROMTYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #fromtype = npy_float, npy_double, npy_longdouble#
+ */
+static void
+@FROMTYPE@_to_@TOTYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @fromtype@ *ip = input;
+    @totype@ *op = output;
+
+    while (n--) {
+        *op++ = (@totype@)*ip;
+        ip += 2;
+    }
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/**begin repeat
+ *
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, LONGDOUBLE, DATETIME,
+ *         TIMEDELTA#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ */
+
+static void
+@TYPE@_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @type@ *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_@TYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    @type@ *op = output;
+
+    while (n--) {
+        *op++ = (@type@)npy_half_to_float(*ip++);
+    }
+}
+
+/**end repeat**/
+#if NPY_SIZEOF_SHORT == 2
+#define HALF_to_HALF SHORT_to_SHORT
+#elif NPY_SIZEOF_INT == 2
+#define HALF_to_HALF INT_to_INT
+#endif
+
+/**begin repeat
+ *
+ * #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+ * #name = float, double, float, double#
+ * #itype = npy_uint32, npy_uint64, npy_uint32, npy_uint64#
+ * #iscomplex = 0, 0, 1, 1#
+ */
+
+static void
+@TYPE@_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @itype@ *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_@name@bits_to_halfbits(*ip);
+#if @iscomplex@
+        ip += 2;
+#else
+        ip++;
+#endif
+    }
+}
+
+static void
+HALF_to_@TYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    @itype@ *op = output;
+
+    while (n--) {
+        *op++ = npy_halfbits_to_@name@bits(*ip++);
+#if @iscomplex@
+        *op++ = 0;
+#endif
+    }
+}
+
+/**end repeat**/
+
+static void
+CLONGDOUBLE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_double_to_half((double) (*ip++));
+        ip += 2;
+    }
+}
+
+static void
+HALF_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = npy_half_to_double(*ip++);
+        *op++ = 0;
+    }
+}
+
+/**begin repeat
+ *
+ * #FROMTYPE = BOOL,
+ *             BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *             LONG, ULONG, LONGLONG, ULONGLONG,
+ *             FLOAT, DOUBLE, LONGDOUBLE,
+ *             DATETIME, TIMEDELTA#
+ * #fromtype = npy_bool,
+ *             npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_float, npy_double, npy_longdouble,
+ *             npy_datetime, npy_timedelta#
+ */
+static void
+@FROMTYPE@_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @fromtype@ *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+/**end repeat**/
+
+static void
+HALF_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(!npy_half_iszero(*ip++));
+    }
+}
+
+/**begin repeat
+ *
+ * #FROMTYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #fromtype = npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+static void
+@FROMTYPE@_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @fromtype@ *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op = (npy_bool)((ip->real != NPY_FALSE) ||
+                (ip->imag != NPY_FALSE));
+        op++;
+        ip++;
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #TOTYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *           LONG, ULONG, LONGLONG, ULONGLONG,
+ *           HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *           DATETIME, TIMEDELTA#
+ * #totype = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *           npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *           npy_half, npy_float, npy_double, npy_longdouble,
+ *           npy_datetime, npy_timedelta#
+ * #one = 1*10, NPY_HALF_ONE, 1*5#
+ * #zero = 0*10, NPY_HALF_ZERO, 0*5#
+ */
+static void
+BOOL_to_@TOTYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    @totype@ *op = output;
+
+    while (n--) {
+        *op++ = (@totype@)((*ip++ != NPY_FALSE) ? @one@ : @zero@);
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #TOTYPE = CFLOAT, CDOUBLE,CLONGDOUBLE#
+ * #totype = npy_float, npy_double, npy_longdouble#
+ */
+
+/**begin repeat1
+ * #FROMTYPE = BOOL,
+ *             BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *             LONG, ULONG, LONGLONG, ULONGLONG,
+ *             FLOAT, DOUBLE, LONGDOUBLE,
+ *             DATETIME, TIMEDELTA#
+ * #fromtype = npy_bool,
+ *             npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_float, npy_double, npy_longdouble,
+ *             npy_datetime, npy_timedelta#
+ */
+static void
+@FROMTYPE@_to_@TOTYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @fromtype@ *ip = input;
+    @totype@ *op = output;
+
+    while (n--) {
+        *op++ = (@totype@)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #TOTYPE = CFLOAT,CDOUBLE,CLONGDOUBLE#
+ * #totype = npy_float, npy_double, npy_longdouble#
+ */
+
+/**begin repeat1
+ * #FROMTYPE = CFLOAT,CDOUBLE,CLONGDOUBLE#
+ * #fromtype = npy_float, npy_double, npy_longdouble#
+ */
+static void
+@FROMTYPE@_to_@TOTYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const @fromtype@ *ip = input;
+    @totype@ *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (@totype@)*ip++;
+    }
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #FROMTYPE = BOOL,
+ *             BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *             LONG, ULONG, LONGLONG, ULONGLONG,
+ *             HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *             CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *             STRING, UNICODE, VOID, OBJECT,
+ *             DATETIME, TIMEDELTA#
+ * #fromtype = npy_bool,
+ *             npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_half, npy_float, npy_double, npy_longdouble,
+ *             npy_cfloat, npy_cdouble, npy_clongdouble,
+ *             npy_char, npy_char, npy_char, PyObject *,
+ *             npy_datetime, npy_timedelta#
+ * #skip = 1*18, PyArray_DESCR(aip)->elsize*3, 1*3#
+ */
+static void
+@FROMTYPE@_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    @fromtype@ *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = @skip@;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = @FROMTYPE@_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+/**end repeat**/
+
+#define _NPY_UNUSEDBOOL  NPY_UNUSED
+#define _NPY_UNUSEDBYTE  NPY_UNUSED
+#define _NPY_UNUSEDUBYTE  NPY_UNUSED
+#define _NPY_UNUSEDSHORT  NPY_UNUSED
+#define _NPY_UNUSEDUSHORT  NPY_UNUSED
+#define _NPY_UNUSEDINT  NPY_UNUSED
+#define _NPY_UNUSEDUINT  NPY_UNUSED
+#define _NPY_UNUSEDLONG  NPY_UNUSED
+#define _NPY_UNUSEDULONG  NPY_UNUSED
+#define _NPY_UNUSEDLONGLONG  NPY_UNUSED
+#define _NPY_UNUSEDULONGLONG  NPY_UNUSED
+#define _NPY_UNUSEDHALF NPY_UNUSED
+#define _NPY_UNUSEDFLOAT  NPY_UNUSED
+#define _NPY_UNUSEDDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDLONGDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDCFLOAT  NPY_UNUSED
+#define _NPY_UNUSEDCDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDCLONGDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDDATETIME  NPY_UNUSED
+#define _NPY_UNUSEDTIMEDELTA  NPY_UNUSED
+#define _NPY_UNUSEDHALF NPY_UNUSED
+#define _NPY_UNUSEDSTRING
+#define _NPY_UNUSEDVOID
+#define _NPY_UNUSEDUNICODE
+
+/**begin repeat
+ *
+ * #TOTYPE = BOOL,
+ *           BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *           LONG, ULONG, LONGLONG, ULONGLONG,
+ *           HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *           CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *           STRING, UNICODE, VOID,
+ *           DATETIME, TIMEDELTA#
+ * #totype = npy_bool,
+ *           npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *           npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *           npy_half, npy_float, npy_double, npy_longdouble,
+ *           npy_cfloat, npy_cdouble, npy_clongdouble,
+ *           npy_char, npy_char, npy_char,
+ *           npy_datetime, npy_timedelta#
+ * #skip = 1*18, PyArray_DESCR(aop)->elsize*3, 1*2#
+ */
+static void
+OBJECT_to_@TOTYPE@(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    @totype@ *op = output;
+
+    npy_intp i;
+    int skip = @skip@;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (@TOTYPE@_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (@TOTYPE@_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+/**end repeat**/
+
+
+/**begin repeat
+ *
+ * #from = STRING*23, UNICODE*23, VOID*23#
+ * #fromtyp = npy_char*69#
+ * #to = (BOOL,
+ *           BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *           LONG, ULONG, LONGLONG, ULONGLONG,
+ *           HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *           CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *           STRING, UNICODE, VOID,
+ *           DATETIME, TIMEDELTA)*3#
+ * #totyp = (npy_bool,
+ *              npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *              npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *              npy_half, npy_float, npy_double, npy_longdouble,
+ *              npy_cfloat, npy_cdouble, npy_clongdouble,
+ *              npy_char, npy_char, npy_char,
+ *              npy_datetime, npy_timedelta)*3#
+ * #oskip = 1*18,(PyArray_DESCR(aop)->elsize)*3,1*2,
+ *          1*18,(PyArray_DESCR(aop)->elsize)*3,1*2,
+ *          1*18,(PyArray_DESCR(aop)->elsize)*3,1*2#
+ * #convert = 1*18, 0*3, 1*2,
+ *            1*18, 0*3, 1*2,
+ *            0*23#
+ * #convstr = (Int*9, Long*2, Float*4, Complex*3, Tuple*3, Long*2)*3#
+ */
+
+#if @convert@
+
+#define IS_@from@
+
+static void
+@from@_to_@to@(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    @fromtyp@ *ip = input;
+    @totyp@ *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = @oskip@;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *new;
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+
+#if defined(NPY_PY3K) && defined(IS_STRING)
+        /* Work around some Python 3K */
+        new = PyUnicode_FromEncodedObject(temp, "ascii", "strict");
+        Py_DECREF(temp);
+        temp = new;
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        /* convert from Python object to needed one */
+        {
+            PyObject *args;
+
+            /* call out to the Python builtin given by convstr */
+            args = Py_BuildValue("(N)", temp);
+#if defined(NPY_PY3K)
+#define PyInt_Type PyLong_Type
+#endif
+            new = Py@convstr@_Type.tp_new(&Py@convstr@_Type, args, NULL);
+#if defined(NPY_PY3K)
+#undef PyInt_Type
+#endif
+            Py_DECREF(args);
+            temp = new;
+            if (temp == NULL) {
+                return;
+            }
+        }
+
+        if (@to@_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+#undef IS_@from@
+
+#else
+
+static void
+@from@_to_@to@(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    @fromtyp@ *ip = input;
+    @totyp@ *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = @oskip@;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+        if (@to@_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+#endif
+
+/**end repeat**/
+
+
+/**begin repeat
+ *
+ * #to = STRING*20, UNICODE*20, VOID*20#
+ * #totyp = npy_char*20, npy_char*20, npy_char*20#
+ * #from = (BOOL,
+ *             BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *             LONG, ULONG, LONGLONG, ULONGLONG,
+ *             HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *             CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *             DATETIME, TIMEDELTA)*3#
+ * #fromtyp = (npy_bool,
+ *               npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *               npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *               npy_half, npy_float, npy_double, npy_longdouble,
+ *               npy_cfloat, npy_cdouble, npy_clongdouble,
+ *               npy_datetime, npy_timedelta)*3#
+ */
+static void
+@from@_to_@to@(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    @fromtyp@ *ip = input;
+    @totyp@ *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (@to@_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                               SCAN                                      **
+ *****************************************************************************
+ */
+
+
+/*
+ * The first ignore argument is for backwards compatibility.
+ * Should be removed when the API version is bumped up.
+ */
+
+/**begin repeat
+ * #fname = SHORT, USHORT, INT, UINT,
+ *          LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #format = "hd", "hu", "d", "u",
+ *           "ld", "lu", NPY_LONGLONG_FMT, NPY_ULONGLONG_FMT#
+ */
+static int
+@fname@_scan(FILE *fp, @type@ *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%"@format@, ip);
+}
+/**end repeat**/
+
+/**begin repeat
+ * #fname = FLOAT, DOUBLE#
+ * #type = npy_float, npy_double#
+ */
+static int
+@fname@_scan(FILE *fp, @type@ *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = (@type@) result;
+    return ret;
+}
+/**end repeat**/
+
+static int
+LONGDOUBLE_scan(FILE *fp, npy_longdouble *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    long double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftoLf(fp, &result);
+    *ip = (npy_longdouble) result;
+    return ret;
+}
+
+static int
+HALF_scan(FILE *fp, npy_half *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = npy_double_to_half(result);
+    return ret;
+}
+
+/**begin repeat
+ * #fname = BYTE, UBYTE#
+ * #type = npy_byte, npy_ubyte#
+ * #btype = npy_int, npy_uint#
+ * #format = "d", "u"#
+ */
+static int
+@fname@_scan(FILE *fp, @type@ *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignore2))
+{
+    @btype@ temp;
+    int num;
+
+    num = fscanf(fp, "%"@format@, &temp);
+    *ip = (@type@) temp;
+    return num;
+}
+/**end repeat**/
+
+static int
+BOOL_scan(FILE *fp, npy_bool *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignore2))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = (npy_bool) (result != 0.0);
+    return ret;
+}
+
+/**begin repeat
+ * #fname = CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          OBJECT, STRING, UNICODE, VOID,
+ *          DATETIME, TIMEDELTA#
+ */
+
+#define @fname@_scan NULL
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             FROMSTR                                     **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *          LONG, ULONG, LONGLONG, ULONGLONG,
+ *          DATETIME, TIMEDELTA#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_datetime, npy_timedelta#
+ * #func = (PyOS_strtol, PyOS_strtoul)*4, NumPyOS_strtoll, NumPyOS_strtoull,
+ *         NumPyOS_strtoll*2#
+ * #btype = (npy_long, npy_ulong)*4, npy_longlong, npy_ulonglong,
+ *          npy_longlong*2#
+ */
+static int
+@fname@_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    @btype@ result;
+
+    result = @func@(str, endptr, 10);
+    *(@type@ *)ip = result;
+    return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #fname = FLOAT, DOUBLE#
+ * #type = npy_float, npy_double#
+ */
+static int
+@fname@_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(@type@ *)ip = result;
+    return 0;
+}
+/**end repeat**/
+
+static int
+LONGDOUBLE_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    long double result;
+
+    result = NumPyOS_ascii_strtold(str, endptr);
+    *(npy_longdouble *)ip = result;
+    return 0;
+}
+
+static int
+HALF_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(npy_half *)ip = npy_double_to_half(result);
+    return 0;
+}
+
+static int
+BOOL_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(npy_bool *)ip = (result != 0.0);
+    return 0;
+}
+
+/**begin repeat
+ * #fname = CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          OBJECT, STRING, UNICODE, VOID#
+ */
+
+#define @fname@_fromstr NULL
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                            COPYSWAPN                                    **
+ *****************************************************************************
+ */
+
+
+static NPY_INLINE void
+_basic_copyn(void *dst, npy_intp dstride, void *src, npy_intp sstride,
+             npy_intp n, int elsize) {
+    if (src == NULL) {
+        return;
+    }
+    if (sstride == elsize && dstride == elsize) {
+        memcpy(dst, src, n*elsize);
+    }
+    else {
+        _unaligned_strided_byte_copy(dst, dstride, src, sstride,
+                n, elsize);
+    }
+}
+
+static NPY_INLINE void
+_basic_copy(void *dst, void *src, int elsize) {
+    if (src == NULL) {
+        return;
+    }
+    memcpy(dst, src, elsize);
+}
+
+
+/**begin repeat
+ *
+ * #fname = SHORT, USHORT, INT, UINT,
+ *          LONG, ULONG, LONGLONG, ULONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          DATETIME, TIMEDELTA#
+ * #fsize = SHORT, SHORT, INT, INT,
+ *          LONG, LONG, LONGLONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          DATETIME, TIMEDELTA#
+ * #type = npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ */
+static void
+@fname@_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(@type@));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(@type@));
+    }
+}
+
+static void
+@fname@_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(@type@));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_@fsize@ == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_@fsize@ == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_@fsize@-1);
+            nn = NPY_SIZEOF_@fsize@ / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #fname = BOOL,
+ *          BYTE, UBYTE#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte#
+ */
+static void
+@fname@_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int NPY_UNUSED(swap), void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(@type@));
+    /* ignore swap */
+}
+
+static void
+@fname@_copyswap (void *dst, void *src, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(@type@));
+    /* ignore swap */
+}
+
+/**end repeat**/
+
+
+
+/**begin repeat
+ *
+ * #fname = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #fsize = FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+*/
+static void
+@fname@_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(@type@));
+
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, NPY_SIZEOF_@fsize@);
+        _strided_byte_swap(((char *)dst + NPY_SIZEOF_@fsize@), dstride,
+                n, NPY_SIZEOF_@fsize@);
+    }
+}
+
+static void
+@fname@_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(@type@));
+
+    if (swap) {
+        char *a, *b, c;
+        a = (char *)dst;
+#if NPY_SIZEOF_@fsize@ == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 2;
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 4;
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 5;
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 6;
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_@fsize@ == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 8;
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_@fsize@ - 1);
+            nn = NPY_SIZEOF_@fsize@ / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+            a += nn;
+            b = a + (NPY_SIZEOF_@fsize@ - 1);
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+/**end repeat**/
+
+static void
+OBJECT_copyswapn(PyObject **dst, npy_intp dstride, PyObject **src,
+        npy_intp sstride, npy_intp n, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+    npy_intp i;
+    if (src != NULL) {
+        if (__ALIGNED(dst, sizeof(PyObject **))
+                && __ALIGNED(src, sizeof(PyObject **))
+                && __ALIGNED(dstride, sizeof(PyObject **))
+                && __ALIGNED(sstride, sizeof(PyObject **))) {
+            dstride /= sizeof(PyObject **);
+            sstride /= sizeof(PyObject **);
+            for (i = 0; i < n; i++) {
+                Py_XINCREF(*src);
+                Py_XDECREF(*dst);
+                *dst = *src;
+                dst += dstride;
+                src += sstride;
+            }
+        }
+        else {
+            unsigned char *dstp, *srcp;
+            PyObject *tmp;
+            dstp = (unsigned char*)dst;
+            srcp = (unsigned char*)src;
+            for (i = 0; i < n; i++) {
+                NPY_COPY_PYOBJECT_PTR(&tmp, srcp);
+                Py_XINCREF(tmp);
+                NPY_COPY_PYOBJECT_PTR(&tmp, dstp);
+                Py_XDECREF(tmp);
+                NPY_COPY_PYOBJECT_PTR(dstp, srcp);
+                dstp += dstride;
+                srcp += sstride;
+            }
+        }
+    }
+    /* ignore swap */
+    return;
+}
+
+static void
+OBJECT_copyswap(PyObject **dst, PyObject **src, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+
+    if (src != NULL) {
+        if (__ALIGNED(dst,sizeof(PyObject **)) &&
+                __ALIGNED(src,sizeof(PyObject **))) {
+            Py_XINCREF(*src);
+            Py_XDECREF(*dst);
+            *dst = *src;
+        }
+        else {
+            PyObject *tmp;
+            NPY_COPY_PYOBJECT_PTR(&tmp, src);
+            Py_XINCREF(tmp);
+            NPY_COPY_PYOBJECT_PTR(&tmp, dst);
+            Py_XDECREF(tmp);
+            NPY_COPY_PYOBJECT_PTR(dst, src);
+        }
+    }
+}
+
+/* ignore swap */
+static void
+STRING_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
+                  npy_intp n, int NPY_UNUSED(swap), PyArrayObject *arr)
+{
+    if (arr == NULL) {
+        return;
+    }
+    _basic_copyn(dst, dstride, src, sstride, n, PyArray_DESCR(arr)->elsize);
+    return;
+}
+
+/* */
+static void
+VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
+                npy_intp n, int swap, PyArrayObject *arr)
+{
+    PyArray_Descr *descr;
+
+    if (arr == NULL) {
+        return;
+    }
+
+    descr = PyArray_DESCR(arr);
+
+    if (PyArray_HASFIELDS(arr)) {
+        PyObject *key, *value;
+
+        Py_ssize_t pos = 0;
+
+        while (PyDict_Next(descr->fields, &pos, &key, &value)) {
+            npy_intp offset;
+            PyArray_Descr * new;
+            if (NPY_TITLE_KEY(key, value)) {
+                continue;
+            }
+            if (_unpack_field(value, &new, &offset) < 0) {
+                ((PyArrayObject_fields *)arr)->descr = descr;
+                return;
+            }
+            /*
+             * TODO: temporarily modifying the array like this
+             *       is bad coding style, should be changed.
+             */
+            ((PyArrayObject_fields *)arr)->descr = new;
+            new->f->copyswapn(dst+offset, dstride,
+                    (src != NULL ? src+offset : NULL),
+                    sstride, n, swap, arr);
+        }
+        ((PyArrayObject_fields *)arr)->descr = descr;
+        return;
+    }
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
+        npy_intp num;
+        npy_intp i;
+        int subitemsize;
+        char *dstptr, *srcptr;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
+
+        if (can_optimize_subarray) {
+            _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
+            return;
+        }
+
+        new = descr->subarray->base;
+        /*
+         * TODO: temporarily modifying the array like this
+         *       is bad coding style, should be changed.
+         */
+        ((PyArrayObject_fields *)arr)->descr = new;
+        dstptr = dst;
+        srcptr = src;
+        subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        num = descr->elsize / subitemsize;
+        for (i = 0; i < n; i++) {
+            new->f->copyswapn(dstptr, subitemsize, srcptr,
+                    subitemsize, num, swap, arr);
+            dstptr += dstride;
+            if (srcptr) {
+                srcptr += sstride;
+            }
+        }
+        ((PyArrayObject_fields *)arr)->descr = descr;
+        return;
+    }
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
+    return;
+}
+
+static void
+VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
+{
+    PyArray_Descr *descr;
+
+    if (arr == NULL) {
+        return;
+    }
+
+    descr = PyArray_DESCR(arr);
+
+    if (PyArray_HASFIELDS(arr)) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+
+        while (PyDict_Next(descr->fields, &pos, &key, &value)) {
+            npy_intp offset;
+            PyArray_Descr * new;
+            if (NPY_TITLE_KEY(key, value)) {
+                continue;
+            }
+            if (_unpack_field(value, &new, &offset) < 0) {
+                ((PyArrayObject_fields *)arr)->descr = descr;
+                return;
+            }
+            /*
+             * TODO: temporarily modifying the array like this
+             *       is bad coding style, should be changed.
+             */
+            ((PyArrayObject_fields *)arr)->descr = new;
+            new->f->copyswap(dst+offset,
+                    (src != NULL ? src+offset : NULL),
+                    swap, arr);
+        }
+        ((PyArrayObject_fields *)arr)->descr = descr;
+        return;
+    }
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
+        npy_intp num;
+        int subitemsize;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
+
+        if (can_optimize_subarray) {
+            _basic_copy(dst, src, descr->elsize);
+            return;
+        }
+
+        new = descr->subarray->base;
+        /*
+         * TODO: temporarily modifying the array like this
+         *       is bad coding style, should be changed.
+         */
+        ((PyArrayObject_fields *)arr)->descr = new;
+        subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        num = descr->elsize / subitemsize;
+        new->f->copyswapn(dst, subitemsize, src,
+                subitemsize, num, swap, arr);
+        ((PyArrayObject_fields *)arr)->descr = descr;
+        return;
+    }
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copy(dst, src, descr->elsize);
+    return;
+}
+
+
+static void
+UNICODE_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
+                   npy_intp n, int swap, PyArrayObject *arr)
+{
+    int itemsize;
+
+    if (arr == NULL) {
+        return;
+    }
+    itemsize = PyArray_DESCR(arr)->elsize;
+    _basic_copyn(dst, dstride, src, sstride, n, itemsize);
+
+    if (swap) {
+        int i;
+        char *_dst;
+        itemsize = itemsize / 4;
+
+        while (n > 0) {
+            _dst = dst;
+            for (i=0; i < itemsize; i++) {
+                npy_bswap4_unaligned(_dst);
+                _dst += 4;
+            }
+            dst += dstride;
+            --n;
+        }
+    }
+}
+
+
+static void
+STRING_copyswap(char *dst, char *src, int NPY_UNUSED(swap), PyArrayObject *arr)
+{
+    if (arr == NULL) {
+        return;
+    }
+    /* copy first if needed */
+    _basic_copy(dst, src, PyArray_DESCR(arr)->elsize);
+}
+
+static void
+UNICODE_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
+{
+    int itemsize;
+
+    if (arr == NULL) {
+        return;
+    }
+    itemsize = PyArray_DESCR(arr)->elsize;
+    _basic_copy(dst, src, itemsize);
+
+    if (swap) {
+        int i;
+        char *_dst;
+        itemsize = itemsize / 4;
+
+        _dst = dst;
+        for (i=0; i < itemsize; i++) {
+            npy_bswap4_unaligned(_dst);
+            _dst += 4;
+        }
+    }
+}
+
+
+/*
+ *****************************************************************************
+ **                                 NONZERO                                 **
+ *****************************************************************************
+ */
+
+#define _NONZERO(a) ((a) != 0)
+
+/**begin repeat
+ *
+ * #fname = BOOL,
+ *          BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *          LONG, ULONG, LONGLONG, ULONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*11, 1*4, 0*2#
+ * #nonzero = _NONZERO*11, !npy_half_iszero, _NONZERO*5#
+ */
+static npy_bool
+@fname@_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        @type@ *ptmp = (@type@ *)ip;
+        return (npy_bool) @nonzero@(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        @type@ tmp;
+#if @isfloat@
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(@type@));
+#endif
+        return (npy_bool) @nonzero@(tmp);
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #fname = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+static npy_bool
+@fname@_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        @type@ *ptmp = (@type@ *)ip;
+        return (npy_bool) ((ptmp->real != 0) || (ptmp->imag != 0));
+    }
+    else {
+        @type@ tmp;
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+        return (npy_bool) ((tmp.real != 0) || (tmp.imag != 0));
+    }
+}
+/**end repeat**/
+
+
+#define WHITESPACE " \t\n\r\v\f"
+#define WHITELEN 6
+
+static npy_bool
+Py_STRING_ISSPACE(char ch)
+{
+    char white[] = WHITESPACE;
+    int j;
+    npy_bool space = NPY_FALSE;
+
+    for (j = 0; j < WHITELEN; j++) {
+        if (ch == white[j]) {
+            space = NPY_TRUE;
+            break;
+        }
+    }
+    return space;
+}
+
+static npy_bool
+STRING_nonzero (char *ip, PyArrayObject *ap)
+{
+    int len = PyArray_DESCR(ap)->elsize;
+    int i;
+    npy_bool nonz = NPY_FALSE;
+    npy_bool seen_null = NPY_FALSE;
+
+    for (i = 0; i < len; i++) {
+        if (*ip == '\0') {
+            seen_null = NPY_TRUE;
+        }
+        else if (seen_null || !Py_STRING_ISSPACE(*ip)) {
+            nonz = NPY_TRUE;
+            break;
+        }
+        ip++;
+    }
+    return nonz;
+}
+
+#ifdef Py_UNICODE_WIDE
+#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
+#else
+#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
+#endif
+
+static npy_bool
+UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
+{
+    int len = PyArray_DESCR(ap)->elsize >> 2;
+    int i;
+    npy_bool nonz = NPY_FALSE;
+    npy_bool seen_null = NPY_FALSE;
+    char *buffer = NULL;
+
+    if (PyArray_ISBYTESWAPPED(ap) || !PyArray_ISALIGNED(ap)) {
+        buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
+        if (buffer == NULL) {
+            return nonz;
+        }
+        memcpy(buffer, ip, PyArray_DESCR(ap)->elsize);
+        if (PyArray_ISBYTESWAPPED(ap)) {
+            byte_swap_vector(buffer, len, 4);
+        }
+        ip = (npy_ucs4 *)buffer;
+    }
+
+    for (i = 0; i < len; i++) {
+        if (*ip == '\0') {
+            seen_null = NPY_TRUE;
+        }
+        else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
+            nonz = NPY_TRUE;
+            break;
+        }
+        ip++;
+    }
+    PyArray_free(buffer);
+    return nonz;
+}
+
+static npy_bool
+OBJECT_nonzero (PyObject **ip, PyArrayObject *ap)
+{
+
+    if (PyArray_ISALIGNED(ap)) {
+        if (*ip == NULL) {
+            return NPY_FALSE;
+        }
+        return (npy_bool) PyObject_IsTrue(*ip);
+    }
+    else {
+        PyObject *obj;
+        NPY_COPY_PYOBJECT_PTR(&obj, ip);
+        if (obj == NULL) {
+            return NPY_FALSE;
+        }
+        return (npy_bool) PyObject_IsTrue(obj);
+    }
+}
+
+/*
+ * if we have fields, then nonzero only if all sub-fields are nonzero.
+ */
+static npy_bool
+VOID_nonzero (char *ip, PyArrayObject *ap)
+{
+    int i;
+    int len;
+    npy_bool nonz = NPY_FALSE;
+
+    if (PyArray_HASFIELDS(ap)) {
+        PyArray_Descr *descr;
+        PyObject *key, *value;
+        int savedflags;
+        Py_ssize_t pos = 0;
+
+        descr = PyArray_DESCR(ap);
+        savedflags = PyArray_FLAGS(ap);
+        while (PyDict_Next(descr->fields, &pos, &key, &value)) {
+            PyArray_Descr * new;
+            npy_intp offset;
+            if (NPY_TITLE_KEY(key, value)) {
+                continue;
+            }
+            if (_unpack_field(value, &new, &offset) < 0) {
+                PyErr_Clear();
+                continue;
+            }
+            /*
+             * TODO: temporarily modifying the array like this
+             *       is bad coding style, should be changed.
+             */
+            ((PyArrayObject_fields *)ap)->descr = new;
+            ((PyArrayObject_fields *)ap)->flags = savedflags;
+            if ((new->alignment > 1) && !__ALIGNED(ip + offset,
+                        new->alignment)) {
+                PyArray_CLEARFLAGS(ap, NPY_ARRAY_ALIGNED);
+            }
+            else {
+                PyArray_ENABLEFLAGS(ap, NPY_ARRAY_ALIGNED);
+            }
+            if (new->f->nonzero(ip+offset, ap)) {
+                nonz = NPY_TRUE;
+                break;
+            }
+        }
+        ((PyArrayObject_fields *)ap)->descr = descr;
+        ((PyArrayObject_fields *)ap)->flags = savedflags;
+        return nonz;
+    }
+    len = PyArray_DESCR(ap)->elsize;
+    for (i = 0; i < len; i++) {
+        if (*ip != '\0') {
+            nonz = NPY_TRUE;
+            break;
+        }
+        ip++;
+    }
+    return nonz;
+}
+
+#undef __ALIGNED
+
+
+/*
+ *****************************************************************************
+ **                                 COMPARE                                 **
+ *****************************************************************************
+ */
+
+
+/* boolean type */
+
+static int
+BOOL_compare(npy_bool *ip1, npy_bool *ip2, PyArrayObject *NPY_UNUSED(ap))
+{
+    return (*ip1 ? (*ip2 ? 0 : 1) : (*ip2 ? -1 : 0));
+}
+
+
+/* integer types */
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_datetime, npy_timedelta#
+ */
+
+static int
+@TYPE@_compare (@type@ *pa, @type@ *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const @type@ a = *pa;
+    const @type@ b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+/**end repeat**/
+
+
+/* float types */
+
+/*
+ * The real/complex comparison functions are compatible with the new sort
+ * order for nans introduced in numpy 1.4.0. All nan values now compare
+ * larger than non-nan values and are sorted to the end. The comparison
+ * order is:
+ *
+ *      Real: [R, nan]
+ *      Complex: [R + Rj, R + nanj, nan + Rj, nan + nanj]
+ *
+ *  where complex values with the same nan placements are sorted according
+ *  to the non-nan part if it exists. If both the real and imaginary parts
+ *  of complex types are non-nan the order is the same as the real parts
+ *  unless they happen to be equal, in which case the order is that of the
+ *  imaginary parts.
+ */
+
+/**begin repeat
+ *
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_float, npy_double, npy_longdouble#
+ */
+
+#define LT(a,b) ((a) < (b) || ((b) != (b) && (a) ==(a)))
+
+static int
+@TYPE@_compare(@type@ *pa, @type@ *pb)
+{
+    const @type@ a = *pa;
+    const @type@ b = *pb;
+    int ret;
+
+    if (LT(a,b)) {
+        ret = -1;
+    }
+    else if (LT(b,a)) {
+        ret = 1;
+    }
+    else {
+        ret = 0;
+    }
+    return ret;
+}
+
+
+static int
+C@TYPE@_compare(@type@ *pa, @type@ *pb)
+{
+    const @type@ ar = pa[0];
+    const @type@ ai = pa[1];
+    const @type@ br = pb[0];
+    const @type@ bi = pb[1];
+    int ret;
+
+    if (ar < br) {
+        if (ai == ai || bi != bi) {
+            ret = -1;
+        }
+        else {
+            ret = 1;
+        }
+    }
+    else if (br < ar) {
+        if (bi == bi || ai != ai) {
+            ret = 1;
+        }
+        else {
+            ret = -1;
+        }
+    }
+    else if (ar == br || (ar != ar && br != br)) {
+        if (LT(ai,bi)) {
+            ret = -1;
+        }
+        else if (LT(bi,ai)) {
+            ret = 1;
+        }
+        else {
+            ret = 0;
+        }
+    }
+    else if (ar == ar) {
+        ret = -1;
+    }
+    else {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+#undef LT
+
+/**end repeat**/
+
+static int
+HALF_compare (npy_half *pa, npy_half *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    npy_half a = *pa, b = *pb;
+    npy_bool a_isnan, b_isnan;
+    int ret;
+
+    a_isnan = npy_half_isnan(a);
+    b_isnan = npy_half_isnan(b);
+
+    if (a_isnan) {
+        ret = b_isnan ? 0 : -1;
+    }
+    else if (b_isnan) {
+        ret = 1;
+    }
+    else if(npy_half_lt_nonan(a, b)) {
+        ret = -1;
+    }
+    else if(npy_half_lt_nonan(b, a)) {
+        ret = 1;
+    }
+    else {
+        ret = 0;
+    }
+
+    return ret;
+}
+
+
+/* object type */
+
+static int
+OBJECT_compare(PyObject **ip1, PyObject **ip2, PyArrayObject *NPY_UNUSED(ap))
+{
+    /*
+     * ALIGNMENT NOTE: It seems that PyArray_Sort is already handling
+     * the alignment of pointers, so it doesn't need to be handled
+     * here.
+     */
+
+    int ret;
+    /*
+     * work around gh-3879, we cannot abort an in-progress quicksort
+     * so at least do not raise again
+     */
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    if ((*ip1 == NULL) || (*ip2 == NULL)) {
+        if (ip1 == ip2) {
+            return 1;
+        }
+        if (ip1 == NULL) {
+            return -1;
+        }
+        return 1;
+    }
+
+    ret = PyObject_RichCompareBool(*ip1, *ip2, Py_LT);
+    if (ret < 0) {
+        /* error occurred, avoid the next call to PyObject_RichCompareBool */ 
+        return 0;
+    }
+    if (ret == 1) {
+        return -1;
+    }
+    else if (PyObject_RichCompareBool(*ip1, *ip2, Py_GT) == 1) {
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+/* string type */
+
+static int
+STRING_compare(char *ip1, char *ip2, PyArrayObject *ap)
+{
+    const unsigned char *c1 = (unsigned char *)ip1;
+    const unsigned char *c2 = (unsigned char *)ip2;
+    const size_t len = PyArray_DESCR(ap)->elsize;
+    int i;
+
+    i = memcmp(c1, c2, len);
+    if (i > 0) {
+        return 1;
+    }
+    else if (i < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+
+/* unicode type */
+
+static int
+UNICODE_compare(npy_ucs4 *ip1, npy_ucs4 *ip2,
+                PyArrayObject *ap)
+{
+    int itemsize = PyArray_DESCR(ap)->elsize;
+
+    if (itemsize < 0) {
+        return 0;
+    }
+    itemsize /= sizeof(npy_ucs4);
+    while (itemsize-- > 0) {
+        npy_ucs4 c1 = *ip1++;
+        npy_ucs4 c2 = *ip2++;
+        if (c1 != c2) {
+            return (c1 < c2) ? -1 : 1;
+        }
+    }
+    return 0;
+}
+
+
+/* void type */
+
+/*
+ * If fields are defined, then compare on first field and if equal
+ * compare on second field.  Continue until done or comparison results
+ * in not_equal.
+ *
+ * Must align data passed on to sub-comparisons.
+ * Also must swap data based on to sub-comparisons.
+ */
+static int
+VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
+{
+    PyArray_Descr *descr;
+    PyObject *names, *key;
+    PyObject *tup;
+    PyArrayObject_fields dummy_struct;
+    PyArrayObject *dummy = (PyArrayObject *)&dummy_struct;
+    char *nip1, *nip2;
+    int i, res = 0, swap = 0;
+
+    if (!PyArray_HASFIELDS(ap)) {
+        return STRING_compare(ip1, ip2, ap);
+    }
+    descr = PyArray_DESCR(ap);
+    /*
+     * Compare on the first-field.  If equal, then
+     * compare on the second-field, etc.
+     */
+    names = descr->names;
+    for (i = 0; i < PyTuple_GET_SIZE(names); i++) {
+        PyArray_Descr *new;
+        npy_intp offset;
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(descr->fields, key);
+        if (_unpack_field(tup, &new, &offset) < 0) {
+            goto finish;
+        }
+        /* descr is the only field checked by compare or copyswap */
+        dummy_struct.descr = new;
+        swap = PyArray_ISBYTESWAPPED(dummy);
+        nip1 = ip1 + offset;
+        nip2 = ip2 + offset;
+        if (swap || new->alignment > 1) {
+            if (swap || !npy_is_aligned(nip1, new->alignment)) {
+                /* create buffer and copy */
+                nip1 = npy_alloc_cache(new->elsize);
+                if (nip1 == NULL) {
+                    goto finish;
+                }
+                memcpy(nip1, ip1 + offset, new->elsize);
+                if (swap)
+                    new->f->copyswap(nip1, NULL, swap, dummy);
+            }
+            if (swap || !npy_is_aligned(nip2, new->alignment)) {
+                /* create buffer and copy */
+                nip2 = npy_alloc_cache(new->elsize);
+                if (nip2 == NULL) {
+                    if (nip1 != ip1 + offset) {
+                        npy_free_cache(nip1, new->elsize);
+                    }
+                    goto finish;
+                }
+                memcpy(nip2, ip2 + offset, new->elsize);
+                if (swap)
+                    new->f->copyswap(nip2, NULL, swap, dummy);
+            }
+        }
+        res = new->f->compare(nip1, nip2, dummy);
+        if (swap || new->alignment > 1) {
+            if (nip1 != ip1 + offset) {
+                npy_free_cache(nip1, new->elsize);
+            }
+            if (nip2 != ip2 + offset) {
+                npy_free_cache(nip2, new->elsize);
+            }
+        }
+        if (res != 0) {
+            break;
+        }
+    }
+
+finish:
+    return res;
+}
+
+
+/*
+ *****************************************************************************
+ **                                 ARGFUNC                                 **
+ *****************************************************************************
+ */
+
+#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b))
+
+static int
+BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
+            PyArrayObject *NPY_UNUSED(aip))
+
+{
+    npy_intp i = 0;
+    /* memcmp like logical_and on i386 is maybe slower for small arrays */
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+    const __m128i zero = _mm_setzero_si128();
+    for (; i < n - (n % 32); i+=32) {
+        __m128i d1 = _mm_loadu_si128((__m128i*)&ip[i]);
+        __m128i d2 = _mm_loadu_si128((__m128i*)&ip[i + 16]);
+        d1 = _mm_cmpeq_epi8(d1, zero);
+        d2 = _mm_cmpeq_epi8(d2, zero);
+        if (_mm_movemask_epi8(_mm_min_epu8(d1, d2)) != 0xFFFF) {
+            break;
+        }
+    }
+#endif
+    for (; i < n; i++) {
+        if (ip[i]) {
+            *max_ind = i;
+            return 0;
+        }
+    }
+    *max_ind = 0;
+    return 0;
+}
+
+/**begin repeat
+ *
+ * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *          LONG, ULONG, LONGLONG, ULONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          DATETIME, TIMEDELTA#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*10, 1*7, 0*2#
+ * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
+ * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
+ * #iscomplex = 0*14, 1*3, 0*2#
+ * #incr = ip++*14, ip+=2*3, ip++*2#
+ */
+static int
+@fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    @type@ mp = *ip;
+#if @iscomplex@
+    @type@ mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if @isfloat@
+    if (@isnan@(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if @iscomplex@
+    if (@isnan@(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        @incr@;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if @iscomplex@
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || @isnan@(ip[0]) || @isnan@(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (@isnan@(mp) || @isnan@(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+        if (!@le@(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if @isfloat@
+            if (@isnan@(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+/**end repeat**/
+
+static int
+BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind,
+            PyArrayObject *NPY_UNUSED(aip))
+
+{
+    npy_bool * p = memchr(ip, 0, n * sizeof(*ip));
+    if (p == NULL) {
+        *min_ind = 0;
+        return 0;
+    }
+    *min_ind = p - ip;
+    return 0;
+}
+
+/**begin repeat
+ *
+ * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *          LONG, ULONG, LONGLONG, ULONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_float, npy_double, npy_longdouble#
+ * #isfloat = 0*10, 1*7#
+ * #isnan = nop*10, npy_half_isnan, npy_isnan*6#
+ * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*6#
+ * #iscomplex = 0*14, 1*3#
+ * #incr = ip++*14, ip+=2*3#
+ */
+static int
+@fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    @type@ mp = *ip;
+#if @iscomplex@
+    @type@ mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if @isfloat@
+    if (@isnan@(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if @iscomplex@
+    if (@isnan@(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        @incr@;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if @iscomplex@
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || @isnan@(ip[0]) || @isnan@(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (@isnan@(mp) || @isnan@(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+        if (!@le@(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if @isfloat@
+            if (@isnan@(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+/**end repeat**/
+
+#undef _LESS_THAN_OR_EQUAL
+
+/**begin repeat
+ *
+ * #fname = DATETIME, TIMEDELTA#
+ * #type = npy_datetime, npy_timedelta#
+ */
+static int
+@fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    /* NPY_DATETIME_NAT is smaller than every other value, we skip
+     * it for consistency with min().
+     */
+    npy_intp i;
+    @type@ mp = NPY_DATETIME_NAT;
+
+    i = 0;
+    while (i < n && mp == NPY_DATETIME_NAT) {
+        mp = ip[i];
+        i++;
+    }
+    if (i == n) {
+        /* All NaTs: return 0 */
+        *min_ind = 0;
+        return 0;
+    }
+    *min_ind = i - 1;
+    for (; i < n; i++) {
+        if (mp > ip[i] && ip[i] != NPY_DATETIME_NAT) {
+            mp = ip[i];
+            *min_ind = i;
+        }
+    }
+    return 0;
+}
+
+/**end repeat**/
+
+static int
+OBJECT_argmax(PyObject **ip, npy_intp n, npy_intp *max_ind,
+              PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+
+    *max_ind = 0;
+    /* Skip over all leading NULL entries */
+    for (i = 0; i < n && ip[i] == NULL; ++i);
+    if (i < n) {
+        /* Found first non-NULL entry */
+        PyObject *mp = ip[i];
+        *max_ind = i;
+        for (i = i + 1; i < n; ++i) {
+            PyObject *val = ip[i];
+            if (val != NULL) {
+                int greater_than = PyObject_RichCompareBool(val, mp, Py_GT);
+
+                if (greater_than < 0) {
+                    return 0;
+                }
+                if (greater_than) {
+                    mp = val;
+                    *max_ind = i;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+/**begin repeat
+ *
+ * #fname = STRING, UNICODE#
+ * #type = npy_char, npy_ucs4#
+ */
+static int
+@fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip)
+{
+    npy_intp i;
+    int elsize = PyArray_DESCR(aip)->elsize;
+    @type@ *mp = (@type@ *)PyArray_malloc(elsize);
+
+    if (mp == NULL) {
+        return 0;
+    }
+    memcpy(mp, ip, elsize);
+    *max_ind = 0;
+    for (i = 1; i < n; i++) {
+        ip += elsize / sizeof(@type@);
+        if (@fname@_compare(ip, mp, aip) > 0) {
+            memcpy(mp, ip, elsize);
+            *max_ind = i;
+        }
+    }
+    PyArray_free(mp);
+    return 0;
+}
+
+/**end repeat**/
+
+#define VOID_argmax NULL
+
+static int
+OBJECT_argmin(PyObject **ip, npy_intp n, npy_intp *min_ind,
+              PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+
+    *min_ind = 0;
+    /* Skip over all leading NULL entries */
+    for (i = 0; i < n && ip[i] == NULL; ++i);
+    if (i < n) {
+        /* Found first non-NULL entry */
+        PyObject *mp = ip[i];
+        *min_ind = i;
+        for (i = i + 1; i < n ; ++i) {
+            PyObject *val = ip[i];
+            if (val != NULL) {
+                int less_than = PyObject_RichCompareBool(val, mp, Py_LT);
+
+                if (less_than < 0) {
+                    return 0;
+                }
+                if (less_than) {
+                    mp = val;
+                    *min_ind = i;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+/**begin repeat
+ *
+ * #fname = STRING, UNICODE#
+ * #type = npy_char, npy_ucs4#
+ */
+static int
+@fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind, PyArrayObject *aip)
+{
+    npy_intp i;
+    int elsize = PyArray_DESCR(aip)->elsize;
+    @type@ *mp = (@type@ *)PyArray_malloc(elsize);
+
+    if (mp==NULL) return 0;
+    memcpy(mp, ip, elsize);
+    *min_ind = 0;
+    for(i=1; i<n; i++) {
+        ip += elsize / sizeof(@type@);
+        if (@fname@_compare(mp,ip,aip) > 0) {
+            memcpy(mp, ip, elsize);
+            *min_ind=i;
+        }
+    }
+    PyArray_free(mp);
+    return 0;
+}
+
+/**end repeat**/
+
+
+#define VOID_argmin NULL
+
+
+/*
+ *****************************************************************************
+ **                                  DOT                                    **
+ *****************************************************************************
+ */
+
+/*
+ * dot means inner product
+ */
+
+/************************** MAYBE USE CBLAS *********************************/
+
+
+/**begin repeat
+ *
+ * #name = FLOAT, DOUBLE#
+ * #type = npy_float, npy_double#
+ * #prefix = s, d#
+ */
+NPY_NO_EXPORT void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+           npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(@type@));
+    int is2b = blas_stride(is2, sizeof(@type@));
+
+    if (is1b && is2b)
+    {
+        double sum = 0.;  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+
+            sum += cblas_@prefix@dot(chunk,
+                                     (@type@ *) ip1, is1b,
+                                     (@type@ *) ip2, is2b);
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        *((@type@ *)op) = (@type@)sum;
+    }
+    else
+#endif
+    {
+        @type@ sum = (@type@)0;  /* could make this double */
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const @type@ ip1r = *((@type@ *)ip1);
+            const @type@ ip2r = *((@type@ *)ip2);
+
+            sum += ip1r * ip2r;
+        }
+        *((@type@ *)op) = sum;
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = CFLOAT, CDOUBLE#
+ * #ctype = npy_cfloat, npy_cdouble#
+ * #type = npy_float, npy_double#
+ * #prefix = c, z#
+ */
+NPY_NO_EXPORT void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+           char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    int is1b = blas_stride(is1, sizeof(@ctype@));
+    int is2b = blas_stride(is2, sizeof(@ctype@));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            @type@ tmp[2];
+
+            cblas_@prefix@dotu_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((@type@ *)op)[0] = (@type@)sum[0];
+        ((@type@ *)op)[1] = (@type@)sum[1];
+    }
+    else
+#endif
+    {
+        @type@ sumr = (@type@)0.0;
+        @type@ sumi = (@type@)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const @type@ ip1r = ((@type@ *)ip1)[0];
+            const @type@ ip1i = ((@type@ *)ip1)[1];
+            const @type@ ip2r = ((@type@ *)ip2)[0];
+            const @type@ ip2i = ((@type@ *)ip2)[1];
+
+            sumr += ip1r * ip2r - ip1i * ip2i;
+            sumi += ip1r * ip2i + ip1i * ip2r;
+        }
+        ((@type@ *)op)[0] = sumr;
+        ((@type@ *)op)[1] = sumi;
+    }
+}
+
+/**end repeat**/
+
+/**************************** NO CBLAS VERSIONS *****************************/
+
+static void
+BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+         void *NPY_UNUSED(ignore))
+{
+    npy_bool tmp = NPY_FALSE;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        if ((*((npy_bool *)ip1) != 0) && (*((npy_bool *)ip2) != 0)) {
+            tmp = NPY_TRUE;
+            break;
+        }
+    }
+    *((npy_bool *)op) = tmp;
+}
+
+/**begin repeat
+ *
+ * #name = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         LONGDOUBLE, DATETIME, TIMEDELTA#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_longdouble, npy_datetime, npy_timedelta#
+ * #out = npy_long, npy_ulong, npy_long, npy_ulong, npy_long, npy_ulong,
+ *        npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *        npy_longdouble, npy_datetime, npy_timedelta#
+ */
+static void
+@name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    @out@ tmp = (@out@)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (@out@)(*((@type@ *)ip1)) *
+               (@out@)(*((@type@ *)ip2));
+    }
+    *((@type@ *)op) = (@type@) tmp;
+}
+/**end repeat**/
+
+static void
+HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+         npy_intp n, void *NPY_UNUSED(ignore))
+{
+    float tmp = 0.0f;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += npy_half_to_float(*((npy_half *)ip1)) *
+               npy_half_to_float(*((npy_half *)ip2));
+    }
+    *((npy_half *)op) = npy_float_to_half(tmp);
+}
+
+static void
+CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+                            char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+    npy_longdouble tmpr = 0.0L;
+    npy_longdouble tmpi = 0.0L;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        const npy_longdouble ip1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble ip1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble ip2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
+
+        tmpr += ip1r * ip2r - ip1i * ip2i;
+        tmpi += ip1r * ip2i + ip1i * ip2r;
+    }
+    ((npy_longdouble *)op)[0] = tmpr;
+    ((npy_longdouble *)op)[1] = tmpi;
+}
+
+static void
+OBJECT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    /*
+     * ALIGNMENT NOTE: np.dot, np.inner etc. enforce that the array is
+     * BEHAVED before getting to this point, so unaligned pointers aren't
+     * handled here.
+     */
+    npy_intp i;
+    PyObject *tmp1, *tmp2, *tmp = NULL;
+    PyObject **tmp3;
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        if ((*((PyObject **)ip1) == NULL) || (*((PyObject **)ip2) == NULL)) {
+            tmp1 = Py_False;
+            Py_INCREF(Py_False);
+        }
+        else {
+            tmp1 = PyNumber_Multiply(*((PyObject **)ip1), *((PyObject **)ip2));
+            if (!tmp1) {
+                Py_XDECREF(tmp);
+                return;
+            }
+        }
+        if (i == 0) {
+            tmp = tmp1;
+        }
+        else {
+            tmp2 = PyNumber_Add(tmp, tmp1);
+            Py_XDECREF(tmp);
+            Py_XDECREF(tmp1);
+            if (!tmp2) {
+                return;
+            }
+            tmp = tmp2;
+        }
+    }
+    tmp3 = (PyObject**) op;
+    tmp2 = *tmp3;
+    *((PyObject **)op) = tmp;
+    Py_XDECREF(tmp2);
+}
+
+
+/*
+ *****************************************************************************
+ **                                 FILL                                    **
+ *****************************************************************************
+ */
+
+
+#define BOOL_fill NULL
+
+/* this requires buffer to be filled with objects or NULL */
+static int
+OBJECT_fill(PyObject **buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    int retval = 0;
+    npy_intp i;
+    PyObject *start = buffer[0];
+    PyObject *delta = buffer[1];
+    PyObject *second;
+
+    delta = PyNumber_Subtract(delta, start);
+    if (!delta) {
+        return -1;
+    }
+    second = start = PyNumber_Add(start, delta);
+    if (!start) {
+        goto error;
+    }
+    buffer += 2;
+
+    for (i = 2; i < length; i++, buffer++) {
+        start = PyNumber_Add(start, delta);
+        if (!start) {
+            goto error;
+        }
+        Py_XDECREF(*buffer);
+        *buffer = start;
+    }
+    goto finish;
+
+error:
+    retval = -1;
+
+finish:
+    Py_XDECREF(second);
+    Py_DECREF(delta);
+    return retval;
+}
+
+/**begin repeat
+ *
+ * #NAME = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+*/
+static int
+@NAME@_fill(@type@ *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    @type@ start = buffer[0];
+    @type@ delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+/**end repeat**/
+
+static int
+HALF_fill(npy_half *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    float start = npy_half_to_float(buffer[0]);
+    float delta = npy_half_to_float(buffer[1]);
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = npy_float_to_half(start + i*delta);
+    }
+    return 0;
+}
+
+/**begin repeat
+ *
+ * #NAME = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+*/
+static int
+@NAME@_fill(@type@ *buffer, npy_intp length, void *NPY_UNUSED(ignore))
+{
+    npy_intp i;
+    @type@ start;
+    @type@ delta;
+
+    start.real = buffer->real;
+    start.imag = buffer->imag;
+    delta.real = buffer[1].real;
+    delta.imag = buffer[1].imag;
+    delta.real -= start.real;
+    delta.imag -= start.imag;
+    buffer += 2;
+    for (i = 2; i < length; i++, buffer++) {
+        buffer->real = start.real + i*delta.real;
+        buffer->imag = start.imag + i*delta.imag;
+    }
+    return 0;
+}
+/**end repeat**/
+
+
+/* this requires buffer to be filled with objects or NULL */
+static void
+OBJECT_fillwithscalar(PyObject **buffer, npy_intp length, PyObject **value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    PyObject *val = *value;
+    for (i = 0; i < length; i++) {
+        Py_XINCREF(val);
+        Py_XDECREF(buffer[i]);
+        buffer[i] = val;
+    }
+}
+/**begin repeat
+ *
+ * #NAME = BOOL, BYTE, UBYTE#
+ * #type = npy_bool, npy_byte, npy_ubyte#
+ */
+static void
+@NAME@_fillwithscalar(@type@ *buffer, npy_intp length, @type@ *value,
+        void *NPY_UNUSED(ignored))
+{
+    memset(buffer, *value, length);
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #NAME = SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_datetime, npy_timedelta#
+ */
+static void
+@NAME@_fillwithscalar(@type@ *buffer, npy_intp length, @type@ *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    @type@ val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                               FASTCLIP                                  **
+ *****************************************************************************
+ */
+
+#define _LESS_THAN(a, b) ((a) < (b))
+#define _GREATER_THAN(a, b) ((a) > (b))
+
+/*
+ * In fastclip, 'b' was already checked for NaN, so the half comparison
+ * only needs to check 'a' for NaN.
+ */
+
+#define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b))
+#define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a))
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*11, 1*4, 0*2#
+ * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
+ * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
+ * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5#
+ */
+static void
+@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
+{
+    npy_intp i;
+    @type@ max_val = 0, min_val = 0;
+
+    if (max != NULL) {
+        max_val = *max;
+#if @isfloat@
+        /* NaNs result in no clipping, so optimize the case away */
+        if (@isnan@(max_val)) {
+            if (min == NULL) {
+                memmove(out, in, ni * sizeof(@type@));
+                return;
+            }
+            max = NULL;
+        }
+#endif
+    }
+    if (min != NULL) {
+        min_val = *min;
+#if @isfloat@
+        if (@isnan@(min_val)) {
+            if (max == NULL) {
+                memmove(out, in, ni * sizeof(@type@));
+                return;
+            }
+            min = NULL;
+        }
+#endif
+    }
+    if (max == NULL) {
+        for (i = 0; i < ni; i++) {
+            if (@lt@(in[i], min_val)) {
+                out[i] = min_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+    else if (min == NULL) {
+        for (i = 0; i < ni; i++) {
+            if (@gt@(in[i], max_val)) {
+                out[i] = max_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+    else {
+        /*
+         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
+         * manner, see: https://github.com/numpy/numpy/issues/7601
+         */
+        #if (_MSC_VER == 1900)
+        #pragma loop( no_vector )
+        #endif
+        for (i = 0; i < ni; i++) {
+            if (@lt@(in[i], min_val)) {
+                out[i]   = min_val;
+            }
+            else if (@gt@(in[i], max_val)) {
+                out[i]   = max_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+}
+/**end repeat**/
+
+#undef _LESS_THAN
+#undef _GREATER_THAN
+#undef _HALF_LESS_THAN
+#undef _HALF_GREATER_THAN
+
+/**begin repeat
+ *
+ * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+static void
+@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
+{
+    npy_intp i;
+    @type@ max_val, min_val;
+
+    if (max != NULL) {
+        max_val = *max;
+    }
+    if (min != NULL) {
+        min_val = *min;
+    }
+    if (max == NULL) {
+        for (i = 0; i < ni; i++) {
+            if (PyArray_CLT(in[i],min_val)) {
+                out[i] = min_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+    else if (min == NULL) {
+        for (i = 0; i < ni; i++) {
+            if (PyArray_CGT(in[i], max_val)) {
+                out[i] = max_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+    else {
+        for (i = 0; i < ni; i++) {
+            if (PyArray_CLT(in[i], min_val)) {
+                out[i] = min_val;
+            }
+            else if (PyArray_CGT(in[i], max_val)) {
+                out[i] = max_val;
+            }
+            else {
+                out[i] = in[i];
+            }
+        }
+    }
+}
+
+/**end repeat**/
+
+#define OBJECT_fastclip NULL
+
+
+/*
+ *****************************************************************************
+ **                              FASTPUTMASK                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_datetime, npy_timedelta#
+*/
+static void
+@name@_fastputmask(@type@ *in, npy_bool *mask, npy_intp ni, @type@ *vals,
+        npy_intp nv)
+{
+    npy_intp i, j;
+
+    if (nv == 1) {
+        @type@ s_val = *vals;
+        for (i = 0; i < ni; i++) {
+            if (mask[i]) {
+                in[i] = s_val;
+            }
+        }
+    }
+    else {
+        for (i = 0, j = 0; i < ni; i++, j++) {
+            if (j >= nv) {
+                j = 0;
+            }
+            if (mask[i]) {
+                in[i] = vals[j];
+            }
+        }
+    }
+    return;
+}
+/**end repeat**/
+
+#define OBJECT_fastputmask NULL
+
+
+/*
+ *****************************************************************************
+ **                                FASTTAKE                                 **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_datetime, npy_timedelta#
+*/
+static int
+@name@_fasttake(@type@ *dest, @type@ *src, npy_intp *indarray,
+                    npy_intp nindarray, npy_intp n_outer,
+                    npy_intp m_middle, npy_intp nelem,
+                    NPY_CLIPMODE clipmode)
+{
+    npy_intp i, j, k, tmp;
+    NPY_BEGIN_THREADS_DEF;
+
+    NPY_BEGIN_THREADS;
+
+    switch(clipmode) {
+    case NPY_RAISE:
+        for (i = 0; i < n_outer; i++) {
+            for (j = 0; j < m_middle; j++) {
+                tmp = indarray[j];
+                /*
+                 * We don't know what axis we're operating on,
+                 * so don't report it in case of an error.
+                 */
+                if (check_and_adjust_index(&tmp, nindarray, -1, _save) < 0) {
+                    return 1;
+                }
+                if (NPY_LIKELY(nelem == 1)) {
+                    *dest++ = *(src + tmp);
+                }
+                else {
+                    for (k = 0; k < nelem; k++) {
+                        *dest++ = *(src + tmp*nelem + k);
+                    }
+                }
+            }
+            src += nelem*nindarray;
+        }
+        break;
+    case NPY_WRAP:
+        for (i = 0; i < n_outer; i++) {
+            for (j = 0; j < m_middle; j++) {
+                tmp = indarray[j];
+                if (tmp < 0) {
+                    while (tmp < 0) {
+                        tmp += nindarray;
+                    }
+                }
+                else if (tmp >= nindarray) {
+                    while (tmp >= nindarray) {
+                        tmp -= nindarray;
+                    }
+                }
+                if (NPY_LIKELY(nelem == 1)) {
+                    *dest++ = *(src+tmp);
+                }
+                else {
+                    for (k = 0; k < nelem; k++) {
+                        *dest++ = *(src+tmp*nelem+k);
+                    }
+                }
+            }
+            src += nelem*nindarray;
+        }
+        break;
+    case NPY_CLIP:
+        for (i = 0; i < n_outer; i++) {
+            for (j = 0; j < m_middle; j++) {
+                tmp = indarray[j];
+                if (tmp < 0) {
+                    tmp = 0;
+                }
+                else if (tmp >= nindarray) {
+                    tmp = nindarray - 1;
+                }
+                if (NPY_LIKELY(nelem == 1)) {
+                    *dest++ = *(src + tmp);
+                }
+                else {
+                    for (k = 0; k < nelem; k++) {
+                        *dest++ = *(src + tmp*nelem + k);
+                    }
+                }
+            }
+            src += nelem*nindarray;
+        }
+        break;
+    }
+
+    NPY_END_THREADS;
+    return 0;
+}
+/**end repeat**/
+
+#define OBJECT_fasttake NULL
+
+/*
+ *****************************************************************************
+ **                       small correlate                                   **
+ *****************************************************************************
+ */
+
+/*
+ * Compute correlation of data with with small kernels
+ * Calling a BLAS dot product for the inner loop of the correlation is overkill
+ * for small kernels. It is faster to compute it directly.
+ * Intended to be used by _pyarray_correlate so no input verifications is done
+ * especially it does not handle the boundaries, they should be handled by the
+ * caller.
+ * Returns 0 if kernel is considered too large or types are not supported, then
+ * the regular array dot should be used to process the data.
+ *
+ * d_, dstride, nd, dtype: data pointer, its stride in bytes, number of
+ *                         elements and type of data
+ * k_, kstride, nk, ktype: kernel pointer, its stride in bytes, number of
+ *                         elements and type of data
+ * out_, ostride: output data pointer and its stride in bytes
+ */
+NPY_NO_EXPORT int
+small_correlate(const char * d_, npy_intp dstride,
+                npy_intp nd, enum NPY_TYPES dtype,
+                const char * k_, npy_intp kstride,
+                npy_intp nk, enum NPY_TYPES ktype,
+                char * out_, npy_intp ostride)
+{
+    /* only handle small kernels and uniform types */
+    if (nk > 11 || dtype != ktype) {
+        return 0;
+    }
+
+    switch (dtype) {
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double#
+ *  #TYPE = NPY_FLOAT, NPY_DOUBLE#
+ */
+        case @TYPE@:
+            {
+                npy_intp i;
+                const @type@ * d = (@type@*)d_;
+                const @type@ * k = (@type@*)k_;
+                @type@ * out = (@type@*)out_;
+                dstride /= sizeof(@type@);
+                kstride /= sizeof(@type@);
+                ostride /= sizeof(@type@);
+                /* unroll inner loop to optimize register usage of the kernel*/
+                switch (nk) {
+/**begin repeat1
+ *  #ksz_outer = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11# */
+                    case @ksz_outer@:
+                    {
+/**begin repeat2
+ *  #ksz = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11# */
+#if @ksz@ <= @ksz_outer@
+                        /* load kernel */
+                        const @type@ k@ksz@ = k[(@ksz@ - 1) * kstride];
+#endif
+/**end repeat2**/
+                        for (i = 0; i < nd; i++) {
+                            @type@ s = 0;
+/**begin repeat2
+ *  #ksz = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11# */
+#if @ksz@ <= @ksz_outer@
+                            s += d[(i + @ksz@ - 1) * dstride] * k@ksz@;
+#endif
+/**end repeat2**/
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+/**end repeat1**/
+                    default:
+                        return 0;
+                }
+            }
+/**end repeat**/
+        default:
+            return 0;
+    }
+}
+
+/*
+*/
+
+/* A clone function for the datetime dtype c_metadata */
+static NpyAuxData *
+_datetime_dtype_metadata_clone(NpyAuxData *data)
+{
+    PyArray_DatetimeDTypeMetaData *newdata =
+        (PyArray_DatetimeDTypeMetaData *)PyArray_malloc(
+                        sizeof(*newdata));
+    if (newdata == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    memcpy(newdata, data, sizeof(*newdata));
+
+    return (NpyAuxData *)newdata;
+}
+
+/*
+ * Allcoate and initialize a PyArray_DatetimeDTypeMetaData object
+ */
+static NpyAuxData*
+_create_datetime_metadata(NPY_DATETIMEUNIT base, int num)
+{
+    PyArray_DatetimeDTypeMetaData *data;
+
+    /* Allocate memory for the metadata */
+    data = PyArray_malloc(sizeof(*data));
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* Initialize the base aux data */
+    memset(data, 0, sizeof(PyArray_DatetimeDTypeMetaData));
+    data->base.free = (NpyAuxData_FreeFunc *)PyArray_free;
+    data->base.clone = _datetime_dtype_metadata_clone;
+
+    data->meta.base = base;
+    data->meta.num = num;
+
+    return (NpyAuxData*)data;
+}
+
+
+/*
+ *****************************************************************************
+ **                       SETUP FUNCTION POINTERS                           **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ *
+ * #from = VOID, STRING, UNICODE#
+ * #suff = void, string, unicode#
+ * #sort = 0, 1, 1#
+ * #align = char, char, npy_ucs4#
+ * #NAME = Void, String, Unicode#
+ * #endian = |, |, =#
+ * #flags = 0, 0, NPY_NEEDS_INIT#
+ */
+static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
+    {
+        @from@_to_BOOL,
+        @from@_to_BYTE,
+        @from@_to_UBYTE,
+        @from@_to_SHORT,
+        @from@_to_USHORT,
+        @from@_to_INT,
+        @from@_to_UINT,
+        @from@_to_LONG,
+        @from@_to_ULONG,
+        @from@_to_LONGLONG,
+        @from@_to_ULONGLONG,
+        @from@_to_FLOAT,
+        @from@_to_DOUBLE,
+        @from@_to_LONGDOUBLE,
+        @from@_to_CFLOAT,
+        @from@_to_CDOUBLE,
+        @from@_to_CLONGDOUBLE,
+        @from@_to_OBJECT,
+        @from@_to_STRING,
+        @from@_to_UNICODE,
+        @from@_to_VOID
+    },
+    @from@_getitem,
+    @from@_setitem,
+    (PyArray_CopySwapNFunc*)@from@_copyswapn,
+    (PyArray_CopySwapFunc*)@from@_copyswap,
+    (PyArray_CompareFunc*)@from@_compare,
+    (PyArray_ArgFunc*)@from@_argmax,
+    (PyArray_DotFunc*)NULL,
+    (PyArray_ScanFunc*)@from@_scan,
+    @from@_fromstr,
+    (PyArray_NonzeroFunc*)@from@_nonzero,
+    (PyArray_FillFunc*)NULL,
+    (PyArray_FillWithScalarFunc*)NULL,
+#if @sort@
+    {
+        quicksort_@suff@,
+        heapsort_@suff@,
+        mergesort_@suff@
+    },
+    {
+        aquicksort_@suff@,
+        aheapsort_@suff@,
+        amergesort_@suff@
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc *)NULL,
+    (PyArray_FastPutmaskFunc *)NULL,
+    (PyArray_FastTakeFunc *)NULL,
+    (PyArray_ArgFunc*)@from@_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+static PyArray_Descr @from@_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &Py@NAME@ArrType_Type,
+    /* kind */
+    NPY_@from@LTR,
+    /* type */
+    NPY_@from@LTR,
+    /* byteorder */
+    '@endian@',
+    /* flags, unicode needs init as py3.3 does not like printing garbage  */
+    @flags@,
+    /* type_num */
+    NPY_@from@,
+    /* elsize */
+    0,
+    /* alignment */
+    _ALIGN(@align@),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_Py@NAME@_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #from = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         OBJECT, DATETIME, TIMEDELTA#
+ * #suff = bool,
+ *         byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, datetime, timedelta#
+ * #sort = 1*18, 0*1, 1*2#
+ * #fromtype = npy_bool,
+ *             npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_half, npy_float, npy_double, npy_longdouble,
+ *             npy_cfloat, npy_cdouble, npy_clongdouble,
+ *             PyObject *, npy_datetime, npy_timedelta#
+ * #NAME = Bool,
+ *         Byte, UByte, Short, UShort, Int, UInt,
+ *         Long, ULong, LongLong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble,
+ *         Object, Datetime, Timedelta#
+ * #kind = GENBOOL,
+ *         SIGNED, UNSIGNED, SIGNED, UNSIGNED, SIGNED, UNSIGNED,
+ *         SIGNED, UNSIGNED, SIGNED, UNSIGNED,
+ *         FLOATING, FLOATING, FLOATING, FLOATING,
+ *         COMPLEX, COMPLEX, COMPLEX,
+ *         OBJECT, DATETIME, TIMEDELTA#
+ * #endian = |*3, =*15, |, =*2#
+ * #isobject= 0*18,NPY_OBJECT_DTYPE_FLAGS,0*2#
+ */
+
+static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
+    {
+        @from@_to_BOOL,
+        @from@_to_BYTE,
+        @from@_to_UBYTE,
+        @from@_to_SHORT,
+        @from@_to_USHORT,
+        @from@_to_INT,
+        @from@_to_UINT,
+        @from@_to_LONG,
+        @from@_to_ULONG,
+        @from@_to_LONGLONG,
+        @from@_to_ULONGLONG,
+        @from@_to_FLOAT,
+        @from@_to_DOUBLE,
+        @from@_to_LONGDOUBLE,
+        @from@_to_CFLOAT,
+        @from@_to_CDOUBLE,
+        @from@_to_CLONGDOUBLE,
+        @from@_to_OBJECT,
+        @from@_to_STRING,
+        @from@_to_UNICODE,
+        @from@_to_VOID
+    },
+    @from@_getitem,
+    @from@_setitem,
+    (PyArray_CopySwapNFunc*)@from@_copyswapn,
+    (PyArray_CopySwapFunc*)@from@_copyswap,
+    (PyArray_CompareFunc*)@from@_compare,
+    (PyArray_ArgFunc*)@from@_argmax,
+    (PyArray_DotFunc*)@from@_dot,
+    (PyArray_ScanFunc*)@from@_scan,
+    @from@_fromstr,
+    (PyArray_NonzeroFunc*)@from@_nonzero,
+    (PyArray_FillFunc*)@from@_fill,
+    (PyArray_FillWithScalarFunc*)@from@_fillwithscalar,
+#if @sort@
+    {
+        quicksort_@suff@,
+        heapsort_@suff@,
+        mergesort_@suff@
+    },
+    {
+        aquicksort_@suff@,
+        aheapsort_@suff@,
+        amergesort_@suff@
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)@from@_fastclip,
+    (PyArray_FastPutmaskFunc*)@from@_fastputmask,
+    (PyArray_FastTakeFunc*)@from@_fasttake,
+    (PyArray_ArgFunc*)@from@_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr @from@_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &Py@NAME@ArrType_Type,
+    /* kind */
+    NPY_@kind@LTR,
+    /* type */
+    NPY_@from@LTR,
+    /* byteorder */
+    '@endian@',
+    /* flags */
+    @isobject@,
+    /* type_num */
+    NPY_@from@,
+    /* elsize */
+    sizeof(@fromtype@),
+    /* alignment */
+    _ALIGN(@fromtype@),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_Py@NAME@_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+/**end repeat**/
+
+#define _MAX_LETTER 128
+static char _letter_to_num[_MAX_LETTER];
+
+static PyArray_Descr *_builtin_descrs[] = {
+    &BOOL_Descr,
+    &BYTE_Descr,
+    &UBYTE_Descr,
+    &SHORT_Descr,
+    &USHORT_Descr,
+    &INT_Descr,
+    &UINT_Descr,
+    &LONG_Descr,
+    &ULONG_Descr,
+    &LONGLONG_Descr,
+    &ULONGLONG_Descr,
+    &FLOAT_Descr,
+    &DOUBLE_Descr,
+    &LONGDOUBLE_Descr,
+    &CFLOAT_Descr,
+    &CDOUBLE_Descr,
+    &CLONGDOUBLE_Descr,
+    &OBJECT_Descr,
+    &STRING_Descr,
+    &UNICODE_Descr,
+    &VOID_Descr,
+    &DATETIME_Descr,
+    &TIMEDELTA_Descr,
+    &HALF_Descr
+};
+
+/*NUMPY_API
+ * Get the PyArray_Descr structure for a type.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_DescrFromType(int type)
+{
+    PyArray_Descr *ret = NULL;
+
+    if (type < NPY_NTYPES) {
+        ret = _builtin_descrs[type];
+    }
+    else if (type == NPY_NOTYPE) {
+        /*
+         * This needs to not raise an error so
+         * that PyArray_DescrFromType(NPY_NOTYPE)
+         * works for backwards-compatible C-API
+         */
+        return NULL;
+    }
+    else if ((type == NPY_CHAR) || (type == NPY_CHARLTR)) {
+        if (type == NPY_CHAR) {
+            /*
+             * warning added 2017-04-25, 1.13
+             * deprecated in 1.7
+             * */
+            if (DEPRECATE("The NPY_CHAR type_num is deprecated. "
+                          "Please port your code to use "
+                          "NPY_STRING instead.") < 0) {
+                return NULL;
+            }
+        }
+        ret = PyArray_DescrNew(_builtin_descrs[NPY_STRING]);
+        if (ret == NULL) {
+            return NULL;
+        }
+        ret->elsize = 1;
+        ret->type = NPY_CHARLTR;
+        return ret;
+    }
+    else if (PyTypeNum_ISUSERDEF(type)) {
+        ret = userdescrs[type - NPY_USERDEF];
+    }
+    else {
+        int num = NPY_NTYPES;
+        if (type < _MAX_LETTER) {
+            num = (int) _letter_to_num[type];
+        }
+        if (num >= NPY_NTYPES) {
+            ret = NULL;
+        }
+        else {
+            ret = _builtin_descrs[num];
+        }
+    }
+    if (ret == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Invalid data-type for array");
+    }
+    else {
+        Py_INCREF(ret);
+    }
+
+    return ret;
+}
+
+/*
+ *****************************************************************************
+ **                             SETUP TYPE INFO                             **
+ *****************************************************************************
+ */
+
+
+/*
+ * This function is called during numpy module initialization,
+ * and is used to initialize internal dtype tables.
+ */
+NPY_NO_EXPORT int
+set_typeinfo(PyObject *dict)
+{
+    PyObject *infodict, *s;
+    int i;
+
+    PyArray_Descr *dtype;
+    PyObject *cobj, *key;
+
+    /*
+     * Add cast functions for the new types
+     */
+
+    /**begin repeat
+     *
+     * #name1 = BOOL,
+     *          BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *          LONG, ULONG, LONGLONG, ULONGLONG,
+     *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+     *          OBJECT, STRING, UNICODE, VOID,
+     *          DATETIME,TIMEDELTA#
+     */
+
+    /**begin repeat1
+     *
+     * #name2 = HALF, DATETIME, TIMEDELTA#
+     */
+
+    dtype = _builtin_descrs[NPY_@name1@];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyInt_FromLong(NPY_@name2@);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)@name1@_to_@name2@, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    /**end repeat1**/
+
+    /**end repeat**/
+
+    _builtin_descrs[NPY_DATETIME]->c_metadata = _create_datetime_metadata(
+                NPY_DATETIME_DEFAULTUNIT, 1);
+    if (_builtin_descrs[NPY_DATETIME]->c_metadata == NULL) {
+        return -1;
+    }
+    _builtin_descrs[NPY_TIMEDELTA]->c_metadata = _create_datetime_metadata(
+                NPY_DATETIME_DEFAULTUNIT, 1);
+    if (_builtin_descrs[NPY_DATETIME]->c_metadata == NULL) {
+        return -1;
+    }
+
+    for (i = 0; i < _MAX_LETTER; i++) {
+        _letter_to_num[i] = NPY_NTYPES;
+    }
+
+    /**begin repeat
+     *
+     * #name = BOOL,
+     *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         INTP, UINTP,
+     *         LONG, ULONG, LONGLONG, ULONGLONG,
+     *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+     *         OBJECT, STRING, UNICODE, VOID,
+     *         DATETIME,TIMEDELTA#
+     */
+
+    _letter_to_num[NPY_@name@LTR] = NPY_@name@;
+
+    /**end repeat**/
+
+    _letter_to_num[NPY_STRINGLTR2] = NPY_STRING;
+
+    /**begin repeat
+      * #name = BOOL,
+      *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+      *         LONG, ULONG, LONGLONG, ULONGLONG,
+      *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+      *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+      *         OBJECT, STRING, UNICODE, VOID,
+      *         DATETIME, TIMEDELTA#
+      */
+
+    @name@_Descr.fields = Py_None;
+
+    /**end repeat**/
+
+
+    /**begin repeat
+      * #name = STRING, UNICODE, VOID#
+      */
+
+    PyDataType_MAKEUNSIZED(&@name@_Descr);
+
+    /**end repeat**/
+
+    /* Set a dictionary with type information */
+    infodict = PyDict_New();
+    if (infodict == NULL) return -1;
+
+
+    /**begin repeat
+     *
+     * #name = BOOL,
+     *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         INTP, UINTP,
+     *         LONG, ULONG, LONGLONG, ULONGLONG#
+     * #uname = BOOL,
+     *          BYTE*2, SHORT*2, INT*2,
+     *          INTP*2,
+     *          LONG*2, LONGLONG*2#
+     * #Name = Bool,
+     *         Byte, UByte, Short, UShort, Int, UInt,
+     *         Intp, UIntp,
+     *         Long, ULong, LongLong, ULongLong#
+     * #type = npy_bool,
+     *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+     *         npy_intp, npy_uintp,
+     *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+     * #max= 1,
+     *       NPY_MAX_BYTE, NPY_MAX_UBYTE, NPY_MAX_SHORT,
+     *       NPY_MAX_USHORT, NPY_MAX_INT, PyLong_FromUnsignedLong(NPY_MAX_UINT),
+     *       PyLong_FromLongLong((npy_longlong) NPY_MAX_INTP),
+     *       PyLong_FromUnsignedLongLong((npy_ulonglong) NPY_MAX_UINTP),
+     *       NPY_MAX_LONG,
+     *       PyLong_FromUnsignedLong((npy_ulong) NPY_MAX_ULONG),
+     *       PyLong_FromLongLong((npy_longlong) NPY_MAX_LONGLONG),
+     *       PyLong_FromUnsignedLongLong((npy_ulonglong) NPY_MAX_ULONGLONG)#
+     * #min = 0, NPY_MIN_BYTE, 0, NPY_MIN_SHORT, 0, NPY_MIN_INT, 0,
+     *        PyLong_FromLongLong((npy_longlong) NPY_MIN_INTP),
+     *        0, NPY_MIN_LONG, 0,
+     *        PyLong_FromLongLong((npy_longlong) NPY_MIN_LONGLONG), 0#
+     * #cx = i*6, N, N, N, l, N, N, N#
+     * #cn = i*7, N, i, l, i, N, i#
+     */
+
+    s = PyArray_typeinforanged(
+        NPY_@name@LTR, NPY_@name@, NPY_BITSOF_@uname@, _ALIGN(@type@),
+        Py_BuildValue("@cx@", @max@),
+        Py_BuildValue("@cn@", @min@),
+        &Py@Name@ArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "@name@", s);
+    Py_DECREF(s);
+
+
+    /**end repeat**/
+
+
+    /**begin repeat
+     *
+     * #type = npy_half, npy_float, npy_double, npy_longdouble,
+     *         npy_cfloat, npy_cdouble, npy_clongdouble#
+     * #name = HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+     * #Name = Half, Float, Double, LongDouble,
+     *         CFloat, CDouble, CLongDouble#
+     */
+    s = PyArray_typeinfo(
+        NPY_@name@LTR, NPY_@name@, NPY_BITSOF_@name@,
+        _ALIGN(@type@), &Py@Name@ArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "@name@", s);
+    Py_DECREF(s);
+
+    /**end repeat**/
+
+    s = PyArray_typeinfo(
+        NPY_OBJECTLTR, NPY_OBJECT, sizeof(PyObject *) * CHAR_BIT,
+        _ALIGN(PyObject *),
+        &PyObjectArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "OBJECT", s);
+    Py_DECREF(s);
+    s = PyArray_typeinfo(
+        NPY_STRINGLTR, NPY_STRING, 0, _ALIGN(char),
+        &PyStringArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "STRING", s);
+    Py_DECREF(s);
+    s = PyArray_typeinfo(
+        NPY_UNICODELTR, NPY_UNICODE, 0, _ALIGN(npy_ucs4),
+        &PyUnicodeArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "UNICODE", s);
+    Py_DECREF(s);
+    s = PyArray_typeinfo(
+        NPY_VOIDLTR, NPY_VOID, 0, _ALIGN(char),
+        &PyVoidArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "VOID", s);
+    Py_DECREF(s);
+    s = PyArray_typeinforanged(
+        NPY_DATETIMELTR, NPY_DATETIME, NPY_BITSOF_DATETIME,
+        _ALIGN(npy_datetime),
+        MyPyLong_FromInt64(NPY_MAX_DATETIME),
+        MyPyLong_FromInt64(NPY_MIN_DATETIME),
+        &PyDatetimeArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "DATETIME", s);
+    Py_DECREF(s);
+    s = PyArray_typeinforanged(
+        NPY_TIMEDELTALTR, NPY_TIMEDELTA, NPY_BITSOF_TIMEDELTA,
+        _ALIGN(npy_timedelta),
+        MyPyLong_FromInt64(NPY_MAX_TIMEDELTA),
+        MyPyLong_FromInt64(NPY_MIN_TIMEDELTA),
+        &PyTimedeltaArrType_Type
+    );
+    if (s == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(infodict, "TIMEDELTA", s);
+    Py_DECREF(s);
+
+#define SETTYPE(name)                           \
+    Py_INCREF(&Py##name##ArrType_Type);         \
+    PyDict_SetItemString(infodict, #name,       \
+            (PyObject *)&Py##name##ArrType_Type)
+
+    SETTYPE(Generic);
+    SETTYPE(Number);
+    SETTYPE(Integer);
+    SETTYPE(Inexact);
+    SETTYPE(SignedInteger);
+    SETTYPE(UnsignedInteger);
+    SETTYPE(Floating);
+    SETTYPE(ComplexFloating);
+    SETTYPE(Flexible);
+    SETTYPE(Character);
+
+#undef SETTYPE
+
+    PyDict_SetItemString(dict, "typeinfo", infodict);
+    Py_DECREF(infodict);
+    return 0;
+}
+
+#undef _MAX_LETTER
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/einsum.c.src b/contrib/python/numpy/py2/numpy/core/src/multiarray/einsum.c.src
new file mode 100644
index 00000000000..58af4409192
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/einsum.c.src
@@ -0,0 +1,2890 @@
+/*
+ * This file contains the implementation of the 'einsum' function,
+ * which provides an einstein-summation operation.
+ *
+ * Copyright (c) 2011 by Mark Wiebe ([email protected])
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/npy_common.h>
+#include <numpy/arrayobject.h>
+#include <numpy/halffloat.h>
+#include <npy_pycompat.h>
+
+#include <ctype.h>
+
+#include "convert.h"
+#include "common.h"
+#include "ctors.h"
+
+#ifdef NPY_HAVE_SSE_INTRINSICS
+#define EINSUM_USE_SSE1 1
+#else
+#define EINSUM_USE_SSE1 0
+#endif
+
+/*
+ * TODO: Only some SSE2 for float64 is implemented.
+ */
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+#define EINSUM_USE_SSE2 1
+#else
+#define EINSUM_USE_SSE2 0
+#endif
+
+#if EINSUM_USE_SSE1
+#include <xmmintrin.h>
+#endif
+
+#if EINSUM_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_EINSUM_DBG_TRACING 0
+
+#if NPY_EINSUM_DBG_TRACING
+#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
+#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
+#else
+#define NPY_EINSUM_DBG_PRINT(s)
+#define NPY_EINSUM_DBG_PRINT1(s, p1)
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
+#endif
+/**********************************************/
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *             npy_float, npy_float, npy_double, npy_longdouble,
+ *             npy_float, npy_double, npy_longdouble#
+ * #to = ,,,,,
+ *       ,,,,,
+ *       npy_float_to_half,,,,
+ *       ,,#
+ * #from = ,,,,,
+ *         ,,,,,
+ *         npy_half_to_float,,,,
+ *         ,,#
+ * #complex = 0*5,
+ *            0*5,
+ *            0*4,
+ *            1*3#
+ * #float32 = 0*5,
+ *            0*5,
+ *            0,1,0,0,
+ *            0*3#
+ * #float64 = 0*5,
+ *            0*5,
+ *            0,0,1,0,
+ *            0*3#
+ */
+
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+static void
+@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif @nop@ == 2
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif @nop@ == 3
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) *
+                                         @from@(*(@type@ *)data2) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
+                                         ((@temptype@ *)data_out)[0];
+        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
+                                         ((@temptype@ *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data_out = (@type@ *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            data_out[@i@] = @to@(@from@(data0[@i@]) +
+                                 @from@(data_out[@i@]));
+#else
+            ((@temptype@ *)data_out + 2*@i@)[0] =
+                                    ((@temptype@ *)data0 + 2*@i@)[0] +
+                                    ((@temptype@ *)data_out + 2*@i@)[0];
+            ((@temptype@ *)data_out + 2*@i@)[1] =
+                                    ((@temptype@ *)data0 + 2*@i@)[1] +
+                                    ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#if !@complex@
+        data_out[@i@] = @to@(@from@(data0[@i@]) +
+                             @from@(data_out[@i@]));
+#else /* complex */
+        ((@temptype@ *)data_out + 2*@i@)[0] =
+                                ((@temptype@ *)data0 + 2*@i@)[0] +
+                                ((@temptype@ *)data_out + 2*@i@)[0];
+        ((@temptype@ *)data_out + 2*@i@)[1] =
+                                ((@temptype@ *)data0 + 2*@i@)[1] +
+                                ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 2 && !@complex@
+
+static void
+@name@_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(@from@(data0[@i@]) *
+                                 @from@(data1[@i@]) +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
+        EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data1 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+/* Some extra specializations for the two operand case */
+static void
+@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value0_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b, value0_sse;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(value0 *
+                                 @from@(data1[@i@]) +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    value0_sse = _mm_set_ps1(value0);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        if (count > 0) {
+            goto finish_after_unrolled_loop;
+        }
+        else {
+            return;
+        }
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    value0_sse = _mm_set1_pd(value0);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        if (count > 0) {
+            goto finish_after_unrolled_loop;
+        }
+        else {
+            return;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(value0 *
+                             @from@(data1[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data1 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    if (count > 0) {
+        goto finish_after_unrolled_loop;
+    }
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value1_sse;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(@from@(data0[@i@])*
+                                 value1  +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    value1_sse = _mm_set_ps1(value1);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@])*
+                             value1  +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+            accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+        }
+
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+            accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+        }
+
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+        _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+        accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+        _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+        accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data1 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data1[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
+/**end repeat2**/
+            data1 += 8;
+        }
+
+#if EINSUM_USE_SSE1 && @float32@
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+#endif
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data1 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data0[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+#if EINSUM_USE_SSE1 && @float32@
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+#endif
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data0[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 3 && !@complex@
+
+static void
+@name@_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data2 = (@type@ *)dataptr[2];
+    @type@ *data_out = (@type@ *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) *
+                             @from@(data2[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@]) *
+                         @from@(data1[@i@]) *
+                         @from@(data2[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+#else /* @nop@ > 3 || @complex */
+
+static void
+@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#else /* complex */
+#  if @nop@ <= 3
+#    define _SUMPROD_NOP @nop@
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various @nop@ */
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+    @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#else
+    @temptype@ accum = 0;
+    @type@ *data0 = (@type@ *)dataptr[0];
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            accum += @from@(data0[@i@]);
+#else /* complex */
+            accum_re += data0[2*@i@+0];
+            accum_im += data0[2*@i@+1];
+#endif
+/**end repeat2**/
+        case 0:
+#if @complex@
+            ((@temptype@ *)dataptr[1])[0] += accum_re;
+            ((@temptype@ *)dataptr[1])[1] += accum_im;
+#else
+            *((@type@ *)dataptr[1]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[1])));
+#endif
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#  if !@complex@
+        accum += @from@(data0[@i@]);
+#  else /* complex */
+        accum_re += data0[2*@i@+0];
+        accum_im += data0[2*@i@+1];
+#  endif
+/**end repeat2**/
+#endif
+
+#if !@complex@
+        data0 += 8;
+#else
+        data0 += 8*2;
+#endif
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#endif /* @nop@ == 1 */
+
+static void
+@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+#else
+    @temptype@ accum = 0;
+#endif
+
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        accum += @from@(*(@type@ *)data0);
+        data0 += stride0;
+#  elif @nop@ == 2
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif @nop@ == 3
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1) *
+                 @from@(*(@type@ *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        accum_re += ((@temptype@ *)data0)[0];
+        accum_im += ((@temptype@ *)data0)[1];
+        data0 += stride0;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if @complex@
+#  if @nop@ <= 3
+    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
+    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
+#  else
+    ((@temptype@ *)dataptr[nop])[0] += accum_re;
+    ((@temptype@ *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if @nop@ <= 3
+    *((@type@ *)dataptr[@nop@]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[@nop@])));
+#  else
+    *((@type@ *)dataptr[nop]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/* Do OR of ANDs for the boolean type */
+
+/**begin repeat
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+
+static void
+bool_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif @nop@ == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif @nop@ == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+#endif
+
+#if (@nop@ <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat1
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#  if @nop@ == 1
+            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
+                                            ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 2
+            ((npy_bool *)data_out)[@i@] =
+                            (((npy_bool *)data0)[@i@] &&
+                             ((npy_bool *)data1)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 3
+            ((npy_bool *)data_out)[@i@] =
+                           (((npy_bool *)data0)[@i@] &&
+                            ((npy_bool *)data1)[@i@] &&
+                            ((npy_bool *)data2)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  endif
+/**end repeat1**/
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (@nop@ <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+                                        (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                        ((*((npy_bool *)data0 + @i@)) &&
+                         (*((npy_bool *)data1 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                       ((*((npy_bool *)data0 + @i@)) &&
+                        (*((npy_bool *)data1 + @i@)) &&
+                        (*((npy_bool *)data2 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (@nop@ <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif @nop@ == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif @nop@ == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if @nop@ <= 3
+    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+/**end repeat**/
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp *, npy_intp);
+
+/* These tables need to match up with the type enum */
+static sum_of_products_fn
+_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+    &@name@_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+/**end repeat**/
+}; /* End of _contig_outstride0_unary_specialization_table */
+
+static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_stride0_contig_outstride0_two,
+    &@name@_sum_of_products_stride0_contig_outcontig_two,
+    &@name@_sum_of_products_contig_stride0_outstride0_two,
+    &@name@_sum_of_products_contig_stride0_outcontig_two,
+    &@name@_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _binary_specialization_table */
+
+static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_outstride0_any,
+    &@name@_sum_of_products_outstride0_one,
+    &@name@_sum_of_products_outstride0_two,
+    &@name@_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _outstride0_specialized_table */
+
+static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_contig_any,
+    &@name@_sum_of_products_contig_one,
+    &@name@_sum_of_products_contig_two,
+    &@name@_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _allcontig_specialized_table */
+
+static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_any,
+    &@name@_sum_of_products_one,
+    &@name@_sum_of_products_two,
+    &@name@_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _unnspecialized_table */
+
+static sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp *fixed_strides)
+{
+    int iop;
+
+    if (type_num >= NPY_NTYPES) {
+        return NULL;
+    }
+
+    /* contiguous reduction */
+    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
+        sum_of_products_fn ret =
+            _contig_outstride0_unary_specialization_table[type_num];
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* nop of 2 has more specializations */
+    if (nop == 2) {
+        /* Encode the zero/contiguous strides */
+        int code;
+        code = (fixed_strides[0] == 0) ? 0 :
+                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
+        code += (fixed_strides[1] == 0) ? 0 :
+                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
+        code += (fixed_strides[2] == 0) ? 0 :
+                    (fixed_strides[2] == itemsize) ? 1 : 8;
+        if (code >= 2 && code < 7) {
+            sum_of_products_fn ret =
+                        _binary_specialization_table[type_num][code-2];
+            if (ret != NULL) {
+                return ret;
+            }
+        }
+    }
+
+    /* Inner loop with an output stride of 0 */
+    if (fixed_strides[nop] == 0) {
+        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* Check for all contiguous */
+    for (iop = 0; iop < nop + 1; ++iop) {
+        if (fixed_strides[iop] != itemsize) {
+            break;
+        }
+    }
+
+    /* Contiguous loop */
+    if (iop == nop + 1) {
+        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* None of the above specializations caught it, general loops */
+    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+}
+
+
+/*
+ * Parses the subscripts for one operand into an output of 'ndim'
+ * labels. The resulting 'op_labels' array will have:
+ *  - the ASCII code of the label for the first occurrence of a label;
+ *  - the (negative) offset to the first occurrence of the label for
+ *    repeated labels;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ * For example:
+ *  - subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]
+ *  - subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]
+ */
+
+static int
+parse_operand_subscripts(char *subscripts, int length,
+                         int ndim, int iop, char *op_labels,
+                         char *label_counts, int *min_label, int *max_label)
+{
+    int i;
+    int idim = 0;
+    int ellipsis = -1;
+
+    /* Process all labels for this operand */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check we don't exceed the operator dimensions. */
+            if (idim >= ndim) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts for operand %d", iop);
+                return -1;
+            }
+
+            op_labels[idim++] = label;
+            if (label < *min_label) {
+                *min_label = label;
+            }
+            if (label > *max_label) {
+                *max_label = label;
+            }
+            label_counts[label]++;
+        }
+        /* The beginning of the ellipsis. */
+        else if (label == '.') {
+            /* Check it's a proper ellipsis. */
+            if (ellipsis != -1 || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains a "
+                             "'.' that is not part of an ellipsis ('...') "
+                             "in operand %d", iop);
+                return -1;
+            }
+
+            ellipsis = idim;
+        }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
+    }
+
+    /* No ellipsis found, labels must match dimensions exactly. */
+    if (ellipsis == -1) {
+        if (idim != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                         "operand has more dimensions than subscripts "
+                         "given in einstein sum, but no '...' ellipsis "
+                         "provided to broadcast the extra dimensions.");
+            return -1;
+        }
+    }
+    /* Ellipsis found, may have to add broadcast dimensions. */
+    else if (idim < ndim) {
+        /* Move labels after ellipsis to the end. */
+        for (i = 0; i < idim - ellipsis; ++i) {
+            op_labels[ndim - i - 1] = op_labels[idim - i - 1];
+        }
+        /* Set all broadcast dimensions to zero. */
+        for (i = 0; i < ndim - idim; ++i) {
+            op_labels[ellipsis + i] = 0;
+        }
+    }
+
+    /*
+     * Find any labels duplicated for this operand, and turn them
+     * into negative offsets to the axis to merge with.
+     *
+     * In C, the char type may be signed or unsigned, but with
+     * twos complement arithmetic the char is ok either way here, and
+     * later where it matters the char is cast to a signed char.
+     */
+    for (idim = 0; idim < ndim - 1; ++idim) {
+        int label = (signed char)op_labels[idim];
+        /* If it is a proper label, find any duplicates of it. */
+        if (label > 0) {
+            /* Search for the next matching label. */
+            char *next = memchr(op_labels + idim + 1, label, ndim - idim - 1);
+
+            while (next != NULL) {
+                /* The offset from next to op_labels[idim] (negative). */
+                *next = (char)((op_labels + idim) - next);
+                /* Search for the next matching label. */
+                next = memchr(next + 1, label, op_labels + ndim - 1 - next);
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+/*
+ * Parses the subscripts for the output operand into an output that
+ * includes 'ndim_broadcast' unlabeled dimensions, and returns the total
+ * number of output dimensions, or -1 if there is an error. Similarly
+ * to parse_operand_subscripts, the 'out_labels' array will have, for
+ * each dimension:
+ *  - the ASCII code of the corresponding label;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ */
+static int
+parse_output_subscripts(char *subscripts, int length,
+                        int ndim_broadcast,
+                        const char *label_counts, char *out_labels)
+{
+    int i, bdim;
+    int ndim = 0;
+    int ellipsis = 0;
+
+    /* Process all the output labels. */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check that it doesn't occur again. */
+            if (memchr(subscripts + i + 1, label, length - i - 1) != NULL) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string includes "
+                             "output subscript '%c' multiple times",
+                             (char)label);
+                return -1;
+            }
+            /* Check that it was used in the inputs. */
+            if (label_counts[label] == 0) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string included "
+                             "output subscript '%c' which never appeared "
+                             "in an input", (char)label);
+                return -1;
+            }
+            /* Check that there is room in out_labels for this label. */
+            if (ndim >= NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
+                return -1;
+            }
+
+            out_labels[ndim++] = label;
+        }
+        /* The beginning of the ellipsis. */
+        else if (label == '.') {
+            /* Check it is a proper ellipsis. */
+            if (ellipsis || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
+                PyErr_SetString(PyExc_ValueError,
+                                "einstein sum subscripts string "
+                                "contains a '.' that is not part of "
+                                "an ellipsis ('...') in the output");
+                return -1;
+            }
+            /* Check there is room in out_labels for broadcast dims. */
+            if (ndim + ndim_broadcast > NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
+                return -1;
+            }
+
+            ellipsis = 1;
+            for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
+                out_labels[ndim++] = 0;
+            }
+        }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
+    }
+
+    /* If no ellipsis was found there should be no broadcast dimensions. */
+    if (!ellipsis && ndim_broadcast > 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "output has more dimensions than subscripts "
+                        "given in einstein sum, but no '...' ellipsis "
+                        "provided to broadcast the extra dimensions.");
+        return -1;
+    }
+
+    return ndim;
+}
+
+
+/*
+ * When there's just one operand and no reduction we can return a view
+ * into 'op'.  This calculates the view and stores it in 'ret', if
+ * possible.  Returns -1 on error, 0 otherwise.  Note that a 0 return
+ * does not mean that a view was successfully created.
+ */
+static int
+get_single_op_view(PyArrayObject *op, char *labels,
+                   int ndim_output, char *output_labels,
+                   PyArrayObject **ret)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    char *out_label;
+    int label, i, idim, ndim, ibroadcast = 0;
+
+    ndim = PyArray_NDIM(op);
+
+    /* Initialize the dimensions and strides to zero */
+    for (idim = 0; idim < ndim_output; ++idim) {
+        new_dims[idim] = 0;
+        new_strides[idim] = 0;
+    }
+
+    /* Match the labels in the operand with the output labels */
+    for (idim = 0; idim < ndim; ++idim) {
+        /*
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
+        label = (signed char)labels[idim];
+        /* If this label says to merge axes, get the actual label */
+        if (label < 0) {
+            label = labels[idim+label];
+        }
+        /* If the label is 0, it's an unlabeled broadcast dimension */
+        if (label == 0) {
+            /* The next output label that's a broadcast dimension */
+            for (; ibroadcast < ndim_output; ++ibroadcast) {
+                if (output_labels[ibroadcast] == 0) {
+                    break;
+                }
+            }
+            if (ibroadcast == ndim_output) {
+                PyErr_SetString(PyExc_ValueError,
+                        "output had too few broadcast dimensions");
+                return -1;
+            }
+            new_dims[ibroadcast] = PyArray_DIM(op, idim);
+            new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
+            ++ibroadcast;
+        }
+        else {
+            /* Find the position for this dimension in the output */
+            out_label = (char *)memchr(output_labels, label,
+                                                    ndim_output);
+            /* If it's not found, reduction -> can't return a view */
+            if (out_label == NULL) {
+                break;
+            }
+            /* Update the dimensions and strides of the output */
+            i = out_label - output_labels;
+            if (new_dims[i] != 0 && new_dims[i] != PyArray_DIM(op, idim)) {
+                PyErr_Format(PyExc_ValueError,
+                        "dimensions in single operand for collapsing "
+                        "index '%c' don't match (%d != %d)",
+                        label, (int)new_dims[i], (int)PyArray_DIM(op, idim));
+                return -1;
+            }
+            new_dims[i] = PyArray_DIM(op, idim);
+            new_strides[i] += PyArray_STRIDE(op, idim);
+        }
+    }
+    /* If we processed all the input axes, return a view */
+    if (idim == ndim) {
+        Py_INCREF(PyArray_DESCR(op));
+        *ret = (PyArrayObject *)PyArray_NewFromDescr_int(
+                Py_TYPE(op), PyArray_DESCR(op),
+                ndim_output, new_dims, new_strides, PyArray_DATA(op),
+                PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
+                (PyObject *)op, (PyObject *)op,
+                0, 0);
+
+        if (*ret == NULL) {
+            return -1;
+        }
+        return 0;
+    }
+
+    /* Return success, but that we couldn't make a view */
+    *ret = NULL;
+    return 0;
+}
+
+
+/*
+ * The char type may be either signed or unsigned, we need it to be
+ * signed here.
+ */
+static int
+_any_labels_are_negative(signed char *labels, int ndim)
+{
+    int idim;
+
+    for (idim = 0; idim < ndim; ++idim) {
+        if (labels[idim] < 0) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Given the labels for an operand array, returns a view of the array
+ * with all repeated labels collapsed into a single dimension along
+ * the corresponding diagonal. The labels are also updated to match
+ * the dimensions of the new array. If no label is repeated, the
+ * original array is reference increased and returned unchanged.
+ */
+static PyArrayObject *
+get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    int idim, icombine;
+    int icombinemap[NPY_MAXDIMS];
+    int ndim = PyArray_NDIM(op);
+    PyArrayObject *ret = NULL;
+
+    /* A fast path to avoid unnecessary calculations. */
+    if (!_any_labels_are_negative((signed char *)labels, ndim)) {
+        Py_INCREF(op);
+
+        return op;
+    }
+
+    /* Combine repeated labels. */
+    icombine = 0;
+    for(idim = 0; idim < ndim; ++idim) {
+        /*
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
+        int label = (signed char)labels[idim];
+        npy_intp dim = PyArray_DIM(op, idim);
+        npy_intp stride = PyArray_STRIDE(op, idim);
+
+        /* A label seen for the first time, add it to the op view. */
+        if (label >= 0) {
+            /*
+             * icombinemap maps dimensions in the original array to
+             * their position in the combined dimensions view.
+             */
+            icombinemap[idim] = icombine;
+            new_dims[icombine] = dim;
+            new_strides[icombine] = stride;
+            ++icombine;
+        }
+        /* A repeated label, find the original one and merge them. */
+        else {
+            int i = icombinemap[idim + label];
+
+            icombinemap[idim] = -1;
+            if (new_dims[i] != dim) {
+                char orig_label = labels[idim + label];
+                PyErr_Format(PyExc_ValueError,
+                             "dimensions in operand %d for collapsing "
+                             "index '%c' don't match (%d != %d)",
+                             iop, orig_label, (int)new_dims[i], (int)dim);
+                return NULL;
+            }
+            new_strides[i] += stride;
+        }
+    }
+
+    /* Overwrite labels to match the new operand view. */
+    for (idim = 0; idim < ndim; ++idim) {
+        int i = icombinemap[idim];
+
+        if (i >= 0) {
+            labels[i] = labels[idim];
+        }
+    }
+
+    /* The number of dimensions of the combined view. */
+    ndim = icombine;
+
+    /* Create a view of the operand with the compressed dimensions. */
+    Py_INCREF(PyArray_DESCR(op));
+    ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+            Py_TYPE(op), PyArray_DESCR(op),
+            ndim, new_dims, new_strides, PyArray_DATA(op),
+            PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
+            (PyObject *)op, (PyObject *)op);
+
+    return ret;
+}
+
+static int
+prepare_op_axes(int ndim, int iop, char *labels, int *axes,
+            int ndim_iter, char *iter_labels)
+{
+    int i, label, ibroadcast;
+
+    ibroadcast = ndim-1;
+    for (i = ndim_iter-1; i >= 0; --i) {
+        label = iter_labels[i];
+        /*
+         * If it's an unlabeled broadcast dimension, choose
+         * the next broadcast dimension from the operand.
+         */
+        if (label == 0) {
+            while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
+                --ibroadcast;
+            }
+            /*
+             * If we used up all the operand broadcast dimensions,
+             * extend it with a "newaxis"
+             */
+            if (ibroadcast < 0) {
+                axes[i] = -1;
+            }
+            /* Otherwise map to the broadcast axis */
+            else {
+                axes[i] = ibroadcast;
+                --ibroadcast;
+            }
+        }
+        /* It's a labeled dimension, find the matching one */
+        else {
+            char *match = memchr(labels, label, ndim);
+            /* If the op doesn't have the label, broadcast it */
+            if (match == NULL) {
+                axes[i] = -1;
+            }
+            /* Otherwise use it */
+            else {
+                axes[i] = match - labels;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop1_ndim2(NpyIter *iter)
+{
+    npy_intp coord, shape[2], strides[2][2];
+    char *ptrs[2][2], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 1-op 2-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            2*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            2*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            2*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 2*sizeof(char*));
+
+    sop = get_sum_of_products_function(1,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    for (coord = shape[1]; coord > 0; --coord) {
+        sop(1, ptrs[0], strides[0], shape[0]);
+
+        ptr = ptrs[1][0] + strides[1][0];
+        ptrs[0][0] = ptrs[1][0] = ptr;
+        ptr = ptrs[1][1] + strides[1][1];
+        ptrs[0][1] = ptrs[1][1] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop1_ndim3(NpyIter *iter)
+{
+    npy_intp coords[2], shape[3], strides[3][2];
+    char *ptrs[3][2], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 1-op 3-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            2*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            2*sizeof(npy_intp));
+    memcpy(strides[2], NpyIter_GetAxisStrideArray(iter, 2),
+                                            2*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            2*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 2*sizeof(char*));
+    memcpy(ptrs[2], ptrs[0], 2*sizeof(char*));
+
+    sop = get_sum_of_products_function(1,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
+        for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
+            sop(1, ptrs[0], strides[0], shape[0]);
+
+            ptr = ptrs[1][0] + strides[1][0];
+            ptrs[0][0] = ptrs[1][0] = ptr;
+            ptr = ptrs[1][1] + strides[1][1];
+            ptrs[0][1] = ptrs[1][1] = ptr;
+        }
+        ptr = ptrs[2][0] + strides[2][0];
+        ptrs[0][0] = ptrs[1][0] = ptrs[2][0] = ptr;
+        ptr = ptrs[2][1] + strides[2][1];
+        ptrs[0][1] = ptrs[1][1] = ptrs[2][1] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop2_ndim2(NpyIter *iter)
+{
+    npy_intp coord, shape[2], strides[2][3];
+    char *ptrs[2][3], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 2-op 2-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            3*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            3*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            3*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 3*sizeof(char*));
+
+    sop = get_sum_of_products_function(2,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    for (coord = shape[1]; coord > 0; --coord) {
+        sop(2, ptrs[0], strides[0], shape[0]);
+
+        ptr = ptrs[1][0] + strides[1][0];
+        ptrs[0][0] = ptrs[1][0] = ptr;
+        ptr = ptrs[1][1] + strides[1][1];
+        ptrs[0][1] = ptrs[1][1] = ptr;
+        ptr = ptrs[1][2] + strides[1][2];
+        ptrs[0][2] = ptrs[1][2] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop2_ndim3(NpyIter *iter)
+{
+    npy_intp coords[2], shape[3], strides[3][3];
+    char *ptrs[3][3], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 2-op 3-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            3*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            3*sizeof(npy_intp));
+    memcpy(strides[2], NpyIter_GetAxisStrideArray(iter, 2),
+                                            3*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            3*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 3*sizeof(char*));
+    memcpy(ptrs[2], ptrs[0], 3*sizeof(char*));
+
+    sop = get_sum_of_products_function(2,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
+        for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
+            sop(2, ptrs[0], strides[0], shape[0]);
+
+            ptr = ptrs[1][0] + strides[1][0];
+            ptrs[0][0] = ptrs[1][0] = ptr;
+            ptr = ptrs[1][1] + strides[1][1];
+            ptrs[0][1] = ptrs[1][1] = ptr;
+            ptr = ptrs[1][2] + strides[1][2];
+            ptrs[0][2] = ptrs[1][2] = ptr;
+        }
+        ptr = ptrs[2][0] + strides[2][0];
+        ptrs[0][0] = ptrs[1][0] = ptrs[2][0] = ptr;
+        ptr = ptrs[2][1] + strides[2][1];
+        ptrs[0][1] = ptrs[1][1] = ptrs[2][1] = ptr;
+        ptr = ptrs[2][2] + strides[2][2];
+        ptrs[0][2] = ptrs[1][2] = ptrs[2][2] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+
+/*NUMPY_API
+ * This function provides summation of array elements according to
+ * the Einstein summation convention.  For example:
+ *  - trace(a)        -> einsum("ii", a)
+ *  - transpose(a)    -> einsum("ji", a)
+ *  - multiply(a,b)   -> einsum(",", a, b)
+ *  - inner(a,b)      -> einsum("i,i", a, b)
+ *  - outer(a,b)      -> einsum("i,j", a, b)
+ *  - matvec(a,b)     -> einsum("ij,j", a, b)
+ *  - matmat(a,b)     -> einsum("ij,jk", a, b)
+ *
+ * subscripts: The string of subscripts for einstein summation.
+ * nop:        The number of operands
+ * op_in:      The array of operands
+ * dtype:      Either NULL, or the data type to force the calculation as.
+ * order:      The order for the calculation/the output axes.
+ * casting:    What kind of casts should be permitted.
+ * out:        Either NULL, or an array into which the output should be placed.
+ *
+ * By default, the labels get placed in alphabetical order
+ * at the end of the output. So, if c = einsum("i,j", a, b)
+ * then c[i,j] == a[i]*b[j], but if c = einsum("j,i", a, b)
+ * then c[i,j] = a[j]*b[i].
+ *
+ * Alternatively, you can control the output order or prevent
+ * an axis from being summed/force an axis to be summed by providing
+ * indices for the output. This allows us to turn 'trace' into
+ * 'diag', for example.
+ *  - diag(a)         -> einsum("ii->i", a)
+ *  - sum(a, axis=0)  -> einsum("i...->", a)
+ *
+ * Subscripts at the beginning and end may be specified by
+ * putting an ellipsis "..." in the middle.  For example,
+ * the function einsum("i...i", a) takes the diagonal of
+ * the first and last dimensions of the operand, and
+ * einsum("ij...,jk...->ik...") takes the matrix product using
+ * the first two indices of each operand instead of the last two.
+ *
+ * When there is only one operand, no axes being summed, and
+ * no output parameter, this function returns a view
+ * into the operand instead of making a copy.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_EinsteinSum(char *subscripts, npy_intp nop,
+                    PyArrayObject **op_in,
+                    PyArray_Descr *dtype,
+                    NPY_ORDER order, NPY_CASTING casting,
+                    PyArrayObject *out)
+{
+    int iop, label, min_label = 127, max_label = 0;
+    char label_counts[128];
+    char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
+    char output_labels[NPY_MAXDIMS], *iter_labels;
+    int idim, ndim_output, ndim_broadcast, ndim_iter;
+
+    PyArrayObject *op[NPY_MAXARGS], *ret = NULL;
+    PyArray_Descr *op_dtypes_array[NPY_MAXARGS], **op_dtypes;
+
+    int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+    int *op_axes[NPY_MAXARGS];
+    npy_uint32 iter_flags, op_flags[NPY_MAXARGS];
+
+    NpyIter *iter;
+    sum_of_products_fn sop;
+    npy_intp fixed_strides[NPY_MAXARGS];
+
+    /* nop+1 (+1 is for the output) must fit in NPY_MAXARGS */
+    if (nop >= NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError,
+                    "too many operands provided to einstein sum function");
+        return NULL;
+    }
+    else if (nop < 1) {
+        PyErr_SetString(PyExc_ValueError,
+                    "not enough operands provided to einstein sum function");
+        return NULL;
+    }
+
+    /* Parse the subscripts string into label_counts and op_labels */
+    memset(label_counts, 0, sizeof(label_counts));
+    for (iop = 0; iop < nop; ++iop) {
+        int length = (int)strcspn(subscripts, ",-");
+
+        if (iop == nop-1 && subscripts[length] == ',') {
+            PyErr_SetString(PyExc_ValueError,
+                        "more operands provided to einstein sum function "
+                        "than specified in the subscripts string");
+            return NULL;
+        }
+        else if(iop < nop-1 && subscripts[length] != ',') {
+            PyErr_SetString(PyExc_ValueError,
+                        "fewer operands provided to einstein sum function "
+                        "than specified in the subscripts string");
+            return NULL;
+        }
+
+        if (parse_operand_subscripts(subscripts, length,
+                        PyArray_NDIM(op_in[iop]),
+                        iop, op_labels[iop], label_counts,
+                        &min_label, &max_label) < 0) {
+            return NULL;
+        }
+
+        /* Move subscripts to the start of the labels for the next op */
+        subscripts += length;
+        if (iop < nop-1) {
+            subscripts++;
+        }
+    }
+
+    /*
+     * Find the number of broadcast dimensions, which is the maximum
+     * number of labels == 0 in an op_labels array.
+     */
+    ndim_broadcast = 0;
+    for (iop = 0; iop < nop; ++iop) {
+        npy_intp count_zeros = 0;
+        int ndim;
+        char *labels = op_labels[iop];
+
+        ndim = PyArray_NDIM(op_in[iop]);
+        for (idim = 0; idim < ndim; ++idim) {
+            if (labels[idim] == 0) {
+                ++count_zeros;
+            }
+        }
+
+        if (count_zeros > ndim_broadcast) {
+            ndim_broadcast = count_zeros;
+        }
+    }
+
+    /*
+     * If there is no output signature, fill output_labels and ndim_output
+     * using each label that appeared once, in alphabetical order.
+     */
+    if (subscripts[0] == '\0') {
+        /* If no output was specified, always broadcast left, as usual. */
+        for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
+            output_labels[ndim_output] = 0;
+        }
+        for (label = min_label; label <= max_label; ++label) {
+            if (label_counts[label] == 1) {
+                if (ndim_output < NPY_MAXDIMS) {
+                    output_labels[ndim_output++] = label;
+                }
+                else {
+                    PyErr_SetString(PyExc_ValueError,
+                                "einstein sum subscript string has too many "
+                                "distinct labels");
+                    return NULL;
+                }
+            }
+        }
+    }
+    else {
+        if (subscripts[0] != '-' || subscripts[1] != '>') {
+            PyErr_SetString(PyExc_ValueError,
+                        "einstein sum subscript string does not "
+                        "contain proper '->' output specified");
+            return NULL;
+        }
+        subscripts += 2;
+
+        /* Parse the output subscript string. */
+        ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
+                                        ndim_broadcast, label_counts,
+                                        output_labels);
+        if (ndim_output < 0) {
+            return NULL;
+        }
+    }
+
+    if (out != NULL && PyArray_NDIM(out) != ndim_output) {
+        PyErr_Format(PyExc_ValueError,
+                "out parameter does not have the correct number of "
+                "dimensions, has %d but should have %d",
+                (int)PyArray_NDIM(out), (int)ndim_output);
+        return NULL;
+    }
+
+    /*
+     * If there's just one operand and no output parameter,
+     * first try remapping the axes to the output to return
+     * a view instead of a copy.
+     */
+    if (nop == 1 && out == NULL) {
+        ret = NULL;
+
+        if (get_single_op_view(op_in[0], op_labels[0], ndim_output,
+                               output_labels, &ret) < 0) {
+            return NULL;
+        }
+
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* Set all the op references to NULL */
+    for (iop = 0; iop < nop; ++iop) {
+        op[iop] = NULL;
+    }
+
+    /*
+     * Process all the input ops, combining dimensions into their
+     * diagonal where specified.
+     */
+    for (iop = 0; iop < nop; ++iop) {
+        char *labels = op_labels[iop];
+
+        op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
+        if (op[iop] == NULL) {
+            goto fail;
+        }
+    }
+
+    /* Set the output op */
+    op[nop] = out;
+
+    /*
+     * Set up the labels for the iterator (output + combined labels).
+     * Can just share the output_labels memory, because iter_labels
+     * is output_labels with some more labels appended.
+     */
+    iter_labels = output_labels;
+    ndim_iter = ndim_output;
+    for (label = min_label; label <= max_label; ++label) {
+        if (label_counts[label] > 0 &&
+                memchr(output_labels, label, ndim_output) == NULL) {
+            if (ndim_iter >= NPY_MAXDIMS) {
+                PyErr_SetString(PyExc_ValueError,
+                            "too many subscripts in einsum");
+                goto fail;
+            }
+            iter_labels[ndim_iter++] = label;
+        }
+    }
+
+    /* Set up the op_axes for the iterator */
+    for (iop = 0; iop < nop; ++iop) {
+        op_axes[iop] = op_axes_arrays[iop];
+
+        if (prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels) < 0) {
+            goto fail;
+        }
+    }
+
+    /* Set up the op_dtypes if dtype was provided */
+    if (dtype == NULL) {
+        op_dtypes = NULL;
+    }
+    else {
+        op_dtypes = op_dtypes_array;
+        for (iop = 0; iop <= nop; ++iop) {
+            op_dtypes[iop] = dtype;
+        }
+    }
+
+    /* Set the op_axes for the output */
+    op_axes[nop] = op_axes_arrays[nop];
+    for (idim = 0; idim < ndim_output; ++idim) {
+        op_axes[nop][idim] = idim;
+    }
+    for (idim = ndim_output; idim < ndim_iter; ++idim) {
+        op_axes[nop][idim] = -1;
+    }
+
+    /* Set the iterator per-op flags */
+
+    for (iop = 0; iop < nop; ++iop) {
+        op_flags[iop] = NPY_ITER_READONLY|
+                        NPY_ITER_NBO|
+                        NPY_ITER_ALIGNED;
+    }
+    op_flags[nop] = NPY_ITER_READWRITE|
+                    NPY_ITER_NBO|
+                    NPY_ITER_ALIGNED|
+                    NPY_ITER_ALLOCATE|
+                    NPY_ITER_NO_BROADCAST;
+    iter_flags = NPY_ITER_EXTERNAL_LOOP|
+            NPY_ITER_BUFFERED|
+            NPY_ITER_DELAY_BUFALLOC|
+            NPY_ITER_GROWINNER|
+            NPY_ITER_REDUCE_OK|
+            NPY_ITER_REFS_OK|
+            NPY_ITER_ZEROSIZE_OK;
+    if (out != NULL) {
+        iter_flags |= NPY_ITER_COPY_IF_OVERLAP;
+    }
+    if (dtype == NULL) {
+        iter_flags |= NPY_ITER_COMMON_DTYPE;
+    }
+
+    /* Allocate the iterator */
+    iter = NpyIter_AdvancedNew(nop+1, op, iter_flags, order, casting, op_flags,
+                               op_dtypes, ndim_iter, op_axes, NULL, 0);
+
+    if (iter == NULL) {
+        goto fail;
+    }
+
+    /* Initialize the output to all zeros */
+    ret = NpyIter_GetOperandArray(iter)[nop];
+    if (PyArray_AssignZero(ret, NULL) < 0) {
+        goto fail;
+    }
+
+    /***************************/
+    /*
+     * Acceleration for some specific loop structures. Note
+     * that with axis coalescing, inputs with more dimensions can
+     * be reduced to fit into these patterns.
+     */
+    if (!NpyIter_RequiresBuffering(iter)) {
+        int ndim = NpyIter_GetNDim(iter);
+        switch (nop) {
+            case 1:
+                if (ndim == 2) {
+                    if (unbuffered_loop_nop1_ndim2(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                else if (ndim == 3) {
+                    if (unbuffered_loop_nop1_ndim3(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                break;
+            case 2:
+                if (ndim == 2) {
+                    if (unbuffered_loop_nop2_ndim2(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                else if (ndim == 3) {
+                    if (unbuffered_loop_nop2_ndim3(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                break;
+        }
+    }
+    /***************************/
+
+    if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
+        goto fail;
+    }
+
+    /*
+     * Get an inner loop function, specializing it based on
+     * the strides that are fixed for the whole loop.
+     */
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    sop = get_sum_of_products_function(nop,
+                        NpyIter_GetDescrArray(iter)[0]->type_num,
+                        NpyIter_GetDescrArray(iter)[0]->elsize,
+                        fixed_strides);
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+
+    /* Finally, the main loop */
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+    }
+    else if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *stride;
+        npy_intp *countptr;
+        NPY_BEGIN_THREADS_DEF;
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        NPY_BEGIN_THREADS_NDITER(iter);
+        NPY_EINSUM_DBG_PRINT("Einsum loop\n");
+        do {
+            sop(nop, dataptr, stride, *countptr);
+        } while(iternext(iter));
+        NPY_END_THREADS;
+
+        /* If the API was needed, it may have thrown an error */
+        if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) {
+            goto fail;
+        }
+    }
+
+finish:
+    if (out != NULL) {
+        ret = out;
+    }
+    Py_INCREF(ret);
+
+    NpyIter_Deallocate(iter);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_DECREF(op[iop]);
+    }
+
+    return ret;
+
+fail:
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+    }
+
+    return NULL;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/contrib/python/numpy/py2/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
new file mode 100644
index 00000000000..16bacf1abc4
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -0,0 +1,1785 @@
+/*
+ * This file contains low-level loops for copying and byte-swapping
+ * strided data.
+ *
+ * Copyright (c) 2010 by Mark Wiebe ([email protected])
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/arrayobject.h>
+#include <numpy/npy_cpu.h>
+#include <numpy/halffloat.h>
+
+#include "lowlevel_strided_loops.h"
+#include "array_assign.h"
+
+
+/*
+ * x86 platform works with unaligned access but the compiler is allowed to
+ * assume all data is aligned to its size by the C standard. This means it can
+ * vectorize instructions peeling only by the size of the type, if the data is
+ * not aligned to this size one ends up with data not correctly aligned for SSE
+ * instructions (16 byte).
+ * So this flag can only be enabled if autovectorization is disabled.
+ */
+#if NPY_CPU_HAVE_UNALIGNED_ACCESS
+#  define NPY_USE_UNALIGNED_ACCESS 0
+#else
+#  define NPY_USE_UNALIGNED_ACCESS 0
+#endif
+
+#define _NPY_NOP1(x) (x)
+#define _NPY_NOP2(x) (x)
+#define _NPY_NOP4(x) (x)
+#define _NPY_NOP8(x) (x)
+
+#define _NPY_SWAP2(x) npy_bswap2(x)
+
+#define _NPY_SWAP4(x) npy_bswap4(x)
+
+#define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \
+                       ((((npy_uint32)x)&0xff00u) >> 8) | \
+                       ((((npy_uint32)x)&0xff0000u) << 8) | \
+                       ((((npy_uint32)x)&0xff000000u) >> 8))
+
+#define _NPY_SWAP8(x) npy_bswap8(x)
+
+#define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffULL) << 24) | \
+                       ((((npy_uint64)x)&0xff00ULL) << 8) | \
+                       ((((npy_uint64)x)&0xff0000ULL) >> 8) | \
+                       ((((npy_uint64)x)&0xff000000ULL) >> 24) | \
+                       ((((npy_uint64)x)&0xff00000000ULL) << 24) | \
+                       ((((npy_uint64)x)&0xff0000000000ULL) << 8) | \
+                       ((((npy_uint64)x)&0xff000000000000ULL) >> 8) | \
+                       ((((npy_uint64)x)&0xff00000000000000ULL) >> 24))
+
+#define _NPY_SWAP_INPLACE2(x) npy_bswap2_unaligned(x)
+
+#define _NPY_SWAP_INPLACE4(x) npy_bswap4_unaligned(x)
+
+#define _NPY_SWAP_INPLACE8(x) npy_bswap8_unaligned(x)
+
+#define _NPY_SWAP_INPLACE16(x) { \
+        char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \
+        a = (x)[1]; (x)[1] = (x)[14]; (x)[14] = a; \
+        a = (x)[2]; (x)[2] = (x)[13]; (x)[13] = a; \
+        a = (x)[3]; (x)[3] = (x)[12]; (x)[12] = a; \
+        a = (x)[4]; (x)[4] = (x)[11]; (x)[11] = a; \
+        a = (x)[5]; (x)[5] = (x)[10]; (x)[10] = a; \
+        a = (x)[6]; (x)[6] = (x)[9]; (x)[9] = a; \
+        a = (x)[7]; (x)[7] = (x)[8]; (x)[8] = a; \
+        }
+
+/************* STRIDED COPYING/SWAPPING SPECIALIZED FUNCTIONS *************/
+
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ * #elsize_half = 0, 1, 2, 4, 8#
+ * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint64#
+ */
+/**begin repeat1
+ * #oper = strided_to_strided, strided_to_contig,
+ *         contig_to_strided, contig_to_contig#
+ * #src_contig = 0, 0, 1 ,1#
+ * #dst_contig = 0, 1, 0 ,1#
+ */
+/**begin repeat2
+ * #swap = _NPY_NOP, _NPY_NOP, _NPY_SWAP_INPLACE, _NPY_SWAP,
+ *         _NPY_SWAP_INPLACE, _NPY_SWAP_PAIR#
+ * #prefix = , _aligned, _swap, _aligned_swap, _swap_pair, _aligned_swap_pair#
+ * #is_aligned = 0, 1, 0, 1, 0, 1#
+ * #minelsize = 1, 1, 2, 2, 4, 4#
+ * #is_swap = 0, 0, 1, 1, 2, 2#
+ */
+
+#if (@elsize@ >= @minelsize@) && \
+    (@elsize@ > 1 || @is_aligned@) && \
+    (!NPY_USE_UNALIGNED_ACCESS || @is_aligned@)
+
+
+#if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static void
+#if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+@prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        NpyAuxData *NPY_UNUSED(data))
+{
+#if @is_aligned@
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
+#endif
+    /*printf("fn @prefix@_@oper@_size@elsize@\n");*/
+    while (N > 0) {
+#if @is_aligned@
+
+        /* aligned copy and swap */
+#  if @elsize@ != 16
+        (*((@type@ *)dst)) = @swap@@elsize@(*((@type@ *)src));
+#  else
+#    if @is_swap@ == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif @is_swap@ == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif @is_swap@ == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, @elsize@);
+#  if @is_swap@ == 1
+        @swap@@elsize@(dst);
+#  elif @is_swap@ == 2
+        @swap@@elsize_half@(dst);
+        @swap@@elsize_half@(dst + @elsize_half@);
+#  endif
+
+#endif
+
+#if @dst_contig@
+        dst += @elsize@;
+#else
+        dst += dst_stride;
+#endif
+
+#if @src_contig@
+        src += @elsize@;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (@src_contig@ == 0) && @is_aligned@
+static NPY_GCC_OPT_3 void
+@prefix@_@oper@_size@elsize@_srcstride0(char *dst,
+                        npy_intp dst_stride,
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        NpyAuxData *NPY_UNUSED(data))
+{
+#if @elsize@ != 16
+#  if !(@elsize@ == 1 && @dst_contig@)
+    @type@ temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return;
+    }
+#if @is_aligned@ && @elsize@ != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
+#endif
+#if @elsize@ == 1 && @dst_contig@
+    memset(dst, *src, N);
+#else
+
+#  if @elsize@ != 16
+    temp = @swap@@elsize@(*((@type@ *)src));
+#  else
+#    if @is_swap@ == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif @is_swap@ == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif @is_swap@ == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if @elsize@ != 16
+        *((@type@ *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if @dst_contig@
+        dst += @elsize@;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && @dst_contig@ -- else */
+}
+#endif/* (@src_contig@ == 0) && @is_aligned@ */
+
+#endif/* @elsize@ >= @minelsize@ */
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+static void
+_strided_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        NpyAuxData *NPY_UNUSED(data))
+{
+    while (N > 0) {
+        memmove(dst, src, src_itemsize);
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_swap_strided_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        NpyAuxData *NPY_UNUSED(data))
+{
+    char *a, *b, c;
+
+    while (N > 0) {
+        memmove(dst, src, src_itemsize);
+        /* general in-place swap */
+        a = dst;
+        b = dst + src_itemsize - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        NpyAuxData *NPY_UNUSED(data))
+{
+    char *a, *b, c;
+    npy_intp itemsize_half = src_itemsize / 2;
+
+    while (N > 0) {
+        memmove(dst, src, src_itemsize);
+        /* general in-place swap */
+        a = dst;
+        b = dst + itemsize_half - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        /* general in-place swap */
+        a = dst + itemsize_half;
+        b = dst + 2*itemsize_half - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp src_itemsize,
+                        NpyAuxData *NPY_UNUSED(data))
+{
+    memmove(dst, src, src_itemsize*N);
+}
+
+
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
+PyArray_GetStridedCopyFn(int aligned, npy_intp src_stride,
+                         npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return
+                          &_aligned_strided_to_contig_size@elsize@_srcstride0;
+/**end repeat**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                return &_contig_to_contig;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_aligned_strided_to_contig_size@elsize@;
+/**end repeat**/
+                }
+            }
+
+            return &_strided_to_strided;
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return
+                          &_aligned_strided_to_strided_size@elsize@_srcstride0;
+/**end repeat**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_aligned_contig_to_strided_size@elsize@;
+/**end repeat**/
+                }
+
+                return &_strided_to_strided;
+            }
+            else {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_aligned_strided_to_strided_size@elsize@;
+/**end repeat**/
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                return &_contig_to_contig;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_strided_to_contig_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_strided_to_contig_size@elsize@;
+/**end repeat**/
+                }
+            }
+
+            return &_strided_to_strided;
+        }
+        /* general dst */
+        else {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_contig_to_strided_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_contig_to_strided_size@elsize@;
+/**end repeat**/
+                }
+
+                return &_strided_to_strided;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_strided_to_strided_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_strided_to_strided_size@elsize@;
+/**end repeat**/
+                }
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_strided_to_strided;
+}
+
+/*
+ * PyArray_GetStridedCopySwapFn and PyArray_GetStridedCopySwapPairFn are
+ * nearly identical, so can do a repeat for them.
+ */
+/**begin repeat
+ * #function = PyArray_GetStridedCopySwapFn, PyArray_GetStridedCopySwapPairFn#
+ * #tag = , _pair#
+ * #not_pair = 1, 0#
+ */
+
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
+@function@(int aligned, npy_intp src_stride,
+                             npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return
+                 &_aligned_swap@tag@_strided_to_contig_size@elsize@_srcstride0;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_contig_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_strided_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return
+                &_aligned_swap@tag@_strided_to_strided_size@elsize@_srcstride0;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_contig_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+
+                return  &_swap@tag@_strided_to_strided;
+            }
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_strided_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_swap@tag@_contig_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                    case @elsize@:
+                        return &_swap@tag@_strided_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+
+            return  &_swap@tag@_strided_to_strided;
+        }
+        /* general dst */
+        else {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_swap@tag@_contig_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+
+                return  &_swap@tag@_strided_to_strided;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_swap@tag@_strided_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_swap@tag@_strided_to_strided;
+}
+
+/**end repeat**/
+
+/************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
+
+/**begin repeat
+ *
+ * #NAME1 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name1 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ * #type1 = npy_bool,
+ *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #rtype1 = npy_bool,
+ *           npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *           npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *           npy_half, npy_float, npy_double, npy_longdouble,
+ *           npy_float, npy_double, npy_longdouble#
+ * #is_bool1 = 1, 0*17#
+ * #is_half1 = 0*11, 1, 0*6#
+ * #is_float1 = 0*12, 1, 0, 0, 1, 0, 0#
+ * #is_double1 = 0*13, 1, 0, 0, 1, 0#
+ * #is_complex1 = 0*15, 1*3#
+ */
+
+/**begin repeat1
+ *
+ * #NAME2 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name2 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ * #type2 = npy_bool,
+ *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #rtype2 = npy_bool,
+ *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          npy_float, npy_double, npy_longdouble#
+ * #is_bool2 = 1, 0*17#
+ * #is_half2 = 0*11, 1, 0*6#
+ * #is_float2 = 0*12, 1, 0, 0, 1, 0, 0#
+ * #is_double2 = 0*13, 1, 0, 0, 1, 0#
+ * #is_complex2 = 0*15, 1*3#
+ */
+
+/**begin repeat2
+ * #prefix = _aligned,,_aligned_contig,_contig#
+ * #aligned = 1,0,1,0#
+ * #contig = 0,0,1,1#
+ */
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !@aligned@)
+
+/* For half types, don't use actual double/float types in conversion */
+#if @is_half1@ || @is_half2@
+
+#  if @is_float1@
+#    define _TYPE1 npy_uint32
+#  elif @is_double1@
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 @rtype1@
+#  endif
+
+#  if @is_float2@
+#    define _TYPE2 npy_uint32
+#  elif @is_double2@
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 @rtype2@
+#  endif
+
+#else
+
+#define _TYPE1 @rtype1@
+#define _TYPE2 @rtype2@
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if @is_half1@
+
+#  if @is_float2@
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif @is_double2@
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif @is_half2@
+#    define _CONVERT_FN(x) (x)
+#  elif @is_bool2@
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif @is_half2@
+
+#  if @is_float1@
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif @is_double1@
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if @is_bool2@ || @is_bool1@
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 void
+@prefix@_cast_@name1@_to_@name2@(
+                        char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        NpyAuxData *NPY_UNUSED(data))
+{
+#if @is_complex1@
+    _TYPE1 src_value[2];
+#elif !@aligned@
+    _TYPE1 src_value;
+#endif
+#if @is_complex2@
+    _TYPE2 dst_value[2];
+#elif !@aligned@
+    _TYPE2 dst_value;
+#endif
+
+#if @aligned@
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, _ALIGN(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2)));
+#endif
+
+    /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
+
+    while (N--) {
+#if @aligned@
+#  if @is_complex1@
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if @is_complex1@
+#  if @is_complex2@
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !@aligned@
+#    if @is_bool2@
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if @is_bool2@
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if @is_complex2@
+#    if !@aligned@
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !@aligned@
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if @aligned@
+#  if @is_complex2@
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if @contig@
+        dst += sizeof(@type2@);
+        src += sizeof(@type1@);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+/**end repeat2**/
+
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
+PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
+                             npy_intp dst_stride,
+                             int src_type_num, int dst_type_num)
+{
+    switch (src_type_num) {
+/**begin repeat
+ *
+ * #NAME1 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name1 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ * #type1 = npy_bool,
+ *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+
+        case NPY_@NAME1@:
+            /*printf("test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/
+            switch (dst_type_num) {
+/**begin repeat1
+ *
+ * #NAME2 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name2 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ * #type2 = npy_bool,
+ *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+
+                case NPY_@NAME2@:
+                    /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(@type1@) &&
+                                dst_stride == sizeof(@type2@)) {
+                        return &_aligned_contig_cast_@name1@_to_@name2@;
+                    }
+                    else {
+                        return &_aligned_cast_@name1@_to_@name2@;
+                    }
+#  else
+                    if (src_stride == sizeof(@type1@) &&
+                                dst_stride == sizeof(@type2@)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_@name1@_to_@name2@ :
+                                    &_contig_cast_@name1@_to_@name2@;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_@name1@_to_@name2@ :
+                                         &_cast_@name1@_to_@name2@;
+                    }
+#  endif
+
+/**end repeat1**/
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/
+
+/**end repeat**/
+    }
+
+    return NULL;
+}
+
+
+/****************** PRIMITIVE FLAT TO/FROM NDIM FUNCTIONS ******************/
+
+/* See documentation of arguments in lowlevel_strided_loops.h */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferNDimToStrided(npy_intp ndim,
+                char *dst, npy_intp dst_stride,
+                char *src, npy_intp *src_strides, npy_intp src_strides_inc,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_StridedUnaryOp *stransfer,
+                NpyAuxData *data)
+{
+    npy_intp i, M, N, coord0, shape0, src_stride0, coord1, shape1, src_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    src_stride0 = src_strides[0];
+    N = shape0 - coord0;
+    if (N >= count) {
+        stransfer(dst, dst_stride, src, src_stride0, count, src_itemsize, data);
+        return 0;
+    }
+    stransfer(dst, dst_stride, src, src_stride0, N, src_itemsize, data);
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    src_stride1 = (src_strides + src_strides_inc)[0];
+    src = src - coord0*src_stride0 + src_stride1;
+    dst += N*dst_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        if (shape0 >= count) {
+            stransfer(dst, dst_stride, src, src_stride0,
+                        count, src_itemsize, data);
+            return 0;
+        }
+        else {
+            stransfer(dst, dst_stride, src, src_stride0,
+                        shape0, src_itemsize, data);
+        }
+        count -= shape0;
+        src += src_stride1;
+        dst += shape0*dst_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, src_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        src_strides += 2*src_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].src_stride = src_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            src_strides += src_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the src pointer from the dimension 0 and 1 loop */
+            src = src - shape1*src_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                src += it[i].src_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    src -= it[i].src_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                if (shape0 >= count) {
+                    stransfer(dst, dst_stride, src, src_stride0,
+                                count, src_itemsize, data);
+                    return 0;
+                }
+                else {
+                    stransfer(dst, dst_stride, src, src_stride0,
+                                shape0, src_itemsize, data);
+                }
+                count -= shape0;
+                src += src_stride1;
+                dst += shape0*dst_stride;
+            }
+        }
+    }
+}
+
+/* See documentation of arguments in lowlevel_strided_loops.h */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferStridedToNDim(npy_intp ndim,
+                char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc,
+                char *src, npy_intp src_stride,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_StridedUnaryOp *stransfer,
+                NpyAuxData *data)
+{
+    npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    dst_stride0 = dst_strides[0];
+    N = shape0 - coord0;
+    if (N >= count) {
+        stransfer(dst, dst_stride0, src, src_stride, count, src_itemsize, data);
+        return 0;
+    }
+    stransfer(dst, dst_stride0, src, src_stride, N, src_itemsize, data);
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    dst_stride1 = (dst_strides + dst_strides_inc)[0];
+    dst = dst - coord0*dst_stride0 + dst_stride1;
+    src += N*src_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        if (shape0 >= count) {
+            stransfer(dst, dst_stride0, src, src_stride,
+                        count, src_itemsize, data);
+            return 0;
+        }
+        else {
+            stransfer(dst, dst_stride0, src, src_stride,
+                        shape0, src_itemsize, data);
+        }
+        count -= shape0;
+        dst += dst_stride1;
+        src += shape0*src_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, dst_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        dst_strides += 2*dst_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].dst_stride = dst_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            dst_strides += dst_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the dst pointer from the dimension 0 and 1 loop */
+            dst = dst - shape1*dst_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                dst += it[i].dst_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    dst -= it[i].dst_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                if (shape0 >= count) {
+                    stransfer(dst, dst_stride0, src, src_stride,
+                                count, src_itemsize, data);
+                    return 0;
+                }
+                else {
+                    stransfer(dst, dst_stride0, src, src_stride,
+                                shape0, src_itemsize, data);
+                }
+                count -= shape0;
+                dst += dst_stride1;
+                src += shape0*src_stride;
+            }
+        }
+    }
+}
+
+/* See documentation of arguments in lowlevel_strided_loops.h */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
+                char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc,
+                char *src, npy_intp src_stride,
+                npy_uint8 *mask, npy_intp mask_stride,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_MaskedStridedUnaryOp *stransfer,
+                NpyAuxData *data)
+{
+    npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    dst_stride0 = dst_strides[0];
+    N = shape0 - coord0;
+    if (N >= count) {
+        stransfer(dst, dst_stride0,
+                    src, src_stride,
+                    mask, mask_stride,
+                    count, src_itemsize, data);
+        return 0;
+    }
+    stransfer(dst, dst_stride0,
+                src, src_stride,
+                mask, mask_stride,
+                N, src_itemsize, data);
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    dst_stride1 = (dst_strides + dst_strides_inc)[0];
+    dst = dst - coord0*dst_stride0 + dst_stride1;
+    src += N*src_stride;
+    mask += N*mask_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        if (shape0 >= count) {
+            stransfer(dst, dst_stride0,
+                        src, src_stride,
+                        mask, mask_stride,
+                        count, src_itemsize, data);
+            return 0;
+        }
+        else {
+            stransfer(dst, dst_stride0,
+                        src, src_stride,
+                        mask, mask_stride,
+                        shape0, src_itemsize, data);
+        }
+        count -= shape0;
+        dst += dst_stride1;
+        src += shape0*src_stride;
+        mask += shape0*mask_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, dst_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        dst_strides += 2*dst_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].dst_stride = dst_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            dst_strides += dst_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the dst pointer from the dimension 0 and 1 loop */
+            dst = dst - shape1*dst_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                dst += it[i].dst_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    dst -= it[i].dst_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                if (shape0 >= count) {
+                    stransfer(dst, dst_stride0,
+                                src, src_stride,
+                                mask, mask_stride,
+                                count, src_itemsize, data);
+                    return 0;
+                }
+                else {
+                    stransfer(dst, dst_stride0,
+                                src, src_stride,
+                                mask, mask_stride,
+                                shape0, src_itemsize, data);
+                }
+                count -= shape0;
+                dst += dst_stride1;
+                src += shape0*src_stride;
+                mask += shape0*mask_stride;
+            }
+        }
+    }
+}
+
+
+/***************************************************************************/
+/****************** MapIter (Advanced indexing) Get/Set ********************/
+/***************************************************************************/
+
+/**begin repeat
+ * #name = set, get#
+ * #isget = 0, 1#
+ */
+
+/*
+ * Advanced indexing iteration of arrays when there is a single indexing
+ * array which has the same memory order as the value array and both
+ * can be trivially iterated (single stride, aligned, no casting necessary).
+ */
+NPY_NO_EXPORT int
+mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
+                       PyArrayObject *result)
+{
+    char *base_ptr, *ind_ptr, *result_ptr;
+    npy_intp self_stride, ind_stride, result_stride;
+    npy_intp fancy_dim = PyArray_DIM(self, 0);
+
+    npy_intp itersize;
+
+    int is_aligned = IsUintAligned(self) && IsUintAligned(result);
+    int needs_api = PyDataType_REFCHK(PyArray_DESCR(self));
+
+    PyArray_CopySwapFunc *copyswap = PyArray_DESCR(self)->f->copyswap;
+    NPY_BEGIN_THREADS_DEF;
+
+    base_ptr = PyArray_BYTES(self);
+    self_stride = PyArray_STRIDE(self, 0);
+
+    PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(ind, result, itersize,
+                                           ind_ptr, result_ptr,
+                                           ind_stride, result_stride)
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(PyArray_SIZE(ind));
+    }
+#if !@isget@
+    /* Check the indices beforehand */
+    while (itersize--) {
+        npy_intp indval = *((npy_intp*)ind_ptr);
+        if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+            return -1;
+        }
+        ind_ptr += ind_stride;
+    }
+
+    /*
+     * Reset the ind_ptr and itersize, due to broadcasting it is always
+     * the size of ind.
+     */
+    ind_ptr = PyArray_BYTES(ind);
+    itersize = PyArray_SIZE(ind);
+#endif
+
+    /* Optimization for aligned types that do not need the api */
+    switch ((is_aligned && !needs_api) ? PyArray_ITEMSIZE(self) : 0) {
+
+/**begin repeat1
+ * #elsize = 1, 2, 4, 8, 0#
+ * #copytype = npy_uint8, npy_uint16, npy_uint32, npy_uint64, 0#
+ */
+
+#if @elsize@
+    case @elsize@:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, _UINT_ALIGN(npy_intp)));
+#if @isget@
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if @isget@
+#if @elsize@
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
+            *(@copytype@ *)result_ptr = *(@copytype@ *)self_ptr;
+#else
+            copyswap(result_ptr, self_ptr, 0, self);
+#endif
+
+#else /* !@isget@ */
+#if @elsize@
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
+            *(@copytype@ *)self_ptr = *(@copytype@ *)result_ptr;
+#else
+            copyswap(self_ptr, result_ptr, 0, self);
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+/**end repeat1**/
+    }
+
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+
+/*
+ * General advanced indexing iteration.
+ */
+NPY_NO_EXPORT int
+mapiter_@name@(PyArrayMapIterObject *mit)
+{
+    npy_intp *counter, count;
+    int i, is_aligned;
+
+    /* Cached mit info */
+    int numiter = mit->numiter;
+    int needs_api = mit->needs_api;
+    /* Constant information */
+    npy_intp fancy_dims[NPY_MAXDIMS];
+    npy_intp fancy_strides[NPY_MAXDIMS];
+#if @isget@
+    int iteraxis;
+#endif
+
+    char *baseoffset = mit->baseoffset;
+    char **outer_ptrs = mit->outer_ptrs;
+    npy_intp *outer_strides = mit->outer_strides;
+    PyArrayObject *array= mit->array;
+
+    /* Fill constant information */
+#if @isget@
+    iteraxis = mit->iteraxes[0];
+#endif
+    for (i = 0; i < numiter; i++) {
+        fancy_dims[i] = mit->fancy_dims[i];
+        fancy_strides[i] = mit->fancy_strides[i];
+    }
+
+    /*
+     * Alignment information (swapping is never needed, since we buffer),
+     * could also check extra_op is buffered, but it should rarely matter.
+     */
+
+    is_aligned = IsUintAligned(array) && IsUintAligned(mit->extra_op);
+
+    if (mit->size == 0) {
+       return 0;
+    }
+
+    if (mit->subspace_iter == NULL) {
+        /*
+         * Item by item copy situation, the operand is buffered
+         * so use copyswap.
+         */
+         PyArray_CopySwapFunc *copyswap = PyArray_DESCR(array)->f->copyswap;
+
+        /* We have only one iterator handling everything */
+        counter = NpyIter_GetInnerLoopSizePtr(mit->outer);
+
+        /************ Optimized inner loops without subspace *************/
+
+/**begin repeat1
+ * #one_iter = 1, 0#
+ * #numiter = 1, numiter#
+ */
+
+#if @one_iter@
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Optimization for aligned types that do not need the api */
+            switch ((is_aligned && !needs_api) ? PyArray_ITEMSIZE(array) : 0) {
+
+/**begin repeat2
+ * #elsize = 1, 2, 4, 8, 0#
+ * #copytype = npy_uint8, npy_uint16, npy_uint32, npy_uint64, 0#
+ */
+
+#if @elsize@
+            case @elsize@:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !@isget@
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < @numiter@; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  _UINT_ALIGN(npy_intp)));
+
+#if @isget@ && @one_iter@
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if @isget@
+#if @elsize@
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                                              _UINT_ALIGN(@copytype@)));
+                        *(@copytype@ *)(outer_ptrs[i]) = *(@copytype@ *)self_ptr;
+#else
+                        copyswap(outer_ptrs[i], self_ptr, 0, array);
+#endif
+#else /* !@isget@ */
+#if @elsize@
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                               _UINT_ALIGN(@copytype@)));
+                        *(@copytype@ *)self_ptr = *(@copytype@ *)(outer_ptrs[i]);
+#else
+                        copyswap(self_ptr, outer_ptrs[i], 0, array);
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+/**end repeat2**/
+            }
+            NPY_END_THREADS;
+        }
+/**end repeat1**/
+    }
+
+    /******************* Nested Iteration Situation *******************/
+    else {
+        char *subspace_baseptrs[2];
+        char **subspace_ptrs = mit->subspace_ptrs;
+        npy_intp *subspace_strides = mit->subspace_strides;
+        int is_subiter_trivial = 0; /* has three states */
+        npy_intp reset_offsets[2] = {0, 0};
+
+        /* Use strided transfer functions for the inner loop */
+        PyArray_StridedUnaryOp *stransfer = NULL;
+        NpyAuxData *transferdata = NULL;
+        npy_intp fixed_strides[2];
+
+#if @isget@
+        npy_intp src_itemsize = PyArray_ITEMSIZE(array);
+#else
+        npy_intp src_itemsize = PyArray_ITEMSIZE(mit->extra_op);
+#endif
+
+        /*
+         * Get a dtype transfer function, since there are no
+         * buffers, this is safe.
+         */
+        NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides);
+
+        if (PyArray_GetDTypeTransferFunction(is_aligned,
+#if @isget@
+                        fixed_strides[0], fixed_strides[1],
+                        PyArray_DESCR(array), PyArray_DESCR(mit->extra_op),
+#else
+                        fixed_strides[1], fixed_strides[0],
+                         PyArray_DESCR(mit->extra_op), PyArray_DESCR(array),
+#endif
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            return -1;
+        }
+
+        counter = NpyIter_GetInnerLoopSizePtr(mit->subspace_iter);
+        if (*counter == PyArray_SIZE(mit->subspace)) {
+           /*
+            * subspace is trivially iterable.
+            * manipulate pointers to avoid expensive resetting
+            */
+            is_subiter_trivial = 1;
+        }
+/**begin repeat1
+ * #one_iter = 1, 0#
+ * #numiter = 1, numiter#
+ */
+
+#if @one_iter@
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Outer iteration (safe because mit->size != 0) */
+            do {
+                char * self_ptr = baseoffset;
+                for (i=0; i < @numiter@; i++) {
+                    npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+
+#if @isget@ && @one_iter@
+                    if (check_and_adjust_index(&indval, fancy_dims[i],
+                                               iteraxis, _save) < 0 ) {
+                        NPY_AUXDATA_FREE(transferdata);
+                        return -1;
+                    }
+#else
+                    if (indval < 0) {
+                        indval += fancy_dims[i];
+                    }
+#endif
+
+                    self_ptr += indval * fancy_strides[i];
+                }
+
+                /*
+                 * Resetting is slow, so try to avoid resetting
+                 * if subspace iteration is trivial.
+                 * Watch out: reset_offsets are kept outside of the loop,
+                 * assuming the subspaces of different external iterations
+                 * share the same structure.
+                 */
+                if (is_subiter_trivial <= 1) {
+                    /* slower resetting: first iteration or non-trivial subspace */
+
+                    char * errmsg = NULL;
+                    subspace_baseptrs[0] = self_ptr;
+                    subspace_baseptrs[1] = mit->extra_op_ptrs[0];
+
+                    /* (can't really fail, since no buffering necessary) */
+                    if (!NpyIter_ResetBasePointers(mit->subspace_iter,
+                                                   subspace_baseptrs,
+                                                   &errmsg)) {
+                        NPY_END_THREADS;
+                        PyErr_SetString(PyExc_ValueError, errmsg);
+                        NPY_AUXDATA_FREE(transferdata);
+                        return -1;
+                    }
+                    if (is_subiter_trivial != 0) {
+                        /* reset_offsets are nonzero for negative strides.*/
+                        reset_offsets[0] = subspace_ptrs[0] - self_ptr;
+                        reset_offsets[1] = subspace_ptrs[1] - mit->extra_op_ptrs[0];
+
+                        /* use the faster adjustment further on */
+                        is_subiter_trivial ++;
+                    }
+                }
+                else {
+                    /*
+                     * faster resetting if the subspace iteration is trivial.
+                     * reset_offsets are zero for positive strides,
+                     * for negative strides this shifts the pointer to the last
+                     * item.
+                     */
+                    subspace_ptrs[0] = self_ptr + reset_offsets[0];
+                    subspace_ptrs[1] = mit->extra_op_ptrs[0] + reset_offsets[1];
+                }
+
+#if !@isget@
+                /*
+                 * When the API is needed the casting might fail
+                 * TODO: Could only check if casting is unsafe, or even just
+                 *       not at all...
+                 */
+                if (needs_api && PyErr_Occurred()) {
+                    NPY_AUXDATA_FREE(transferdata);
+                    return -1;
+                }
+#endif
+
+                do {
+
+#if @isget@
+                    stransfer(subspace_ptrs[1], subspace_strides[1],
+                              subspace_ptrs[0], subspace_strides[0],
+                              *counter, src_itemsize, transferdata);
+#else
+                    stransfer(subspace_ptrs[0], subspace_strides[0],
+                              subspace_ptrs[1], subspace_strides[1],
+                              *counter, src_itemsize, transferdata);
+#endif
+                } while (mit->subspace_next(mit->subspace_iter));
+
+                mit->extra_op_next(mit->extra_op_iter);
+            } while (mit->outer_next(mit->outer));
+            NPY_END_THREADS;
+        }
+/**end repeat1**/
+
+        NPY_AUXDATA_FREE(transferdata);
+    }
+    return 0;
+}
+
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/nditer_templ.c.src b/contrib/python/numpy/py2/numpy/core/src/multiarray/nditer_templ.c.src
new file mode 100644
index 00000000000..0f0d5997230
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/nditer_templ.c.src
@@ -0,0 +1,615 @@
+/*
+ * This file implements the API functions for NumPy's nditer that
+ * are specialized using the templating system.
+ *
+ * Copyright (c) 2010-2011 by Mark Wiebe ([email protected])
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+/* Indicate that this .c file is allowed to include the header */
+#define NPY_ITERATOR_IMPLEMENTATION_CODE
+#include "nditer_impl.h"
+
+/* SPECIALIZED iternext functions that handle the non-buffering part */
+
+/**begin repeat
+ * #const_itflags = 0,
+ *                  NPY_ITFLAG_HASINDEX,
+ *                  NPY_ITFLAG_EXLOOP,
+ *                  NPY_ITFLAG_RANGE,
+ *                  NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX#
+ * #tag_itflags = 0, IND, NOINN, RNG, RNGuIND#
+ */
+/**begin repeat1
+ * #const_ndim = 1, 2, NPY_MAXDIMS#
+ * #tag_ndim = 1, 2, ANY#
+ */
+/**begin repeat2
+ * #const_nop = 1, 2, NPY_MAXDIMS#
+ * #tag_nop = 1, 2, ANY#
+ */
+
+/* Specialized iternext (@const_itflags@,@tag_ndim@,@tag_nop@) */
+static int
+npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_nop@(
+                                                      NpyIter *iter)
+{
+#if !(@const_itflags@&NPY_ITFLAG_EXLOOP) || (@const_ndim@ > 1)
+    const npy_uint32 itflags = @const_itflags@;
+#  if @const_ndim@ >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if @const_nop@ < NPY_MAXDIMS
+    const int nop = @const_nop@;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if @const_ndim@ > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if @const_ndim@ > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (@const_itflags@&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if @const_ndim@ > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(@const_itflags@&NPY_ITFLAG_EXLOOP) || (@const_ndim@ > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(@const_itflags@&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if @const_ndim@ == 1
+
+#  if !(@const_itflags@&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(@const_itflags@&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if @const_ndim@ == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+
+/**begin repeat
+ * #const_nop = 1, 2, 3, 4, NPY_MAXDIMS#
+ * #tag_nop = 1, 2, 3, 4, ANY#
+ */
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_iters@tag_nop@(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+#if @const_nop@ >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
+#else
+    const int nop = @const_nop@;
+#endif
+
+    int iop;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINT1("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
+
+    /* Write back to the arrays */
+    npyiter_copy_from_buffers(iter);
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    npyiter_copy_to_buffers(iter, prev_dataptrs);
+
+    return 1;
+}
+
+/**end repeat**/
+
+/* iternext function that handles the buffering part */
+static int
+npyiter_buffered_iternext(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+    int nop = NIT_NOP(iter);
+
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            int iop;
+            npy_intp *strides;
+            char **ptrs;
+
+            strides = NBF_STRIDES(bufferdata);
+            ptrs = NBF_PTRS(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    /* Write back to the arrays */
+    npyiter_copy_from_buffers(iter);
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    npyiter_copy_to_buffers(iter, NULL);
+
+    return 1;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/* Specialization of iternext for when the iteration size is 1 */
+static int
+npyiter_iternext_sizeone(NpyIter *iter)
+{
+    return 0;
+}
+
+/*NUMPY_API
+ * Compute the specialized iteration function for an iterator
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT NpyIter_IterNextFunc *
+NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    if (NIT_ITERSIZE(iter) < 0) {
+        if (errmsg == NULL) {
+            PyErr_SetString(PyExc_ValueError, "iterator is too large");
+        }
+        else {
+            *errmsg = "iterator is too large";
+        }
+        return NULL;
+    }
+
+    /*
+     * When there is just one iteration and buffering is disabled
+     * the iternext function is very simple.
+     */
+    if (itflags&NPY_ITFLAG_ONEITERATION) {
+        return &npyiter_iternext_sizeone;
+    }
+
+    /*
+     * If buffering is enabled.
+     */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        if (itflags&NPY_ITFLAG_REDUCE) {
+            switch (nop) {
+                case 1:
+                    return &npyiter_buffered_reduce_iternext_iters1;
+                case 2:
+                    return &npyiter_buffered_reduce_iternext_iters2;
+                case 3:
+                    return &npyiter_buffered_reduce_iternext_iters3;
+                case 4:
+                    return &npyiter_buffered_reduce_iternext_iters4;
+                default:
+                    return &npyiter_buffered_reduce_iternext_itersANY;
+            }
+        }
+        else {
+            return &npyiter_buffered_iternext;
+        }
+    }
+
+    /*
+     * Ignore all the flags that don't affect the iterator memory
+     * layout or the iternext function.  Currently only HASINDEX,
+     * EXLOOP, and RANGE affect them here.
+     */
+    itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_EXLOOP|NPY_ITFLAG_RANGE);
+
+    /* Switch statements let the compiler optimize this most effectively */
+    switch (itflags) {
+    /*
+     * The combinations HASINDEX|EXLOOP and RANGE|EXLOOP are excluded
+     * by the New functions
+     */
+/**begin repeat
+ * #const_itflags = 0,
+ *                  NPY_ITFLAG_HASINDEX,
+ *                  NPY_ITFLAG_EXLOOP,
+ *                  NPY_ITFLAG_RANGE,
+ *                  NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX#
+ * #tag_itflags = 0, IND, NOINN, RNG, RNGuIND#
+ */
+        case @const_itflags@:
+            switch (ndim) {
+/**begin repeat1
+ * #const_ndim = 1, 2#
+ * #tag_ndim = 1, 2#
+ */
+                case @const_ndim@:
+                    switch (nop) {
+/**begin repeat2
+ * #const_nop = 1, 2#
+ * #tag_nop = 1, 2#
+ */
+                        case @const_nop@:
+                            return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_nop@;
+/**end repeat2**/
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_itersANY;
+                    }
+/**end repeat1**/
+                /* Not specialized on ndim */
+                default:
+                    switch (nop) {
+/**begin repeat1
+ * #const_nop = 1, 2#
+ * #tag_nop = 1, 2#
+ */
+                        case @const_nop@:
+                            return &npyiter_iternext_itflags@tag_itflags@_dimsANY_iters@tag_nop@;
+/**end repeat1**/
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflags@tag_itflags@_dimsANY_itersANY;
+                    }
+            }
+/**end repeat**/
+    }
+    /* The switch above should have caught all the possibilities. */
+    if (errmsg == NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "GetIterNext internal iterator error - unexpected "
+                "itflags/ndim/nop combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)nop);
+    }
+    else {
+        *errmsg = "GetIterNext internal iterator error - unexpected "
+                  "itflags/ndim/nop combination";
+    }
+    return NULL;
+}
+
+
+/* SPECIALIZED getindex functions */
+
+/**begin repeat
+ * #const_itflags = 0,
+ *    NPY_ITFLAG_HASINDEX,
+ *    NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP,
+ *                BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF#
+ */
+static void
+npyiter_get_multi_index_itflags@tag_itflags@(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = @const_itflags@;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((@const_itflags@)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((@const_itflags@)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((@const_itflags@)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+/**end repeat**/
+
+/*NUMPY_API
+ * Compute a specialized get_multi_index function for the iterator
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT NpyIter_GetMultiIndexFunc *
+NpyIter_GetGetMultiIndex(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    /* These flags must be correct */
+    if ((itflags&(NPY_ITFLAG_HASMULTIINDEX|NPY_ITFLAG_DELAYBUF)) !=
+            NPY_ITFLAG_HASMULTIINDEX) {
+        if (!(itflags&NPY_ITFLAG_HASMULTIINDEX)) {
+            if (errmsg == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot retrieve a GetMultiIndex function for an "
+                        "iterator that doesn't track a multi-index.");
+            }
+            else {
+                *errmsg = "Cannot retrieve a GetMultiIndex function for an "
+                          "iterator that doesn't track a multi-index.";
+            }
+            return NULL;
+        }
+        else {
+            if (errmsg == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot retrieve a GetMultiIndex function for an "
+                        "iterator that used DELAY_BUFALLOC before a Reset call");
+            }
+            else {
+                *errmsg = "Cannot retrieve a GetMultiIndex function for an "
+                          "iterator that used DELAY_BUFALLOC before a "
+                          "Reset call";
+            }
+            return NULL;
+        }
+    }
+
+    /*
+     * Only these flags affect the iterator memory layout or
+     * the get_multi_index behavior. IDENTPERM and NEGPERM are mutually
+     * exclusive, so that reduces the number of cases slightly.
+     */
+    itflags &= (NPY_ITFLAG_HASINDEX |
+                NPY_ITFLAG_IDENTPERM |
+                NPY_ITFLAG_NEGPERM |
+                NPY_ITFLAG_BUFFER);
+
+    switch (itflags) {
+/**begin repeat
+ * #const_itflags = 0,
+ *    NPY_ITFLAG_HASINDEX,
+ *    NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP,
+ *                BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF#
+ */
+        case @const_itflags@:
+            return npyiter_get_multi_index_itflags@tag_itflags@;
+/**end repeat**/
+    }
+    /* The switch above should have caught all the possibilities. */
+    if (errmsg == NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "GetGetMultiIndex internal iterator error - unexpected "
+                "itflags/ndim/nop combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)nop);
+    }
+    else {
+        *errmsg = "GetGetMultiIndex internal iterator error - unexpected "
+                  "itflags/ndim/nop combination";
+    }
+    return NULL;
+
+}
+
+#undef NPY_ITERATOR_IMPLEMENTATION_CODE
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/refcount.h b/contrib/python/numpy/py2/numpy/core/src/multiarray/refcount.h
new file mode 100644
index 00000000000..761d53dd0d7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/refcount.h
@@ -0,0 +1,19 @@
+#ifndef _NPY_PRIVATE_REFCOUNT_H_
+#define _NPY_PRIVATE_REFCOUNT_H_
+
+NPY_NO_EXPORT void
+PyArray_Item_INCREF(char *data, PyArray_Descr *descr);
+
+NPY_NO_EXPORT void
+PyArray_Item_XDECREF(char *data, PyArray_Descr *descr);
+
+NPY_NO_EXPORT int
+PyArray_INCREF(PyArrayObject *mp);
+
+NPY_NO_EXPORT int
+PyArray_XDECREF(PyArrayObject *mp);
+
+NPY_NO_EXPORT void
+PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj);
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/multiarray/scalartypes.c.src b/contrib/python/numpy/py2/numpy/core/src/multiarray/scalartypes.c.src
new file mode 100644
index 00000000000..52de31289d0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/multiarray/scalartypes.c.src
@@ -0,0 +1,4496 @@
+/* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#ifndef _MULTIARRAYMODULE
+#define _MULTIARRAYMODULE
+#endif
+
+#include "numpy/arrayobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "numpy/arrayscalars.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_config.h"
+#include "mapping.h"
+#include "ctors.h"
+#include "usertypes.h"
+#include "numpyos.h"
+#include "common.h"
+#include "scalartypes.h"
+#include "_datetime.h"
+#include "datetime_strings.h"
+#include "alloc.h"
+#include "npy_import.h"
+#include "dragon4.h"
+#include "npy_longdouble.h"
+#include "buffer.h"
+
+#include <stdlib.h>
+
+#include "binop_override.h"
+
+NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[] = {
+    {PyObject_HEAD_INIT(&PyBoolArrType_Type) 0},
+    {PyObject_HEAD_INIT(&PyBoolArrType_Type) 1},
+};
+
+/* TimeInteger is deleted, but still here to fill the API slot */
+NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type;
+
+/*
+ * Inheritance is established later when tp_bases is set (or tp_base for
+ * single inheritance)
+ */
+
+/**begin repeat
+ * #name = number, integer, signedinteger, unsignedinteger, inexact,
+ *         floating, complexfloating, flexible, character#
+ * #NAME = Number, Integer, SignedInteger, UnsignedInteger, Inexact,
+ *         Floating, ComplexFloating, Flexible, Character#
+ */
+NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy.@name@",                             /* tp_name*/
+    sizeof(PyObject),                           /* tp_basicsize*/
+    0,                                          /* tp_itemsize */
+    /* methods */
+    0,                                          /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    0,                                          /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+};
+/**end repeat**/
+
+static PyObject *
+gentype_alloc(PyTypeObject *type, Py_ssize_t nitems)
+{
+    PyObject *obj;
+    const size_t size = _PyObject_VAR_SIZE(type, nitems + 1);
+
+    obj = (PyObject *)PyObject_Malloc(size);
+    /*
+     * Fixme. Need to check for no memory.
+     * If we don't need to zero memory, we could use
+     * PyObject_{New, NewVar} for this whole function.
+     */
+    memset(obj, 0, size);
+    if (type->tp_itemsize == 0) {
+        PyObject_Init(obj, type);
+    }
+    else {
+        (void) PyObject_InitVar((PyVarObject *)obj, type, nitems);
+    }
+    return obj;
+}
+
+static void
+gentype_dealloc(PyObject *v)
+{
+    _dealloc_cached_buffer_info(v);
+    Py_TYPE(v)->tp_free(v);
+}
+
+static void
+gentype_free(PyObject *v)
+{
+    /*
+     * have an explicit tp_free to enforce inheritance from it.
+     * PyObject_Free is also the tp_free of PyBaseObject so python does not
+     * COPYSLOT it, instead it takes the next parent PyInt which has a
+     * different allocator
+     */
+    PyObject_Free(v);
+}
+
+
+static PyObject *
+gentype_power(PyObject *m1, PyObject *m2, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_power, gentype_power);
+    return PyArray_Type.tp_as_number->nb_power(m1, m2, Py_None);
+}
+
+static PyObject *
+gentype_generic_method(PyObject *self, PyObject *args, PyObject *kwds,
+        char *str)
+{
+    PyObject *arr, *meth, *ret;
+
+    arr = PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    meth = PyObject_GetAttrString(arr, str);
+    if (meth == NULL) {
+        Py_DECREF(arr);
+        return NULL;
+    }
+    if (kwds == NULL) {
+        ret = PyObject_CallObject(meth, args);
+    }
+    else {
+        ret = PyObject_Call(meth, args, kwds);
+    }
+    Py_DECREF(meth);
+    Py_DECREF(arr);
+    if (ret && PyArray_Check(ret)) {
+        return PyArray_Return((PyArrayObject *)ret);
+    }
+    else {
+        return ret;
+    }
+}
+
+static PyObject *
+gentype_add(PyObject *m1, PyObject* m2)
+{
+    /* special case str.__radd__, which should not call array_add */
+    if (PyString_Check(m1) || PyUnicode_Check(m1)) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_add, gentype_add);
+    return PyArray_Type.tp_as_number->nb_add(m1, m2);
+}
+
+/**begin repeat
+ *
+ * #name = subtract, remainder, divmod, lshift, rshift,
+ *         and, xor, or, floor_divide, true_divide#
+ */
+static PyObject *
+gentype_@name@(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_@name@, gentype_@name@);
+    return PyArray_Type.tp_as_number->nb_@name@(m1, m2);
+}
+
+/**end repeat**/
+
+#if !defined(NPY_PY3K)
+/**begin repeat
+ *
+ * #name = divide#
+ */
+static PyObject *
+gentype_@name@(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_@name@, gentype_@name@);
+    return PyArray_Type.tp_as_number->nb_@name@(m1, m2);
+}
+/**end repeat**/
+#endif
+
+/* Get a nested slot, or NULL if absent */
+#define GET_NESTED_SLOT(type, group, slot) \
+    ((type)->group == NULL ? NULL : (type)->group->slot)
+
+static PyObject *
+gentype_multiply(PyObject *m1, PyObject *m2)
+{
+    /*
+     * If the other object supports sequence repeat and not number multiply
+     * we fall back on the python builtin to invoke the sequence repeat, rather
+     * than promoting both arguments to ndarray.
+     * This covers a list repeat by numpy scalars.
+     * A python defined class will always only have the nb_multiply slot and
+     * some classes may have neither defined. For the latter we want need
+     * to give the normal case a chance to convert the object to ndarray.
+     * Probably no class has both defined, but if they do, prefer number.
+     */
+    if (!PyArray_IsScalar(m1, Generic) &&
+            GET_NESTED_SLOT(Py_TYPE(m1), tp_as_sequence, sq_repeat) != NULL &&
+            GET_NESTED_SLOT(Py_TYPE(m1), tp_as_number, nb_multiply) == NULL) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    if (!PyArray_IsScalar(m2, Generic) &&
+            GET_NESTED_SLOT(Py_TYPE(m2), tp_as_sequence, sq_repeat) != NULL &&
+            GET_NESTED_SLOT(Py_TYPE(m2), tp_as_number, nb_multiply) == NULL) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    /* All normal cases are handled by PyArray's multiply */
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_multiply, gentype_multiply);
+    return PyArray_Type.tp_as_number->nb_multiply(m1, m2);
+}
+
+/**begin repeat
+ *
+ * #name = positive, negative, absolute, invert, int, float#
+ */
+static PyObject *
+gentype_@name@(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_@name@(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+/**end repeat**/
+
+#if !defined(NPY_PY3K)
+/**begin repeat
+ *
+ * #name = long, oct, hex#
+ */
+static PyObject *
+gentype_@name@(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_@name@(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+/**end repeat**/
+#endif
+
+static int
+gentype_nonzero_number(PyObject *m1)
+{
+    PyObject *arr;
+    int ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return -1;
+    }
+#if defined(NPY_PY3K)
+    ret = Py_TYPE(arr)->tp_as_number->nb_bool(arr);
+#else
+    ret = Py_TYPE(arr)->tp_as_number->nb_nonzero(arr);
+#endif
+    Py_DECREF(arr);
+    return ret;
+}
+
+static PyObject *
+genint_type_str(PyObject *self)
+{
+    PyObject  *item, *item_str;
+    item = gentype_generic_method(self, NULL, NULL, "item");
+    if (item == NULL) {
+        return NULL;
+    }
+
+    item_str = PyObject_Str(item);
+    Py_DECREF(item);
+    return item_str;
+}
+
+/*
+ * The __format__ method for PEP 3101.
+ */
+static PyObject *
+gentype_format(PyObject *self, PyObject *args)
+{
+    PyObject *format_spec;
+    PyObject *obj, *ret;
+
+#if defined(NPY_PY3K)
+    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) {
+        return NULL;
+    }
+#else
+    if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) {
+        return NULL;
+    }
+
+    if (!PyUnicode_Check(format_spec) && !PyString_Check(format_spec)) {
+        PyErr_SetString(PyExc_TypeError,
+                "format must be a string");
+        return NULL;
+    }
+#endif
+
+    /*
+     * Convert to an appropriate Python type and call its format.
+     * TODO: For some types, like long double, this isn't right,
+     *       because it throws away precision.
+     */
+    if (Py_TYPE(self) == &PyBoolArrType_Type) {
+        obj = PyBool_FromLong(((PyBoolScalarObject *)self)->obval);
+    }
+    else if (PyArray_IsScalar(self, Integer)) {
+#if defined(NPY_PY3K)
+        obj = Py_TYPE(self)->tp_as_number->nb_int(self);
+#else
+        obj = Py_TYPE(self)->tp_as_number->nb_long(self);
+#endif
+    }
+    else if (PyArray_IsScalar(self, Floating)) {
+        obj = Py_TYPE(self)->tp_as_number->nb_float(self);
+    }
+    else if (PyArray_IsScalar(self, ComplexFloating)) {
+        double val[2];
+        PyArray_Descr *dtype = PyArray_DescrFromScalar(self);
+
+        if (dtype == NULL) {
+            return NULL;
+        }
+        if (PyArray_CastScalarDirect(self, dtype, &val[0], NPY_CDOUBLE) < 0) {
+            Py_DECREF(dtype);
+            return NULL;
+        }
+        obj = PyComplex_FromDoubles(val[0], val[1]);
+        Py_DECREF(dtype);
+    }
+    else {
+        obj = PyObject_Str(self);
+    }
+
+    if (obj == NULL) {
+        return NULL;
+    }
+
+    ret = PyObject_Format(obj, format_spec);
+    Py_DECREF(obj);
+    return ret;
+}
+
+#ifdef FORCE_NO_LONG_DOUBLE_FORMATTING
+#undef NPY_LONGDOUBLE_FMT
+#define NPY_LONGDOUBLE_FMT NPY_DOUBLE_FMT
+#endif
+
+/**begin repeat
+ * #name = half, float, double, longdouble#
+ * #Name = Half, Float, Double, LongDouble#
+ * #NAME = HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_half, npy_float, npy_double, npy_longdouble#
+ * #suff = h, f, d, l#
+ */
+
+NPY_NO_EXPORT PyObject *
+format_@name@(@type@ val, npy_bool scientific,
+              int precision, int sign, TrimMode trim,
+              int pad_left, int pad_right, int exp_digits)
+{
+    if (scientific) {
+        return Dragon4_Scientific_@Name@(&val,
+                        DigitMode_Unique, precision,
+                        sign, trim, pad_left, exp_digits);
+    }
+    else {
+        return Dragon4_Positional_@Name@(&val,
+                        DigitMode_Unique, CutoffMode_TotalLength, precision,
+                        sign, trim, pad_left, pad_right);
+    }
+}
+
+
+/**end repeat**/
+
+/*
+ * over-ride repr and str of array-scalar strings and unicode to
+ * remove NULL bytes and then call the corresponding functions
+ * of string and unicode.
+ */
+
+/**begin repeat
+ * #name = string*2,unicode*2#
+ * #form = (repr,str)*2#
+ * #Name = String*2,Unicode*2#
+ * #NAME = STRING*2,UNICODE*2#
+ * #extra = AndSize*2,,#
+ * #type = npy_char*2, Py_UNICODE*2#
+ */
+static PyObject *
+@name@type_@form@(PyObject *self)
+{
+    const @type@ *dptr, *ip;
+    int len;
+    PyObject *new;
+    PyObject *ret;
+
+    ip = dptr = Py@Name@_AS_@NAME@(self);
+    len = Py@Name@_GET_SIZE(self);
+    dptr += len-1;
+    while(len > 0 && *dptr-- == 0) {
+        len--;
+    }
+    new = Py@Name@_From@Name@@extra@(ip, len);
+    if (new == NULL) {
+        return PyUString_FromString("");
+    }
+    ret = Py@Name@_Type.tp_@form@(new);
+    Py_DECREF(new);
+    return ret;
+}
+/**end repeat**/
+
+
+/*
+ * Convert array of bytes to a string representation much like bytes.__repr__,
+ * but convert all bytes (including ASCII) to the `\x00` notation with
+ * uppercase hex codes (FF not ff).
+ *
+ * Largely copied from _Py_strhex_impl in CPython implementation
+ */
+static NPY_INLINE PyObject *
+_void_to_hex(const char* argbuf, const Py_ssize_t arglen,
+             const char *schars, const char *bprefix, const char *echars)
+{
+    PyObject *retval;
+    int extrachars, slen;
+    char *retbuf;
+    Py_ssize_t i, j;
+    char const *hexdigits = "0123456789ABCDEF";
+
+    extrachars = strlen(schars) + strlen(echars);
+    slen = extrachars + arglen*(2 + strlen(bprefix));
+
+    if (arglen > (PY_SSIZE_T_MAX / 2) - extrachars) {
+        return PyErr_NoMemory();
+    }
+
+    retbuf = (char *)PyMem_Malloc(slen);
+    if (!retbuf) {
+        return PyErr_NoMemory();
+    }
+
+    memcpy(retbuf, schars, strlen(schars));
+    j = strlen(schars);
+
+    for (i = 0; i < arglen; i++) {
+        unsigned char c;
+        memcpy(&retbuf[j], bprefix, strlen(bprefix));
+        j += strlen(bprefix);
+        c = (argbuf[i] >> 4) & 0xf;
+        retbuf[j++] = hexdigits[c];
+        c = argbuf[i] & 0xf;
+        retbuf[j++] = hexdigits[c];
+    }
+    memcpy(&retbuf[j], echars, strlen(echars));
+
+    retval = PyUString_FromStringAndSize(retbuf, slen);
+    PyMem_Free(retbuf);
+
+    return retval;
+}
+
+static PyObject *
+_void_scalar_repr(PyObject *obj) {
+    static PyObject *reprfunc = NULL;
+    npy_cache_import("numpy.core.arrayprint",
+                     "_void_scalar_repr", &reprfunc);
+    if (reprfunc == NULL) {
+        return NULL;
+    }
+    return PyObject_CallFunction(reprfunc, "O", obj);
+}
+
+static PyObject *
+voidtype_repr(PyObject *self)
+{
+    PyVoidScalarObject *s = (PyVoidScalarObject*) self;
+    if (PyDataType_HASFIELDS(s->descr)) {
+        return _void_scalar_repr(self);
+    }
+    return _void_to_hex(s->obval, s->descr->elsize, "void(b'", "\\x", "')");
+}
+
+static PyObject *
+voidtype_str(PyObject *self)
+{
+    PyVoidScalarObject *s = (PyVoidScalarObject*) self;
+    if (PyDataType_HASFIELDS(s->descr)) {
+        return _void_scalar_repr(self);
+    }
+    return _void_to_hex(s->obval, s->descr->elsize, "b'", "\\x", "'");
+}
+
+static PyObject *
+datetimetype_repr(PyObject *self)
+{
+    PyDatetimeScalarObject *scal;
+    npy_datetimestruct dts;
+    PyObject *ret;
+    char iso[NPY_DATETIME_MAX_ISO8601_STRLEN];
+    NPY_DATETIMEUNIT unit;
+
+    if (!PyArray_IsScalar(self, Datetime)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy datetime repr on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyDatetimeScalarObject *)self;
+
+    if (convert_datetime_to_datetimestruct(&scal->obmeta,
+                scal->obval, &dts) < 0) {
+        return NULL;
+    }
+
+    unit = scal->obmeta.base;
+    if (make_iso_8601_datetime(&dts, iso, sizeof(iso), 0, 0,
+                            unit, -1, NPY_SAFE_CASTING) < 0) {
+        return NULL;
+    }
+
+    /*
+     * For straight units or generic units, the unit will be deduced
+     * from the string, so it's not necessary to specify it.
+     */
+    if ((scal->obmeta.num == 1 && scal->obmeta.base != NPY_FR_h) ||
+            scal->obmeta.base == NPY_FR_GENERIC) {
+        ret = PyUString_FromString("numpy.datetime64('");
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString(iso));
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString("')"));
+    }
+    else {
+        ret = PyUString_FromString("numpy.datetime64('");
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString(iso));
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString("','"));
+        ret = append_metastr_to_string(&scal->obmeta, 1, ret);
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString("')"));
+    }
+
+    return ret;
+}
+
+static PyObject *
+timedeltatype_repr(PyObject *self)
+{
+    PyTimedeltaScalarObject *scal;
+    PyObject *ret;
+
+    if (!PyArray_IsScalar(self, Timedelta)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy timedelta repr on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyTimedeltaScalarObject *)self;
+
+    /* The value */
+    if (scal->obval == NPY_DATETIME_NAT) {
+        ret = PyUString_FromString("numpy.timedelta64('NaT'");
+    }
+    else {
+        /*
+         * Can't use "%lld" if HAVE_LONG_LONG is not defined
+         */
+#if defined(HAVE_LONG_LONG)
+        ret = PyUString_FromFormat("numpy.timedelta64(%lld",
+                                            (long long)scal->obval);
+#else
+        ret = PyUString_FromFormat("numpy.timedelta64(%ld",
+                                            (long)scal->obval);
+#endif
+    }
+    /* The metadata unit */
+    if (scal->obmeta.base == NPY_FR_GENERIC) {
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString(")"));
+    }
+    else {
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString(",'"));
+        ret = append_metastr_to_string(&scal->obmeta, 1, ret);
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString("')"));
+    }
+
+    return ret;
+}
+
+static PyObject *
+datetimetype_str(PyObject *self)
+{
+    PyDatetimeScalarObject *scal;
+    npy_datetimestruct dts;
+    char iso[NPY_DATETIME_MAX_ISO8601_STRLEN];
+    NPY_DATETIMEUNIT unit;
+
+    if (!PyArray_IsScalar(self, Datetime)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy datetime str on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyDatetimeScalarObject *)self;
+
+    if (convert_datetime_to_datetimestruct(&scal->obmeta, scal->obval,
+                                                            &dts) < 0) {
+        return NULL;
+    }
+
+    unit = scal->obmeta.base;
+    if (make_iso_8601_datetime(&dts, iso, sizeof(iso), 0, 0,
+                            unit, -1, NPY_SAFE_CASTING) < 0) {
+        return NULL;
+    }
+
+    return PyUString_FromString(iso);
+}
+
+static char *_datetime_verbose_strings[NPY_DATETIME_NUMUNITS] = {
+    "years",
+    "months",
+    "weeks",
+    "<invalid>",
+    "days",
+    "hours",
+    "minutes",
+    "seconds",
+    "milliseconds",
+    "microseconds",
+    "nanoseconds",
+    "picoseconds",
+    "femtoseconds",
+    "attoseconds",
+    "generic time units"
+};
+
+static PyObject *
+timedeltatype_str(PyObject *self)
+{
+    PyTimedeltaScalarObject *scal;
+    PyObject *ret;
+    char *basestr = "invalid";
+
+    if (!PyArray_IsScalar(self, Timedelta)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy timedelta str on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyTimedeltaScalarObject *)self;
+
+    if (scal->obmeta.base >= 0 && scal->obmeta.base < NPY_DATETIME_NUMUNITS) {
+        basestr = _datetime_verbose_strings[scal->obmeta.base];
+    }
+    else {
+        PyErr_SetString(PyExc_RuntimeError,
+                "NumPy datetime metadata is corrupted");
+        return NULL;
+    }
+
+    if (scal->obval == NPY_DATETIME_NAT) {
+        ret = PyUString_FromString("NaT");
+    }
+    else {
+        /*
+         * Can't use "%lld" if HAVE_LONG_LONG is not defined
+         */
+#if defined(HAVE_LONG_LONG)
+        ret = PyUString_FromFormat("%lld ",
+                                (long long)(scal->obval * scal->obmeta.num));
+#else
+        ret = PyUString_FromFormat("%ld ",
+                                (long)(scal->obval * scal->obmeta.num));
+#endif
+        PyUString_ConcatAndDel(&ret,
+                PyUString_FromString(basestr));
+    }
+
+    return ret;
+}
+
+/*
+ * float type str and repr
+ *
+ * These functions will return NULL if PyString creation fails.
+ */
+
+
+/*
+ *               *** BEGIN LEGACY PRINTING MODE CODE ***
+ *
+ * This code is legacy code needed to reproduce the printing behavior of
+ * scalars in numpy 1.13. One day we hope to remove it.
+ */
+
+/* determines if legacy mode is enabled, global set in multiarraymodule.c */
+extern int npy_legacy_print_mode;
+
+#define HALFPREC_REPR 5
+#define HALFPREC_STR 5
+#define FLOATPREC_REPR 8
+#define FLOATPREC_STR 6
+#define DOUBLEPREC_REPR 17
+#define DOUBLEPREC_STR 12
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+#define LONGDOUBLEPREC_REPR DOUBLEPREC_REPR
+#define LONGDOUBLEPREC_STR DOUBLEPREC_STR
+#else /* More than probably needed on Intel FP */
+#define LONGDOUBLEPREC_REPR 20
+#define LONGDOUBLEPREC_STR 12
+#endif
+
+/**begin repeat
+ * #kind = str, repr#
+ * #KIND = STR, REPR#
+ */
+
+/**begin repeat1
+ * #name = cfloat, cdouble, clongdouble#
+ * #NAME = FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #suff = f, d, l#
+ */
+
+#define _FMT1 "%%.%i" NPY_@NAME@_FMT
+#define _FMT2 "%%+.%i" NPY_@NAME@_FMT
+
+static PyObject*
+legacy_@name@_format@kind@(@type@ val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, @NAME@PREC_@KIND@);
+        res = NumPyOS_ascii_format@suff@(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", 1);
+        }
+        strncat(buf, "j", 1);
+    }
+    else {
+        char re[64], im[64];
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, @NAME@PREC_@KIND@);
+            res = NumPyOS_ascii_format@suff@(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, @NAME@PREC_@KIND@);
+            res = NumPyOS_ascii_format@suff@(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUString_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #name = float, double, longdouble#
+ * #Name = Float, Double, LongDouble#
+ * #NAME = FLOAT, DOUBLE, LONGDOUBLE#
+ * #suff = f, d, l#
+ */
+
+#define _FMT1 "%%.%i" NPY_@NAME@_FMT
+
+static PyObject *
+legacy_@name@_format@kind@(npy_@name@ val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, @NAME@PREC_@KIND@);
+    res = NumPyOS_ascii_format@suff@(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUString_FromString(buf);
+}
+
+#undef _FMT1
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/*
+ *               *** END LEGACY PRINTING MODE CODE ***
+ */
+
+
+/**begin repeat
+ * #kind = str, repr#
+ */
+
+/**begin repeat1
+ * #name = float, double, longdouble#
+ * #Name = Float, Double, LongDouble#
+ * #NAME = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+@name@type_@kind@_either(npy_@name@ val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+    npy_@name@ absval;
+
+    if (npy_legacy_print_mode == 113) {
+        return legacy_@name@_format@kind@(val);
+    }
+
+    absval = val < 0 ? -val : val;
+
+    if (absval == 0 || (absval < 1.e16L && absval >= 1.e-4L) ) {
+        return format_@name@(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_@name@(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+@name@type_@kind@(PyObject *self)
+{
+    return @name@type_@kind@_either(((Py@Name@ScalarObject *)self)->obval,
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+c@name@type_@kind@(PyObject *self)
+{
+    PyObject *rstr, *istr, *ret;
+    npy_c@name@ val = ((PyC@Name@ScalarObject *)self)->obval;
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode == 113) {
+        return legacy_c@name@_format@kind@(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = @name@type_@kind@_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+
+        PyUString_ConcatAndDel(&istr, PyUString_FromString("j"));
+        return istr;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = @name@type_@kind@_either(val.real, trim, trim, 0);
+        if (rstr == NULL) {
+            return NULL;
+        }
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUString_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUString_FromString("inf");
+    }
+    else {
+        rstr = PyUString_FromString("-inf");
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = @name@type_@kind@_either(val.imag, trim, trim, 1);
+        if (istr == NULL) {
+            return NULL;
+        }
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUString_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUString_FromString("+inf");
+    }
+    else {
+        istr = PyUString_FromString("-inf");
+    }
+
+    ret = PyUString_FromString("(");
+    PyUString_ConcatAndDel(&ret, rstr);
+    PyUString_ConcatAndDel(&ret, istr);
+    PyUString_ConcatAndDel(&ret, PyUString_FromString("j)"));
+    return ret;
+}
+
+#undef PREC
+
+/**end repeat1**/
+
+
+static PyObject *
+halftype_@kind@(PyObject *self)
+{
+    npy_half val = ((PyHalfScalarObject *)self)->obval;
+    float floatval = npy_half_to_float(val);
+    float absval;
+
+    if (npy_legacy_print_mode == 113) {
+        return legacy_float_format@kind@(floatval);
+    }
+
+    absval = floatval < 0 ? -floatval : floatval;
+
+    if (absval == 0 || (absval < 1.e16 && absval >= 1.e-4) ) {
+        return format_half(val, 0, -1, 0, TrimMode_LeaveOneZero, -1, -1, -1);
+    }
+    return format_half(val, 1, -1, 0, TrimMode_DptZeros, -1, -1, -1);
+}
+
+
+/**end repeat**/
+
+/**begin repeat
+ * #char = ,c#
+ * #CHAR = ,C#
+ * #POST = ,.real#
+ */
+static PyObject *
+@char@longdoubletype_float(PyObject *self)
+{
+    npy_longdouble val = PyArrayScalar_VAL(self, @CHAR@LongDouble)@POST@;
+    return PyFloat_FromDouble((double) val);
+}
+
+static PyObject *
+@char@longdoubletype_long(PyObject *self)
+{
+    npy_longdouble val = PyArrayScalar_VAL(self, @CHAR@LongDouble)@POST@;
+    return npy_longdouble_to_PyLong(val);
+}
+
+#if !defined(NPY_PY3K)
+
+/**begin repeat1
+ * #name = int, hex, oct#
+ */
+static PyObject *
+@char@longdoubletype_@name@(PyObject *self)
+{
+    PyObject *ret;
+    PyObject *obj = @char@longdoubletype_long(self);
+    if (obj == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(obj)->tp_as_number->nb_@name@(obj);
+    Py_DECREF(obj);
+    return ret;
+}
+/**end repeat1**/
+
+#endif /* !defined(NPY_PY3K) */
+
+/**end repeat**/
+
+static PyNumberMethods gentype_as_number = {
+    (binaryfunc)gentype_add,                     /*nb_add*/
+    (binaryfunc)gentype_subtract,                /*nb_subtract*/
+    (binaryfunc)gentype_multiply,                /*nb_multiply*/
+#if !defined(NPY_PY3K)
+    (binaryfunc)gentype_divide,                  /*nb_divide*/
+#endif
+    (binaryfunc)gentype_remainder,               /*nb_remainder*/
+    (binaryfunc)gentype_divmod,                  /*nb_divmod*/
+    (ternaryfunc)gentype_power,                  /*nb_power*/
+    (unaryfunc)gentype_negative,
+    (unaryfunc)gentype_positive,                 /*nb_pos*/
+    (unaryfunc)gentype_absolute,                 /*(unaryfunc)gentype_abs,*/
+    (inquiry)gentype_nonzero_number,             /*nb_nonzero*/
+    (unaryfunc)gentype_invert,                   /*nb_invert*/
+    (binaryfunc)gentype_lshift,                  /*nb_lshift*/
+    (binaryfunc)gentype_rshift,                  /*nb_rshift*/
+    (binaryfunc)gentype_and,                     /*nb_and*/
+    (binaryfunc)gentype_xor,                     /*nb_xor*/
+    (binaryfunc)gentype_or,                      /*nb_or*/
+#if !defined(NPY_PY3K)
+    0,                                           /*nb_coerce*/
+#endif
+    (unaryfunc)gentype_int,                      /*nb_int*/
+#if defined(NPY_PY3K)
+    0,                                           /*nb_reserved*/
+#else
+    (unaryfunc)gentype_long,                     /*nb_long*/
+#endif
+    (unaryfunc)gentype_float,                    /*nb_float*/
+#if !defined(NPY_PY3K)
+    (unaryfunc)gentype_oct,                      /*nb_oct*/
+    (unaryfunc)gentype_hex,                      /*nb_hex*/
+#endif
+    0,                                           /*inplace_add*/
+    0,                                           /*inplace_subtract*/
+    0,                                           /*inplace_multiply*/
+#if !defined(NPY_PY3K)
+    0,                                           /*inplace_divide*/
+#endif
+    0,                                           /*inplace_remainder*/
+    0,                                           /*inplace_power*/
+    0,                                           /*inplace_lshift*/
+    0,                                           /*inplace_rshift*/
+    0,                                           /*inplace_and*/
+    0,                                           /*inplace_xor*/
+    0,                                           /*inplace_or*/
+    (binaryfunc)gentype_floor_divide,            /*nb_floor_divide*/
+    (binaryfunc)gentype_true_divide,             /*nb_true_divide*/
+    0,                                           /*nb_inplace_floor_divide*/
+    0,                                           /*nb_inplace_true_divide*/
+    (unaryfunc)NULL,                             /*nb_index*/
+#if PY_VERSION_HEX >= 0x03050000
+    0,                                           /*np_matmul*/
+    0,                                           /*np_inplace_matmul*/
+#endif
+};
+
+
+static PyObject *
+gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    PyObject *arr, *ret;
+
+    /*
+     * If the other object is None, False is always right. This avoids
+     * the array None comparison, at least until deprecation it is fixed.
+     * After that, this may be removed and numpy false would be returned.
+     *
+     * NOTE: np.equal(NaT, None) evaluates to TRUE! This is an
+     *       an inconsistency, which may has to be considered
+     *       when the deprecation is finished.
+     */
+    if (other == Py_None) {
+        if (cmp_op == Py_EQ) {
+            Py_RETURN_FALSE;
+        }
+        if (cmp_op == Py_NE) {
+            Py_RETURN_TRUE;
+        }
+    }
+
+    arr = PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    /*
+     * Call via PyObject_RichCompare to ensure that other.__eq__
+     * has a chance to run when necessary
+     */
+    ret = PyObject_RichCompare(arr, other, cmp_op);
+    Py_DECREF(arr);
+    return ret;
+}
+
+static PyObject *
+gentype_ndim_get(PyObject *NPY_UNUSED(self))
+{
+    return PyInt_FromLong(0);
+}
+
+static PyObject *
+gentype_flags_get(PyObject *NPY_UNUSED(self))
+{
+    return PyArray_NewFlagsObject(NULL);
+}
+
+static PyObject *
+voidtype_flags_get(PyVoidScalarObject *self)
+{
+    PyObject *flagobj;
+    flagobj = PyArrayFlags_Type.tp_alloc(&PyArrayFlags_Type, 0);
+    if (flagobj == NULL) {
+        return NULL;
+    }
+    ((PyArrayFlagsObject *)flagobj)->arr = NULL;
+    ((PyArrayFlagsObject *)flagobj)->flags = self->flags;
+    return flagobj;
+}
+
+static PyObject *
+voidtype_dtypedescr_get(PyVoidScalarObject *self)
+{
+    Py_INCREF(self->descr);
+    return (PyObject *)self->descr;
+}
+
+
+static PyObject *
+inttype_numerator_get(PyObject *self)
+{
+    Py_INCREF(self);
+    return self;
+}
+
+
+static PyObject *
+inttype_denominator_get(PyObject *self)
+{
+    return PyInt_FromLong(1);
+}
+
+
+static PyObject *
+gentype_data_get(PyObject *self)
+{
+#if defined(NPY_PY3K)
+    return PyMemoryView_FromObject(self);
+#else
+    return PyBuffer_FromObject(self, 0, Py_END_OF_BUFFER);
+#endif
+}
+
+
+static PyObject *
+gentype_itemsize_get(PyObject *self)
+{
+    PyArray_Descr *typecode;
+    PyObject *ret;
+    int elsize;
+
+    typecode = PyArray_DescrFromScalar(self);
+    elsize = typecode->elsize;
+#ifndef Py_UNICODE_WIDE
+    if (typecode->type_num == NPY_UNICODE) {
+        elsize >>= 1;
+    }
+#endif
+    ret = PyInt_FromLong((long) elsize);
+    Py_DECREF(typecode);
+    return ret;
+}
+
+static PyObject *
+gentype_size_get(PyObject *NPY_UNUSED(self))
+{
+    return PyInt_FromLong(1);
+}
+
+static PyObject *
+gentype_sizeof(PyObject *self)
+{
+    Py_ssize_t nbytes;
+    PyObject * isz = gentype_itemsize_get(self);
+    if (isz == NULL) {
+        return NULL;
+    }
+    nbytes = PyLong_AsLong(isz) + Py_TYPE(self)->tp_basicsize +
+        Py_SIZE(self) * Py_TYPE(self)->tp_itemsize;
+    Py_DECREF(isz);
+    return PyLong_FromSsize_t(nbytes);
+}
+
+#if PY_VERSION_HEX >= 0x03000000
+NPY_NO_EXPORT void
+gentype_struct_free(PyObject *ptr)
+{
+    PyArrayInterface *arrif;
+    PyObject *context;
+
+    arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL);
+    context = (PyObject *)PyCapsule_GetContext(ptr);
+    Py_DECREF(context);
+    Py_XDECREF(arrif->descr);
+    PyArray_free(arrif->shape);
+    PyArray_free(arrif);
+}
+#else
+NPY_NO_EXPORT void
+gentype_struct_free(void *ptr, void *arg)
+{
+    PyArrayInterface *arrif = (PyArrayInterface *)ptr;
+    Py_DECREF((PyObject *)arg);
+    Py_XDECREF(arrif->descr);
+    PyArray_free(arrif->shape);
+    PyArray_free(arrif);
+}
+#endif
+
+static PyObject *
+gentype_struct_get(PyObject *self)
+{
+    PyArrayObject *arr;
+    PyArrayInterface *inter;
+    PyObject *ret;
+
+    arr = (PyArrayObject *)PyArray_FromScalar(self, NULL);
+    inter = (PyArrayInterface *)PyArray_malloc(sizeof(PyArrayInterface));
+    inter->two = 2;
+    inter->nd = 0;
+    inter->flags = PyArray_FLAGS(arr);
+    inter->flags &= ~(NPY_ARRAY_UPDATEIFCOPY | NPY_ARRAY_WRITEBACKIFCOPY |
+                      NPY_ARRAY_OWNDATA);
+    inter->flags |= NPY_ARRAY_NOTSWAPPED;
+    inter->typekind = PyArray_DESCR(arr)->kind;
+    inter->itemsize = PyArray_DESCR(arr)->elsize;
+    inter->strides = NULL;
+    inter->shape = NULL;
+    inter->data = PyArray_DATA(arr);
+    inter->descr = NULL;
+
+    ret = NpyCapsule_FromVoidPtrAndDesc(inter, arr, gentype_struct_free);
+    return ret;
+}
+
+static PyObject *
+gentype_priority_get(PyObject *NPY_UNUSED(self))
+{
+    return PyFloat_FromDouble(NPY_SCALAR_PRIORITY);
+}
+
+static PyObject *
+gentype_shape_get(PyObject *NPY_UNUSED(self))
+{
+    return PyTuple_New(0);
+}
+
+
+static PyObject *
+gentype_interface_get(PyObject *self)
+{
+    PyArrayObject *arr;
+    PyObject *inter;
+
+    arr = (PyArrayObject *)PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    inter = PyObject_GetAttrString((PyObject *)arr, "__array_interface__");
+    if (inter != NULL) {
+        PyDict_SetItemString(inter, "__ref", (PyObject *)arr);
+    }
+    Py_DECREF(arr);
+    return inter;
+}
+
+
+
+static PyObject *
+gentype_typedescr_get(PyObject *self)
+{
+    return (PyObject *)PyArray_DescrFromScalar(self);
+}
+
+
+static PyObject *
+gentype_base_get(PyObject *NPY_UNUSED(self))
+{
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+voidtype_base_get(PyVoidScalarObject *self)
+{
+    if (self->base == NULL) {
+        Py_RETURN_NONE;
+    }
+    else {
+        Py_INCREF(self->base);
+        return self->base;
+    }
+}
+
+
+static PyArray_Descr *
+_realdescr_fromcomplexscalar(PyObject *self, int *typenum)
+{
+    if (PyArray_IsScalar(self, CDouble)) {
+        *typenum = NPY_CDOUBLE;
+        return PyArray_DescrFromType(NPY_DOUBLE);
+    }
+    if (PyArray_IsScalar(self, CFloat)) {
+        *typenum = NPY_CFLOAT;
+        return PyArray_DescrFromType(NPY_FLOAT);
+    }
+    if (PyArray_IsScalar(self, CLongDouble)) {
+        *typenum = NPY_CLONGDOUBLE;
+        return PyArray_DescrFromType(NPY_LONGDOUBLE);
+    }
+    return NULL;
+}
+
+static PyObject *
+gentype_real_get(PyObject *self)
+{
+    PyArray_Descr *typecode;
+    PyObject *ret;
+    int typenum;
+
+    if (PyArray_IsScalar(self, ComplexFloating)) {
+        void *ptr;
+        typecode = _realdescr_fromcomplexscalar(self, &typenum);
+        ptr = scalar_value(self, NULL);
+        ret = PyArray_Scalar(ptr, typecode, NULL);
+        Py_DECREF(typecode);
+        return ret;
+    }
+    else if (PyArray_IsScalar(self, Object)) {
+        PyObject *obj = ((PyObjectScalarObject *)self)->obval;
+        ret = PyObject_GetAttrString(obj, "real");
+        if (ret != NULL) {
+            return ret;
+        }
+        PyErr_Clear();
+    }
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+static PyObject *
+gentype_imag_get(PyObject *self)
+{
+    PyArray_Descr *typecode=NULL;
+    PyObject *ret;
+    int typenum;
+
+    if (PyArray_IsScalar(self, ComplexFloating)) {
+        char *ptr;
+        typecode = _realdescr_fromcomplexscalar(self, &typenum);
+        ptr = (char *)scalar_value(self, NULL);
+        ret = PyArray_Scalar(ptr + typecode->elsize, typecode, NULL);
+    }
+    else if (PyArray_IsScalar(self, Object)) {
+        PyObject *obj = ((PyObjectScalarObject *)self)->obval;
+        PyArray_Descr *newtype;
+        ret = PyObject_GetAttrString(obj, "imag");
+        if (ret == NULL) {
+            PyErr_Clear();
+            obj = PyInt_FromLong(0);
+            newtype = PyArray_DescrFromType(NPY_OBJECT);
+            ret = PyArray_Scalar((char *)&obj, newtype, NULL);
+            Py_DECREF(newtype);
+            Py_DECREF(obj);
+        }
+    }
+    else {
+        char *temp;
+        int elsize;
+        typecode = PyArray_DescrFromScalar(self);
+        elsize = typecode->elsize;
+        temp = npy_alloc_cache_zero(elsize);
+        ret = PyArray_Scalar(temp, typecode, NULL);
+        npy_free_cache(temp, elsize);
+    }
+
+    Py_XDECREF(typecode);
+    return ret;
+}
+
+static PyObject *
+gentype_flat_get(PyObject *self)
+{
+    PyObject *ret, *arr;
+
+    arr = PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = PyArray_IterNew(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+
+static PyObject *
+gentype_transpose_get(PyObject *self)
+{
+    Py_INCREF(self);
+    return self;
+}
+
+
+static PyGetSetDef gentype_getsets[] = {
+    {"ndim",
+        (getter)gentype_ndim_get,
+        (setter) 0,
+        "number of array dimensions",
+        NULL},
+    {"flags",
+        (getter)gentype_flags_get,
+        (setter)0,
+        "integer value of flags",
+        NULL},
+    {"shape",
+        (getter)gentype_shape_get,
+        (setter)0,
+        "tuple of array dimensions",
+        NULL},
+    {"strides",
+        (getter)gentype_shape_get,
+        (setter) 0,
+        "tuple of bytes steps in each dimension",
+        NULL},
+    {"data",
+        (getter)gentype_data_get,
+        (setter) 0,
+        "pointer to start of data",
+        NULL},
+    {"itemsize",
+        (getter)gentype_itemsize_get,
+        (setter)0,
+        "length of one element in bytes",
+        NULL},
+    {"size",
+        (getter)gentype_size_get,
+        (setter)0,
+        "number of elements in the gentype",
+        NULL},
+    {"nbytes",
+        (getter)gentype_itemsize_get,
+        (setter)0,
+        "length of item in bytes",
+        NULL},
+    {"base",
+        (getter)gentype_base_get,
+        (setter)0,
+        "base object",
+        NULL},
+    {"dtype",
+        (getter)gentype_typedescr_get,
+        NULL,
+        "get array data-descriptor",
+        NULL},
+    {"real",
+        (getter)gentype_real_get,
+        (setter)0,
+        "real part of scalar",
+        NULL},
+    {"imag",
+        (getter)gentype_imag_get,
+        (setter)0,
+        "imaginary part of scalar",
+        NULL},
+    {"flat",
+        (getter)gentype_flat_get,
+        (setter)0,
+        "a 1-d view of scalar",
+        NULL},
+    {"T",
+        (getter)gentype_transpose_get,
+        (setter)0,
+        "transpose",
+        NULL},
+    {"__array_interface__",
+        (getter)gentype_interface_get,
+        NULL,
+        "Array protocol: Python side",
+        NULL},
+    {"__array_struct__",
+        (getter)gentype_struct_get,
+        NULL,
+        "Array protocol: struct",
+        NULL},
+    {"__array_priority__",
+        (getter)gentype_priority_get,
+        NULL,
+        "Array priority.",
+        NULL},
+    {NULL, NULL, NULL, NULL, NULL}  /* Sentinel */
+};
+
+
+/* 0-dim array from scalar object */
+
+static char doc_getarray[] = "sc.__array__(dtype) return 0-dim array from "
+                             "scalar with specified dtype";
+
+static PyObject *
+gentype_getarray(PyObject *scalar, PyObject *args)
+{
+    PyArray_Descr *outcode=NULL;
+    PyObject *ret;
+
+    if (!PyArg_ParseTuple(args, "|O&:__array__", &PyArray_DescrConverter,
+                &outcode)) {
+        Py_XDECREF(outcode);
+        return NULL;
+    }
+    ret = PyArray_FromScalar(scalar, outcode);
+    return ret;
+}
+
+static char doc_sc_wraparray[] = "sc.__array_wrap__(obj) return scalar from array";
+
+static PyObject *
+gentype_wraparray(PyObject *NPY_UNUSED(scalar), PyObject *args)
+{
+    PyObject *obj;
+    PyArrayObject *arr;
+
+    if (PyTuple_Size(args) < 1) {
+        PyErr_SetString(PyExc_TypeError,
+                "only accepts 1 argument.");
+        return NULL;
+    }
+    obj = PyTuple_GET_ITEM(args, 0);
+    if (!PyArray_Check(obj)) {
+        PyErr_SetString(PyExc_TypeError,
+                "can only be called with ndarray object");
+        return NULL;
+    }
+    arr = (PyArrayObject *)obj;
+
+    return PyArray_Scalar(PyArray_DATA(arr),
+                    PyArray_DESCR(arr), (PyObject *)arr);
+}
+
+/*
+ * These gentype_* functions do not take keyword arguments.
+ * The proper flag is METH_VARARGS.
+ */
+/**begin repeat
+ *
+ * #name = tolist, item, __deepcopy__, __copy__,
+ *         swapaxes, conj, conjugate, nonzero,
+ *         fill, transpose, newbyteorder#
+ */
+static PyObject *
+gentype_@name@(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "@name@");
+}
+/**end repeat**/
+
+static PyObject *
+gentype_itemset(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    PyErr_SetString(PyExc_ValueError, "array-scalars are immutable");
+    return NULL;
+}
+
+static Py_ssize_t
+gentype_getreadbuf(PyObject *, Py_ssize_t, void **);
+
+static PyObject *
+gentype_byteswap(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    npy_bool inplace = NPY_FALSE;
+    static char *kwlist[] = {"inplace", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&:byteswap", kwlist,
+                                     PyArray_BoolConverter, &inplace)) {
+        return NULL;
+    }
+    if (inplace) {
+        PyErr_SetString(PyExc_ValueError,
+                "cannot byteswap a scalar in-place");
+        return NULL;
+    }
+    else {
+        /* get the data, copyswap it and pass it to a new Array scalar */
+        char *data;
+        PyArray_Descr *descr;
+        PyObject *new;
+        char *newmem;
+
+        gentype_getreadbuf(self, 0, (void **)&data);
+        descr = PyArray_DescrFromScalar(self);
+        newmem = PyObject_Malloc(descr->elsize);
+        if (newmem == NULL) {
+            Py_DECREF(descr);
+            return PyErr_NoMemory();
+        }
+        else {
+            descr->f->copyswap(newmem, data, 1, NULL);
+        }
+        new = PyArray_Scalar(newmem, descr, NULL);
+        PyObject_Free(newmem);
+        Py_DECREF(descr);
+        return new;
+    }
+}
+
+
+/*
+ * These gentype_* functions take keyword arguments.
+ * The proper flag is METH_VARARGS | METH_KEYWORDS.
+ */
+/**begin repeat
+ *
+ * #name = take, getfield, put, repeat, tofile, mean, trace, diagonal, clip,
+ *         std, var, sum, cumsum, prod, cumprod, compress, sort, argsort,
+ *         round, argmax, argmin, max, min, ptp, any, all, astype, resize,
+ *         reshape, choose, tostring, tobytes, copy, searchsorted, view,
+ *         flatten, ravel, squeeze#
+ */
+static PyObject *
+gentype_@name@(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "@name@");
+}
+/**end repeat**/
+
+static PyObject *
+voidtype_getfield(PyVoidScalarObject *self, PyObject *args, PyObject *kwds)
+{
+    /* Use ndarray's getfield to obtain the field safely */
+    return gentype_generic_method((PyObject *)self, args, kwds, "getfield");
+}
+
+static PyObject *
+gentype_setfield(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
+                 PyObject *NPY_UNUSED(kwds))
+{
+    PyErr_SetString(PyExc_TypeError,
+            "Can't set fields in a non-void array scalar.");
+    return NULL;
+}
+
+static PyObject *
+voidtype_setfield(PyVoidScalarObject *self, PyObject *args, PyObject *kwds)
+{
+    /*
+     * We would like to use ndarray's setfield because it performs safety
+     * checks on the field datatypes and because it broadcasts properly.
+     * However, as a special case, void-scalar assignment broadcasts
+     * differently from ndarrays when assigning to an object field: Assignment
+     * to an ndarray object field broadcasts, but assignment to a void-scalar
+     * object-field should not, in order to allow nested ndarrays.
+     * These lines should then behave identically:
+     *
+     *     b = np.zeros(1, dtype=[('x', 'O')])
+     *     b[0]['x'] = arange(3)  # uses voidtype_setfield
+     *     b['x'][0] = arange(3)  # uses ndarray setitem
+     *
+     * Ndarray's setfield would try to broadcast the lhs. Instead we use
+     * ndarray getfield to get the field safely, then setitem with an empty
+     * tuple to set the value without broadcast. Note we also want subarrays to
+     * be set properly, ie
+     *
+     *     a = np.zeros(1, dtype=[('x', 'i', 5)])
+     *     a[0]['x'] = 1
+     *
+     * sets all values to 1. "getfield + setitem with empty tuple" takes
+     * care of both object arrays and subarrays.
+     */
+    PyObject *getfield_args, *value, *arr, *meth, *arr_field, *emptytuple;
+
+    value = PyTuple_GetItem(args, 0);
+    if (value == NULL) {
+        return NULL;
+    }
+    getfield_args = PyTuple_GetSlice(args, 1, 3);
+    if (getfield_args == NULL) {
+        return NULL;
+    }
+
+    /* 1. Convert to 0-d array and use getfield */
+    arr = PyArray_FromScalar((PyObject*)self, NULL);
+    if (arr == NULL) {
+        Py_DECREF(getfield_args);
+        return NULL;
+    }
+    meth = PyObject_GetAttrString(arr, "getfield");
+    if (meth == NULL) {
+        Py_DECREF(getfield_args);
+        Py_DECREF(arr);
+        return NULL;
+    }
+    if (kwds == NULL) {
+        arr_field = PyObject_CallObject(meth, getfield_args);
+    }
+    else {
+        arr_field = PyObject_Call(meth, getfield_args, kwds);
+    }
+    Py_DECREF(getfield_args);
+    Py_DECREF(meth);
+    Py_DECREF(arr);
+
+    if(arr_field == NULL){
+        return NULL;
+    }
+
+    /* 2. Assign the value using setitem with empty tuple. */
+    emptytuple = PyTuple_New(0);
+    if (PyObject_SetItem(arr_field, emptytuple, value) < 0) {
+        Py_DECREF(arr_field);
+        Py_DECREF(emptytuple);
+        return NULL;
+    }
+    Py_DECREF(emptytuple);
+    Py_DECREF(arr_field);
+
+    Py_RETURN_NONE;
+}
+
+
+static PyObject *
+gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    PyObject *ret = NULL, *obj = NULL, *mod = NULL;
+#if defined(NPY_PY3K)
+    Py_buffer view;
+#endif
+    const char *buffer;
+    Py_ssize_t buflen;
+
+    /* Return a tuple of (callable object, arguments) */
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+#if defined(NPY_PY3K)
+    if (PyArray_IsScalar(self, Unicode)) {
+        /* Unicode on Python 3 does not expose the buffer interface */
+        buffer = PyUnicode_AS_DATA(self);
+        buflen = PyUnicode_GET_DATA_SIZE(self);
+    }
+    else if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) {
+        buffer = view.buf;
+        buflen = view.len;
+        /*
+         * In Python 3 both of the deprecated functions PyObject_AsWriteBuffer and
+         * PyObject_AsReadBuffer that this code replaces release the buffer. It is
+         * up to the object that supplies the buffer to guarantee that the buffer
+         * sticks around after the release.
+         */
+        PyBuffer_Release(&view);
+        _dealloc_cached_buffer_info(self);
+    }
+    else {
+        Py_DECREF(ret);
+        return NULL;
+    }
+#else
+    if (PyObject_AsReadBuffer(self, (const void **)&buffer, &buflen)<0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+#endif
+
+    mod = PyImport_ImportModule("numpy.core._multiarray_umath");
+    if (mod == NULL) {
+        return NULL;
+    }
+    obj = PyObject_GetAttrString(mod, "scalar");
+    Py_DECREF(mod);
+    if (obj == NULL) {
+        return NULL;
+    }
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyObject_GetAttrString((PyObject *)self, "dtype");
+    if (PyArray_IsScalar(self, Object)) {
+        mod = ((PyObjectScalarObject *)self)->obval;
+        PyTuple_SET_ITEM(ret, 1, Py_BuildValue("NO", obj, mod));
+    }
+    else {
+#ifndef Py_UNICODE_WIDE
+        /*
+         * We need to expand the buffer so that we always write
+         * UCS4 to disk for pickle of unicode scalars.
+         *
+         * This could be in a unicode_reduce function, but
+         * that would require re-factoring.
+         */
+        int alloc = 0;
+        char *tmp;
+        int newlen;
+
+        if (PyArray_IsScalar(self, Unicode)) {
+            tmp = PyArray_malloc(buflen*2);
+            if (tmp == NULL) {
+                Py_DECREF(ret);
+                return PyErr_NoMemory();
+            }
+            alloc = 1;
+            newlen = PyUCS2Buffer_AsUCS4((Py_UNICODE *)buffer,
+                    (npy_ucs4 *)tmp,
+                    buflen / 2, buflen / 2);
+            buflen = newlen*4;
+            buffer = tmp;
+        }
+#endif
+        mod = PyBytes_FromStringAndSize(buffer, buflen);
+        if (mod == NULL) {
+            Py_DECREF(ret);
+#ifndef Py_UNICODE_WIDE
+            ret = NULL;
+            goto fail;
+#else
+            return NULL;
+#endif
+        }
+        PyTuple_SET_ITEM(ret, 1,
+                Py_BuildValue("NN", obj, mod));
+#ifndef Py_UNICODE_WIDE
+fail:
+        if (alloc) PyArray_free((char *)buffer);
+#endif
+    }
+    return ret;
+}
+
+/* ignores everything */
+static PyObject *
+gentype_setstate(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+gentype_dump(PyObject *self, PyObject *args)
+{
+    PyObject *file = NULL;
+    int ret;
+
+    if (!PyArg_ParseTuple(args, "O:dump", &file)) {
+        return NULL;
+    }
+    ret = PyArray_Dump(self, file, 2);
+    if (ret < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+gentype_dumps(PyObject *self, PyObject *args)
+{
+    if (!PyArg_ParseTuple(args, "")) {
+        return NULL;
+    }
+    return PyArray_Dumps(self, 2);
+}
+
+
+/* setting flags cannot be done for scalars */
+static PyObject *
+gentype_setflags(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
+        PyObject *NPY_UNUSED(kwds))
+{
+    Py_RETURN_NONE;
+}
+
+/*
+ * casting complex numbers (that don't inherit from Python complex)
+ * to Python complex
+ */
+
+/**begin repeat
+ * #name = cfloat, clongdouble#
+ * #Name = CFloat, CLongDouble#
+ */
+static PyObject *
+@name@_complex(PyObject *self, PyObject *NPY_UNUSED(args),
+               PyObject *NPY_UNUSED(kwds))
+{
+    return PyComplex_FromDoubles(PyArrayScalar_VAL(self, @Name@).real,
+                                 PyArrayScalar_VAL(self, @Name@).imag);
+}
+/**end repeat**/
+
+/*
+ * need to fill in doc-strings for these methods on import -- copy from
+ * array docstrings
+ */
+static PyMethodDef gentype_methods[] = {
+    {"tolist",
+        (PyCFunction)gentype_tolist,
+        METH_VARARGS, NULL},
+    {"item",
+        (PyCFunction)gentype_item,
+        METH_VARARGS, NULL},
+    {"itemset",
+        (PyCFunction)gentype_itemset,
+        METH_VARARGS, NULL},
+    {"tobytes",
+        (PyCFunction)gentype_tobytes,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"tofile",
+        (PyCFunction)gentype_tofile,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"tostring",
+        (PyCFunction)gentype_tostring,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"byteswap",
+        (PyCFunction)gentype_byteswap,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"astype",
+        (PyCFunction)gentype_astype,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"getfield",
+        (PyCFunction)gentype_getfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"setfield",
+        (PyCFunction)gentype_setfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"copy",
+        (PyCFunction)gentype_copy,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"resize",
+        (PyCFunction)gentype_resize,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"__array__",
+        (PyCFunction)gentype_getarray,
+        METH_VARARGS, doc_getarray},
+    {"__array_wrap__",
+        (PyCFunction)gentype_wraparray,
+        METH_VARARGS, doc_sc_wraparray},
+
+    /* for the sys module */
+    {"__sizeof__",
+        (PyCFunction)gentype_sizeof,
+        METH_NOARGS, NULL},
+
+    /* for the copy module */
+    {"__copy__",
+        (PyCFunction)gentype___copy__,
+        METH_VARARGS, NULL},
+    {"__deepcopy__",
+        (PyCFunction)gentype___deepcopy__,
+        METH_VARARGS, NULL},
+
+    {"__reduce__",
+        (PyCFunction) gentype_reduce,
+        METH_VARARGS, NULL},
+    /* For consistency does nothing */
+    {"__setstate__",
+        (PyCFunction) gentype_setstate,
+        METH_VARARGS, NULL},
+
+    {"dumps",
+        (PyCFunction) gentype_dumps,
+        METH_VARARGS, NULL},
+    {"dump",
+        (PyCFunction) gentype_dump,
+        METH_VARARGS, NULL},
+
+    /* Methods for array */
+    {"fill",
+        (PyCFunction)gentype_fill,
+        METH_VARARGS, NULL},
+    {"transpose",
+        (PyCFunction)gentype_transpose,
+        METH_VARARGS, NULL},
+    {"take",
+        (PyCFunction)gentype_take,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"put",
+        (PyCFunction)gentype_put,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"repeat",
+        (PyCFunction)gentype_repeat,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"choose",
+        (PyCFunction)gentype_choose,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"sort",
+        (PyCFunction)gentype_sort,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"argsort",
+        (PyCFunction)gentype_argsort,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"searchsorted",
+        (PyCFunction)gentype_searchsorted,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"argmax",
+        (PyCFunction)gentype_argmax,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"argmin",
+        (PyCFunction)gentype_argmin,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"reshape",
+        (PyCFunction)gentype_reshape,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"squeeze",
+        (PyCFunction)gentype_squeeze,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"view",
+        (PyCFunction)gentype_view,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"swapaxes",
+        (PyCFunction)gentype_swapaxes,
+        METH_VARARGS, NULL},
+    {"max",
+        (PyCFunction)gentype_max,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"min",
+        (PyCFunction)gentype_min,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"ptp",
+        (PyCFunction)gentype_ptp,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"mean",
+        (PyCFunction)gentype_mean,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"trace",
+        (PyCFunction)gentype_trace,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"diagonal",
+        (PyCFunction)gentype_diagonal,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"clip",
+        (PyCFunction)gentype_clip,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"conj",
+        (PyCFunction)gentype_conj,
+        METH_VARARGS, NULL},
+    {"conjugate",
+        (PyCFunction)gentype_conjugate,
+        METH_VARARGS, NULL},
+    {"nonzero",
+        (PyCFunction)gentype_nonzero,
+        METH_VARARGS, NULL},
+    {"std",
+        (PyCFunction)gentype_std,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"var",
+        (PyCFunction)gentype_var,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"sum",
+        (PyCFunction)gentype_sum,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"cumsum",
+        (PyCFunction)gentype_cumsum,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"prod",
+        (PyCFunction)gentype_prod,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"cumprod",
+        (PyCFunction)gentype_cumprod,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"all",
+        (PyCFunction)gentype_all,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"any",
+        (PyCFunction)gentype_any,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"compress",
+        (PyCFunction)gentype_compress,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"flatten",
+        (PyCFunction)gentype_flatten,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"ravel",
+        (PyCFunction)gentype_ravel,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"round",
+        (PyCFunction)gentype_round,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+#if defined(NPY_PY3K)
+    /* Hook for the round() builtin */
+    {"__round__",
+        (PyCFunction)gentype_round,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+#endif
+    /* For the format function */
+    {"__format__",
+        gentype_format,
+        METH_VARARGS,
+        "NumPy array scalar formatter"},
+    {"setflags",
+        (PyCFunction)gentype_setflags,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"newbyteorder",
+        (PyCFunction)gentype_newbyteorder,
+        METH_VARARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+
+static PyGetSetDef voidtype_getsets[] = {
+    {"flags",
+        (getter)voidtype_flags_get,
+        (setter)0,
+        "integer value of flags",
+        NULL},
+    {"dtype",
+        (getter)voidtype_dtypedescr_get,
+        (setter)0,
+        "dtype object",
+        NULL},
+    {"base",
+        (getter)voidtype_base_get,
+        (setter)0,
+        "base object",
+        NULL},
+    {NULL, NULL, NULL, NULL, NULL}
+};
+
+static PyMethodDef voidtype_methods[] = {
+    {"getfield",
+        (PyCFunction)voidtype_getfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"setfield",
+        (PyCFunction)voidtype_setfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+static PyGetSetDef inttype_getsets[] = {
+    {"numerator",
+        (getter)inttype_numerator_get,
+        (setter)0,
+        "numerator of value (the value itself)",
+        NULL},
+    {"denominator",
+        (getter)inttype_denominator_get,
+        (setter)0,
+        "denominator of value (1)",
+        NULL},
+    {NULL, NULL, NULL, NULL, NULL}
+};
+
+/**begin repeat
+ * #name = cfloat,clongdouble#
+ */
+static PyMethodDef @name@type_methods[] = {
+    {"__complex__",
+        (PyCFunction)@name@_complex,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+/**end repeat**/
+
+/************* As_mapping functions for void array scalar ************/
+
+static Py_ssize_t
+voidtype_length(PyVoidScalarObject *self)
+{
+    if (!PyDataType_HASFIELDS(self->descr)) {
+        return 0;
+    }
+    else {
+        /* return the number of fields */
+        return (Py_ssize_t) PyTuple_GET_SIZE(self->descr->names);
+    }
+}
+
+static PyObject *
+voidtype_subscript(PyVoidScalarObject *self, PyObject *ind);
+
+static PyObject *
+voidtype_item(PyVoidScalarObject *self, Py_ssize_t n)
+{
+    npy_intp m;
+    PyObject *flist=NULL;
+
+    if (!(PyDataType_HASFIELDS(self->descr))) {
+        PyErr_SetString(PyExc_IndexError,
+                "can't index void scalar without fields");
+        return NULL;
+    }
+    flist = self->descr->names;
+    m = PyTuple_GET_SIZE(flist);
+    if (n < 0) {
+        n += m;
+    }
+    if (n < 0 || n >= m) {
+        PyErr_Format(PyExc_IndexError, "invalid index (%d)", (int) n);
+        return NULL;
+    }
+
+    return voidtype_subscript(self, PyTuple_GetItem(flist, n));
+}
+
+/* get field by name or number */
+static PyObject *
+voidtype_subscript(PyVoidScalarObject *self, PyObject *ind)
+{
+    npy_intp n;
+    PyObject *ret, *res;
+
+    /* structured voids will accept an integer index */
+    if (PyDataType_HASFIELDS(self->descr)) {
+        n = PyArray_PyIntAsIntp(ind);
+        if (!error_converting(n)) {
+            return voidtype_item(self, (Py_ssize_t)n);
+        }
+        PyErr_Clear();
+    }
+
+    res = PyArray_FromScalar((PyObject*)self, NULL);
+
+    /* ellipsis should return 0d array */
+    if(ind == Py_Ellipsis){
+        return res;
+    }
+
+    /*
+     * other cases (field names, empty tuple) will return either
+     * scalar or non-0d array. Compute this using ndarray subscript.
+     */
+    ret = array_subscript((PyArrayObject *)res, ind);
+    Py_DECREF(res);
+    return PyArray_Return((PyArrayObject*)ret);
+}
+
+static int
+voidtype_ass_subscript(PyVoidScalarObject *self, PyObject *ind, PyObject *val);
+
+static int
+voidtype_ass_item(PyVoidScalarObject *self, Py_ssize_t n, PyObject *val)
+{
+    npy_intp m;
+    PyObject *flist=NULL;
+
+    if (!(PyDataType_HASFIELDS(self->descr))) {
+        PyErr_SetString(PyExc_IndexError,
+                "can't index void scalar without fields");
+        return -1;
+    }
+
+    flist = self->descr->names;
+    m = PyTuple_GET_SIZE(flist);
+    if (n < 0) {
+        n += m;
+    }
+    if (n < 0 || n >= m) {
+        PyErr_Format(PyExc_IndexError, "invalid index (%d)", (int) n);
+        return -1;
+    }
+
+    return voidtype_ass_subscript(self, PyTuple_GetItem(flist, n), val);
+}
+
+static int
+voidtype_ass_subscript(PyVoidScalarObject *self, PyObject *ind, PyObject *val)
+{
+    npy_intp n;
+    char *msg = "invalid index";
+    PyObject *args;
+
+    if (!PyDataType_HASFIELDS(self->descr)) {
+        PyErr_SetString(PyExc_IndexError,
+                "can't index void scalar without fields");
+        return -1;
+    }
+
+    if (!val) {
+        PyErr_SetString(PyExc_ValueError,
+                "cannot delete scalar field");
+        return -1;
+    }
+
+    if (PyBaseString_Check(ind)) {
+        /*
+         * Much like in voidtype_setfield, we cannot simply use ndarray's
+         * __setitem__ since assignment to void scalars should not broadcast
+         * the lhs. Instead we get a view through __getitem__ and then assign
+         * the value using setitem with an empty tuple (which treats both
+         * object arrays and subarrays properly).
+         *
+         * Also we do not want to use voidtype_setfield here, since we do
+         * not need to do the (slow) view safety checks, since we already
+         * know the dtype/offset are safe.
+         */
+
+        PyObject *arr, *arr_field, *meth, *emptytuple;
+
+        /* 1. Convert to 0-d array and use getitem */
+        arr = PyArray_FromScalar((PyObject*)self, NULL);
+        if (arr == NULL) {
+            return -1;
+        }
+        meth = PyObject_GetAttrString(arr, "__getitem__");
+        if (meth == NULL) {
+            Py_DECREF(arr);
+            return -1;
+        }
+        args = Py_BuildValue("(O)", ind);
+        arr_field = PyObject_CallObject(meth, args);
+        Py_DECREF(meth);
+        Py_DECREF(arr);
+        Py_DECREF(args);
+
+        if(arr_field == NULL){
+            return -1;
+        }
+
+        /* 2. Assign the value using setitem with empty tuple. */
+        emptytuple = PyTuple_New(0);
+        if (PyObject_SetItem(arr_field, emptytuple, val) < 0) {
+            Py_DECREF(arr_field);
+            Py_DECREF(emptytuple);
+            return -1;
+        }
+        Py_DECREF(emptytuple);
+        Py_DECREF(arr_field);
+        return 0;
+    }
+
+    /* try to convert it to a number */
+    n = PyArray_PyIntAsIntp(ind);
+    if (error_converting(n)) {
+        goto fail;
+    }
+    return voidtype_ass_item(self, (Py_ssize_t)n, val);
+
+fail:
+    PyErr_SetString(PyExc_IndexError, msg);
+    return -1;
+}
+
+static PyMappingMethods voidtype_as_mapping = {
+    (lenfunc)voidtype_length,                    /*mp_length*/
+    (binaryfunc)voidtype_subscript,              /*mp_subscript*/
+    (objobjargproc)voidtype_ass_subscript,       /*mp_ass_subscript*/
+};
+
+
+static PySequenceMethods voidtype_as_sequence = {
+    (lenfunc)voidtype_length,                    /*sq_length*/
+    0,                                           /*sq_concat*/
+    0,                                           /*sq_repeat*/
+    (ssizeargfunc)voidtype_item,                 /*sq_item*/
+    0,                                           /*sq_slice*/
+    (ssizeobjargproc)voidtype_ass_item,          /*sq_ass_item*/
+    0,                                           /* ssq_ass_slice */
+    0,                                           /* sq_contains */
+    0,                                           /* sq_inplace_concat */
+    0,                                           /* sq_inplace_repeat */
+};
+
+
+static Py_ssize_t
+gentype_getreadbuf(PyObject *self, Py_ssize_t segment, void **ptrptr)
+{
+    int numbytes;
+    PyArray_Descr *outcode;
+
+    if (segment != 0) {
+        PyErr_SetString(PyExc_SystemError,
+                "Accessing non-existent array segment");
+        return -1;
+    }
+
+    outcode = PyArray_DescrFromScalar(self);
+    numbytes = outcode->elsize;
+    *ptrptr = (void *)scalar_value(self, outcode);
+
+#ifndef Py_UNICODE_WIDE
+    if (outcode->type_num == NPY_UNICODE) {
+        numbytes >>= 1;
+    }
+#endif
+    Py_DECREF(outcode);
+    return numbytes;
+}
+
+#if !defined(NPY_PY3K)
+static Py_ssize_t
+gentype_getsegcount(PyObject *self, Py_ssize_t *lenp)
+{
+    PyArray_Descr *outcode;
+
+    outcode = PyArray_DescrFromScalar(self);
+    if (lenp) {
+        *lenp = outcode->elsize;
+#ifndef Py_UNICODE_WIDE
+        if (outcode->type_num == NPY_UNICODE) {
+            *lenp >>= 1;
+        }
+#endif
+    }
+    Py_DECREF(outcode);
+    return 1;
+}
+
+static Py_ssize_t
+gentype_getcharbuf(PyObject *self, Py_ssize_t segment, constchar **ptrptr)
+{
+    if (PyArray_IsScalar(self, String) ||
+            PyArray_IsScalar(self, Unicode)) {
+        return gentype_getreadbuf(self, segment, (void **)ptrptr);
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "Non-character array cannot be interpreted "\
+                "as character buffer.");
+        return -1;
+    }
+}
+#endif /* !defined(NPY_PY3K) */
+
+static PyBufferProcs gentype_as_buffer = {
+#if !defined(NPY_PY3K)
+    gentype_getreadbuf,                          /* bf_getreadbuffer*/
+    NULL,                                        /* bf_getwritebuffer*/
+    gentype_getsegcount,                         /* bf_getsegcount*/
+    gentype_getcharbuf,                          /* bf_getcharbuffer*/
+#endif
+    gentype_getbuffer,                           /* bf_getbuffer */
+    NULL,                                        /* bf_releasebuffer */
+};
+
+
+#if defined(NPY_PY3K)
+#define BASEFLAGS Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE
+#define LEAFFLAGS  Py_TPFLAGS_DEFAULT
+#else
+#define BASEFLAGS Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES
+#define LEAFFLAGS  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES
+#endif
+
+NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy.generic",                            /* tp_name*/
+    sizeof(PyObject),                           /* tp_basicsize*/
+    0,                                          /* tp_itemsize */
+    /* methods */
+    0,                                          /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    0,                                          /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+};
+
+static void
+void_dealloc(PyVoidScalarObject *v)
+{
+    _dealloc_cached_buffer_info((PyObject *)v);
+
+    if (v->flags & NPY_ARRAY_OWNDATA) {
+        npy_free_cache(v->obval, Py_SIZE(v));
+    }
+    Py_XDECREF(v->descr);
+    Py_XDECREF(v->base);
+    Py_TYPE(v)->tp_free(v);
+}
+
+static void
+object_arrtype_dealloc(PyObject *v)
+{
+    Py_XDECREF(((PyObjectScalarObject *)v)->obval);
+    Py_TYPE(v)->tp_free(v);
+}
+
+/*
+ * string and unicode inherit from Python Type first and so GET_ITEM
+ * is different to get to the Python Type.
+ *
+ * ok is a work-around for a bug in complex_new that doesn't allocate
+ *  memory from the sub-types memory allocator.
+ */
+
+#define _WORK(num)  \
+    if (type->tp_bases && (PyTuple_GET_SIZE(type->tp_bases)==2)) { \
+        PyTypeObject *sup; \
+        /* We are inheriting from a Python type as well so \
+           give it first dibs on conversion */ \
+        sup = (PyTypeObject *)PyTuple_GET_ITEM(type->tp_bases, num); \
+        /* Prevent recursion */ \
+        if (thisfunc != sup->tp_new) { \
+            robj = sup->tp_new(type, args, kwds); \
+            if (robj != NULL) goto finish;        \
+            if (PyTuple_GET_SIZE(args)!=1) return NULL; \
+            PyErr_Clear(); \
+        } \
+        /* now do default conversion */ \
+    }
+
+#define _WORK1 _WORK(1)
+#define _WORKz _WORK(0)
+#define _WORK0
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong, ubyte, ushort, uint, ulong,
+ *         ulonglong, half, float, double, longdouble, cfloat, cdouble,
+ *         clongdouble, string, unicode, object#
+ * #Name = Byte, Short, Int, Long, LongLong, UByte, UShort, UInt, ULong,
+ *         ULongLong, Half, Float, Double, LongDouble, CFloat, CDouble,
+ *         CLongDouble, String, Unicode, Object#
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG, UBYTE, USHORT, UINT, ULONG,
+ *         ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE, CFLOAT, CDOUBLE,
+ *         CLONGDOUBLE, STRING, UNICODE, OBJECT#
+ * #work = 0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,z,z,0#
+ * #default = 0*17,1*2,2#
+ */
+
+#define _NPY_UNUSED2_1
+#define _NPY_UNUSED2_z
+#define _NPY_UNUSED2_0 NPY_UNUSED
+#define _NPY_UNUSED1_0
+#define _NPY_UNUSED1_1
+#define _NPY_UNUSED1_2 NPY_UNUSED
+
+static PyObject *
+@name@_arrtype_new(PyTypeObject *_NPY_UNUSED1_@default@(type), PyObject *args, PyObject *_NPY_UNUSED2_@work@(kwds))
+{
+    PyObject *obj = NULL;
+    PyObject *robj;
+    PyArrayObject *arr;
+    PyArray_Descr *typecode = NULL;
+#if (@work@ != 0) || (@default@ == 1)
+    void *thisfunc = (void *)@name@_arrtype_new;
+#endif
+#if !(@default@ == 2)
+    int itemsize;
+    void *dest, *src;
+#endif
+
+    /*
+     * allow base-class (if any) to do conversion
+     * If successful, this will jump to finish:
+     */
+    _WORK@work@
+
+    /* TODO: include type name in error message, which is not @name@ */
+    if (!PyArg_ParseTuple(args, "|O", &obj)) {
+        return NULL;
+    }
+    typecode = PyArray_DescrFromType(NPY_@TYPE@);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    /*
+     * typecode is new reference and stolen by
+     * PyArray_FromAny but not PyArray_Scalar
+     */
+    if (obj == NULL) {
+#if @default@ == 0
+        robj = PyArray_Scalar(NULL, typecode, NULL);
+        if (robj == NULL) {
+            Py_DECREF(typecode);
+            return NULL;
+        }
+        memset(&((Py@Name@ScalarObject *)robj)->obval, 0, sizeof(npy_@name@));
+#elif @default@ == 1
+        robj = PyArray_Scalar(NULL, typecode, NULL);
+#elif @default@ == 2
+        Py_INCREF(Py_None);
+        robj = Py_None;
+#endif
+        Py_DECREF(typecode);
+        goto finish;
+    }
+
+    /*
+     * It is expected at this point that robj is a PyArrayScalar
+     * (even for Object Data Type)
+     */
+    arr = (PyArrayObject *)PyArray_FromAny(obj, typecode,
+                                    0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if ((arr == NULL) || (PyArray_NDIM(arr) > 0)) {
+        return (PyObject *)arr;
+    }
+    /* 0-d array */
+    robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+finish:
+    /*
+     * In OBJECT case, robj is no longer a
+     * PyArrayScalar at this point but the
+     * remaining code assumes it is
+     */
+#if @default@ == 2
+    return robj;
+#else
+    /* Normal return */
+    if ((robj == NULL) || (Py_TYPE(robj) == type)) {
+        return robj;
+    }
+
+    /*
+     * This return path occurs when the requested type is not created
+     * but another scalar object is created instead (i.e. when
+     * the base-class does the conversion in _WORK macro)
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    obj = type->tp_alloc(type, itemsize);
+    if (obj == NULL) {
+        Py_DECREF(robj);
+        return NULL;
+    }
+    /* typecode will be NULL */
+    typecode = PyArray_DescrFromType(NPY_@TYPE@);
+    dest = scalar_value(obj, typecode);
+    src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if @default@ == 0
+    *((npy_@name@ *)dest) = *((npy_@name@ *)src);
+#elif @default@ == 1 /* unicode and strings */
+    if (itemsize == 0) { /* unicode */
+#if PY_VERSION_HEX >= 0x03030000
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+#else
+        itemsize = ((PyUnicodeObject *)robj)->length * sizeof(Py_UNICODE);
+#endif
+    }
+    memcpy(dest, src, itemsize);
+    /* @default@ == 2 won't get here */
+#endif
+    Py_DECREF(robj);
+    return obj;
+#endif
+}
+/**end repeat**/
+
+#undef _WORK1
+#undef _WORKz
+#undef _WORK0
+#undef _WORK
+
+/**begin repeat
+ * #name = datetime, timedelta#
+ * #Name = Datetime, Timedelta#
+ * #NAME = DATETIME, TIMEDELTA#
+ * #is_datetime = 1, 0#
+ */
+
+static PyObject *
+@name@_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    PyObject *obj = NULL, *meta_obj = NULL;
+    Py@Name@ScalarObject *ret;
+
+    /* TODO: include type name in error message, which is not @name@ */
+    if (!PyArg_ParseTuple(args, "|OO", &obj, &meta_obj)) {
+        return NULL;
+    }
+
+    /* Allocate the return scalar */
+    ret = (Py@Name@ScalarObject *)Py@Name@ArrType_Type.tp_alloc(
+                                            &Py@Name@ArrType_Type, 0);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Incorporate the metadata if its provided */
+    if (meta_obj != NULL) {
+        /* Parse the provided metadata input */
+        if (convert_pyobject_to_datetime_metadata(meta_obj, &ret->obmeta)
+                                                                    < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+    else {
+        /*
+         * A unit of -1 signals that convert_pyobject_to_datetime
+         * should populate.
+         */
+        ret->obmeta.base = -1;
+    }
+
+    if (obj == NULL) {
+        if (ret->obmeta.base == -1) {
+            ret->obmeta.base = NPY_DATETIME_DEFAULTUNIT;
+            ret->obmeta.num = 1;
+        }
+
+        /* Make datetime default to NaT, timedelta default to zero */
+#if @is_datetime@
+        ret->obval = NPY_DATETIME_NAT;
+#else
+        ret->obval = 0;
+#endif
+    }
+    else if (convert_pyobject_to_@name@(&ret->obmeta, obj,
+                            NPY_SAME_KIND_CASTING, &ret->obval) < 0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    return (PyObject *)ret;
+}
+/**end repeat**/
+
+/* bool->tp_new only returns Py_True or Py_False */
+static PyObject *
+bool_arrtype_new(PyTypeObject *NPY_UNUSED(type), PyObject *args, PyObject *NPY_UNUSED(kwds))
+{
+    PyObject *obj = NULL;
+    PyArrayObject *arr;
+
+    if (!PyArg_ParseTuple(args, "|O:bool_", &obj)) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyArrayScalar_RETURN_FALSE;
+    }
+    if (obj == Py_False) {
+        PyArrayScalar_RETURN_FALSE;
+    }
+    if (obj == Py_True) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    arr = (PyArrayObject *)PyArray_FROM_OTF(obj,
+                                NPY_BOOL, NPY_ARRAY_FORCECAST);
+    if (arr && 0 == PyArray_NDIM(arr)) {
+        npy_bool val = *((npy_bool *)PyArray_DATA(arr));
+        Py_DECREF(arr);
+        PyArrayScalar_RETURN_BOOL_FROM_LONG(val);
+    }
+    return PyArray_Return((PyArrayObject *)arr);
+}
+
+static PyObject *
+bool_arrtype_and(PyObject *a, PyObject *b)
+{
+    if (PyArray_IsScalar(a, Bool) && PyArray_IsScalar(b, Bool)) {
+        PyArrayScalar_RETURN_BOOL_FROM_LONG
+            ((a == PyArrayScalar_True) & (b == PyArrayScalar_True));
+    }
+    return PyGenericArrType_Type.tp_as_number->nb_and(a, b);
+}
+
+static PyObject *
+bool_arrtype_or(PyObject *a, PyObject *b)
+{
+    if (PyArray_IsScalar(a, Bool) && PyArray_IsScalar(b, Bool)) {
+        PyArrayScalar_RETURN_BOOL_FROM_LONG
+            ((a == PyArrayScalar_True)|(b == PyArrayScalar_True));
+    }
+    return PyGenericArrType_Type.tp_as_number->nb_or(a, b);
+}
+
+static PyObject *
+bool_arrtype_xor(PyObject *a, PyObject *b)
+{
+    if (PyArray_IsScalar(a, Bool) && PyArray_IsScalar(b, Bool)) {
+        PyArrayScalar_RETURN_BOOL_FROM_LONG
+            ((a == PyArrayScalar_True)^(b == PyArrayScalar_True));
+    }
+    return PyGenericArrType_Type.tp_as_number->nb_xor(a, b);
+}
+
+static int
+bool_arrtype_nonzero(PyObject *a)
+{
+    return a == PyArrayScalar_True;
+}
+
+/**begin repeat
+ * #name = byte, short, int, long, ubyte, ushort, longlong, uint,
+ *         ulong, ulonglong#
+ * #Name = Byte, Short, Int, Long, UByte, UShort, LongLong, UInt,
+ *         ULong, ULongLong#
+ * #type = PyInt_FromLong*6, PyLong_FromLongLong*1,
+ *         PyLong_FromUnsignedLong*2, PyLong_FromUnsignedLongLong#
+ */
+static PyNumberMethods @name@_arrtype_as_number;
+static PyObject *
+@name@_index(PyObject *self)
+{
+    return @type@(PyArrayScalar_VAL(self, @Name@));
+}
+/**end repeat**/
+
+/**begin repeat
+ *  #name = half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ *  #NAME = Half, Float, Double, LongDouble,
+ *          CFloat, CDouble, CLongDouble#
+ */
+static PyNumberMethods @name@_arrtype_as_number;
+/**end repeat**/
+
+static PyObject *
+bool_index(PyObject *a)
+{
+    if (DEPRECATE(
+            "In future, it will be an error for 'np.bool_' scalars to be "
+            "interpreted as an index") < 0) {
+        return NULL;
+    }
+    else {
+        return PyInt_FromLong(PyArrayScalar_VAL(a, Bool));
+    }
+}
+
+/* Arithmetic methods -- only so we can override &, |, ^. */
+NPY_NO_EXPORT PyNumberMethods bool_arrtype_as_number = {
+    0,                                           /* nb_add */
+    0,                                           /* nb_subtract */
+    0,                                           /* nb_multiply */
+#if defined(NPY_PY3K)
+#else
+    0,                                           /* nb_divide */
+#endif
+    0,                                           /* nb_remainder */
+    0,                                           /* nb_divmod */
+    0,                                           /* nb_power */
+    0,                                           /* nb_negative */
+    0,                                           /* nb_positive */
+    0,                                           /* nb_absolute */
+    (inquiry)bool_arrtype_nonzero,               /* nb_nonzero / nb_bool */
+    0,                                           /* nb_invert */
+    0,                                           /* nb_lshift */
+    0,                                           /* nb_rshift */
+    (binaryfunc)bool_arrtype_and,                /* nb_and */
+    (binaryfunc)bool_arrtype_xor,                /* nb_xor */
+    (binaryfunc)bool_arrtype_or,                 /* nb_or */
+#if defined(NPY_PY3K)
+#else
+    0,                                           /* nb_coerce */
+#endif
+    0,                                           /* nb_int */
+#if defined(NPY_PY3K)
+    0,                                           /* nb_reserved */
+#else
+    0,                                           /* nb_long */
+#endif
+    0,                                           /* nb_float */
+#if defined(NPY_PY3K)
+#else
+    0,                                           /* nb_oct */
+    0,                                           /* nb_hex */
+#endif
+    /* Added in release 2.0 */
+    0,                                           /* nb_inplace_add */
+    0,                                           /* nb_inplace_subtract */
+    0,                                           /* nb_inplace_multiply */
+#if defined(NPY_PY3K)
+#else
+    0,                                           /* nb_inplace_divide */
+#endif
+    0,                                           /* nb_inplace_remainder */
+    0,                                           /* nb_inplace_power */
+    0,                                           /* nb_inplace_lshift */
+    0,                                           /* nb_inplace_rshift */
+    0,                                           /* nb_inplace_and */
+    0,                                           /* nb_inplace_xor */
+    0,                                           /* nb_inplace_or */
+    /* Added in release 2.2 */
+    /* The following require the Py_TPFLAGS_HAVE_CLASS flag */
+    0,                                           /* nb_floor_divide */
+    0,                                           /* nb_true_divide */
+    0,                                           /* nb_inplace_floor_divide */
+    0,                                           /* nb_inplace_true_divide */
+    /* Added in release 2.5 */
+    0,                                           /* nb_index */
+};
+
+static PyObject *
+void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *NPY_UNUSED(kwds))
+{
+    PyObject *obj, *arr;
+    PyObject *new = NULL;
+
+    if (!PyArg_ParseTuple(args, "O:void", &obj)) {
+        return NULL;
+    }
+    /*
+     * For a VOID scalar first see if obj is an integer or long
+     * and create new memory of that size (filled with 0) for the scalar
+     */
+    if (PyLong_Check(obj) || PyInt_Check(obj) ||
+            PyArray_IsScalar(obj, Integer) ||
+            (PyArray_Check(obj) &&
+                     PyArray_NDIM((PyArrayObject *)obj)==0 &&
+                     PyArray_ISINTEGER((PyArrayObject *)obj))) {
+#if defined(NPY_PY3K)
+        new = Py_TYPE(obj)->tp_as_number->nb_int(obj);
+#else
+        new = Py_TYPE(obj)->tp_as_number->nb_long(obj);
+#endif
+    }
+    if (new && PyLong_Check(new)) {
+        PyObject *ret;
+        char *destptr;
+        npy_ulonglong memu = PyLong_AsUnsignedLongLong(new);
+        Py_DECREF(new);
+        if (PyErr_Occurred() || (memu > NPY_MAX_INT)) {
+            PyErr_Clear();
+            PyErr_Format(PyExc_OverflowError,
+                    "size must be non-negative and not greater than %d",
+                    (int) NPY_MAX_INT);
+            return NULL;
+        }
+        destptr = npy_alloc_cache_zero(memu);
+        if (destptr == NULL) {
+            return PyErr_NoMemory();
+        }
+        ret = type->tp_alloc(type, 0);
+        if (ret == NULL) {
+            npy_free_cache(destptr, memu);
+            return PyErr_NoMemory();
+        }
+        ((PyVoidScalarObject *)ret)->obval = destptr;
+        Py_SIZE((PyVoidScalarObject *)ret) = (int) memu;
+        ((PyVoidScalarObject *)ret)->descr =
+            PyArray_DescrNewFromType(NPY_VOID);
+        ((PyVoidScalarObject *)ret)->descr->elsize = (int) memu;
+        ((PyVoidScalarObject *)ret)->flags = NPY_ARRAY_BEHAVED |
+                                             NPY_ARRAY_OWNDATA;
+        ((PyVoidScalarObject *)ret)->base = NULL;
+        return ret;
+    }
+
+    arr = PyArray_FROM_OTF(obj, NPY_VOID, NPY_ARRAY_FORCECAST);
+    return PyArray_Return((PyArrayObject *)arr);
+}
+
+
+/****************  Define Hash functions ********************/
+
+/**begin repeat
+ * #lname = bool, ubyte, ushort#
+ * #name = Bool,UByte, UShort#
+ */
+static npy_hash_t
+@lname@_arrtype_hash(PyObject *obj)
+{
+    return (npy_hash_t)(((Py@name@ScalarObject *)obj)->obval);
+}
+/**end repeat**/
+
+/**begin repeat
+ * #lname = byte, short, uint#
+ * #name = Byte, Short, UInt#
+ */
+static npy_hash_t
+@lname@_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(((Py@name@ScalarObject *)obj)->obval);
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+/**end repeat**/
+
+static npy_hash_t
+ulong_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_FromUnsignedLong(((PyULongScalarObject*)obj)->obval);
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+
+#if (NPY_SIZEOF_INT != NPY_SIZEOF_LONG) || defined(NPY_PY3K)
+static npy_hash_t
+int_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(((PyIntScalarObject *)obj)->obval);
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+#endif
+
+#if defined(NPY_PY3K)
+static npy_hash_t
+long_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_FromLong(((PyLongScalarObject*)obj)->obval);
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+#endif
+
+/**begin repeat
+ * #char = ,u#
+ * #Char = ,U#
+ * #Word = ,Unsigned#
+ */
+static NPY_INLINE npy_hash_t
+@char@longlong_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_From@Word@LongLong(
+                                 ((Py@Char@LongLongScalarObject*)obj)->obval);
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #lname = datetime, timedelta#
+ * #name = Datetime, Timedelta#
+ */
+#if NPY_SIZEOF_HASH_T==NPY_SIZEOF_DATETIME
+static npy_hash_t
+@lname@_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(((Py@name@ScalarObject *)obj)->obval);
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+#elif NPY_SIZEOF_LONGLONG==NPY_SIZEOF_DATETIME
+static npy_hash_t
+@lname@_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t y;
+    npy_longlong x = (((Py@name@ScalarObject *)obj)->obval);
+
+    if ((x <= LONG_MAX)) {
+        y = (npy_hash_t) x;
+    }
+    else {
+        union Mask {
+            long hashvals[2];
+            npy_longlong v;
+        } both;
+
+        both.v = x;
+        y = both.hashvals[0] + (1000003)*both.hashvals[1];
+    }
+    if (y == -1) {
+        y = -2;
+    }
+    return y;
+}
+#endif
+/**end repeat**/
+
+
+
+/* Wrong thing to do for longdouble, but....*/
+
+/**begin repeat
+ * #lname = float, longdouble#
+ * #name = Float, LongDouble#
+ */
+static npy_hash_t
+@lname@_arrtype_hash(PyObject *obj)
+{
+    return _Py_HashDouble((double) ((Py@name@ScalarObject *)obj)->obval);
+}
+
+/* borrowed from complex_hash */
+static npy_hash_t
+c@lname@_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t hashreal, hashimag, combined;
+    hashreal = _Py_HashDouble((double)
+            (((PyC@name@ScalarObject *)obj)->obval).real);
+
+    if (hashreal == -1) {
+        return -1;
+    }
+    hashimag = _Py_HashDouble((double)
+            (((PyC@name@ScalarObject *)obj)->obval).imag);
+    if (hashimag == -1) {
+        return -1;
+    }
+    combined = hashreal + 1000003 * hashimag;
+    if (combined == -1) {
+        combined = -2;
+    }
+    return combined;
+}
+/**end repeat**/
+
+static npy_hash_t
+half_arrtype_hash(PyObject *obj)
+{
+    return _Py_HashDouble(npy_half_to_double(((PyHalfScalarObject *)obj)->obval));
+}
+
+static npy_hash_t
+object_arrtype_hash(PyObject *obj)
+{
+    return PyObject_Hash(((PyObjectScalarObject *)obj)->obval);
+}
+
+/* we used to just hash the pointer */
+/* now use tuplehash algorithm using voidtype_item to get the object
+*/
+static npy_hash_t
+void_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x, y;
+    Py_ssize_t len, n;
+    PyVoidScalarObject *p;
+    PyObject *element;
+    npy_hash_t mult = 1000003L;
+    x = 0x345678L;
+    p = (PyVoidScalarObject *)obj;
+    /* Cannot hash mutable void scalars */
+    if (p->flags & NPY_ARRAY_WRITEABLE) {
+       PyErr_SetString(PyExc_TypeError, "unhashable type: 'writeable void-scalar'");
+       return -1;
+    }
+    len = voidtype_length(p);
+    for (n=0; n < len; n++) {
+        element = voidtype_item(p, n);
+        y = PyObject_Hash(element);
+        Py_DECREF(element);
+        if (y == -1)
+           return -1;
+        x = (x ^ y) * mult;
+        mult += (npy_hash_t)(82520L + len + len);
+    }
+    x += 97531L;
+    if (x == -1)
+        x = -2;
+    return x;
+}
+
+/*object arrtype getattro and setattro */
+static PyObject *
+object_arrtype_getattro(PyObjectScalarObject *obj, PyObject *attr) {
+    PyObject *res;
+
+    /* first look in object and then hand off to generic type */
+
+    res = PyObject_GenericGetAttr(obj->obval, attr);
+    if (res) {
+        return res;
+    }
+    PyErr_Clear();
+    return  PyObject_GenericGetAttr((PyObject *)obj, attr);
+}
+
+static int
+object_arrtype_setattro(PyObjectScalarObject *obj, PyObject *attr, PyObject *val) {
+    int res;
+    /* first look in object and then hand off to generic type */
+
+    res = PyObject_GenericSetAttr(obj->obval, attr, val);
+    if (res >= 0) {
+        return res;
+    }
+    PyErr_Clear();
+    return PyObject_GenericSetAttr((PyObject *)obj, attr, val);
+}
+
+static PyObject *
+object_arrtype_concat(PyObjectScalarObject *self, PyObject *other)
+{
+    return PySequence_Concat(self->obval, other);
+}
+
+static Py_ssize_t
+object_arrtype_length(PyObjectScalarObject *self)
+{
+    return PyObject_Length(self->obval);
+}
+
+static PyObject *
+object_arrtype_repeat(PyObjectScalarObject *self, Py_ssize_t count)
+{
+    return PySequence_Repeat(self->obval, count);
+}
+
+static PyObject *
+object_arrtype_subscript(PyObjectScalarObject *self, PyObject *key)
+{
+    return PyObject_GetItem(self->obval, key);
+}
+
+static int
+object_arrtype_ass_subscript(PyObjectScalarObject *self, PyObject *key,
+                             PyObject *value)
+{
+    return PyObject_SetItem(self->obval, key, value);
+}
+
+static int
+object_arrtype_contains(PyObjectScalarObject *self, PyObject *ob)
+{
+    return PySequence_Contains(self->obval, ob);
+}
+
+static PyObject *
+object_arrtype_inplace_concat(PyObjectScalarObject *self, PyObject *o)
+{
+    return PySequence_InPlaceConcat(self->obval, o);
+}
+
+static PyObject *
+object_arrtype_inplace_repeat(PyObjectScalarObject *self, Py_ssize_t count)
+{
+    return PySequence_InPlaceRepeat(self->obval, count);
+}
+
+static PySequenceMethods object_arrtype_as_sequence = {
+    (lenfunc)object_arrtype_length,              /*sq_length*/
+    (binaryfunc)object_arrtype_concat,           /*sq_concat*/
+    (ssizeargfunc)object_arrtype_repeat,         /*sq_repeat*/
+    0,                                           /*sq_item*/
+    0,                                           /*sq_slice*/
+    0,                                           /* sq_ass_item */
+    0,                                           /* sq_ass_slice */
+    (objobjproc)object_arrtype_contains,         /* sq_contains */
+    (binaryfunc)object_arrtype_inplace_concat,   /* sq_inplace_concat */
+    (ssizeargfunc)object_arrtype_inplace_repeat, /* sq_inplace_repeat */
+};
+
+static PyMappingMethods object_arrtype_as_mapping = {
+    (lenfunc)object_arrtype_length,
+    (binaryfunc)object_arrtype_subscript,
+    (objobjargproc)object_arrtype_ass_subscript,
+};
+
+#if !defined(NPY_PY3K)
+static Py_ssize_t
+object_arrtype_getsegcount(PyObjectScalarObject *self, Py_ssize_t *lenp)
+{
+    Py_ssize_t newlen;
+    int cnt;
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+
+    if (pb == NULL ||
+            pb->bf_getsegcount == NULL ||
+            (cnt = (*pb->bf_getsegcount)(self->obval, &newlen)) != 1) {
+        return 0;
+    }
+    if (lenp) {
+        *lenp = newlen;
+    }
+    return cnt;
+}
+
+static Py_ssize_t
+object_arrtype_getreadbuf(PyObjectScalarObject *self, Py_ssize_t segment, void **ptrptr)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+
+    if (pb == NULL ||
+            pb->bf_getreadbuffer == NULL ||
+            pb->bf_getsegcount == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "expected a readable buffer object");
+        return -1;
+    }
+    return (*pb->bf_getreadbuffer)(self->obval, segment, ptrptr);
+}
+
+static Py_ssize_t
+object_arrtype_getwritebuf(PyObjectScalarObject *self, Py_ssize_t segment, void **ptrptr)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+
+    if (pb == NULL ||
+            pb->bf_getwritebuffer == NULL ||
+            pb->bf_getsegcount == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "expected a writeable buffer object");
+        return -1;
+    }
+    return (*pb->bf_getwritebuffer)(self->obval, segment, ptrptr);
+}
+
+static Py_ssize_t
+object_arrtype_getcharbuf(PyObjectScalarObject *self, Py_ssize_t segment,
+                          constchar **ptrptr)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+
+    if (pb == NULL ||
+            pb->bf_getcharbuffer == NULL ||
+            pb->bf_getsegcount == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "expected a character buffer object");
+        return -1;
+    }
+    return (*pb->bf_getcharbuffer)(self->obval, segment, ptrptr);
+}
+#endif
+
+static int
+object_arrtype_getbuffer(PyObjectScalarObject *self, Py_buffer *view, int flags)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+    if (pb == NULL || pb->bf_getbuffer == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "expected a readable buffer object");
+        return -1;
+    }
+    return (*pb->bf_getbuffer)(self->obval, view, flags);
+}
+
+static void
+object_arrtype_releasebuffer(PyObjectScalarObject *self, Py_buffer *view)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+    if (pb == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "expected a readable buffer object");
+        return;
+    }
+    if (pb->bf_releasebuffer != NULL) {
+        (*pb->bf_releasebuffer)(self->obval, view);
+    }
+}
+
+static PyBufferProcs object_arrtype_as_buffer = {
+#if !defined(NPY_PY3K)
+    (readbufferproc)object_arrtype_getreadbuf,
+    (writebufferproc)object_arrtype_getwritebuf,
+    (segcountproc)object_arrtype_getsegcount,
+    (charbufferproc)object_arrtype_getcharbuf,
+#endif
+    (getbufferproc)object_arrtype_getbuffer,
+    (releasebufferproc)object_arrtype_releasebuffer,
+};
+
+static PyObject *
+object_arrtype_call(PyObjectScalarObject *obj, PyObject *args, PyObject *kwds)
+{
+    return PyObject_Call(obj->obval, args, kwds);
+}
+
+NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy.object_",                            /* tp_name*/
+    sizeof(PyObjectScalarObject),               /* tp_basicsize*/
+    0,                                          /* tp_itemsize */
+    (destructor)object_arrtype_dealloc,         /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    &object_arrtype_as_sequence,                /* tp_as_sequence */
+    &object_arrtype_as_mapping,                 /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    (ternaryfunc)object_arrtype_call,           /* tp_call */
+    0,                                          /* tp_str */
+    (getattrofunc)object_arrtype_getattro,      /* tp_getattro */
+    (setattrofunc)object_arrtype_setattro,      /* tp_setattro */
+    &object_arrtype_as_buffer,                  /* tp_as_buffer */
+    0,                                          /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+};
+
+static PyObject *
+gen_arrtype_subscript(PyObject *self, PyObject *key)
+{
+    /*
+     * Only [...], [...,<???>], [<???>, ...],
+     * is allowed for indexing a scalar
+     *
+     * These return a new N-d array with a copy of
+     * the data where N is the number of None's in <???>.
+     */
+    PyObject *res, *ret;
+
+    res = PyArray_FromScalar(self, NULL);
+
+    ret = array_subscript((PyArrayObject *)res, key);
+    Py_DECREF(res);
+    if (ret == NULL) {
+        PyErr_SetString(PyExc_IndexError,
+                        "invalid index to scalar variable.");
+    }
+    return ret;
+}
+
+
+#define NAME_bool "bool"
+#define NAME_void "void"
+#if defined(NPY_PY3K)
+#define NAME_string "bytes"
+#define NAME_unicode "str"
+#else
+#define NAME_string "string"
+#define NAME_unicode "unicode"
+#endif
+
+/**begin repeat
+ * #name = bool, string, unicode, void#
+ * #NAME = Bool, String, Unicode, Void#
+ * #ex = _,_,_,#
+ */
+NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy." NAME_@name@ "@ex@",                /* tp_name*/
+    sizeof(Py@NAME@ScalarObject),               /* tp_basicsize*/
+    0,                                          /* tp_itemsize */
+    0,                                          /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    0,                                          /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+};
+/**end repeat**/
+
+#undef NAME_bool
+#undef NAME_void
+#undef NAME_string
+#undef NAME_unicode
+
+/**begin repeat
+ * #NAME = Byte, Short, Int, Long, LongLong, UByte, UShort, UInt, ULong,
+ *         ULongLong, Half, Float, Double, LongDouble, Datetime, Timedelta#
+ * #name = int*5, uint*5, float*4, datetime, timedelta#
+ * #CNAME = (CHAR, SHORT, INT, LONG, LONGLONG)*2, HALF, FLOAT, DOUBLE,
+ *          LONGDOUBLE, DATETIME, TIMEDELTA#
+ */
+#if NPY_BITSOF_@CNAME@ == 8
+#define _THIS_SIZE "8"
+#elif NPY_BITSOF_@CNAME@ == 16
+#define _THIS_SIZE "16"
+#elif NPY_BITSOF_@CNAME@ == 32
+#define _THIS_SIZE "32"
+#elif NPY_BITSOF_@CNAME@ == 64
+#define _THIS_SIZE "64"
+#elif NPY_BITSOF_@CNAME@ == 80
+#define _THIS_SIZE "80"
+#elif NPY_BITSOF_@CNAME@ == 96
+#define _THIS_SIZE "96"
+#elif NPY_BITSOF_@CNAME@ == 128
+#define _THIS_SIZE "128"
+#elif NPY_BITSOF_@CNAME@ == 256
+#define _THIS_SIZE "256"
+#endif
+NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy.@name@" _THIS_SIZE,                  /* tp_name*/
+    sizeof(Py@NAME@ScalarObject),               /* tp_basicsize*/
+    0,                                          /* tp_itemsize */
+    0,                                          /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    0,                                          /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+};
+
+#undef _THIS_SIZE
+/**end repeat**/
+
+
+static PyMappingMethods gentype_as_mapping = {
+    NULL,
+    (binaryfunc)gen_arrtype_subscript,
+    NULL
+};
+
+
+/**begin repeat
+ * #NAME = CFloat, CDouble, CLongDouble#
+ * #name = complex*3#
+ * #CNAME = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+#if NPY_BITSOF_@CNAME@ == 16
+#define _THIS_SIZE "32"
+#elif NPY_BITSOF_@CNAME@ == 32
+#define _THIS_SIZE "64"
+#elif NPY_BITSOF_@CNAME@ == 64
+#define _THIS_SIZE "128"
+#elif NPY_BITSOF_@CNAME@ == 80
+#define _THIS_SIZE "160"
+#elif NPY_BITSOF_@CNAME@ == 96
+#define _THIS_SIZE "192"
+#elif NPY_BITSOF_@CNAME@ == 128
+#define _THIS_SIZE "256"
+#elif NPY_BITSOF_@CNAME@ == 256
+#define _THIS_SIZE "512"
+#endif
+
+NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(0, 0)
+#else
+    PyObject_HEAD_INIT(0)
+    0,                                          /* ob_size */
+#endif
+    "numpy.@name@" _THIS_SIZE,                  /* tp_name*/
+    sizeof(Py@NAME@ScalarObject),               /* tp_basicsize*/
+    0,                                          /* tp_itemsize*/
+    0,                                          /* tp_dealloc*/
+    0,                                          /* tp_print*/
+    0,                                          /* tp_getattr*/
+    0,                                          /* tp_setattr*/
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr*/
+    0,                                          /* tp_as_number*/
+    0,                                          /* tp_as_sequence*/
+    0,                                          /* tp_as_mapping*/
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call*/
+    0,                                          /* tp_str*/
+    0,                                          /* tp_getattro*/
+    0,                                          /* tp_setattro*/
+    0,                                          /* tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT,                         /* tp_flags*/
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+};
+#undef _THIS_SIZE
+
+/**end repeat**/
+
+/*
+ * This table maps the built-in type numbers to their scalar
+ * type numbers.  Note that signed integers are mapped to INTNEG_SCALAR,
+ * which is different than what PyArray_ScalarKind returns.
+ */
+NPY_NO_EXPORT signed char
+_npy_scalar_kinds_table[NPY_NTYPES];
+
+/*
+ * This table maps a scalar kind (excluding NPY_NOSCALAR)
+ * to the smallest type number of that kind.
+ */
+NPY_NO_EXPORT signed char
+_npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
+
+/*
+ * This table gives the type of the same kind, but next in the sequence
+ * of sizes.
+ */
+NPY_NO_EXPORT signed char
+_npy_next_larger_type_table[NPY_NTYPES];
+
+/*
+ * This table describes safe casting for small type numbers,
+ * and is used by PyArray_CanCastSafely.
+ */
+NPY_NO_EXPORT unsigned char
+_npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
+
+/*
+ * This table gives the smallest-size and smallest-kind type to which
+ * the input types may be safely cast, according to _npy_can_cast_safely.
+ */
+NPY_NO_EXPORT signed char
+_npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
+
+NPY_NO_EXPORT void
+initialize_casting_tables(void)
+{
+    int i, j;
+
+    _npy_smallest_type_of_kind_table[NPY_BOOL_SCALAR] = NPY_BOOL;
+    _npy_smallest_type_of_kind_table[NPY_INTPOS_SCALAR] = NPY_UBYTE;
+    _npy_smallest_type_of_kind_table[NPY_INTNEG_SCALAR] = NPY_BYTE;
+    _npy_smallest_type_of_kind_table[NPY_FLOAT_SCALAR] = NPY_HALF;
+    _npy_smallest_type_of_kind_table[NPY_COMPLEX_SCALAR] = NPY_CFLOAT;
+    _npy_smallest_type_of_kind_table[NPY_OBJECT_SCALAR] = NPY_OBJECT;
+
+    /* Default for built-in types is object scalar */
+    memset(_npy_scalar_kinds_table, NPY_OBJECT_SCALAR,
+                                        sizeof(_npy_scalar_kinds_table));
+    /* Default for next largest type is -1, signalling no bigger */
+    memset(_npy_next_larger_type_table, -1,
+                                        sizeof(_npy_next_larger_type_table));
+
+    /* Compile-time loop of scalar kinds */
+
+    /**begin repeat
+     * #NAME = BOOL,
+     *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         LONG, ULONG, LONGLONG, ULONGLONG,
+     *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+     * #BIGGERTYPE = -1,
+     *         NPY_SHORT, NPY_USHORT, NPY_INT, NPY_UINT, NPY_LONG, NPY_ULONG,
+     *         NPY_LONGLONG, NPY_ULONGLONG, -1, -1,
+     *         NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE, -1,
+     *         NPY_CDOUBLE, NPY_CLONGDOUBLE, -1#
+     * #SCKIND = BOOL,
+     *           (INTNEG, INTPOS)*5,
+     *           FLOAT*4,
+     *           COMPLEX*3#
+     */
+
+    _npy_scalar_kinds_table[NPY_@NAME@] = NPY_@SCKIND@_SCALAR;
+    _npy_next_larger_type_table[NPY_@NAME@] = @BIGGERTYPE@;
+
+    /**end repeat**/
+
+    memset(_npy_can_cast_safely_table, 0, sizeof(_npy_can_cast_safely_table));
+
+    for (i = 0; i < NPY_NTYPES; ++i) {
+        /* Identity */
+        _npy_can_cast_safely_table[i][i] = 1;
+        if (i != NPY_DATETIME) {
+            /*
+             * Bool -> <Anything> except datetime (since
+             *                    it conceptually has no zero)
+             */
+            _npy_can_cast_safely_table[NPY_BOOL][i] = 1;
+        }
+        /* <Anything> -> Object */
+        _npy_can_cast_safely_table[i][NPY_OBJECT] = 1;
+        /* <Anything> -> Void */
+        _npy_can_cast_safely_table[i][NPY_VOID] = 1;
+    }
+
+    _npy_can_cast_safely_table[NPY_STRING][NPY_UNICODE] = 1;
+    _npy_can_cast_safely_table[NPY_BOOL][NPY_TIMEDELTA] = 1;
+
+#ifndef NPY_SIZEOF_BYTE
+#define NPY_SIZEOF_BYTE 1
+#endif
+
+    /* Compile-time loop of casting rules */
+
+    /**begin repeat
+     * #FROM_NAME = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *              LONG, ULONG, LONGLONG, ULONGLONG,
+     *              HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *              CFLOAT, CDOUBLE, CLONGDOUBLE#
+     * #FROM_BASENAME = BYTE, BYTE, SHORT, SHORT, INT, INT,
+     *                  LONG, LONG, LONGLONG, LONGLONG,
+     *                  HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *                  FLOAT, DOUBLE, LONGDOUBLE#
+     * #from_isint = 1, 0, 1, 0, 1, 0, 1, 0,
+     *               1, 0, 0, 0, 0, 0,
+     *               0, 0, 0#
+     * #from_isuint = 0, 1, 0, 1, 0, 1, 0, 1,
+     *                0, 1, 0, 0, 0, 0,
+     *                0, 0, 0#
+     * #from_isfloat = 0, 0, 0, 0, 0, 0, 0, 0,
+     *                 0, 0, 1, 1, 1, 1,
+     *                 0, 0, 0#
+     * #from_iscomplex = 0, 0, 0, 0, 0, 0, 0, 0,
+     *                   0, 0, 0, 0, 0, 0,
+     *                   1, 1, 1#
+     */
+
+#define _FROM_BSIZE NPY_SIZEOF_@FROM_BASENAME@
+#define _FROM_NUM   (NPY_@FROM_NAME@)
+
+    _npy_can_cast_safely_table[_FROM_NUM][NPY_STRING] = 1;
+    _npy_can_cast_safely_table[_FROM_NUM][NPY_UNICODE] = 1;
+
+    /* Allow casts from any integer to the TIMEDELTA type */
+#if @from_isint@ || @from_isuint@
+    _npy_can_cast_safely_table[_FROM_NUM][NPY_TIMEDELTA] = 1;
+#endif
+
+    /**begin repeat1
+     * #TO_NAME = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *            LONG, ULONG, LONGLONG, ULONGLONG,
+     *            HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *            CFLOAT, CDOUBLE, CLONGDOUBLE#
+     * #TO_BASENAME = BYTE, BYTE, SHORT, SHORT, INT, INT,
+     *                LONG, LONG, LONGLONG, LONGLONG,
+     *                HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *                FLOAT, DOUBLE, LONGDOUBLE#
+     * #to_isint = 1, 0, 1, 0, 1, 0, 1, 0,
+     *             1, 0, 0, 0, 0, 0,
+     *             0, 0, 0#
+     * #to_isuint = 0, 1, 0, 1, 0, 1, 0, 1,
+     *              0, 1, 0, 0, 0, 0,
+     *              0, 0, 0#
+     * #to_isfloat = 0, 0, 0, 0, 0, 0, 0, 0,
+     *               0, 0, 1, 1, 1, 1,
+     *               0, 0, 0#
+     * #to_iscomplex = 0, 0, 0, 0, 0, 0, 0, 0,
+     *                 0, 0, 0, 0, 0, 0,
+     *                 1, 1, 1#
+     */
+#define _TO_BSIZE NPY_SIZEOF_@TO_BASENAME@
+#define _TO_NUM   (NPY_@TO_NAME@)
+
+    /*
+     * NOTE: _FROM_BSIZE and _TO_BSIZE are the sizes of the "base type"
+     *       which is the same as the size of the type except for
+     *       complex, where it is the size of the real type.
+     */
+
+#if @from_isint@
+
+#  if @to_isint@ && (_TO_BSIZE >= _FROM_BSIZE)
+    /* int -> int */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_isfloat@ && (_FROM_BSIZE < 8) && (_TO_BSIZE > _FROM_BSIZE)
+    /* int -> float */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_isfloat@ && (_FROM_BSIZE >= 8) && (_TO_BSIZE >= _FROM_BSIZE)
+    /* int -> float */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_iscomplex@ && (_FROM_BSIZE < 8) && (_TO_BSIZE > _FROM_BSIZE)
+    /* int -> complex */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_iscomplex@ && (_FROM_BSIZE >= 8) && (_TO_BSIZE >= _FROM_BSIZE)
+    /* int -> complex */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  endif
+
+#elif @from_isuint@
+
+#  if @to_isint@ && (_TO_BSIZE > _FROM_BSIZE)
+    /* uint -> int */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_isuint@ && (_TO_BSIZE >= _FROM_BSIZE)
+    /* uint -> uint */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_isfloat@ && (_FROM_BSIZE < 8) && (_TO_BSIZE > _FROM_BSIZE)
+    /* uint -> float */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_isfloat@ && (_FROM_BSIZE >= 8) && (_TO_BSIZE >= _FROM_BSIZE)
+    /* uint -> float */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_iscomplex@ && (_FROM_BSIZE < 8) && (_TO_BSIZE > _FROM_BSIZE)
+    /* uint -> complex */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_iscomplex@ && (_FROM_BSIZE >= 8) && (_TO_BSIZE >= _FROM_BSIZE)
+    /* uint -> complex */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  endif
+
+
+#elif @from_isfloat@
+
+#  if @to_isfloat@ && (_TO_BSIZE >= _FROM_BSIZE)
+    /* float -> float */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  elif @to_iscomplex@ && (_TO_BSIZE >= _FROM_BSIZE)
+    /* float -> complex */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  endif
+
+#elif @from_iscomplex@
+
+#  if @to_iscomplex@ && (_TO_BSIZE >= _FROM_BSIZE)
+    /* complex -> complex */
+    _npy_can_cast_safely_table[_FROM_NUM][_TO_NUM] = 1;
+#  endif
+
+#endif
+
+#undef _TO_NUM
+#undef _TO_BSIZE
+
+/**end repeat1**/
+
+#undef _FROM_NUM
+#undef _FROM_BSIZE
+
+/**end repeat**/
+
+    /*
+     * Now that the _can_cast_safely table is finished, we can
+     * use it to build the _type_promotion table
+     */
+    for (i = 0; i < NPY_NTYPES; ++i) {
+        _npy_type_promotion_table[i][i] = i;
+        /* Don't let number promote to string/unicode/void/datetime/timedelta */
+        if (i == NPY_STRING || i == NPY_UNICODE || i == NPY_VOID ||
+                i == NPY_DATETIME || i == NPY_TIMEDELTA) {
+            /* Promoting these types requires examining their contents */
+            _npy_type_promotion_table[i][i] = -1;
+            for (j = i + 1; j < NPY_NTYPES; ++j) {
+                _npy_type_promotion_table[i][j] = -1;
+                _npy_type_promotion_table[j][i] = -1;
+            }
+            /* Except they can convert to OBJECT */
+            _npy_type_promotion_table[i][NPY_OBJECT] = NPY_OBJECT;
+            _npy_type_promotion_table[NPY_OBJECT][i] = NPY_OBJECT;
+        }
+        else {
+            for (j = i + 1; j < NPY_NTYPES; ++j) {
+                /* Don't let number promote to string/unicode/void */
+                if (j == NPY_STRING || j == NPY_UNICODE || j == NPY_VOID) {
+                    _npy_type_promotion_table[i][j] = -1;
+                    _npy_type_promotion_table[j][i] = -1;
+                }
+                else if (_npy_can_cast_safely_table[i][j]) {
+                    _npy_type_promotion_table[i][j] = j;
+                    _npy_type_promotion_table[j][i] = j;
+                }
+                else if (_npy_can_cast_safely_table[j][i]) {
+                    _npy_type_promotion_table[i][j] = i;
+                    _npy_type_promotion_table[j][i] = i;
+                }
+                else {
+                    int k, iskind, jskind, skind;
+                    iskind = _npy_scalar_kinds_table[i];
+                    jskind = _npy_scalar_kinds_table[j];
+                    /* If there's no kind (void/string/etc) */
+                    if (iskind == NPY_NOSCALAR || jskind == NPY_NOSCALAR) {
+                        k = -1;
+                    }
+                    else {
+                        /* Start with the type of larger kind */
+                        if (iskind > jskind) {
+                            skind = iskind;
+                            k = i;
+                        }
+                        else {
+                            skind = jskind;
+                            k = j;
+                        }
+                        for (;;) {
+                            /* Try the next larger type of this kind */
+                            k = _npy_next_larger_type_table[k];
+
+                            /* If there is no larger, try a larger kind */
+                            if (k < 0) {
+                                ++skind;
+                                /* Use -1 to signal no promoted type found */
+                                if (skind < NPY_NSCALARKINDS) {
+                                    k = _npy_smallest_type_of_kind_table[skind];
+                                }
+                                else {
+                                    k = -1;
+                                    break;
+                                }
+                            }
+
+                            if (_npy_can_cast_safely_table[i][k] &&
+                                            _npy_can_cast_safely_table[j][k]) {
+                                break;
+                            }
+                        }
+                    }
+                    _npy_type_promotion_table[i][j] = k;
+                    _npy_type_promotion_table[j][i] = k;
+                }
+            }
+        }
+    }
+}
+
+#ifndef NPY_PY3K
+/*
+ * In python2, the `float` and `complex` types still implement the obsolete
+ * "tp_print" method, which uses CPython's float-printing routines to print the
+ * float.  Numpy's float_/cfloat inherit from Python float/complex, but
+ * override its tp_repr and tp_str methods. In order to avoid an inconsistency
+ * with the inherited tp_print, we need to override it too.
+ *
+ * In python3 the tp_print method is reserved/unused.
+ */
+static int
+doubletype_print(PyObject *o, FILE *fp, int flags)
+{
+    int ret;
+    PyObject *to_print;
+    if (flags & Py_PRINT_RAW) {
+        to_print = PyObject_Str(o);
+    }
+    else {
+        to_print = PyObject_Repr(o);
+    }
+
+    if (to_print == NULL) {
+        return -1;
+    }
+
+    ret = PyObject_Print(to_print, fp, Py_PRINT_RAW);
+    Py_DECREF(to_print);
+    return ret;
+}
+#endif
+
+static PyNumberMethods longdoubletype_as_number;
+static PyNumberMethods clongdoubletype_as_number;
+static void init_basetypes(void);
+
+
+NPY_NO_EXPORT void
+initialize_numeric_types(void)
+{
+    init_basetypes();
+    PyGenericArrType_Type.tp_dealloc = (destructor)gentype_dealloc;
+    PyGenericArrType_Type.tp_as_number = &gentype_as_number;
+    PyGenericArrType_Type.tp_as_buffer = &gentype_as_buffer;
+    PyGenericArrType_Type.tp_as_mapping = &gentype_as_mapping;
+    PyGenericArrType_Type.tp_flags = BASEFLAGS;
+    PyGenericArrType_Type.tp_methods = gentype_methods;
+    PyGenericArrType_Type.tp_getset = gentype_getsets;
+    PyGenericArrType_Type.tp_new = NULL;
+    PyGenericArrType_Type.tp_alloc = gentype_alloc;
+    PyGenericArrType_Type.tp_free = (freefunc)gentype_free;
+    PyGenericArrType_Type.tp_richcompare = gentype_richcompare;
+
+    PyBoolArrType_Type.tp_as_number = &bool_arrtype_as_number;
+    /*
+     * need to add dummy versions with filled-in nb_index
+     * in-order for PyType_Ready to fill in .__index__() method
+     * also fill array_type_as_number struct with reasonable defaults
+     */
+
+    /**begin repeat
+     * #name = byte, short, int, long, longlong, ubyte, ushort,
+     *         uint, ulong, ulonglong#
+     * #NAME = Byte, Short, Int, Long, LongLong, UByte, UShort,
+     *         UInt, ULong, ULongLong#
+     */
+    @name@_arrtype_as_number = gentype_as_number;
+    Py@NAME@ArrType_Type.tp_as_number = &@name@_arrtype_as_number;
+    Py@NAME@ArrType_Type.tp_as_number->nb_index = (unaryfunc)@name@_index;
+
+    /**end repeat**/
+
+    /**begin repeat
+     *  #name = half, float, double, longdouble,
+     *          cfloat, cdouble, clongdouble#
+     *  #NAME = Half, Float, Double, LongDouble,
+     *          CFloat, CDouble, CLongDouble#
+     */
+    @name@_arrtype_as_number = gentype_as_number;
+    Py@NAME@ArrType_Type.tp_as_number = &@name@_arrtype_as_number;
+
+    /**end repeat**/
+
+#ifndef NPY_PY3K
+    PyDoubleArrType_Type.tp_print = &doubletype_print;
+    PyCDoubleArrType_Type.tp_print = &doubletype_print;
+#endif
+
+
+    PyBoolArrType_Type.tp_as_number->nb_index = (unaryfunc)bool_index;
+
+    PyStringArrType_Type.tp_alloc = NULL;
+    PyStringArrType_Type.tp_free = NULL;
+
+    PyStringArrType_Type.tp_repr = stringtype_repr;
+    PyStringArrType_Type.tp_str = stringtype_str;
+
+    PyUnicodeArrType_Type.tp_repr = unicodetype_repr;
+    PyUnicodeArrType_Type.tp_str = unicodetype_str;
+
+    PyVoidArrType_Type.tp_methods = voidtype_methods;
+    PyVoidArrType_Type.tp_getset = voidtype_getsets;
+    PyVoidArrType_Type.tp_as_mapping = &voidtype_as_mapping;
+    PyVoidArrType_Type.tp_as_sequence = &voidtype_as_sequence;
+    PyVoidArrType_Type.tp_repr = voidtype_repr;
+    PyVoidArrType_Type.tp_str = voidtype_str;
+
+    PyIntegerArrType_Type.tp_getset = inttype_getsets;
+
+    /**begin repeat
+     * #NAME= Number, Integer, SignedInteger, UnsignedInteger, Inexact,
+     *        Floating, ComplexFloating, Flexible, Character#
+     */
+
+    Py@NAME@ArrType_Type.tp_flags = BASEFLAGS;
+
+    /**end repeat**/
+
+    /**begin repeat
+     * #name = bool, byte, short, int, long, longlong, ubyte, ushort, uint,
+     *         ulong, ulonglong, half, float, double, longdouble, cfloat,
+     *         cdouble, clongdouble, string, unicode, void, object, datetime,
+     *         timedelta#
+     * #NAME = Bool, Byte, Short, Int, Long, LongLong, UByte, UShort, UInt,
+     *         ULong, ULongLong, Half, Float, Double, LongDouble, CFloat,
+     *         CDouble, CLongDouble, String, Unicode, Void, Object, Datetime,
+     *         Timedelta#
+     */
+
+    Py@NAME@ArrType_Type.tp_flags = BASEFLAGS;
+    Py@NAME@ArrType_Type.tp_new = @name@_arrtype_new;
+    Py@NAME@ArrType_Type.tp_richcompare = gentype_richcompare;
+
+    /**end repeat**/
+
+    /**begin repeat
+     * #name = bool, byte, short, ubyte, ushort, uint, ulong, ulonglong,
+     *         half, float, longdouble, cfloat, clongdouble, void, object,
+     *         datetime, timedelta#
+     * #NAME = Bool, Byte, Short, UByte, UShort, UInt, ULong, ULongLong,
+     *         Half, Float, LongDouble, CFloat, CLongDouble, Void, Object,
+     *         Datetime, Timedelta#
+     */
+
+    Py@NAME@ArrType_Type.tp_hash = @name@_arrtype_hash;
+
+    /**end repeat**/
+
+    /**begin repeat
+     * #name = cfloat, clongdouble#
+     * #NAME = CFloat, CLongDouble#
+     */
+
+    Py@NAME@ArrType_Type.tp_methods = @name@type_methods;
+
+    /**end repeat**/
+
+#if (NPY_SIZEOF_INT != NPY_SIZEOF_LONG) || defined(NPY_PY3K)
+    /* We won't be inheriting from Python Int type. */
+    PyIntArrType_Type.tp_hash = int_arrtype_hash;
+#endif
+
+#if defined(NPY_PY3K)
+    /* We won't be inheriting from Python Int type. */
+    PyLongArrType_Type.tp_hash = long_arrtype_hash;
+#endif
+
+#if (NPY_SIZEOF_LONG != NPY_SIZEOF_LONGLONG) || defined(NPY_PY3K)
+    /* We won't be inheriting from Python Int type. */
+    PyLongLongArrType_Type.tp_hash = longlong_arrtype_hash;
+#endif
+
+    /**begin repeat
+     * #name = repr, str#
+     */
+
+    PyHalfArrType_Type.tp_@name@ = halftype_@name@;
+
+    PyFloatArrType_Type.tp_@name@ = floattype_@name@;
+    PyCFloatArrType_Type.tp_@name@ = cfloattype_@name@;
+
+    PyDoubleArrType_Type.tp_@name@ = doubletype_@name@;
+    PyCDoubleArrType_Type.tp_@name@  = cdoubletype_@name@;
+
+    PyDatetimeArrType_Type.tp_@name@ = datetimetype_@name@;
+    PyTimedeltaArrType_Type.tp_@name@ = timedeltatype_@name@;
+
+    /**end repeat**/
+
+
+    /**begin repeat
+     * #Type = Bool, Byte, UByte, Short, UShort, Int, UInt, Long,
+     *         ULong, LongLong, ULongLong#
+     */
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    Py@Type@ArrType_Type.tp_str = genint_type_str;
+    Py@Type@ArrType_Type.tp_repr = genint_type_str;
+
+    /**end repeat**/
+
+
+
+    /**begin repeat
+     * #char = ,c#
+     * #CHAR = ,C#
+     */
+
+    /*
+     * These need to be coded specially because longdouble/clongdouble getitem
+     * does not return a normal Python type
+     */
+    @char@longdoubletype_as_number.nb_float = @char@longdoubletype_float;
+#if defined(NPY_PY3K)
+    @char@longdoubletype_as_number.nb_int  = @char@longdoubletype_long;
+#else
+    @char@longdoubletype_as_number.nb_int  = @char@longdoubletype_int;
+    @char@longdoubletype_as_number.nb_long = @char@longdoubletype_long;
+    @char@longdoubletype_as_number.nb_hex  = @char@longdoubletype_hex;
+    @char@longdoubletype_as_number.nb_oct  = @char@longdoubletype_oct;
+#endif
+
+    Py@CHAR@LongDoubleArrType_Type.tp_as_number = &@char@longdoubletype_as_number;
+    Py@CHAR@LongDoubleArrType_Type.tp_repr = @char@longdoubletype_repr;
+    Py@CHAR@LongDoubleArrType_Type.tp_str = @char@longdoubletype_str;
+
+    /**end repeat**/
+
+    PyStringArrType_Type.tp_itemsize = sizeof(char);
+    PyVoidArrType_Type.tp_dealloc = (destructor) void_dealloc;
+
+    PyArrayIter_Type.tp_iter = PyObject_SelfIter;
+    PyArrayMapIter_Type.tp_iter = PyObject_SelfIter;
+}
+
+typedef struct {
+    PyTypeObject * type;
+    int typenum;
+} scalar_type;
+
+static scalar_type typeobjects[] = {
+    {&PyBoolArrType_Type, NPY_BOOL},
+    {&PyByteArrType_Type, NPY_BYTE},
+    {&PyUByteArrType_Type, NPY_UBYTE},
+    {&PyShortArrType_Type, NPY_SHORT},
+    {&PyUShortArrType_Type, NPY_USHORT},
+    {&PyIntArrType_Type, NPY_INT},
+    {&PyUIntArrType_Type, NPY_UINT},
+    {&PyLongArrType_Type, NPY_LONG},
+    {&PyULongArrType_Type, NPY_ULONG},
+    {&PyLongLongArrType_Type, NPY_LONGLONG},
+    {&PyULongLongArrType_Type, NPY_ULONGLONG},
+    {&PyFloatArrType_Type, NPY_FLOAT},
+    {&PyDoubleArrType_Type, NPY_DOUBLE},
+    {&PyLongDoubleArrType_Type, NPY_LONGDOUBLE},
+    {&PyCFloatArrType_Type, NPY_CFLOAT},
+    {&PyCDoubleArrType_Type, NPY_CDOUBLE},
+    {&PyCLongDoubleArrType_Type, NPY_CLONGDOUBLE},
+    {&PyObjectArrType_Type, NPY_OBJECT},
+    {&PyStringArrType_Type, NPY_STRING},
+    {&PyUnicodeArrType_Type, NPY_UNICODE},
+    {&PyVoidArrType_Type, NPY_VOID},
+    {&PyDatetimeArrType_Type, NPY_DATETIME},
+    {&PyTimedeltaArrType_Type, NPY_TIMEDELTA},
+    {&PyHalfArrType_Type, NPY_HALF}
+};
+
+static int compare_types(const void * a_, const void * b_)
+{
+    const PyTypeObject * a = ((const scalar_type *)a_)->type;
+    const PyTypeObject * b = ((const scalar_type *)b_)->type;
+    if (a < b) {
+        return -1;
+    }
+    else if (a > b) {
+        return 1;
+    }
+    return 0;
+}
+
+static void init_basetypes(void)
+{
+    qsort(typeobjects, sizeof(typeobjects) / sizeof(typeobjects[0]),
+          sizeof(typeobjects[0]),
+          compare_types);
+}
+
+
+NPY_NO_EXPORT int
+get_typeobj_idx(PyTypeObject * obj)
+{
+    npy_intp imin = 0, imax = sizeof(typeobjects) / sizeof(typeobjects[0]) - 1;
+    while (imax >= imin)
+    {
+        npy_intp imid = ((imax - imin) / 2) + imin;
+        if(typeobjects[imid].type == obj) {
+            return imid;
+        }
+        else if (typeobjects[imid].type < obj) {
+            imin = imid + 1;
+        }
+        else {
+            imax = imid - 1;
+        }
+    }
+
+    return -1;
+}
+
+NPY_NO_EXPORT int
+is_anyscalar_exact(PyObject *obj)
+{
+    return get_typeobj_idx(Py_TYPE(obj)) >= 0;
+}
+
+NPY_NO_EXPORT int
+_typenum_fromtypeobj(PyObject *type, int user)
+{
+    int typenum, i;
+
+    typenum = NPY_NOTYPE;
+    i = get_typeobj_idx((PyTypeObject*)type);
+    if (i >= 0) {
+        typenum = typeobjects[i].typenum;
+    }
+
+    if (!user) {
+        return typenum;
+    }
+    /* Search any registered types */
+    i = 0;
+    while (i < NPY_NUMUSERTYPES) {
+        if (type == (PyObject *)(userdescrs[i]->typeobj)) {
+            typenum = i + NPY_USERDEF;
+            break;
+        }
+        i++;
+    }
+    return typenum;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/npymath/ieee754.c.src b/contrib/python/numpy/py2/numpy/core/src/npymath/ieee754.c.src
new file mode 100644
index 00000000000..d960838c8fa
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npymath/ieee754.c.src
@@ -0,0 +1,841 @@
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ *
+ * Low-level routines related to IEEE-754 format
+ */
+#include "npy_math_common.h"
+#include "npy_math_private.h"
+#include "numpy/utils.h"
+
+#ifndef HAVE_COPYSIGN
+double npy_copysign(double x, double y)
+{
+    npy_uint32 hx, hy;
+    GET_HIGH_WORD(hx, x);
+    GET_HIGH_WORD(hy, y);
+    SET_HIGH_WORD(x, (hx & 0x7fffffff) | (hy & 0x80000000));
+    return x;
+}
+#endif
+
+/*
+ The below code is provided for compilers which do not yet provide C11 compatibility (gcc 4.5 and older)
+ */
+#ifndef LDBL_TRUE_MIN
+#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+#if !defined(HAVE_DECL_SIGNBIT)
+#include "_signbit.c"
+
+int _npy_signbit_f(float x)
+{
+    return _npy_signbit_d((double) x);
+}
+
+int _npy_signbit_ld(long double x)
+{
+    return _npy_signbit_d((double) x);
+}
+#endif
+
+/*
+ * FIXME: There is a lot of redundancy between _next* and npy_nextafter*.
+ * refactor this at some point
+ *
+ * p >= 0, returnx x + nulp
+ * p < 0, returnx x - nulp
+ */
+static double _next(double x, int p)
+{
+    volatile double t;
+    npy_int32 hx, hy, ix;
+    npy_uint32 lx;
+
+    EXTRACT_WORDS(hx, lx, x);
+    ix = hx & 0x7fffffff;       /* |x| */
+
+    if (((ix >= 0x7ff00000) && ((ix - 0x7ff00000) | lx) != 0))        /* x is nan */
+        return x;
+    if ((ix | lx) == 0) {       /* x == 0 */
+        if (p >= 0) {
+            INSERT_WORDS(x, 0x0, 1);    /* return +minsubnormal */
+        } else {
+            INSERT_WORDS(x, 0x80000000, 1);    /* return -minsubnormal */
+        }
+        t = x * x;
+        if (t == x)
+            return t;
+        else
+            return x;           /* raise underflow flag */
+    }
+    if (p < 0) {     /* x -= ulp */
+        if (lx == 0)
+            hx -= 1;
+        lx -= 1;
+    } else {         /* x += ulp */
+        lx += 1;
+        if (lx == 0)
+            hx += 1;
+    }
+    hy = hx & 0x7ff00000;
+    if (hy >= 0x7ff00000)
+        return x + x;           /* overflow  */
+    if (hy < 0x00100000) {      /* underflow */
+        t = x * x;
+        if (t != x) {           /* raise underflow flag */
+            INSERT_WORDS(x, hx, lx);
+            return x;
+        }
+    }
+    INSERT_WORDS(x, hx, lx);
+    return x;
+}
+
+static float _nextf(float x, int p)
+{
+    volatile float t;
+    npy_int32 hx, hy, ix;
+
+    GET_FLOAT_WORD(hx, x);
+    ix = hx & 0x7fffffff;       /* |x| */
+
+    if ((ix > 0x7f800000))      /* x is nan */
+        return x;
+    if (ix == 0) {              /* x == 0 */
+        if (p >= 0) {
+            SET_FLOAT_WORD(x, 0x0 | 1); /* return +minsubnormal */
+        } else {
+            SET_FLOAT_WORD(x, 0x80000000 | 1); /* return -minsubnormal */
+        }
+        t = x * x;
+        if (t == x)
+            return t;
+        else
+            return x;           /* raise underflow flag */
+    }
+    if (p < 0) {            /* x -= ulp */
+        hx -= 1;
+    } else {                /* x += ulp */
+        hx += 1;
+    }
+    hy = hx & 0x7f800000;
+    if (hy >= 0x7f800000)
+        return x + x;           /* overflow  */
+    if (hy < 0x00800000) {      /* underflow */
+        t = x * x;
+        if (t != x) {           /* raise underflow flag */
+            SET_FLOAT_WORD(x, hx);
+            return x;
+        }
+    }
+    SET_FLOAT_WORD(x, hx);
+    return x;
+}
+
+#if defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) || \
+    defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+
+/*
+ * FIXME: this is ugly and untested. The asm part only works with gcc, and we
+ * should consolidate the GET_LDOUBLE* / SET_LDOUBLE macros
+ */
+#define math_opt_barrier(x) \
+        ({ __typeof (x) __x = x; __asm ("" : "+m" (__x)); __x; })
+#define math_force_eval(x) __asm __volatile ("" : : "m" (x))
+
+/* only works for big endian */
+typedef union
+{
+    npy_longdouble value;
+    struct
+    {
+        npy_uint64 msw;
+        npy_uint64 lsw;
+    } parts64;
+    struct
+    {
+        npy_uint32 w0, w1, w2, w3;
+    } parts32;
+} ieee854_long_double_shape_type;
+
+/* Get two 64 bit ints from a long double.  */
+
+#define GET_LDOUBLE_WORDS64(ix0,ix1,d) \
+do {                                   \
+  ieee854_long_double_shape_type qw_u; \
+  qw_u.value = (d);                    \
+  (ix0) = qw_u.parts64.msw;            \
+  (ix1) = qw_u.parts64.lsw;            \
+} while (0)
+
+/* Set a long double from two 64 bit ints.  */
+
+#define SET_LDOUBLE_WORDS64(d,ix0,ix1) \
+do {                                   \
+  ieee854_long_double_shape_type qw_u; \
+  qw_u.parts64.msw = (ix0);            \
+  qw_u.parts64.lsw = (ix1);            \
+  (d) = qw_u.value;                    \
+} while (0)
+
+static npy_longdouble _nextl(npy_longdouble x, int p)
+{
+    npy_int64 hx,ihx,ilx;
+    npy_uint64 lx;
+    npy_longdouble u;
+
+    GET_LDOUBLE_WORDS64(hx, lx, x);
+    ihx = hx & 0x7fffffffffffffffLL;      /* |hx| */
+    ilx = lx & 0x7fffffffffffffffLL;      /* |lx| */
+
+    if(((ihx & 0x7ff0000000000000LL)==0x7ff0000000000000LL)&&
+       ((ihx & 0x000fffffffffffffLL)!=0)) {
+        return x; /* signal the nan */
+    }
+    if(ihx == 0 && ilx == 0) {          /* x == 0 */
+        SET_LDOUBLE_WORDS64(x, p, 0ULL);/* return +-minsubnormal */
+        u = x * x;
+        if (u == x) {
+            return u;
+        } else {
+            return x;           /* raise underflow flag */
+        }
+    }
+
+    if(p < 0) { /* p < 0, x -= ulp */
+        if((hx==0xffefffffffffffffLL)&&(lx==0xfc8ffffffffffffeLL))
+            return x+x; /* overflow, return -inf */
+        if (hx >= 0x7ff0000000000000LL) {
+            SET_LDOUBLE_WORDS64(u,0x7fefffffffffffffLL,0x7c8ffffffffffffeLL);
+            return u;
+        }
+        if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
+            u = math_opt_barrier (x);
+            x -= LDBL_TRUE_MIN;
+            if (ihx < 0x0360000000000000LL
+                    || (hx > 0 && (npy_int64) lx <= 0)
+                    || (hx < 0 && (npy_int64) lx > 1)) {
+                u = u * u;
+                math_force_eval (u);        /* raise underflow flag */
+            }
+            return x;
+        }
+        if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+            u *= 0x1.0000000000000p-105L;
+        } else
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+        return x - u;
+    } else {                /* p >= 0, x += ulp */
+        if((hx==0x7fefffffffffffffLL)&&(lx==0x7c8ffffffffffffeLL))
+            return x+x; /* overflow, return +inf */
+        if ((npy_uint64) hx >= 0xfff0000000000000ULL) {
+            SET_LDOUBLE_WORDS64(u,0xffefffffffffffffLL,0xfc8ffffffffffffeLL);
+            return u;
+        }
+        if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
+            u = math_opt_barrier (x);
+            x += LDBL_TRUE_MIN;
+            if (ihx < 0x0360000000000000LL
+                    || (hx > 0 && (npy_int64) lx < 0 && lx != 0x8000000000000001LL)
+                    || (hx < 0 && (npy_int64) lx >= 0)) {
+                u = u * u;
+                math_force_eval (u);        /* raise underflow flag */
+            }
+            if (x == 0.0L)  /* handle negative LDBL_TRUE_MIN case */
+                x = -0.0L;
+            return x;
+        }
+        if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+            u *= 0x1.0000000000000p-105L;
+        } else
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+        return x + u;
+    }
+}
+#else
+static npy_longdouble _nextl(npy_longdouble x, int p)
+{
+    volatile npy_longdouble t;
+    union IEEEl2bitsrep ux;
+
+    ux.e = x;
+
+    if ((GET_LDOUBLE_EXP(ux) == 0x7fff &&
+         ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) | GET_LDOUBLE_MANL(ux)) != 0)) {
+        return ux.e;        /* x is nan */
+    }
+    if (ux.e == 0.0) {
+        SET_LDOUBLE_MANH(ux, 0);              /* return +-minsubnormal */
+        SET_LDOUBLE_MANL(ux, 1);
+        if (p >= 0) {
+            SET_LDOUBLE_SIGN(ux, 0);
+        } else {
+            SET_LDOUBLE_SIGN(ux, 1);
+        }
+        t = ux.e * ux.e;
+        if (t == ux.e) {
+            return t;
+        } else {
+            return ux.e;           /* raise underflow flag */
+        }
+    }
+    if (p < 0) {      /* x -= ulp */
+        if (GET_LDOUBLE_MANL(ux) == 0) {
+            if ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) == 0) {
+                SET_LDOUBLE_EXP(ux, GET_LDOUBLE_EXP(ux) - 1);
+            }
+            SET_LDOUBLE_MANH(ux,
+                             (GET_LDOUBLE_MANH(ux) - 1) |
+                             (GET_LDOUBLE_MANH(ux) & LDBL_NBIT));
+        }
+        SET_LDOUBLE_MANL(ux, GET_LDOUBLE_MANL(ux) - 1);
+    } else {                    /* x += ulp */
+        SET_LDOUBLE_MANL(ux, GET_LDOUBLE_MANL(ux) + 1);
+        if (GET_LDOUBLE_MANL(ux) == 0) {
+            SET_LDOUBLE_MANH(ux,
+                             (GET_LDOUBLE_MANH(ux) + 1) |
+                             (GET_LDOUBLE_MANH(ux) & LDBL_NBIT));
+            if ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) == 0) {
+                SET_LDOUBLE_EXP(ux, GET_LDOUBLE_EXP(ux) + 1);
+            }
+        }
+    }
+    if (GET_LDOUBLE_EXP(ux) == 0x7fff) {
+        return ux.e + ux.e;           /* overflow  */
+    }
+    if (GET_LDOUBLE_EXP(ux) == 0) {            /* underflow */
+        if (LDBL_NBIT) {
+            SET_LDOUBLE_MANH(ux, GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT);
+        }
+        t = ux.e * ux.e;
+        if (t != ux.e) {           /* raise underflow flag */
+            return ux.e;
+        }
+    }
+
+    return ux.e;
+}
+#endif
+
+/*
+ * nextafter code taken from BSD math lib, the code contains the following
+ * notice:
+ *
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#ifndef HAVE_NEXTAFTER
+double npy_nextafter(double x, double y)
+{
+    volatile double t;
+    npy_int32 hx, hy, ix, iy;
+    npy_uint32 lx, ly;
+
+    EXTRACT_WORDS(hx, lx, x);
+    EXTRACT_WORDS(hy, ly, y);
+    ix = hx & 0x7fffffff;       /* |x| */
+    iy = hy & 0x7fffffff;       /* |y| */
+
+    if (((ix >= 0x7ff00000) && ((ix - 0x7ff00000) | lx) != 0) ||        /* x is nan */
+        ((iy >= 0x7ff00000) && ((iy - 0x7ff00000) | ly) != 0))  /* y is nan */
+        return x + y;
+    if (x == y)
+        return y;               /* x=y, return y */
+    if ((ix | lx) == 0) {       /* x == 0 */
+        INSERT_WORDS(x, hy & 0x80000000, 1);    /* return +-minsubnormal */
+        t = x * x;
+        if (t == x)
+            return t;
+        else
+            return x;           /* raise underflow flag */
+    }
+    if (hx >= 0) {              /* x > 0 */
+        if (hx > hy || ((hx == hy) && (lx > ly))) {     /* x > y, x -= ulp */
+            if (lx == 0)
+                hx -= 1;
+            lx -= 1;
+        } else {                /* x < y, x += ulp */
+            lx += 1;
+            if (lx == 0)
+                hx += 1;
+        }
+    } else {                    /* x < 0 */
+        if (hy >= 0 || hx > hy || ((hx == hy) && (lx > ly))) {  /* x < y, x -= ulp */
+            if (lx == 0)
+                hx -= 1;
+            lx -= 1;
+        } else {                /* x > y, x += ulp */
+            lx += 1;
+            if (lx == 0)
+                hx += 1;
+        }
+    }
+    hy = hx & 0x7ff00000;
+    if (hy >= 0x7ff00000)
+        return x + x;           /* overflow  */
+    if (hy < 0x00100000) {      /* underflow */
+        t = x * x;
+        if (t != x) {           /* raise underflow flag */
+            INSERT_WORDS(y, hx, lx);
+            return y;
+        }
+    }
+    INSERT_WORDS(x, hx, lx);
+    return x;
+}
+#endif
+
+#ifndef HAVE_NEXTAFTERF
+float npy_nextafterf(float x, float y)
+{
+    volatile float t;
+    npy_int32 hx, hy, ix, iy;
+
+    GET_FLOAT_WORD(hx, x);
+    GET_FLOAT_WORD(hy, y);
+    ix = hx & 0x7fffffff;       /* |x| */
+    iy = hy & 0x7fffffff;       /* |y| */
+
+    if ((ix > 0x7f800000) ||    /* x is nan */
+        (iy > 0x7f800000))      /* y is nan */
+        return x + y;
+    if (x == y)
+        return y;               /* x=y, return y */
+    if (ix == 0) {              /* x == 0 */
+        SET_FLOAT_WORD(x, (hy & 0x80000000) | 1); /* return +-minsubnormal */
+        t = x * x;
+        if (t == x)
+            return t;
+        else
+            return x;           /* raise underflow flag */
+    }
+    if (hx >= 0) {              /* x > 0 */
+        if (hx > hy) {          /* x > y, x -= ulp */
+            hx -= 1;
+        } else {                /* x < y, x += ulp */
+            hx += 1;
+        }
+    } else {                    /* x < 0 */
+        if (hy >= 0 || hx > hy) {       /* x < y, x -= ulp */
+            hx -= 1;
+        } else {                /* x > y, x += ulp */
+            hx += 1;
+        }
+    }
+    hy = hx & 0x7f800000;
+    if (hy >= 0x7f800000)
+        return x + x;           /* overflow  */
+    if (hy < 0x00800000) {      /* underflow */
+        t = x * x;
+        if (t != x) {           /* raise underflow flag */
+            SET_FLOAT_WORD(y, hx);
+            return y;
+        }
+    }
+    SET_FLOAT_WORD(x, hx);
+    return x;
+}
+#endif
+
+#ifndef HAVE_NEXTAFTERL
+npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
+{
+    volatile npy_longdouble t;
+    union IEEEl2bitsrep ux;
+    union IEEEl2bitsrep uy;
+
+    ux.e = x;
+    uy.e = y;
+
+    if ((GET_LDOUBLE_EXP(ux) == 0x7fff &&
+         ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) | GET_LDOUBLE_MANL(ux)) != 0) ||
+        (GET_LDOUBLE_EXP(uy) == 0x7fff &&
+         ((GET_LDOUBLE_MANH(uy) & ~LDBL_NBIT) | GET_LDOUBLE_MANL(uy)) != 0)) {
+        return ux.e + uy.e;        /* x or y is nan */
+    }
+    if (ux.e == uy.e) {
+        return uy.e;               /* x=y, return y */
+    }
+    if (ux.e == 0.0) {
+        SET_LDOUBLE_MANH(ux, 0);              /* return +-minsubnormal */
+        SET_LDOUBLE_MANL(ux, 1);
+        SET_LDOUBLE_SIGN(ux, GET_LDOUBLE_SIGN(uy));
+        t = ux.e * ux.e;
+        if (t == ux.e) {
+            return t;
+        } else {
+            return ux.e;           /* raise underflow flag */
+        }
+    }
+    if ((ux.e > 0.0) ^ (ux.e < uy.e)) {      /* x -= ulp */
+        if (GET_LDOUBLE_MANL(ux) == 0) {
+            if ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) == 0) {
+                SET_LDOUBLE_EXP(ux, GET_LDOUBLE_EXP(ux) - 1);
+            }
+            SET_LDOUBLE_MANH(ux,
+                             (GET_LDOUBLE_MANH(ux) - 1) |
+                             (GET_LDOUBLE_MANH(ux) & LDBL_NBIT));
+        }
+        SET_LDOUBLE_MANL(ux, GET_LDOUBLE_MANL(ux) - 1);
+    } else {                    /* x += ulp */
+        SET_LDOUBLE_MANL(ux, GET_LDOUBLE_MANL(ux) + 1);
+        if (GET_LDOUBLE_MANL(ux) == 0) {
+            SET_LDOUBLE_MANH(ux,
+                             (GET_LDOUBLE_MANH(ux) + 1) |
+                             (GET_LDOUBLE_MANH(ux) & LDBL_NBIT));
+            if ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) == 0) {
+                SET_LDOUBLE_EXP(ux, GET_LDOUBLE_EXP(ux) + 1);
+            }
+        }
+    }
+    if (GET_LDOUBLE_EXP(ux) == 0x7fff) {
+        return ux.e + ux.e;           /* overflow  */
+    }
+    if (GET_LDOUBLE_EXP(ux) == 0) {            /* underflow */
+        if (LDBL_NBIT) {
+            SET_LDOUBLE_MANH(ux, GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT);
+        }
+        t = ux.e * ux.e;
+        if (t != ux.e) {           /* raise underflow flag */
+            return ux.e;
+        }
+    }
+
+    return ux.e;
+}
+#endif
+
+/**begin repeat
+ * #suff = f,,l#
+ * #SUFF = F,,L#
+ * #type = npy_float, npy_double, npy_longdouble#
+ */
+@type@ npy_spacing@suff@(@type@ x)
+{
+    /* XXX: npy isnan/isinf may be optimized by bit twiddling */
+    if (npy_isinf(x)) {
+        return NPY_NAN@SUFF@;
+    }
+
+    return _next@suff@(x, 1) - x;
+}
+/**end repeat**/
+
+/*
+ * Decorate all the math functions which are available on the current platform
+ */
+
+#ifdef HAVE_NEXTAFTERF
+float npy_nextafterf(float x, float y)
+{
+    return nextafterf(x, y);
+}
+#endif
+
+#ifdef HAVE_NEXTAFTER
+double npy_nextafter(double x, double y)
+{
+    return nextafter(x, y);
+}
+#endif
+
+#ifdef HAVE_NEXTAFTERL
+npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
+{
+    return nextafterl(x, y);
+}
+#endif
+
+int npy_clear_floatstatus() {
+    char x=0;
+    return npy_clear_floatstatus_barrier(&x);
+}
+int npy_get_floatstatus() {
+    char x=0;
+    return npy_get_floatstatus_barrier(&x);
+}
+
+/*
+ * Functions to set the floating point status word.
+ */
+
+#if (defined(__unix__) || defined(unix)) && !defined(USG)
+#include <sys/param.h>
+#endif
+
+
+/*
+ * Define floating point status functions. We must define
+ * npy_get_floatstatus_barrier, npy_clear_floatstatus_barrier,
+ * npy_set_floatstatus_{divbyzero, overflow, underflow, invalid}
+ * for all supported platforms.
+ */
+
+
+/* Solaris --------------------------------------------------------*/
+/* --------ignoring SunOS ieee_flags approach, someone else can
+**         deal with that! */
+#if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \
+    (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \
+    defined(__NetBSD__)
+#include <ieeefp.h>
+
+int npy_get_floatstatus_barrier(char * param)
+{
+    int fpstatus = fpgetsticky();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
+    return ((FP_X_DZ  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+           ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+           ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+           ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus_barrier(char * param)
+{
+    int fpstatus = npy_get_floatstatus_barrier(param);
+    fpsetsticky(0);
+
+    return fpstatus;
+}
+
+void npy_set_floatstatus_divbyzero(void)
+{
+    fpsetsticky(FP_X_DZ);
+}
+
+void npy_set_floatstatus_overflow(void)
+{
+    fpsetsticky(FP_X_OFL);
+}
+
+void npy_set_floatstatus_underflow(void)
+{
+    fpsetsticky(FP_X_UFL);
+}
+
+void npy_set_floatstatus_invalid(void)
+{
+    fpsetsticky(FP_X_INV);
+}
+
+#elif defined(_AIX)
+#include <float.h>
+#include <fpxcp.h>
+
+int npy_get_floatstatus_barrier(char *param)
+{
+    int fpstatus = fp_read_flag();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
+    return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+           ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+           ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+           ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus_barrier(char * param)
+{
+    int fpstatus = npy_get_floatstatus_barrier(param);
+    fp_swap_flag(0);
+
+    return fpstatus;
+}
+
+void npy_set_floatstatus_divbyzero(void)
+{
+    fp_raise_xcp(FP_DIV_BY_ZERO);
+}
+
+void npy_set_floatstatus_overflow(void)
+{
+    fp_raise_xcp(FP_OVERFLOW);
+}
+
+void npy_set_floatstatus_underflow(void)
+{
+    fp_raise_xcp(FP_UNDERFLOW);
+}
+
+void npy_set_floatstatus_invalid(void)
+{
+    fp_raise_xcp(FP_INVALID);
+}
+
+#elif defined(_MSC_VER) || (defined(__osf__) && defined(__alpha))
+
+/*
+ * By using a volatile floating point value,
+ * the compiler is forced to actually do the requested
+ * operations because of potential concurrency.
+ *
+ * We shouldn't write multiple values to a single
+ * global here, because that would cause
+ * a race condition.
+ */
+static volatile double _npy_floatstatus_x,
+    _npy_floatstatus_zero = 0.0, _npy_floatstatus_big = 1e300,
+    _npy_floatstatus_small = 1e-300, _npy_floatstatus_inf;
+
+void npy_set_floatstatus_divbyzero(void)
+{
+    _npy_floatstatus_x = 1.0 / _npy_floatstatus_zero;
+}
+
+void npy_set_floatstatus_overflow(void)
+{
+    _npy_floatstatus_x = _npy_floatstatus_big * 1e300;
+}
+
+void npy_set_floatstatus_underflow(void)
+{
+    _npy_floatstatus_x = _npy_floatstatus_small * 1e-300;
+}
+
+void npy_set_floatstatus_invalid(void)
+{
+    _npy_floatstatus_inf = NPY_INFINITY;
+    _npy_floatstatus_x = _npy_floatstatus_inf - NPY_INFINITY;
+}
+
+/* MS Windows -----------------------------------------------------*/
+#if defined(_MSC_VER)
+
+#include <float.h>
+
+int npy_get_floatstatus_barrier(char *param)
+{
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+#if defined(_WIN64)
+    int fpstatus = _statusfp();
+#else
+    /* windows enables sse on 32 bit, so check both flags */
+    int fpstatus, fpstatus2;
+    _statusfp2(&fpstatus, &fpstatus2);
+    fpstatus |= fpstatus2;
+#endif
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
+    return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+           ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+           ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+           ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus_barrier(char *param)
+{
+    int fpstatus = npy_get_floatstatus_barrier(param);
+    _clearfp();
+
+    return fpstatus;
+}
+
+/*  OSF/Alpha (Tru64)  ---------------------------------------------*/
+#elif defined(__osf__) && defined(__alpha)
+
+#include <machine/fpu.h>
+
+int npy_get_floatstatus_barrier(char *param)
+{
+    unsigned long fpstatus = ieee_get_fp_control();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
+    return  ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+            ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+            ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+            ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus_barrier(char *param)
+{
+    int fpstatus = npy_get_floatstatus_barrier(param);
+    /* clear status bits as well as disable exception mode if on */
+    ieee_set_fp_control(0);
+
+    return fpstatus;
+}
+
+#endif
+/* End of defined(_MSC_VER) || (defined(__osf__) && defined(__alpha)) */
+
+#else
+/* General GCC code, should work on most platforms */
+#  include <fenv.h>
+
+int npy_get_floatstatus_barrier(char* param)
+{
+    int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
+                                FE_UNDERFLOW | FE_INVALID);
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
+
+    return ((FE_DIVBYZERO  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+           ((FE_OVERFLOW   & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+           ((FE_UNDERFLOW  & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+           ((FE_INVALID    & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus_barrier(char * param)
+{
+    /* testing float status is 50-100 times faster than clearing on x86 */
+    int fpstatus = npy_get_floatstatus_barrier(param);
+    if (fpstatus != 0) {
+        feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
+                      FE_UNDERFLOW | FE_INVALID);
+    }
+
+    return fpstatus;
+}
+
+
+void npy_set_floatstatus_divbyzero(void)
+{
+    feraiseexcept(FE_DIVBYZERO);
+}
+
+void npy_set_floatstatus_overflow(void)
+{
+    feraiseexcept(FE_OVERFLOW);
+}
+
+void npy_set_floatstatus_underflow(void)
+{
+    feraiseexcept(FE_UNDERFLOW);
+}
+
+void npy_set_floatstatus_invalid(void)
+{
+    feraiseexcept(FE_INVALID);
+}
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/npymath/npy_math_complex.c.src b/contrib/python/numpy/py2/numpy/core/src/npymath/npy_math_complex.c.src
new file mode 100644
index 00000000000..cf427dad809
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npymath/npy_math_complex.c.src
@@ -0,0 +1,1811 @@
+/*
+ * vim: syntax=c
+ *
+ * Implement some C99-compatible complex math functions
+ *
+ * Most of the code is taken from the msun library in FreeBSD (HEAD @ 4th
+ * October 2013), under the following license:
+ *
+ * Copyright (c) 2007, 2011 David Schultz <[email protected]>
+ * Copyright (c) 2012 Stephen Montgomery-Smith <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include "npy_math_common.h"
+#include "npy_math_private.h"
+#include <numpy/utils.h>
+
+/*
+ * Hack inherited from BSD, the intent is to set the FPU inexact
+ * flag in an efficient way. The flag is IEEE specific. See
+ * https://github.com/freebsd/freebsd/blob/4c6378299/lib/msun/src/catrig.c#L42
+ */
+#define raise_inexact() do {                        \
+    volatile npy_float NPY_UNUSED(junk) = 1 + tiny; \
+} while (0)
+
+
+static const volatile npy_float tiny = 3.9443045e-31f;
+
+
+/**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #ctype = npy_cfloat,npy_cdouble,npy_clongdouble#
+ * #c = f, , l#
+ * #C = F, , L#
+ * #TMAX = FLT_MAX, DBL_MAX, LDBL_MAX#
+ * #TMIN = FLT_MIN, DBL_MIN, LDBL_MIN#
+ * #TMANT_DIG = FLT_MANT_DIG, DBL_MANT_DIG, LDBL_MANT_DIG#
+ * #TEPS = FLT_EPSILON, DBL_EPSILON, LDBL_EPSILON#
+ * #precision = 1, 2, 3#
+ */
+
+/*==========================================================
+ * Constants
+ *=========================================================*/
+static const @ctype@ c_1@c@ = {1.0@C@, 0.0};
+static const @ctype@ c_half@c@ = {0.5@C@, 0.0};
+static const @ctype@ c_i@c@ = {0.0, 1.0@C@};
+static const @ctype@ c_ihalf@c@ = {0.0, 0.5@C@};
+
+/*==========================================================
+ * Helper functions
+ *
+ * These are necessary because we do not count on using a
+ * C99 compiler.
+ *=========================================================*/
+static NPY_INLINE
+@ctype@
+cadd@c@(@ctype@ a, @ctype@ b)
+{
+    return npy_cpack@c@(npy_creal@c@(a) + npy_creal@c@(b),
+                        npy_cimag@c@(a) + npy_cimag@c@(b));
+}
+
+static NPY_INLINE
+@ctype@
+csub@c@(@ctype@ a, @ctype@ b)
+{
+    return npy_cpack@c@(npy_creal@c@(a) - npy_creal@c@(b),
+                        npy_cimag@c@(a) - npy_cimag@c@(b));
+}
+
+static NPY_INLINE
+@ctype@
+cmul@c@(@ctype@ a, @ctype@ b)
+{
+    @type@ ar, ai, br, bi;
+    ar = npy_creal@c@(a);
+    ai = npy_cimag@c@(a);
+    br = npy_creal@c@(b);
+    bi = npy_cimag@c@(b);
+    return npy_cpack@c@(ar*br - ai*bi, ar*bi + ai*br);
+}
+
+static NPY_INLINE
+@ctype@
+cdiv@c@(@ctype@ a, @ctype@ b)
+{
+    @type@ ar, ai, br, bi, abs_br, abs_bi;
+    ar = npy_creal@c@(a);
+    ai = npy_cimag@c@(a);
+    br = npy_creal@c@(b);
+    bi = npy_cimag@c@(b);
+    abs_br = npy_fabs@c@(br);
+    abs_bi = npy_fabs@c@(bi);
+
+    if (abs_br >= abs_bi) {
+        if (abs_br == 0 && abs_bi == 0) {
+            /* divide by zeros should yield a complex inf or nan */
+            return npy_cpack@c@(ar/abs_br, ai/abs_bi);
+        }
+        else {
+            @type@ rat = bi/br;
+            @type@ scl = 1.0@C@/(br+bi*rat);
+            return npy_cpack@c@((ar + ai*rat)*scl, (ai - ar*rat)*scl);
+        }
+    }
+    else {
+        @type@ rat = br/bi;
+        @type@ scl = 1.0@C@/(bi + br*rat);
+        return npy_cpack@c@((ar*rat + ai)*scl, (ai*rat - ar)*scl);
+    }
+}
+
+static NPY_INLINE
+@ctype@
+cneg@c@(@ctype@ a)
+{
+    return npy_cpack@c@(-npy_creal@c@(a), -npy_cimag@c@(a));
+}
+
+static NPY_INLINE
+@ctype@
+cmuli@c@(@ctype@ a)
+{
+    return npy_cpack@c@(-npy_cimag@c@(a), npy_creal@c@(a));
+}
+
+/*==========================================================
+ * Custom implementation of missing complex C99 functions
+ *=========================================================*/
+
+#ifndef HAVE_CABS@C@
+@type@
+npy_cabs@c@(@ctype@ z)
+{
+    return npy_hypot@c@(npy_creal@c@(z), npy_cimag@c@(z));
+}
+#endif
+
+#ifndef HAVE_CARG@C@
+@type@
+npy_carg@c@(@ctype@ z)
+{
+    return npy_atan2@c@(npy_cimag@c@(z), npy_creal@c@(z));
+}
+#endif
+
+/*
+ * cexp and (ccos, csin)h functions need to calculate exp scaled by another
+ * number.  This can be difficult if exp(x) overflows.  By doing this way, we
+ * don't risk overflowing exp. This likely raises floating-point exceptions,
+ * if we decide that we care.
+ *
+ * This is only useful over a limited range, (see below) an expects that the
+ * input values are in this range.
+ *
+ * This is based on the technique used in FreeBSD's __frexp_exp and
+ * __ldexp_(c)exp functions by David Schultz.
+ *
+ * SCALED_CEXP_LOWER = log(FLT_MAX)
+ * SCALED_CEXP_UPPER = log(2) + log(FLT_MAX) - log(FLT_TRUE_MIN),
+ * where FLT_TRUE_MIN is the smallest possible subnormal number.
+ */
+
+#define SCALED_CEXP_LOWERF 88.722839f
+#define SCALED_CEXP_UPPERF 192.69492f
+#define SCALED_CEXP_LOWER 710.47586007394386
+#define SCALED_CEXP_UPPER 1454.9159319953251
+#define SCALED_CEXP_LOWERL 11357.216553474703895L
+#define SCALED_CEXP_UPPERL 22756.021937783004509L
+
+#if !defined(HAVE_CSINH@C@) || \
+    !defined(HAVE_CCOSH@C@) || \
+    !defined(HAVE_CEXP@C@)
+
+static
+@ctype@
+_npy_scaled_cexp@c@(@type@ x, @type@ y, npy_int expt)
+{
+#if @precision@ == 1
+    const npy_int k = 235;
+#endif
+#if @precision@ == 2
+    const npy_int k = 1799;
+#endif
+#if @precision@ == 3
+    const npy_int k = 19547;
+#endif
+    const @type@ kln2 = k * NPY_LOGE2@c@;
+    @type@ mant, mantcos, mantsin;
+    npy_int ex, excos, exsin;
+
+    mant = npy_frexp@c@(npy_exp@c@(x - kln2), &ex);
+    mantcos = npy_frexp@c@(npy_cos@c@(y), &excos);
+    mantsin = npy_frexp@c@(npy_sin@c@(y), &exsin);
+
+    expt += ex + k;
+    return npy_cpack@c@( npy_ldexp@c@(mant * mantcos, expt + excos),
+                         npy_ldexp@c@(mant * mantsin, expt + exsin));
+}
+
+#endif
+
+#ifndef HAVE_CEXP@C@
+
+@ctype@
+npy_cexp@c@(@ctype@ z)
+{
+    @type@ x, c, s;
+    @type@ r, i;
+    @ctype@ ret;
+
+    r = npy_creal@c@(z);
+    i = npy_cimag@c@(z);
+
+    if (npy_isfinite(r)) {
+        if (r >= SCALED_CEXP_LOWER@C@ && r <= SCALED_CEXP_UPPER@C@) {
+            ret = _npy_scaled_cexp@c@(r, i, 0);
+        }
+        else {
+            x = npy_exp@c@(r);
+
+            c = npy_cos@c@(i);
+            s = npy_sin@c@(i);
+
+            if (npy_isfinite(i)) {
+                ret = npy_cpack@c@(x * c, x * s);
+            }
+            else {
+                ret = npy_cpack@c@(NPY_NAN@C@, npy_copysign@c@(NPY_NAN@C@, i));
+            }
+        }
+
+    }
+    else  if (npy_isnan(r)) {
+        /* r is nan */
+        if (i == 0) {
+            ret = z;
+        }
+        else {
+            ret = npy_cpack@c@(r, npy_copysign@c@(NPY_NAN@C@, i));
+        }
+    }
+    else {
+        /* r is +- inf */
+        if (r > 0) {
+            if (i == 0) {
+                ret = npy_cpack@c@(r, i);
+            }
+            else if (npy_isfinite(i)) {
+                c = npy_cos@c@(i);
+                s = npy_sin@c@(i);
+
+                ret = npy_cpack@c@(r * c, r * s);
+            }
+            else {
+                /* x = +inf, y = +-inf | nan */
+                npy_set_floatstatus_invalid();
+                ret = npy_cpack@c@(r, NPY_NAN@C@);
+            }
+        }
+        else {
+            if (npy_isfinite(i)) {
+                x = npy_exp@c@(r);
+                c = npy_cos@c@(i);
+                s = npy_sin@c@(i);
+
+                ret = npy_cpack@c@(x * c, x * s);
+            }
+            else {
+                /* x = -inf, y = nan | +i inf */
+                ret = npy_cpack@c@(0, 0);
+            }
+        }
+    }
+
+    return ret;
+}
+#endif
+
+#ifndef HAVE_CLOG@C@
+/* algorithm from cpython, rev. d86f5686cef9
+ *
+ * The usual formula for the real part is log(hypot(z.real, z.imag)).
+ * There are four situations where this formula is potentially
+ * problematic:
+ *
+ * (1) the absolute value of z is subnormal.  Then hypot is subnormal,
+ * so has fewer than the usual number of bits of accuracy, hence may
+ * have large relative error.  This then gives a large absolute error
+ * in the log.  This can be solved by rescaling z by a suitable power
+ * of 2.
+ *
+ * (2) the absolute value of z is greater than DBL_MAX (e.g. when both
+ * z.real and z.imag are within a factor of 1/sqrt(2) of DBL_MAX)
+ * Again, rescaling solves this.
+ *
+ * (3) the absolute value of z is close to 1.  In this case it's
+ * difficult to achieve good accuracy, at least in part because a
+ * change of 1ulp in the real or imaginary part of z can result in a
+ * change of billions of ulps in the correctly rounded answer.
+ *
+ * (4) z = 0.  The simplest thing to do here is to call the
+ * floating-point log with an argument of 0, and let its behaviour
+ * (returning -infinity, signaling a floating-point exception, setting
+ * errno, or whatever) determine that of c_log.  So the usual formula
+ * is fine here.
+*/
+@ctype@
+npy_clog@c@(@ctype@ z)
+{
+    @type@ ax = npy_fabs@c@(npy_creal@c@(z));
+    @type@ ay = npy_fabs@c@(npy_cimag@c@(z));
+    @type@ rr, ri;
+
+    if (ax > @TMAX@/4 || ay > @TMAX@/4) {
+        rr = npy_log@c@(npy_hypot@c@(ax/2, ay/2)) + NPY_LOGE2@c@;
+    }
+    else if (ax < @TMIN@ && ay < @TMIN@) {
+        if (ax > 0  || ay > 0) {
+            /* catch cases where hypot(ax, ay) is subnormal */
+            rr = npy_log@c@(npy_hypot@c@(npy_ldexp@c@(ax, @TMANT_DIG@),
+                 npy_ldexp@c@(ay, @TMANT_DIG@))) - @TMANT_DIG@*NPY_LOGE2@c@;
+        }
+        else {
+            /* log(+/-0 +/- 0i) */
+            /* raise divide-by-zero floating point exception */
+            rr = -1.0@c@ / npy_creal@c@(z);
+            rr = npy_copysign@c@(rr, -1);
+            ri = npy_carg@c@(z);
+            return npy_cpack@c@(rr, ri);
+        }
+    }
+    else {
+        @type@ h = npy_hypot@c@(ax, ay);
+        if (0.71 <= h && h <= 1.73) {
+            @type@ am = ax > ay ? ax : ay; /* max(ax, ay) */
+            @type@ an = ax > ay ? ay : ax; /* min(ax, ay) */
+            rr = npy_log1p@c@((am-1)*(am+1)+an*an)/2;
+        }
+        else {
+            rr = npy_log@c@(h);
+        }
+    }
+    ri = npy_carg@c@(z);
+
+    return npy_cpack@c@(rr, ri);
+}
+#endif
+
+#ifndef HAVE_CSQRT@C@
+
+/* We risk spurious overflow for components >= DBL_MAX / (1 + sqrt(2)). */
+#define THRESH  (@TMAX@ / (1 + NPY_SQRT2@c@))
+
+@ctype@
+npy_csqrt@c@(@ctype@ z)
+{
+    @ctype@ result;
+    @type@ a, b;
+    @type@ t;
+    int scale;
+
+    a = npy_creal@c@(z);
+    b = npy_cimag@c@(z);
+
+    /* Handle special cases. */
+    if (a == 0 && b == 0) {
+        return (npy_cpack@c@(0, b));
+    }
+    if (npy_isinf(b)) {
+        return (npy_cpack@c@(NPY_INFINITY@C@, b));
+    }
+    if (npy_isnan(a)) {
+        t = (b - b) / (b - b);  /* raise invalid if b is not a NaN */
+        return (npy_cpack@c@(a, t));    /* return NaN + NaN i */
+    }
+    if (npy_isinf(a)) {
+        /*
+         * csqrt(inf + NaN i)  = inf +  NaN i
+         * csqrt(inf + y i)    = inf +  0 i
+         * csqrt(-inf + NaN i) = NaN +- inf i
+         * csqrt(-inf + y i)   = 0   +  inf i
+         */
+        if (npy_signbit(a)) {
+            return (npy_cpack@c@(npy_fabs@c@(b - b), npy_copysign@c@(a, b)));
+        }
+        else {
+            return (npy_cpack@c@(a, npy_copysign@c@(b - b, b)));
+        }
+    }
+    /*
+     * The remaining special case (b is NaN) is handled just fine by
+     * the normal code path below.
+     */
+
+    /* Scale to avoid overflow. */
+    if (npy_fabs@c@(a) >= THRESH || npy_fabs@c@(b) >= THRESH) {
+        a *= 0.25;
+        b *= 0.25;
+        scale = 1;
+    }
+    else {
+        scale = 0;
+    }
+
+    /* Algorithm 312, CACM vol 10, Oct 1967. */
+    if (a >= 0) {
+        t = npy_sqrt@c@((a + npy_hypot@c@(a, b)) * 0.5@c@);
+        result = npy_cpack@c@(t, b / (2 * t));
+    }
+    else {
+        t = npy_sqrt@c@((-a + npy_hypot@c@(a, b)) * 0.5@c@);
+        result = npy_cpack@c@(npy_fabs@c@(b) / (2 * t), npy_copysign@c@(t, b));
+    }
+
+    /* Rescale. */
+    if (scale) {
+        return (npy_cpack@c@(npy_creal@c@(result) * 2, npy_cimag@c@(result)));
+    }
+    else {
+        return (result);
+    }
+}
+#undef THRESH
+#endif
+
+/*
+ * Always use this function because of the multiplication for small
+ * integer powers, but in the body use cpow if it is available.
+ */
+
+/* private function for use in npy_pow{f, ,l} */
+#ifdef HAVE_CPOW@C@
+static @ctype@
+sys_cpow@c@(@ctype@ x, @ctype@ y)
+{
+    __@ctype@_to_c99_cast xcast;
+    __@ctype@_to_c99_cast ycast;
+    __@ctype@_to_c99_cast ret;
+    xcast.npy_z = x;
+    ycast.npy_z = y;
+    ret.c99_z = cpow@c@(xcast.c99_z, ycast.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+@ctype@
+npy_cpow@c@ (@ctype@ a, @ctype@ b)
+{
+    npy_intp n;
+    @type@ ar = npy_creal@c@(a);
+    @type@ br = npy_creal@c@(b);
+    @type@ ai = npy_cimag@c@(a);
+    @type@ bi = npy_cimag@c@(b);
+    @ctype@ r;
+
+    if (br == 0. && bi == 0.) {
+        return npy_cpack@c@(1., 0.);
+    }
+    if (ar == 0. && ai == 0.) {
+        if (br > 0 && bi == 0) {
+            return npy_cpack@c@(0., 0.);
+        }
+        else {
+            volatile @type@ tmp = NPY_INFINITY@C@;
+            /*
+             * NB: there are four complex zeros; c0 = (+-0, +-0), so that
+             * unlike for reals, c0**p, with `p` negative is in general
+             * ill-defined.
+             *
+             *     c0**z with z complex is also ill-defined.
+             */
+            r = npy_cpack@c@(NPY_NAN@C@, NPY_NAN@C@);
+
+            /* Raise invalid */
+            tmp -= NPY_INFINITY@C@;
+            ar = tmp;
+            return r;
+        }
+    }
+    if (bi == 0 && (n=(npy_intp)br) == br) {
+        if (n == 1) {
+            /* unroll: handle inf better */
+            return npy_cpack@c@(ar, ai);
+        }
+        else if (n == 2) {
+            /* unroll: handle inf better */
+            return cmul@c@(a, a);
+        }
+        else if (n == 3) {
+            /* unroll: handle inf better */
+            return cmul@c@(a, cmul@c@(a, a));
+        }
+        else if (n > -100 && n < 100) {
+            @ctype@ p, aa;
+            npy_intp mask = 1;
+            if (n < 0) {
+                n = -n;
+            }
+            aa = c_1@c@;
+            p = npy_cpack@c@(ar, ai);
+            while (1) {
+                if (n & mask) {
+                    aa = cmul@c@(aa,p);
+                }
+                mask <<= 1;
+                if (n < mask || mask <= 0) {
+                    break;
+                }
+                p = cmul@c@(p,p);
+            }
+            r = npy_cpack@c@(npy_creal@c@(aa), npy_cimag@c@(aa));
+            if (br < 0) {
+                r = cdiv@c@(c_1@c@, r);
+            }
+            return r;
+        }
+    }
+
+#ifdef HAVE_CPOW@C@
+    return sys_cpow@c@(a, b);
+
+#else
+    {
+        @ctype@ loga = npy_clog@c@(a);
+
+        ar = npy_creal@c@(loga);
+        ai = npy_cimag@c@(loga);
+        return npy_cexp@c@(npy_cpack@c@(ar*br - ai*bi, ar*bi + ai*br));
+    }
+
+#endif
+}
+
+
+#ifndef HAVE_CCOS@C@
+@ctype@
+npy_ccos@c@(@ctype@ z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccosh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CSIN@C@
+@ctype@
+npy_csin@c@(@ctype@ z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z));
+}
+#endif
+
+#ifndef HAVE_CTAN@C@
+@ctype@
+npy_ctan@c@(@ctype@ z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return (npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+@ctype@
+npy_ccosh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_cosh@c@(x), x * y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_cosh@c@(x) * npy_cos@c@(y),
+                                npy_sinh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(absx) * 0.5@c@;
+            return npy_cpack@c@(h * npy_cos@c@(y),
+                                npy_copysign@c@(h, x) * npy_sin@c@(y));
+        }
+        else if (absx < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z),
+                                npy_cimag@c@(z) * npy_copysign@c@(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpack@c@(h * h * npy_cos@c@(y), h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(y - y, npy_copysign@c@(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpack@c@(x * x, npy_copysign@c@(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@((x * x) * npy_cos@c@(y), x * npy_sin@c@(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+@ctype@
+npy_csinh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@ x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_sinh@c@(x), y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_sinh@c@(x) * npy_cos@c@(y),
+                                npy_cosh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(npy_fabs@c@(x)) * 0.5@c@;
+            return npy_cpack@c@(npy_copysign@c@(h, x) * npy_cos@c@(y),
+                                h * npy_sin@c@(y));
+        }
+        else if (x < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z) * npy_copysign@c@(1, x),
+                                npy_cimag@c@(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpack@c@(h * npy_cos@c@(y), h * h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(npy_copysign@c@(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpack@c@(x, npy_copysign@c@(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@(x * npy_cos@c@(y),
+                            NPY_INFINITY@C@ * npy_sin@c@(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+@ctype@
+npy_ctanh@c@(@ctype@ z)
+{
+    @type@ x, y;
+    @type@ t, beta, s, rho, denom;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpack@c@(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpack@c@(npy_copysign@c@(1,x),
+                                npy_copysign@c@(0,
+                                npy_isinf(y) ?
+                                    y : npy_sin@c@(y) * npy_cos@c@(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpack@c@(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabs@c@(x) >= TANH@C@_HUGE) {
+        @type@ exp_mx = npy_exp@c@(-npy_fabs@c@(x));
+        return npy_cpack@c@(npy_copysign@c@(1, x),
+                            4 * npy_sin@c@(y) * npy_cos@c@(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tan@c@(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinh@c@(x);
+    rho = npy_sqrt@c@(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpack@c@((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
+#if !defined (HAVE_CACOS@C@) || !defined (HAVE_CASINH@C@)
+/*
+ * Complex inverse trig functions taken from the msum library in FreeBSD
+ * revision 251404
+ *
+ * The algorithm is very close to that in "Implementing the complex arcsine
+ * and arccosine functions using exception handling" by T. E. Hull, Thomas F.
+ * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on
+ * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335,
+ * http://dl.acm.org/citation.cfm?id=275324.
+ *
+ * Throughout we use the convention z = x + I*y.
+ *
+ * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B)
+ * where
+ * A = (|z+I| + |z-I|) / 2
+ * B = (|z+I| - |z-I|) / 2 = y/A
+ *
+ * These formulas become numerically unstable:
+ *   (a) for Re(casinh(z)) when z is close to the line segment [-I, I] (that
+ *       is, Re(casinh(z)) is close to 0);
+ *   (b) for Im(casinh(z)) when z is close to either of the intervals
+ *       [I, I*infinity) or (-I*infinity, -I] (that is, |Im(casinh(z))| is
+ *       close to PI/2).
+ *
+ * These numerical problems are overcome by defining
+ * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2
+ * Then if A < A_crossover, we use
+ *   log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1)))
+ *   A-1 = f(x, 1+y) + f(x, 1-y)
+ * and if B > B_crossover, we use
+ *   asin(B) = atan2(y, sqrt(A*A - y*y)) = atan2(y, sqrt((A+y)*(A-y)))
+ *   A-y = f(x, y+1) + f(x, y-1)
+ * where without loss of generality we have assumed that x and y are
+ * non-negative.
+ *
+ * Much of the difficulty comes because the intermediate computations may
+ * produce overflows or underflows.  This is dealt with in the paper by Hull
+ * et al by using exception handling.  We do this by detecting when
+ * computations risk underflow or overflow.  The hardest part is handling the
+ * underflows when computing f(a, b).
+ *
+ * Note that the function f(a, b) does not appear explicitly in the paper by
+ * Hull et al, but the idea may be found on pages 308 and 309.  Introducing the
+ * function f(a, b) allows us to concentrate many of the clever tricks in this
+ * paper into one function.
+ */
+
+/*
+ * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
+ * Pass hypot(a, b) as the third argument.
+ */
+static NPY_INLINE @type@
+_f@c@(@type@ a, @type@ b, @type@ hypot_a_b)
+{
+    if (b < 0) {
+        return ((hypot_a_b - b) / 2);
+    }
+    if (b == 0) {
+        return (a / 2);
+    }
+    return (a * a / (hypot_a_b + b) / 2);
+}
+
+/*
+ * All the hard work is contained in this function.
+ * x and y are assumed positive or zero, and less than RECIP_EPSILON.
+ * Upon return:
+ * rx = Re(casinh(z)) = -Im(cacos(y + I*x)).
+ * B_is_usable is set to 1 if the value of B is usable.
+ * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y.
+ * If returning sqrt_A2my2 has potential to result in an underflow, it is
+ * rescaled, and new_y is similarly rescaled.
+ */
+static NPY_INLINE void
+_do_hard_work@c@(@type@ x, @type@ y, @type@ *rx,
+    npy_int *B_is_usable, @type@ *B, @type@ *sqrt_A2my2, @type@ *new_y)
+{
+#if @precision@ == 1
+    const npy_float A_crossover = 10.0f;
+    const npy_float B_crossover = 0.6417f;
+    const npy_float FOUR_SQRT_MIN = 4.3368086899420177e-19f;
+#endif
+#if @precision@ == 2
+    const npy_double A_crossover = 10.0;
+    const npy_double B_crossover = 0.6417;
+    const npy_double FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#endif
+#if @precision@ == 3
+    const npy_longdouble A_crossover = 10.0l;
+    const npy_longdouble B_crossover = 0.6417l;
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#else
+    const npy_longdouble FOUR_SQRT_MIN = 7.3344154702193886625e-2466l;
+#endif
+#endif
+    @type@ R, S, A; /* A, B, R, and S are as in Hull et al. */
+    @type@ Am1, Amy; /* A-1, A-y. */
+
+    R = npy_hypot@c@(x, y + 1);        /* |z+I| */
+    S = npy_hypot@c@(x, y - 1);        /* |z-I| */
+
+    /* A = (|z+I| + |z-I|) / 2 */
+    A = (R + S) / 2;
+    /*
+     * Mathematically A >= 1.  There is a small chance that this will not
+     * be so because of rounding errors.  So we will make certain it is
+     * so.
+     */
+    if (A < 1) {
+        A = 1;
+    }
+
+    if (A < A_crossover) {
+        /*
+         * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
+         * rx = log1p(Am1 + sqrt(Am1*(A+1)))
+         */
+        if (y == 1 && x < @TEPS@ * @TEPS@ / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *rx = npy_sqrt@c@(x);
+        }
+        else if (x >= @TEPS@ * npy_fabs@c@(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON^2/128 >= FOUR_SQRT_MIN
+             */
+            Am1 = _f@c@(x, 1 + y, R) + _f@c@(x, 1 - y, S);
+            *rx = npy_log1p@c@(Am1 + npy_sqrt@c@(Am1 * (A + 1)));
+        }
+        else if (y < 1) {
+            /*
+             * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and
+             * A = 1 (inexactly).
+             */
+            *rx = x / npy_sqrt@c@((1 - y) * (1 + y));
+        }
+        else {        /* if (y > 1) */
+            /*
+             * A-1 = y-1 (inexactly).
+             */
+            *rx = npy_log1p@c@((y - 1) + npy_sqrt@c@((y - 1) * (y + 1)));
+        }
+    }
+    else {
+        *rx = npy_log@c@(A + npy_sqrt@c@(A * A - 1));
+    }
+
+    *new_y = y;
+
+    if (y < FOUR_SQRT_MIN) {
+        /*
+         * Avoid a possible underflow caused by y/A.  For casinh this
+         * would be legitimate, but will be picked up by invoking atan2
+         * later on.  For cacos this would not be legitimate.
+         */
+        *B_is_usable = 0;
+        *sqrt_A2my2 = A * (2 / @TEPS@);
+        *new_y = y * (2 / @TEPS@);
+        return;
+    }
+
+    /* B = (|z+I| - |z-I|) / 2 = y/A */
+    *B = y / A;
+    *B_is_usable = 1;
+
+    if (*B > B_crossover) {
+        *B_is_usable = 0;
+        /*
+         * Amy = fp + fm, where fp = f(x, y+1), and fm = f(x, y-1).
+         * sqrt_A2my2 = sqrt(Amy*(A+y))
+         */
+        if (y == 1 && x < @TEPS@ / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrt@c@(x) * npy_sqrt@c@((A + y) / 2);
+        }
+        else if (x >= @TEPS@ * npy_fabs@c@(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON/128 >= FOUR_SQRT_MIN
+             * and
+             * x >= DBL_EPSILON^2 >= FOUR_SQRT_MIN
+             */
+            Amy = _f@c@(x, y + 1, R) + _f@c@(x, y - 1, S);
+            *sqrt_A2my2 = npy_sqrt@c@(Amy * (A + y));
+        }
+        else if (y > 1) {
+            /*
+             * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and
+             * A = y (inexactly).
+             *
+             * y < RECIP_EPSILON.  So the following
+             * scaling should avoid any underflow problems.
+             */
+            *sqrt_A2my2 = x * (4 / @TEPS@ / @TEPS@) * y /
+                npy_sqrt@c@((y + 1) * (y - 1));
+            *new_y = y * (4 / @TEPS@ / @TEPS@);
+        }
+        else {        /* if (y < 1) */
+            /*
+             * fm = 1-y >= DBL_EPSILON, fp is of order x^2, and
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrt@c@((1 - y) * (1 + y));
+        }
+    }
+}
+
+/*
+ * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
+ */
+static NPY_INLINE void
+_clog_for_large_values@c@(@type@ x, @type@ y,
+    @type@ *rr, @type@ *ri)
+{
+#if @precision@ == 1
+    const npy_float QUARTER_SQRT_MAX = 4.611685743549481e+18f;
+    const npy_float SQRT_MIN = 1.0842021724855044e-19f;
+ #endif
+#if @precision@ == 2
+    const npy_double QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_double SQRT_MIN = 1.4916681462400413e-154;
+ #endif
+#if @precision@ == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_longdouble SQRT_MIN = 1.4916681462400413e-154;
+#else
+    const npy_longdouble QUARTER_SQRT_MAX = 2.7268703390485398235e+2465l;
+    const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    @type@ ax, ay, t;
+
+    ax = npy_fabs@c@(x);
+    ay = npy_fabs@c@(y);
+    if (ax < ay) {
+        t = ax;
+        ax = ay;
+        ay = t;
+    }
+
+    /*
+     * Avoid overflow in hypot() when x and y are both very large.
+     * Divide x and y by E, and then add 1 to the logarithm.  This depends
+     * on E being larger than sqrt(2).
+     * Dividing by E causes an insignificant loss of accuracy; however
+     * this method is still poor since it is uneccessarily slow.
+     */
+    if (ax > @TMAX@ / 2) {
+        *rr = npy_log@c@(npy_hypot@c@(x / NPY_E@c@, y / NPY_E@c@)) + 1;
+    }
+    /*
+     * Avoid overflow when x or y is large.  Avoid underflow when x or
+     * y is small.
+     */
+    else if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN) {
+        *rr = npy_log@c@(npy_hypot@c@(x, y));
+    }
+    else {
+        *rr = npy_log@c@(ax * ax + ay * ay) / 2;
+    }
+    *ri = npy_atan2@c@(y, x);
+}
+#endif
+
+#ifndef HAVE_CACOS@C@
+@ctype@
+npy_cacos@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if @precision@ == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if @precision@ == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const @type@ RECIP_EPSILON = 1.0@c@ / @TEPS@;
+    const @type@ pio2_hi = NPY_PI_2@c@;
+    @type@ x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2mx2, new_x;
+    npy_int sx, sy;
+    npy_int B_is_usable;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+    sx = npy_signbit(x);
+    sy = npy_signbit(y);
+    ax = npy_fabs@c@(x);
+    ay = npy_fabs@c@(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+        if (npy_isinf(x)) {
+            return npy_cpack@c@(y + y, -NPY_INFINITY@C@);
+        }
+        /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
+        if (npy_isinf(y)) {
+            return npy_cpack@c@(x + x, -y);
+        }
+        /* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+        if (x == 0) {
+            return npy_cpack@c@(pio2_hi + pio2_lo, y + y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpack@c@(NPY_NAN@C@, NPY_NAN@C@);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        _clog_for_large_values@c@(x, y, &wx, &wy);
+        rx = npy_fabs@c@(wy);
+        ry = wx + NPY_LOGE2@c@;
+        if (sy == 0) {
+            ry = -ry;
+        }
+        return npy_cpack@c@(rx, ry);
+    }
+
+    /* Avoid spuriously raising inexact for z = 1. */
+    if (x == 1 && y == 0) {
+        return npy_cpack@c@(0, -y);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return npy_cpack@c@(pio2_hi - (x - pio2_lo), -y);
+    }
+
+    _do_hard_work@c@(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
+    if (B_is_usable) {
+        if (sx == 0) {
+            rx = npy_acos@c@(B);
+        }
+        else {
+            rx = npy_acos@c@(-B);
+        }
+    }
+    else {
+        if (sx == 0) {
+            rx = npy_atan2@c@(sqrt_A2mx2, new_x);
+        }
+        else {
+            rx = npy_atan2@c@(sqrt_A2mx2, -new_x);
+        }
+    }
+    if (sy == 0) {
+        ry = -ry;
+    }
+    return npy_cpack@c@(rx, ry);
+}
+#endif
+
+#ifndef HAVE_CASIN@C@
+@ctype@
+npy_casin@c@(@ctype@ z)
+{
+    /* casin(z) = I * conj( casinh(I * conj(z)) ) */
+    z = npy_casinh@c@(npy_cpack@c@(npy_cimag@c@(z), npy_creal@c@(z)));
+    return npy_cpack@c@(npy_cimag@c@(z), npy_creal@c@(z));
+}
+#endif
+
+#ifndef HAVE_CATAN@C@
+@ctype@
+npy_catan@c@(@ctype@ z)
+{
+    /* catan(z) = I * conj( catanh(I * conj(z)) ) */
+    z = npy_catanh@c@(npy_cpack@c@(npy_cimag@c@(z), npy_creal@c@(z)));
+    return npy_cpack@c@(npy_cimag@c@(z), npy_creal@c@(z));
+}
+#endif
+
+#ifndef HAVE_CACOSH@C@
+@ctype@
+npy_cacosh@c@(@ctype@ z)
+{
+    /*
+     * cacosh(z) = I*cacos(z) or -I*cacos(z)
+     * where the sign is chosen so Re(cacosh(z)) >= 0.
+     */
+    @ctype@  w;
+    @type@ rx, ry;
+
+    w = npy_cacos@c@(z);
+    rx = npy_creal@c@(w);
+    ry = npy_cimag@c@(w);
+    /* cacosh(NaN + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(rx) && npy_isnan(ry)) {
+        return npy_cpack@c@(ry, rx);
+    }
+    /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
+    /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
+    if (npy_isnan(rx)) {
+        return npy_cpack@c@(npy_fabs@c@(ry), rx);
+    }
+    /* cacosh(0 + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(ry)) {
+        return npy_cpack@c@(ry, ry);
+    }
+    return npy_cpack@c@(npy_fabs@c@(ry), npy_copysign@c@(rx, npy_cimag@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CASINH@C@
+/*
+ * casinh(z) = z + O(z^3)   as z -> 0
+ *
+ * casinh(z) = sign(x)*clog(sign(x)*z) + O(1/z^2)   as z -> infinity
+ * The above formula works for the imaginary part as well, because
+ * Im(casinh(z)) = sign(x)*atan2(sign(x)*y, fabs(x)) + O(y/z^3)
+ *    as z -> infinity, uniformly in y
+ */
+@ctype@
+npy_casinh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if @precision@ == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if @precision@ == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const @type@ RECIP_EPSILON = 1.0@c@ / @TEPS@;
+    const @type@ pio2_hi = NPY_PI_2@c@;
+    @type@ x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2my2, new_y;
+    npy_int B_is_usable;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+    ax = npy_fabs@c@(x);
+    ay = npy_fabs@c@(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpack@c@(x, y + y);
+        }
+        /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+        if (npy_isinf(y)) {
+            return npy_cpack@c@(y, x + x);
+        }
+        /* casinh(NaN + I*0) = NaN + I*0 */
+        if (y == 0) {
+            return npy_cpack@c@(x + x, y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpack@c@(NPY_NAN@C@, NPY_NAN@C@);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        if (npy_signbit(x) == 0) {
+            _clog_for_large_values@c@(x, y, &wx, &wy);
+            wx += NPY_LOGE2@c@;
+        }
+        else {
+            _clog_for_large_values@c@(-x, -y, &wx, &wy);
+            wx += NPY_LOGE2@c@;
+        }
+        return npy_cpack@c@(npy_copysign@c@(wx, x), npy_copysign@c@(wy, y));
+    }
+
+    /* Avoid spuriously raising inexact for z = 0. */
+    if (x == 0 && y == 0) {
+        return (z);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return (z);
+    }
+
+    _do_hard_work@c@(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y);
+    if (B_is_usable) {
+        ry = npy_asin@c@(B);
+    }
+    else {
+        ry = npy_atan2@c@(new_y, sqrt_A2my2);
+    }
+    return npy_cpack@c@(npy_copysign@c@(rx, x), npy_copysign@c@(ry, y));
+}
+#endif
+
+#ifndef HAVE_CATANH@C@
+/*
+ * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
+ * Assumes x*x and y*y will not overflow.
+ * Assumes x and y are finite.
+ * Assumes y is non-negative.
+ * Assumes fabs(x) >= DBL_EPSILON.
+ */
+static NPY_INLINE @type@
+_sum_squares@c@(@type@ x, @type@ y)
+{
+#if @precision@ == 1
+const npy_float SQRT_MIN = 1.0842022e-19f;
+#endif
+#if @precision@ == 2
+const npy_double SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#endif
+#if @precision@ == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+const npy_longdouble SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#else
+/* this is correct for 80 bit long doubles */
+const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    /* Avoid underflow when y is small. */
+    if (y < SQRT_MIN) {
+        return (x * x);
+    }
+
+    return (x * x + y * y);
+}
+
+/*
+ * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
+ * Assumes x and y are not NaN, and one of x and y is larger than
+ * RECIP_EPSILON.  We avoid unwarranted underflow.  It is important to not use
+ * the code creal(1/z), because the imaginary part may produce an unwanted
+ * underflow.
+ * This is only called in a context where inexact is always raised before
+ * the call, so no effort is made to avoid or force inexact.
+ */
+#if @precision@ == 1
+#define BIAS (FLT_MAX_EXP - 1)
+#define CUTOFF (FLT_MANT_DIG / 2 + 1)
+static NPY_INLINE npy_float
+_real_part_reciprocalf(npy_float x, npy_float y)
+{
+    npy_float scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    GET_FLOAT_WORD(hx, x);
+    ix = hx & 0x7f800000;
+    GET_FLOAT_WORD(hy, y);
+    iy = hy & 0x7f800000;
+    if (ix - iy >= CUTOFF << 23 || npy_isinf(x)) {
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 23) {
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + FLT_MAX_EXP / 2 - CUTOFF) << 23) {
+        return (x / (x * x + y * y));
+    }
+    SET_FLOAT_WORD(scale, 0x7f800000 - ix);
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if @precision@ == 2
+#define BIAS (DBL_MAX_EXP - 1)
+/*  more guard digits are useful iff there is extra precision. */
+#define CUTOFF (DBL_MANT_DIG / 2 + 1)  /* just half or 1 guard digit */
+static NPY_INLINE npy_double
+_real_part_reciprocal(npy_double x, npy_double y)
+{
+    npy_double scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    /*
+     * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
+     * example 2.
+     */
+    GET_HIGH_WORD(hx, x);
+    ix = hx & 0x7ff00000;
+    GET_HIGH_WORD(hy, y);
+    iy = hy & 0x7ff00000;
+    if (ix - iy >= CUTOFF << 20 || npy_isinf(x)) {
+        /* +-Inf -> +-0 is special */
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 20) {
+        /* should avoid double div, but hard */
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + DBL_MAX_EXP / 2 - CUTOFF) << 20) {
+        return (x / (x * x + y * y));
+    }
+    scale = 1;
+    SET_HIGH_WORD(scale, 0x7ff00000 - ix);  /* 2**(1-ilogb(x)) */
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if @precision@ == 3
+#if !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) && \
+    !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+
+#define BIAS (LDBL_MAX_EXP - 1)
+#define CUTOFF (LDBL_MANT_DIG / 2 + 1)
+static NPY_INLINE npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    npy_longdouble scale;
+    union IEEEl2bitsrep ux, uy, us;
+    npy_int32 ix, iy;
+
+    ux.e = x;
+    ix = GET_LDOUBLE_EXP(ux);
+    uy.e = y;
+    iy = GET_LDOUBLE_EXP(uy);
+    if (ix - iy >= CUTOFF || npy_isinf(x)) {
+        return (1/x);
+    }
+    if (iy - ix >= CUTOFF) {
+        return (x/y/y);
+    }
+    if (ix <= BIAS + LDBL_MAX_EXP / 2 - CUTOFF) {
+        return (x/(x*x + y*y));
+    }
+    us.e = 1;
+    SET_LDOUBLE_EXP(us, 0x7fff - ix);
+    scale = us.e;
+    x *= scale;
+    y *= scale;
+    return (x/(x*x + y*y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+
+#else
+
+static NPY_INLINE npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    return x/(x*x + y*y);
+}
+
+#endif
+#endif
+
+@ctype@
+npy_catanh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    /* this is sqrt(3*EPS) */
+    const npy_float SQRT_3_EPSILON = 5.9801995673e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if @precision@ == 2
+    const npy_double SQRT_3_EPSILON = 2.5809568279517849e-8;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if @precision@ == 3
+    const npy_longdouble SQRT_3_EPSILON = 5.70316273435758915310e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const @type@ RECIP_EPSILON = 1.0@c@ / @TEPS@;
+    const @type@ pio2_hi = NPY_PI_2@c@;
+    @type@ x, y, ax, ay, rx, ry;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+    ax = npy_fabs@c@(x);
+    ay = npy_fabs@c@(y);
+
+    /* This helps handle many cases. */
+    if (y == 0 && ax <= 1) {
+        return npy_cpack@c@(npy_atanh@c@(x), y);
+    }
+
+    /* To ensure the same accuracy as atan(), and to filter out z = 0. */
+    if (x == 0) {
+        return npy_cpack@c@(x, npy_atan@c@(y));
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpack@c@(npy_copysign@c@(0, x), y + y);
+        }
+        /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+        if (npy_isinf(y)) {
+            return npy_cpack@c@(npy_copysign@c@(0, x),
+                npy_copysign@c@(pio2_hi + pio2_lo, y));
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpack@c@(NPY_NAN@C@, NPY_NAN@C@);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        return npy_cpack@c@(_real_part_reciprocal@c@(x, y),
+            npy_copysign@c@(pio2_hi + pio2_lo, y));
+    }
+
+    if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
+        /*
+         * z = 0 was filtered out above.  All other cases must raise
+         * inexact, but this is the only only that needs to do it
+         * explicitly.
+         */
+        raise_inexact();
+        return (z);
+    }
+
+    if (ax == 1 && ay < @TEPS@) {
+        rx = (NPY_LOGE2@c@ - npy_log@c@(ay)) / 2;
+    }
+    else {
+        rx = npy_log1p@c@(4 * ax / _sum_squares@c@(ax - 1, ay)) / 4;
+    }
+
+    if (ax == 1) {
+        ry = npy_atan2@c@(2, -ay) / 2;
+    }
+    else if (ay < @TEPS@) {
+        ry = npy_atan2@c@(2 * ay, (1 - ax) * (1 + ax)) / 2;
+    }
+    else {
+        ry = npy_atan2@c@(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
+    }
+
+    return npy_cpack@c@(npy_copysign@c@(rx, x), npy_copysign@c@(ry, y));
+}
+#endif
+/**end repeat**/
+
+/*==========================================================
+ * Decorate all the functions which are available natively
+ *=========================================================*/
+
+/**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #ctype = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #c = f, , l#
+ * #C = F, , L#
+ */
+
+/**begin repeat1
+ * #kind = cabs,carg#
+ * #KIND = CABS,CARG#
+ */
+#ifdef HAVE_@KIND@@C@
+@type@
+npy_@kind@@c@(@ctype@ z)
+{
+    __@ctype@_to_c99_cast z1;
+    z1.npy_z = z;
+    return @kind@@c@(z1.c99_z);
+}
+#endif
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = cexp,clog,csqrt,ccos,csin,ctan,ccosh,csinh,ctanh,
+ *         cacos,casin,catan,cacosh,casinh,catanh#
+ * #KIND = CEXP,CLOG,CSQRT,CCOS,CSIN,CTAN,CCOSH,CSINH,CTANH,
+ *         CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
+ */
+#ifdef HAVE_@KIND@@C@
+@ctype@
+npy_@kind@@c@(@ctype@ z)
+{
+    __@ctype@_to_c99_cast z1;
+    __@ctype@_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = @kind@@c@(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+/**end repeat1**/
+
+
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/npymath/npy_math_internal.h.src b/contrib/python/numpy/py2/numpy/core/src/npymath/npy_math_internal.h.src
new file mode 100644
index 00000000000..fa820baac3b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npymath/npy_math_internal.h.src
@@ -0,0 +1,718 @@
+/*
+ * vim:syntax=c
+ * A small module to implement missing C99 math capabilities required by numpy
+ *
+ * Please keep this independent of python ! Only basic types (npy_longdouble)
+ * can be used, otherwise, pure C, without any use of Python facilities
+ *
+ * How to add a function to this section
+ * -------------------------------------
+ *
+ * Say you want to add `foo`, these are the steps and the reasons for them.
+ *
+ * 1) Add foo to the appropriate list in the configuration system. The
+ *    lists can be found in numpy/core/setup.py lines 63-105. Read the
+ *    comments that come with them, they are very helpful.
+ *
+ * 2) The configuration system will define a macro HAVE_FOO if your function
+ *    can be linked from the math library. The result can depend on the
+ *    optimization flags as well as the compiler, so can't be known ahead of
+ *    time. If the function can't be linked, then either it is absent, defined
+ *    as a macro, or is an intrinsic (hardware) function.
+ *
+ *    i) Undefine any possible macros:
+ *
+ *    #ifdef foo
+ *    #undef foo
+ *    #endif
+ *
+ *    ii) Avoid as much as possible to declare any function here. Declaring
+ *    functions is not portable: some platforms define some function inline
+ *    with a non standard identifier, for example, or may put another
+ *    identifier which changes the calling convention of the function. If you
+ *    really have to, ALWAYS declare it for the one platform you are dealing
+ *    with:
+ *
+ *    Not ok:
+ *        double exp(double a);
+ *
+ *    Ok:
+ *        #ifdef SYMBOL_DEFINED_WEIRD_PLATFORM
+ *        double exp(double);
+ *        #endif
+ *
+ * Some of the code is taken from msun library in FreeBSD, with the following
+ * notice:
+ *
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#include "npy_math_private.h"
+
+/*
+ *****************************************************************************
+ **                     BASIC MATH FUNCTIONS                                **
+ *****************************************************************************
+ */
+
+/* Original code by Konrad Hinsen.  */
+#ifndef HAVE_EXPM1
+NPY_INPLACE double npy_expm1(double x)
+{
+    if (npy_isinf(x) && x > 0) {
+        return x;
+    }
+    else {
+        const double u = npy_exp(x);
+
+        if (u == 1.0) {
+            return x;
+        } else if (u - 1.0 == -1.0) {
+            return -1;
+        } else {
+            return (u - 1.0) * x/npy_log(u);
+        }
+    }
+}
+#endif
+
+#ifndef HAVE_LOG1P
+NPY_INPLACE double npy_log1p(double x)
+{
+    if (npy_isinf(x) && x > 0) {
+        return x;
+    }
+    else {
+        const double u = 1. + x;
+        const double d = u - 1.;
+
+        if (d == 0) {
+            return x;
+        } else {
+            return npy_log(u) * x / d;
+        }
+    }
+}
+#endif
+
+/* Taken from FreeBSD mlib, adapted for numpy
+ *
+ * XXX: we could be a bit faster by reusing high/low words for inf/nan
+ * classification instead of calling npy_isinf/npy_isnan: we should have some
+ * macros for this, though, instead of doing it manually
+ */
+#ifndef HAVE_ATAN2
+/* XXX: we should have this in npy_math.h */
+#define NPY_DBL_EPSILON 1.2246467991473531772E-16
+NPY_INPLACE double npy_atan2(double y, double x)
+{
+    npy_int32 k, m, iy, ix, hx, hy;
+    npy_uint32 lx,ly;
+    double z;
+
+    EXTRACT_WORDS(hx, lx, x);
+    ix = hx & 0x7fffffff;
+    EXTRACT_WORDS(hy, ly, y);
+    iy = hy & 0x7fffffff;
+
+    /* if x or y is nan, return nan */
+    if (npy_isnan(x * y)) {
+        return x + y;
+    }
+
+    if (x == 1.0) {
+        return npy_atan(y);
+    }
+
+    m = 2 * (npy_signbit((x)) != 0) + (npy_signbit((y)) != 0);
+    if (y == 0.0) {
+        switch(m) {
+        case 0:
+        case 1: return  y;  /* atan(+-0,+anything)=+-0 */
+        case 2: return  NPY_PI;/* atan(+0,-anything) = pi */
+        case 3: return -NPY_PI;/* atan(-0,-anything) =-pi */
+        }
+    }
+
+    if (x == 0.0) {
+        return y > 0 ? NPY_PI_2 : -NPY_PI_2;
+    }
+
+    if (npy_isinf(x)) {
+        if (npy_isinf(y)) {
+            switch(m) {
+                case 0: return  NPY_PI_4;/* atan(+INF,+INF) */
+                case 1: return -NPY_PI_4;/* atan(-INF,+INF) */
+                case 2: return  3.0*NPY_PI_4;/*atan(+INF,-INF)*/
+                case 3: return -3.0*NPY_PI_4;/*atan(-INF,-INF)*/
+            }
+        } else {
+            switch(m) {
+                case 0: return  NPY_PZERO;  /* atan(+...,+INF) */
+                case 1: return  NPY_NZERO;  /* atan(-...,+INF) */
+                case 2: return  NPY_PI;  /* atan(+...,-INF) */
+                case 3: return -NPY_PI;  /* atan(-...,-INF) */
+            }
+        }
+    }
+
+    if (npy_isinf(y)) {
+        return y > 0 ? NPY_PI_2 : -NPY_PI_2;
+    }
+
+    /* compute y/x */
+    k = (iy - ix) >> 20;
+    if (k > 60) {            /* |y/x| >  2**60 */
+        z = NPY_PI_2 + 0.5 * NPY_DBL_EPSILON;
+        m &= 1;
+    } else if (hx < 0 && k < -60) {
+        z = 0.0;    /* 0 > |y|/x > -2**-60 */
+    } else {
+        z = npy_atan(npy_fabs(y/x));        /* safe to do y/x */
+    }
+
+    switch (m) {
+        case 0: return  z  ;    /* atan(+,+) */
+        case 1: return -z  ;    /* atan(-,+) */
+        case 2: return  NPY_PI - (z - NPY_DBL_EPSILON);/* atan(+,-) */
+        default: /* case 3 */
+            return  (z - NPY_DBL_EPSILON) - NPY_PI;/* atan(-,-) */
+    }
+}
+
+#endif
+
+#ifndef HAVE_HYPOT
+NPY_INPLACE double npy_hypot(double x, double y)
+{
+    double yx;
+
+    if (npy_isinf(x) || npy_isinf(y)) {
+        return NPY_INFINITY;
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        return NPY_NAN;
+    }
+
+    x = npy_fabs(x);
+    y = npy_fabs(y);
+    if (x < y) {
+        double temp = x;
+        x = y;
+        y = temp;
+    }
+    if (x == 0.) {
+        return 0.;
+    }
+    else {
+        yx = y/x;
+        return x*npy_sqrt(1.+yx*yx);
+    }
+}
+#endif
+
+#ifndef HAVE_ACOSH
+NPY_INPLACE double npy_acosh(double x)
+{
+    if (x < 1.0) {
+        return NPY_NAN;
+    }
+
+    if (npy_isfinite(x)) {
+        if (x > 1e8) {
+             return npy_log(x) + NPY_LOGE2;
+        }
+        else {
+            double u = x - 1.0;
+            return npy_log1p(u + npy_sqrt(2*u + u*u));
+        }
+    }
+    return x;
+}
+#endif
+
+#ifndef HAVE_ASINH
+NPY_INPLACE double npy_asinh(double xx)
+{
+    double x, d;
+    int sign;
+    if (xx < 0.0) {
+        sign = -1;
+        x = -xx;
+    }
+    else {
+        sign = 1;
+        x = xx;
+    }
+    if (x > 1e8) {
+        d = x;
+    } else {
+        d = npy_sqrt(x*x + 1);
+    }
+    return sign*npy_log1p(x*(1.0 + x/(d+1)));
+}
+#endif
+
+#ifndef HAVE_ATANH
+NPY_INPLACE double npy_atanh(double x)
+{
+    if (x > 0) {
+        return -0.5*npy_log1p(-2.0*x/(1.0 + x));
+    }
+    else {
+        return 0.5*npy_log1p(2.0*x/(1.0 - x));
+    }
+}
+#endif
+
+#ifndef HAVE_RINT
+#if defined(_MSC_VER) && (_MSC_VER == 1500) && !defined(_WIN64)
+#pragma optimize("", off)
+#endif
+NPY_INPLACE double npy_rint(double x)
+{
+    double y, r;
+
+    y = npy_floor(x);
+    r = x - y;
+
+    if (r > 0.5) {
+        y += 1.0;
+    }
+
+    /* Round to nearest even */
+    if (r == 0.5) {
+        r = y - 2.0*npy_floor(0.5*y);
+        if (r == 1.0) {
+            y += 1.0;
+        }
+    }
+    return y;
+}
+#if defined(_MSC_VER) && (_MSC_VER == 1500) && !defined(_WIN64)
+#pragma optimize("", on)
+#endif
+#endif
+
+#ifndef HAVE_TRUNC
+NPY_INPLACE double npy_trunc(double x)
+{
+    return x < 0 ? npy_ceil(x) : npy_floor(x);
+}
+#endif
+
+#ifndef HAVE_EXP2
+NPY_INPLACE double npy_exp2(double x)
+{
+    return npy_exp(NPY_LOGE2*x);
+}
+#endif
+
+#ifndef HAVE_LOG2
+NPY_INPLACE double npy_log2(double x)
+{
+#ifdef HAVE_FREXP
+    if (!npy_isfinite(x) || x <= 0.) {
+        /* special value result */
+        return npy_log(x);
+    }
+    else {
+        /*
+         * fallback implementation copied from python3.4 math.log2
+         * provides int(log(2**i)) == i for i 1-64 in default rounding mode.
+         *
+         * We want log2(m * 2**e) == log(m) / log(2) + e.  Care is needed when
+         * x is just greater than 1.0: in that case e is 1, log(m) is negative,
+         * and we get significant cancellation error from the addition of
+         * log(m) / log(2) to e.  The slight rewrite of the expression below
+         * avoids this problem.
+         */
+        int e;
+        double m = frexp(x, &e);
+        if (x >= 1.0) {
+            return log(2.0 * m) / log(2.0) + (e - 1);
+        }
+        else {
+            return log(m) / log(2.0) + e;
+        }
+    }
+#else
+    /* does not provide int(log(2**i)) == i */
+    return NPY_LOG2E * npy_log(x);
+#endif
+}
+#endif
+
+/*
+ * if C99 extensions not available then define dummy functions that use the
+ * double versions for
+ *
+ * sin, cos, tan
+ * sinh, cosh, tanh,
+ * fabs, floor, ceil, rint, trunc
+ * sqrt, log10, log, exp, expm1
+ * asin, acos, atan,
+ * asinh, acosh, atanh
+ *
+ * hypot, atan2, pow, fmod, modf
+ * ldexp, frexp
+ *
+ * We assume the above are always available in their double versions.
+ *
+ * NOTE: some facilities may be available as macro only  instead of functions.
+ * For simplicity, we define our own functions and undef the macros. We could
+ * instead test for the macro, but I am lazy to do that for now.
+ */
+
+/**begin repeat
+ * #type = npy_longdouble, npy_float#
+ * #TYPE = NPY_LONGDOUBLE, FLOAT#
+ * #c = l,f#
+ * #C = L,F#
+ */
+
+/**begin repeat1
+ * #kind = sin,cos,tan,sinh,cosh,tanh,fabs,floor,ceil,rint,trunc,sqrt,log10,
+ *         log,exp,expm1,asin,acos,atan,asinh,acosh,atanh,log1p,exp2,log2#
+ * #KIND = SIN,COS,TAN,SINH,COSH,TANH,FABS,FLOOR,CEIL,RINT,TRUNC,SQRT,LOG10,
+ *         LOG,EXP,EXPM1,ASIN,ACOS,ATAN,ASINH,ACOSH,ATANH,LOG1P,EXP2,LOG2#
+ */
+
+#ifdef @kind@@c@
+#undef @kind@@c@
+#endif
+#ifndef HAVE_@KIND@@C@
+NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
+{
+    return (@type@) npy_@kind@((double)x);
+}
+#endif
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = atan2,hypot,pow,fmod,copysign#
+ * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
+ */
+#ifdef @kind@@c@
+#undef @kind@@c@
+#endif
+#ifndef HAVE_@KIND@@C@
+NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
+{
+    return (@type@) npy_@kind@((double)x, (double) y);
+}
+#endif
+/**end repeat1**/
+
+#ifdef modf@c@
+#undef modf@c@
+#endif
+#ifndef HAVE_MODF@C@
+NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
+{
+    double niptr;
+    double y = npy_modf((double)x, &niptr);
+    *iptr = (@type@) niptr;
+    return (@type@) y;
+}
+#endif
+
+#ifdef ldexp@c@
+#undef ldexp@c@
+#endif
+#ifndef HAVE_LDEXP@C@
+NPY_INPLACE @type@ npy_ldexp@c@(@type@ x, int exp)
+{
+    return (@type@) npy_ldexp((double)x, exp);
+}
+#endif
+
+#ifdef frexp@c@
+#undef frexp@c@
+#endif
+#ifndef HAVE_FREXP@C@
+NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
+{
+    return (@type@) npy_frexp(x, exp);
+}
+#endif
+
+/**end repeat**/
+
+
+/*
+ * Decorate all the math functions which are available on the current platform
+ */
+
+/**begin repeat
+ * #type = npy_longdouble, npy_double, npy_float#
+ * #c = l,,f#
+ * #C = L,,F#
+ */
+/**begin repeat1
+ * #kind = sin,cos,tan,sinh,cosh,tanh,fabs,floor,ceil,rint,trunc,sqrt,log10,
+ *         log,exp,expm1,asin,acos,atan,asinh,acosh,atanh,log1p,exp2,log2#
+ * #KIND = SIN,COS,TAN,SINH,COSH,TANH,FABS,FLOOR,CEIL,RINT,TRUNC,SQRT,LOG10,
+ *         LOG,EXP,EXPM1,ASIN,ACOS,ATAN,ASINH,ACOSH,ATANH,LOG1P,EXP2,LOG2#
+ */
+#ifdef HAVE_@KIND@@C@
+NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
+{
+    return @kind@@c@(x);
+}
+#endif
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = atan2,hypot,pow,fmod,copysign#
+ * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
+ */
+#ifdef HAVE_@KIND@@C@
+NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
+{
+    return @kind@@c@(x, y);
+}
+#endif
+/**end repeat1**/
+
+#ifdef HAVE_MODF@C@
+NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
+{
+    return modf@c@(x, iptr);
+}
+#endif
+
+#ifdef HAVE_LDEXP@C@
+NPY_INPLACE @type@ npy_ldexp@c@(@type@ x, int exp)
+{
+    return ldexp@c@(x, exp);
+}
+#endif
+
+#ifdef HAVE_FREXP@C@
+NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
+{
+    return frexp@c@(x, exp);
+}
+#endif
+
+/* C99 but not mandatory */
+
+#ifndef HAVE_CBRT@C@
+NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
+{
+    /* don't set invalid flag */
+    if (npy_isnan(x)) {
+        return NPY_NAN;
+    }
+    else if (x < 0) {
+        return -npy_pow@c@(-x, 1. / 3.);
+    }
+    else {
+        return npy_pow@c@(x, 1. / 3.);
+    }
+}
+#else
+NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
+{
+    return cbrt@c@(x);
+}
+#endif
+
+/**end repeat**/
+
+
+/*
+ * Non standard functions
+ */
+
+/**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #c = f, ,l#
+ * #C = F, ,L#
+ */
+
+@type@ npy_heaviside@c@(@type@ x, @type@ h0)
+{
+    if (npy_isnan(x)) {
+        return (@type@) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (@type@) 0.0;
+    }
+    else {
+        return (@type@) 1.0;
+    }
+}
+
+#define LOGE2    NPY_LOGE2@c@
+#define LOG2E    NPY_LOG2E@c@
+#define RAD2DEG  (180.0@c@/NPY_PI@c@)
+#define DEG2RAD  (NPY_PI@c@/180.0@c@)
+
+NPY_INPLACE @type@ npy_rad2deg@c@(@type@ x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE @type@ npy_deg2rad@c@(@type@ x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE @type@ npy_log2_1p@c@(@type@ x)
+{
+    return LOG2E*npy_log1p@c@(x);
+}
+
+NPY_INPLACE @type@ npy_exp2_m1@c@(@type@ x)
+{
+    return npy_expm1@c@(LOGE2*x);
+}
+
+NPY_INPLACE @type@ npy_logaddexp@c@(@type@ x, @type@ y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const @type@ tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1p@c@(npy_exp@c@(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1p@c@(npy_exp@c@(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE @type@ npy_logaddexp2@c@(@type@ x, @type@ y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const @type@ tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1p@c@(npy_exp2@c@(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1p@c@(npy_exp2@c@(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE @type@
+npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
+{
+    @type@ div, mod, floordiv;
+
+    mod = npy_fmod@c@(a, b);
+
+    if (!b) {
+        /* If b == 0, return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return mod;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if ((b < 0) != (mod < 0)) {
+            mod += b;
+            div -= 1.0@c@;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysign@c@(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floor@c@(div);
+        if (div - floordiv > 0.5@c@)
+            floordiv += 1.0@c@;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysign@c@(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #type = npy_uint, npy_ulong, npy_ulonglong#
+ * #c = u,ul,ull#
+ */
+NPY_INPLACE @type@
+npy_gcd@c@(@type@ a, @type@ b)
+{
+    @type@ c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE @type@
+npy_lcm@c@(@type@ a, @type@ b)
+{
+    @type@ gcd = npy_gcd@c@(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #type = (npy_int, npy_long, npy_longlong)*2#
+ * #c = (,l,ll)*2#
+ * #func=gcd*3,lcm*3#
+ */
+NPY_INPLACE @type@
+npy_@func@@c@(@type@ a, @type@ b)
+{
+    return npy_@func@u@c@(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/npysort/binsearch.c.src b/contrib/python/numpy/py2/numpy/core/src/npysort/binsearch.c.src
new file mode 100644
index 00000000000..c04e197b7ce
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npysort/binsearch.c.src
@@ -0,0 +1,250 @@
+/* -*- c -*- */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include "npy_binsearch.h"
+
+#define NOT_USED NPY_UNUSED(unused)
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SEARCHES                             **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE, DATETIME, TIMEDELTA#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
+ */
+
+#define @TYPE@_LTE(a, b) (!@TYPE@_LT((b), (a)))
+
+/**begin repeat1
+ *
+ * #side = left, right#
+ * #CMP  = LT, LTE#
+ */
+
+NPY_VISIBILITY_HIDDEN void
+binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
+                        npy_intp arr_len, npy_intp key_len,
+                        npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
+                        PyArrayObject *NOT_USED)
+{
+    npy_intp min_idx = 0;
+    npy_intp max_idx = arr_len;
+    @type@ last_key_val;
+
+    if (key_len == 0) {
+        return;
+    }
+    last_key_val = *(const @type@ *)key;
+
+    for (; key_len > 0; key_len--, key += key_str, ret += ret_str) {
+        const @type@ key_val = *(const @type@ *)key;
+        /*
+         * Updating only one of the indices based on the previous key
+         * gives the search a big boost when keys are sorted, but slightly
+         * slows down things for purely random ones.
+         */
+        if (@TYPE@_LT(last_key_val, key_val)) {
+            max_idx = arr_len;
+        }
+        else {
+            min_idx = 0;
+            max_idx = (max_idx < arr_len) ? (max_idx + 1) : arr_len;
+        }
+
+        last_key_val = key_val;
+
+        while (min_idx < max_idx) {
+            const npy_intp mid_idx = min_idx + ((max_idx - min_idx) >> 1);
+            const @type@ mid_val = *(const @type@ *)(arr + mid_idx*arr_str);
+            if (@TYPE@_@CMP@(mid_val, key_val)) {
+                min_idx = mid_idx + 1;
+            }
+            else {
+                max_idx = mid_idx;
+            }
+        }
+        *(npy_intp *)ret = min_idx;
+    }
+}
+
+NPY_VISIBILITY_HIDDEN int
+argbinsearch_@side@_@suff@(const char *arr, const char *key,
+                           const char *sort, char *ret,
+                           npy_intp arr_len, npy_intp key_len,
+                           npy_intp arr_str, npy_intp key_str,
+                           npy_intp sort_str, npy_intp ret_str,
+                           PyArrayObject *NOT_USED)
+{
+    npy_intp min_idx = 0;
+    npy_intp max_idx = arr_len;
+    @type@ last_key_val;
+
+    if (key_len == 0) {
+        return 0;
+    }
+    last_key_val = *(const @type@ *)key;
+
+    for (; key_len > 0; key_len--, key += key_str, ret += ret_str) {
+        const @type@ key_val = *(const @type@ *)key;
+        /*
+         * Updating only one of the indices based on the previous key
+         * gives the search a big boost when keys are sorted, but slightly
+         * slows down things for purely random ones.
+         */
+        if (@TYPE@_LT(last_key_val, key_val)) {
+            max_idx = arr_len;
+        }
+        else {
+            min_idx = 0;
+            max_idx = (max_idx < arr_len) ? (max_idx + 1) : arr_len;
+        }
+
+        last_key_val = key_val;
+
+        while (min_idx < max_idx) {
+            const npy_intp mid_idx = min_idx + ((max_idx - min_idx) >> 1);
+            const npy_intp sort_idx = *(npy_intp *)(sort + mid_idx*sort_str);
+            @type@ mid_val;
+
+            if (sort_idx < 0 || sort_idx >= arr_len) {
+                return -1;
+            }
+
+            mid_val = *(const @type@ *)(arr + sort_idx*arr_str);
+
+            if (@TYPE@_@CMP@(mid_val, key_val)) {
+                min_idx = mid_idx + 1;
+            }
+            else {
+                max_idx = mid_idx;
+            }
+        }
+        *(npy_intp *)ret = min_idx;
+    }
+    return 0;
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             GENERIC SEARCH                              **
+ *****************************************************************************
+ */
+
+ /**begin repeat
+ *
+ * #side = left, right#
+ * #CMP  = <, <=#
+ */
+
+NPY_VISIBILITY_HIDDEN void
+npy_binsearch_@side@(const char *arr, const char *key, char *ret,
+                     npy_intp arr_len, npy_intp key_len,
+                     npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
+                     PyArrayObject *cmp)
+{
+    PyArray_CompareFunc *compare = PyArray_DESCR(cmp)->f->compare;
+    npy_intp min_idx = 0;
+    npy_intp max_idx = arr_len;
+    const char *last_key = key;
+
+    for (; key_len > 0; key_len--, key += key_str, ret += ret_str) {
+        /*
+         * Updating only one of the indices based on the previous key
+         * gives the search a big boost when keys are sorted, but slightly
+         * slows down things for purely random ones.
+         */
+        if (compare(last_key, key, cmp) @CMP@ 0) {
+            max_idx = arr_len;
+        }
+        else {
+            min_idx = 0;
+            max_idx = (max_idx < arr_len) ? (max_idx + 1) : arr_len;
+        }
+
+        last_key = key;
+
+        while (min_idx < max_idx) {
+            const npy_intp mid_idx = min_idx + ((max_idx - min_idx) >> 1);
+            const char *arr_ptr = arr + mid_idx*arr_str;
+
+            if (compare(arr_ptr, key, cmp) @CMP@ 0) {
+                min_idx = mid_idx + 1;
+            }
+            else {
+                max_idx = mid_idx;
+            }
+        }
+        *(npy_intp *)ret = min_idx;
+    }
+}
+
+NPY_VISIBILITY_HIDDEN int
+npy_argbinsearch_@side@(const char *arr, const char *key,
+                        const char *sort, char *ret,
+                        npy_intp arr_len, npy_intp key_len,
+                        npy_intp arr_str, npy_intp key_str,
+                        npy_intp sort_str, npy_intp ret_str,
+                        PyArrayObject *cmp)
+{
+    PyArray_CompareFunc *compare = PyArray_DESCR(cmp)->f->compare;
+    npy_intp min_idx = 0;
+    npy_intp max_idx = arr_len;
+    const char *last_key = key;
+
+    for (; key_len > 0; key_len--, key += key_str, ret += ret_str) {
+        /*
+         * Updating only one of the indices based on the previous key
+         * gives the search a big boost when keys are sorted, but slightly
+         * slows down things for purely random ones.
+         */
+        if (compare(last_key, key, cmp) @CMP@ 0) {
+            max_idx = arr_len;
+        }
+        else {
+            min_idx = 0;
+            max_idx = (max_idx < arr_len) ? (max_idx + 1) : arr_len;
+        }
+
+        last_key = key;
+
+        while (min_idx < max_idx) {
+            const npy_intp mid_idx = min_idx + ((max_idx - min_idx) >> 1);
+            const npy_intp sort_idx = *(npy_intp *)(sort + mid_idx*sort_str);
+            const char *arr_ptr;
+
+            if (sort_idx < 0 || sort_idx >= arr_len) {
+                return -1;
+            }
+
+            arr_ptr = arr + sort_idx*arr_str;
+
+            if (compare(arr_ptr, key, cmp) @CMP@ 0) {
+                min_idx = mid_idx + 1;
+            }
+            else {
+                max_idx = mid_idx;
+            }
+        }
+        *(npy_intp *)ret = min_idx;
+    }
+    return 0;
+}
+
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/npysort/heapsort.c.src b/contrib/python/numpy/py2/numpy/core/src/npysort/heapsort.c.src
new file mode 100644
index 00000000000..c2e3b63cbef
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npysort/heapsort.c.src
@@ -0,0 +1,402 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include <stdlib.h>
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE, DATETIME, TIMEDELTA#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
+ */
+
+int
+heapsort_@suff@(void *start, npy_intp n, void *NOT_USED)
+{
+    @type@ tmp, *a;
+    npy_intp i,j,l;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = (@type@ *)start - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
+                j += 1;
+            }
+            if (@TYPE@_LT(tmp, a[j])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(a[j], a[j+1])) {
+                j++;
+            }
+            if (@TYPE@_LT(tmp, a[j])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+
+int
+aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *NOT_USED)
+{
+    @type@ *v = vv;
+    npy_intp *a, i,j,l, tmp;
+    /* The arrays need to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
+                j += 1;
+            }
+            if (@TYPE@_LT(v[tmp], v[a[j]])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(v[a[j]], v[a[j+1]])) {
+                j++;
+            }
+            if (@TYPE@_LT(v[tmp], v[a[j]])) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+int
+heapsort_@suff@(void *start, npy_intp n, void *varr)
+{
+    PyArrayObject *arr = varr;
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *tmp = malloc(PyArray_ITEMSIZE(arr));
+    @type@ *a = (@type@ *)start - len;
+    npy_intp i, j, l;
+
+    if (tmp == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    for (l = n>>1; l > 0; --l) {
+        @TYPE@_COPY(tmp, a + l*len, len);
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
+                j += 1;
+            if (@TYPE@_LT(tmp, a + j*len, len)) {
+                @TYPE@_COPY(a + i*len, a + j*len, len);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        @TYPE@_COPY(a + i*len, tmp, len);
+    }
+
+    for (; n > 1;) {
+        @TYPE@_COPY(tmp, a + n*len, len);
+        @TYPE@_COPY(a + n*len, a + len, len);
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(a + j*len, a + (j+1)*len, len))
+                j++;
+            if (@TYPE@_LT(tmp, a + j*len, len)) {
+                @TYPE@_COPY(a + i*len, a + j*len, len);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        @TYPE@_COPY(a + i*len, tmp, len);
+    }
+
+    free(tmp);
+    return 0;
+}
+
+
+int
+aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *varr)
+{
+    @type@ *v = vv;
+    PyArrayObject *arr = varr;
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    npy_intp *a, i,j,l, tmp;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n>>1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
+                j += 1;
+            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && @TYPE@_LT(v + a[j]*len, v + a[j+1]*len, len))
+                j++;
+            if (@TYPE@_LT(v + tmp*len, v + a[j]*len, len)) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+int
+npy_heapsort(void *start, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    npy_intp elsize = PyArray_ITEMSIZE(arr);
+    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    char *tmp = malloc(elsize);
+    char *a = (char *)start - elsize;
+    npy_intp i, j, l;
+
+    if (tmp == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    for (l = num >> 1; l > 0; --l) {
+        GENERIC_COPY(tmp, a + l*elsize, elsize);
+        for (i = l, j = l << 1; j <= num;) {
+            if (j < num && cmp(a + j*elsize, a + (j+1)*elsize, arr) < 0) {
+                ++j;
+            }
+            if (cmp(tmp, a + j*elsize, arr) < 0) {
+                GENERIC_COPY(a + i*elsize, a + j*elsize, elsize);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        GENERIC_COPY(a + i*elsize, tmp, elsize);
+    }
+
+    for (; num > 1;) {
+        GENERIC_COPY(tmp, a + num*elsize, elsize);
+        GENERIC_COPY(a + num*elsize, a + elsize, elsize);
+        num -= 1;
+        for (i = 1, j = 2; j <= num;) {
+            if (j < num && cmp(a + j*elsize, a + (j+1)*elsize, arr) < 0) {
+                ++j;
+            }
+            if (cmp(tmp, a + j*elsize, arr) < 0) {
+                GENERIC_COPY(a + i*elsize, a + j*elsize, elsize);
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        GENERIC_COPY(a + i*elsize, tmp, elsize);
+    }
+
+    free(tmp);
+    return 0;
+}
+
+
+int
+npy_aheapsort(void *vv, npy_intp *tosort, npy_intp n, void *varr)
+{
+    char *v = vv;
+    PyArrayObject *arr = varr;
+    npy_intp elsize = PyArray_ITEMSIZE(arr);
+    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    npy_intp *a, i, j, l, tmp;
+
+    /* The array needs to be offset by one for heapsort indexing */
+    a = tosort - 1;
+
+    for (l = n >> 1; l > 0; --l) {
+        tmp = a[l];
+        for (i = l, j = l<<1; j <= n;) {
+            if (j < n && cmp(v + a[j]*elsize, v + a[j+1]*elsize, arr) < 0) {
+                ++j;
+            }
+            if (cmp(v + tmp*elsize, v + a[j]*elsize, arr) < 0) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    for (; n > 1;) {
+        tmp = a[n];
+        a[n] = a[1];
+        n -= 1;
+        for (i = 1, j = 2; j <= n;) {
+            if (j < n && cmp(v + a[j]*elsize, v + a[j+1]*elsize, arr) < 0) {
+                ++j;
+            }
+            if (cmp(v + tmp*elsize, v + a[j]*elsize, arr) < 0) {
+                a[i] = a[j];
+                i = j;
+                j += j;
+            }
+            else {
+                break;
+            }
+        }
+        a[i] = tmp;
+    }
+
+    return 0;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/npysort/mergesort.c.src b/contrib/python/numpy/py2/numpy/core/src/npysort/mergesort.c.src
new file mode 100644
index 00000000000..6f659617a73
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npysort/mergesort.c.src
@@ -0,0 +1,511 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario can
+ * be slower than the merge and heap sorts.  The merge sort requires
+ * extra memory and so for large arrays may not be useful.
+ *
+ * The merge sort is *stable*, meaning that equal components
+ * are unmoved from their entry versions, so it can be used to
+ * implement lexigraphic sorting on multiple keys.
+ *
+ * The heap sort is included for completeness.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include <stdlib.h>
+
+#define NOT_USED NPY_UNUSED(unused)
+#define PYA_QS_STACK 100
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE, DATETIME, TIMEDELTA#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
+ */
+
+static void
+mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
+{
+    @type@ vp, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        mergesort0_@suff@(pl, pm, pw);
+        mergesort0_@suff@(pm, pr, pw);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(*pm, *pj)) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while(pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vp = *pi;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, *pk)) {
+                *pj-- = *pk--;
+            }
+            *pj = vp;
+        }
+    }
+}
+
+
+int
+mergesort_@suff@(void *start, npy_intp num, void *NOT_USED)
+{
+    @type@ *pl, *pr, *pw;
+
+    pl = start;
+    pr = pl + num;
+    pw = malloc((num/2) * sizeof(@type@));
+    if (pw == NULL) {
+        return -NPY_ENOMEM;
+    }
+    mergesort0_@suff@(pl, pr, pw);
+
+    free(pw);
+    return 0;
+}
+
+
+static void
+amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
+{
+    @type@ vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        amergesort0_@suff@(pl, pm, v, pw);
+        amergesort0_@suff@(pm, pr, v, pw);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(v[*pm], v[*pj])) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while(pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v[vi];
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+amergesort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
+{
+    npy_intp *pl, *pr, *pw;
+
+    pl = tosort;
+    pr = pl + num;
+    pw = malloc((num/2) * sizeof(npy_intp));
+    if (pw == NULL) {
+        return -NPY_ENOMEM;
+    }
+    amergesort0_@suff@(pl, pr, v, pw);
+    free(pw);
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+static void
+mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
+{
+    @type@ *pi, *pj, *pk, *pm;
+
+    if ((size_t)(pr - pl) > SMALL_MERGESORT*len) {
+        /* merge sort */
+        pm = pl + (((pr - pl)/len) >> 1)*len;
+        mergesort0_@suff@(pl, pm, pw, vp, len);
+        mergesort0_@suff@(pm, pr, pw, vp, len);
+        @TYPE@_COPY(pw, pl, pm - pl);
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(pm, pj, len)) {
+                @TYPE@_COPY(pk, pm, len);
+                pm += len;
+                pk += len;
+            }
+            else {
+                @TYPE@_COPY(pk, pj, len);
+                pj += len;
+                pk += len;
+            }
+        }
+        @TYPE@_COPY(pk, pj, pi - pj);
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + len; pi < pr; pi += len) {
+            @TYPE@_COPY(vp, pi, len);
+            pj = pi;
+            pk = pi - len;
+            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
+                @TYPE@_COPY(pj, pk, len);
+                pj -= len;
+                pk -= len;
+            }
+            @TYPE@_COPY(pj, vp, len);
+        }
+    }
+}
+
+
+int
+mergesort_@suff@(void *start, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    size_t elsize = PyArray_ITEMSIZE(arr);
+    size_t len = elsize / sizeof(@type@);
+    @type@ *pl, *pr, *pw, *vp;
+    int err = 0;
+
+    /* Items that have zero size don't make sense to sort */
+    if (elsize == 0) {
+        return 0;
+    }
+
+    pl = start;
+    pr = pl + num*len;
+    pw = malloc((num/2) * elsize);
+    if (pw == NULL) {
+        err = -NPY_ENOMEM;
+        goto fail_0;
+    }
+    vp = malloc(elsize);
+    if (vp == NULL) {
+        err = -NPY_ENOMEM;
+        goto fail_1;
+    }
+    mergesort0_@suff@(pl, pr, pw, vp, len);
+
+    free(vp);
+fail_1:
+    free(pw);
+fail_0:
+    return err;
+}
+
+
+static void
+amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, size_t len)
+{
+    @type@ *vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        amergesort0_@suff@(pl, pm, v, pw, len);
+        amergesort0_@suff@(pm, pr, v, pw, len);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pj)*len, len)) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while (pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*len;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+amergesort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    size_t elsize = PyArray_ITEMSIZE(arr);
+    size_t len = elsize / sizeof(@type@);
+    npy_intp *pl, *pr, *pw;
+
+    /* Items that have zero size don't make sense to sort */
+    if (elsize == 0) {
+        return 0;
+    }
+
+    pl = tosort;
+    pr = pl + num;
+    pw = malloc((num/2) * sizeof(npy_intp));
+    if (pw == NULL) {
+        return -NPY_ENOMEM;
+    }
+    amergesort0_@suff@(pl, pr, v, pw, len);
+    free(pw);
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+static void
+npy_mergesort0(char *pl, char *pr, char *pw, char *vp, npy_intp elsize,
+               PyArray_CompareFunc *cmp, PyArrayObject *arr)
+{
+    char *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT*elsize) {
+        /* merge sort */
+        pm = pl + (((pr - pl)/elsize) >> 1)*elsize;
+        npy_mergesort0(pl, pm, pw, vp, elsize, cmp, arr);
+        npy_mergesort0(pm, pr, pw, vp, elsize, cmp, arr);
+        GENERIC_COPY(pw, pl, pm - pl);
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (cmp(pm, pj, arr) < 0) {
+                GENERIC_COPY(pk, pm, elsize);
+                pm += elsize;
+                pk += elsize;
+            }
+            else {
+                GENERIC_COPY(pk, pj, elsize);
+                pj += elsize;
+                pk += elsize;
+            }
+        }
+        GENERIC_COPY(pk, pj, pi - pj);
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + elsize; pi < pr; pi += elsize) {
+            GENERIC_COPY(vp, pi, elsize);
+            pj = pi;
+            pk = pi - elsize;
+            while (pj > pl && cmp(vp, pk, arr) < 0) {
+                GENERIC_COPY(pj, pk, elsize);
+                pj -= elsize;
+                pk -= elsize;
+            }
+            GENERIC_COPY(pj, vp, elsize);
+        }
+    }
+}
+
+
+int
+npy_mergesort(void *start, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    npy_intp elsize = PyArray_ITEMSIZE(arr);
+    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    char *pl = start;
+    char *pr = pl + num*elsize;
+    char *pw;
+    char *vp;
+    int err = -NPY_ENOMEM;
+
+    /* Items that have zero size don't make sense to sort */
+    if (elsize == 0) {
+        return 0;
+    }
+
+    pw = malloc((num >> 1) *elsize);
+    vp = malloc(elsize);
+
+    if (pw != NULL && vp != NULL) {
+        npy_mergesort0(pl, pr, pw, vp, elsize, cmp, arr);
+        err = 0;
+    }
+
+    free(vp);
+    free(pw);
+
+    return err;
+}
+
+
+static void
+npy_amergesort0(npy_intp *pl, npy_intp *pr, char *v, npy_intp *pw,
+                npy_intp elsize, PyArray_CompareFunc *cmp, PyArrayObject *arr)
+{
+    char *vp;
+    npy_intp vi, *pi, *pj, *pk, *pm;
+
+    if (pr - pl > SMALL_MERGESORT) {
+        /* merge sort */
+        pm = pl + ((pr - pl) >> 1);
+        npy_amergesort0(pl, pm, v, pw, elsize, cmp, arr);
+        npy_amergesort0(pm, pr, v, pw, elsize, cmp, arr);
+        for (pi = pw, pj = pl; pj < pm;) {
+            *pi++ = *pj++;
+        }
+        pi = pw + (pm - pl);
+        pj = pw;
+        pk = pl;
+        while (pj < pi && pm < pr) {
+            if (cmp(v + (*pm)*elsize, v + (*pj)*elsize, arr) < 0) {
+                *pk++ = *pm++;
+            }
+            else {
+                *pk++ = *pj++;
+            }
+        }
+        while (pj < pi) {
+            *pk++ = *pj++;
+        }
+    }
+    else {
+        /* insertion sort */
+        for (pi = pl + 1; pi < pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*elsize;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && cmp(vp, v + (*pk)*elsize, arr) < 0) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+    }
+}
+
+
+int
+npy_amergesort(void *v, npy_intp *tosort, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    npy_intp elsize = PyArray_ITEMSIZE(arr);
+    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    npy_intp *pl, *pr, *pw;
+
+    /* Items that have zero size don't make sense to sort */
+    if (elsize == 0) {
+        return 0;
+    }
+
+    pl = tosort;
+    pr = pl + num;
+    pw = malloc((num >> 1) * sizeof(npy_intp));
+    if (pw == NULL) {
+        return -NPY_ENOMEM;
+    }
+    npy_amergesort0(pl, pr, v, pw, elsize, cmp, arr);
+    free(pw);
+
+    return 0;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/npysort/quicksort.c.src b/contrib/python/numpy/py2/numpy/core/src/npysort/quicksort.c.src
new file mode 100644
index 00000000000..49a2c490680
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npysort/quicksort.c.src
@@ -0,0 +1,634 @@
+/* -*- c -*- */
+
+/*
+ * The purpose of this module is to add faster sort functions
+ * that are type-specific.  This is done by altering the
+ * function table for the builtin descriptors.
+ *
+ * These sorting functions are copied almost directly from numarray
+ * with a few modifications (complex comparisons compare the imaginary
+ * part if the real parts are equal, for example), and the names
+ * are changed.
+ *
+ * The original sorting code is due to Charles R. Harris who wrote
+ * it for numarray.
+ */
+
+/*
+ * Quick sort is usually the fastest, but the worst case scenario is O(N^2) so
+ * the code switches to the O(NlogN) worst case heapsort if not enough progress
+ * is made on the large side of the two quicksort partitions. This improves the
+ * worst case while still retaining the speed of quicksort for the common case.
+ * This is variant known as introsort.
+ *
+ *
+ * def introsort(lower, higher, recursion_limit=log2(higher - lower + 1) * 2):
+ *   # sort remainder with heapsort if we are not making enough progress
+ *   # we arbitrarily choose 2 * log(n) as the cutoff point
+ *   if recursion_limit < 0:
+ *       heapsort(lower, higher)
+ *       return
+ *
+ *   if lower < higher:
+ *      pivot_pos = partition(lower, higher)
+ *      # recurse into smaller first and leave larger on stack
+ *      # this limits the required stack space
+ *      if (pivot_pos - lower > higher - pivot_pos):
+ *          quicksort(pivot_pos + 1, higher, recursion_limit - 1)
+ *          quicksort(lower, pivot_pos, recursion_limit - 1)
+ *      else:
+ *          quicksort(lower, pivot_pos, recursion_limit - 1)
+ *          quicksort(pivot_pos + 1, higher, recursion_limit - 1)
+ *
+ *
+ * the below code implements this converted to an iteration and as an
+ * additional minor optimization skips the recursion depth checking on the
+ * smaller partition as it is always less than half of the remaining data and
+ * will thus terminate fast enough
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include <stdlib.h>
+
+#define NOT_USED NPY_UNUSED(unused)
+/*
+ * pushing largest partition has upper bound of log2(n) space
+ * we store two pointers each time
+ */
+#define PYA_QS_STACK (NPY_BITSOF_INTP * 2)
+#define SMALL_QUICKSORT 15
+#define SMALL_MERGESORT 20
+#define SMALL_STRING 16
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE, DATETIME, TIMEDELTA#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
+ */
+
+int
+quicksort_@suff@(void *start, npy_intp num, void *NOT_USED)
+{
+    @type@ vp;
+    @type@ *pl = start;
+    @type@ *pr = pl + num - 1;
+    @type@ *stack[PYA_QS_STACK];
+    @type@ **sptr = stack;
+    @type@ *pm, *pi, *pj, *pk;
+    int depth[PYA_QS_STACK];
+    int * psdepth = depth;
+    int cdepth = npy_get_msb(num) * 2;
+
+    for (;;) {
+        if (NPY_UNLIKELY(cdepth < 0)) {
+            heapsort_@suff@(pl, pr - pl + 1, NULL);
+            goto stack_pop;
+        }
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
+            if (@TYPE@_LT(*pr, *pm)) @TYPE@_SWAP(*pr, *pm);
+            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
+            vp = *pm;
+            pi = pl;
+            pj = pr - 1;
+            @TYPE@_SWAP(*pm, *pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(*pi, vp));
+                do --pj; while (@TYPE@_LT(vp, *pj));
+                if (pi >= pj) {
+                    break;
+                }
+                @TYPE@_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            @TYPE@_SWAP(*pi, *pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+            *psdepth++ = --cdepth;
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vp = *pi;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, *pk)) {
+                *pj-- = *pk--;
+            }
+            *pj = vp;
+        }
+stack_pop:
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+        cdepth = *(--psdepth);
+    }
+
+    return 0;
+}
+
+
+int
+aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *NOT_USED)
+{
+    @type@ *v = vv;
+    @type@ vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr = stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+    int depth[PYA_QS_STACK];
+    int * psdepth = depth;
+    int cdepth = npy_get_msb(num) * 2;
+
+    for (;;) {
+        if (NPY_UNLIKELY(cdepth < 0)) {
+            aheapsort_@suff@(vv, pl, pr - pl + 1, NULL);
+            goto stack_pop;
+        }
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
+            if (@TYPE@_LT(v[*pr],v[*pm])) INTP_SWAP(*pr, *pm);
+            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
+            vp = v[*pm];
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm, *pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(v[*pi], vp));
+                do --pj; while (@TYPE@_LT(vp, v[*pj]));
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi, *pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+            *psdepth++ = --cdepth;
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v[vi];
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+stack_pop:
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+        cdepth = *(--psdepth);
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = STRING, UNICODE#
+ * #suff = string, unicode#
+ * #type = npy_char, npy_ucs4#
+ */
+
+int
+quicksort_@suff@(void *start, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    const size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *vp;
+    @type@ *pl = start;
+    @type@ *pr = pl + (num - 1)*len;
+    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
+    int depth[PYA_QS_STACK];
+    int * psdepth = depth;
+    int cdepth = npy_get_msb(num) * 2;
+
+    /* Items that have zero size don't make sense to sort */
+    if (len == 0) {
+        return 0;
+    }
+
+    vp = malloc(PyArray_ITEMSIZE(arr));
+    if (vp == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    for (;;) {
+        if (NPY_UNLIKELY(cdepth < 0)) {
+            heapsort_@suff@(pl, (pr - pl) / len + 1, varr);
+            goto stack_pop;
+        }
+        while ((size_t)(pr - pl) > SMALL_QUICKSORT*len) {
+            /* quicksort partition */
+            pm = pl + (((pr - pl)/len) >> 1)*len;
+            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
+            if (@TYPE@_LT(pr, pm, len)) @TYPE@_SWAP(pr, pm, len);
+            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
+            @TYPE@_COPY(vp, pm, len);
+            pi = pl;
+            pj = pr - len;
+            @TYPE@_SWAP(pm, pj, len);
+            for (;;) {
+                do pi += len; while (@TYPE@_LT(pi, vp, len));
+                do pj -= len; while (@TYPE@_LT(vp, pj, len));
+                if (pi >= pj) {
+                    break;
+                }
+                @TYPE@_SWAP(pi, pj, len);
+            }
+            pk = pr - len;
+            @TYPE@_SWAP(pi, pk, len);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + len;
+                *sptr++ = pr;
+                pr = pi - len;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - len;
+                pl = pi + len;
+            }
+            *psdepth++ = --cdepth;
+        }
+
+        /* insertion sort */
+        for (pi = pl + len; pi <= pr; pi += len) {
+            @TYPE@_COPY(vp, pi, len);
+            pj = pi;
+            pk = pi - len;
+            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
+                @TYPE@_COPY(pj, pk, len);
+                pj -= len;
+                pk -= len;
+            }
+            @TYPE@_COPY(pj, vp, len);
+        }
+stack_pop:
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+        cdepth = *(--psdepth);
+    }
+
+    free(vp);
+    return 0;
+}
+
+
+int
+aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *varr)
+{
+    @type@ *v = vv;
+    PyArrayObject *arr = varr;
+    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
+    @type@ *vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr=stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+    int depth[PYA_QS_STACK];
+    int * psdepth = depth;
+    int cdepth = npy_get_msb(num) * 2;
+
+    /* Items that have zero size don't make sense to sort */
+    if (len == 0) {
+        return 0;
+    }
+
+    for (;;) {
+        if (NPY_UNLIKELY(cdepth < 0)) {
+            aheapsort_@suff@(vv, pl, pr - pl + 1, varr);
+            goto stack_pop;
+        }
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
+            if (@TYPE@_LT(v + (*pr)*len, v + (*pm)*len, len)) INTP_SWAP(*pr, *pm);
+            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
+            vp = v + (*pm)*len;
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm,*pj);
+            for (;;) {
+                do ++pi; while (@TYPE@_LT(v + (*pi)*len, vp, len));
+                do --pj; while (@TYPE@_LT(vp, v + (*pj)*len, len));
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+            *psdepth++ = --cdepth;
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*len;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+stack_pop:
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+        cdepth = *(--psdepth);
+    }
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+int
+npy_quicksort(void *start, npy_intp num, void *varr)
+{
+    PyArrayObject *arr = varr;
+    npy_intp elsize = PyArray_ITEMSIZE(arr);
+    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    char *vp;
+    char *pl = start;
+    char *pr = pl + (num - 1)*elsize;
+    char *stack[PYA_QS_STACK];
+    char **sptr = stack;
+    char *pm, *pi, *pj, *pk;
+    int depth[PYA_QS_STACK];
+    int * psdepth = depth;
+    int cdepth = npy_get_msb(num) * 2;
+
+    /* Items that have zero size don't make sense to sort */
+    if (elsize == 0) {
+        return 0;
+    }
+
+    vp = malloc(elsize);
+    if (vp == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    for (;;) {
+        if (NPY_UNLIKELY(cdepth < 0)) {
+            npy_heapsort(pl, (pr - pl) / elsize + 1, varr);
+            goto stack_pop;
+        }
+        while(pr - pl > SMALL_QUICKSORT*elsize) {
+            /* quicksort partition */
+            pm = pl + (((pr - pl) / elsize) >> 1) * elsize;
+            if (cmp(pm, pl, arr) < 0) {
+                GENERIC_SWAP(pm, pl, elsize);
+            }
+            if (cmp(pr, pm, arr) < 0) {
+                GENERIC_SWAP(pr, pm, elsize);
+            }
+            if (cmp(pm, pl, arr) < 0) {
+                GENERIC_SWAP(pm, pl, elsize);
+            }
+            GENERIC_COPY(vp, pm, elsize);
+            pi = pl;
+            pj = pr - elsize;
+            GENERIC_SWAP(pm, pj, elsize);
+            /*
+             * Generic comparisons may be buggy, so don't rely on the sentinels
+             * to keep the pointers from going out of bounds.
+             */
+            for (;;) {
+                do {
+                    pi += elsize;
+                } while (cmp(pi, vp, arr) < 0 && pi < pj);
+                do {
+                    pj -= elsize;
+                } while (cmp(vp, pj, arr) < 0 && pi < pj);
+                if (pi >= pj) {
+                    break;
+                }
+                GENERIC_SWAP(pi, pj, elsize);
+            }
+            pk = pr - elsize;
+            GENERIC_SWAP(pi, pk, elsize);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + elsize;
+                *sptr++ = pr;
+                pr = pi - elsize;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - elsize;
+                pl = pi + elsize;
+            }
+            *psdepth++ = --cdepth;
+        }
+
+        /* insertion sort */
+        for (pi = pl + elsize; pi <= pr; pi += elsize) {
+            GENERIC_COPY(vp, pi, elsize);
+            pj = pi;
+            pk = pi - elsize;
+            while (pj > pl && cmp(vp, pk, arr) < 0) {
+                GENERIC_COPY(pj, pk, elsize);
+                pj -= elsize;
+                pk -= elsize;
+            }
+            GENERIC_COPY(pj, vp, elsize);
+        }
+stack_pop:
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+        cdepth = *(--psdepth);
+    }
+
+    free(vp);
+    return 0;
+}
+
+
+int
+npy_aquicksort(void *vv, npy_intp* tosort, npy_intp num, void *varr)
+{
+    char *v = vv;
+    PyArrayObject *arr = varr;
+    npy_intp elsize = PyArray_ITEMSIZE(arr);
+    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    char *vp;
+    npy_intp *pl = tosort;
+    npy_intp *pr = tosort + num - 1;
+    npy_intp *stack[PYA_QS_STACK];
+    npy_intp **sptr = stack;
+    npy_intp *pm, *pi, *pj, *pk, vi;
+    int depth[PYA_QS_STACK];
+    int * psdepth = depth;
+    int cdepth = npy_get_msb(num) * 2;
+
+    /* Items that have zero size don't make sense to sort */
+    if (elsize == 0) {
+        return 0;
+    }
+
+    for (;;) {
+        if (NPY_UNLIKELY(cdepth < 0)) {
+            npy_aheapsort(vv, pl, pr - pl + 1, varr);
+            goto stack_pop;
+        }
+        while ((pr - pl) > SMALL_QUICKSORT) {
+            /* quicksort partition */
+            pm = pl + ((pr - pl) >> 1);
+            if (cmp(v + (*pm)*elsize, v + (*pl)*elsize, arr) < 0) {
+                INTP_SWAP(*pm, *pl);
+            }
+            if (cmp(v + (*pr)*elsize, v + (*pm)*elsize, arr) < 0) {
+                INTP_SWAP(*pr, *pm);
+            }
+            if (cmp(v + (*pm)*elsize, v + (*pl)*elsize, arr) < 0) {
+                INTP_SWAP(*pm, *pl);
+            }
+            vp = v + (*pm)*elsize;
+            pi = pl;
+            pj = pr - 1;
+            INTP_SWAP(*pm,*pj);
+            for (;;) {
+                do {
+                    ++pi;
+                } while (cmp(v + (*pi)*elsize, vp, arr) < 0 && pi < pj);
+                do {
+                    --pj;
+                } while (cmp(vp, v + (*pj)*elsize, arr) < 0 && pi < pj);
+                if (pi >= pj) {
+                    break;
+                }
+                INTP_SWAP(*pi,*pj);
+            }
+            pk = pr - 1;
+            INTP_SWAP(*pi,*pk);
+            /* push largest partition on stack */
+            if (pi - pl < pr - pi) {
+                *sptr++ = pi + 1;
+                *sptr++ = pr;
+                pr = pi - 1;
+            }
+            else {
+                *sptr++ = pl;
+                *sptr++ = pi - 1;
+                pl = pi + 1;
+            }
+            *psdepth++ = --cdepth;
+        }
+
+        /* insertion sort */
+        for (pi = pl + 1; pi <= pr; ++pi) {
+            vi = *pi;
+            vp = v + vi*elsize;
+            pj = pi;
+            pk = pi - 1;
+            while (pj > pl && cmp(vp, v + (*pk)*elsize, arr) < 0) {
+                *pj-- = *pk--;
+            }
+            *pj = vi;
+        }
+stack_pop:
+        if (sptr == stack) {
+            break;
+        }
+        pr = *(--sptr);
+        pl = *(--sptr);
+        cdepth = *(--psdepth);
+    }
+
+    return 0;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/npysort/selection.c.src b/contrib/python/numpy/py2/numpy/core/src/npysort/selection.c.src
new file mode 100644
index 00000000000..1e0934558a5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/npysort/selection.c.src
@@ -0,0 +1,418 @@
+/* -*- c -*- */
+
+/*
+ *
+ * The code is loosely based on the quickselect from
+ * Nicolas Devillard - 1998 public domain
+ * http://ndevilla.free.fr/median/median/
+ *
+ * Quick select with median of 3 pivot is usually the fastest,
+ * but the worst case scenario can be quadratic complexity,
+ * e.g. np.roll(np.arange(x), x / 2)
+ * To avoid this if it recurses too much it falls back to the
+ * worst case linear median of median of group 5 pivot strategy.
+ */
+
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include "numpy/npy_math.h"
+#include "npy_partition.h"
+#include <stdlib.h>
+
+#define NOT_USED NPY_UNUSED(unused)
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+static NPY_INLINE void store_pivot(npy_intp pivot, npy_intp kth,
+                                   npy_intp * pivots, npy_intp * npiv)
+{
+    if (pivots == NULL) {
+        return;
+    }
+
+    /*
+     * If pivot is the requested kth store it, overwritting other pivots if
+     * required. This must be done so iterative partition can work without
+     * manually shifting lower data offset by kth each time
+     */
+    if (pivot == kth && *npiv == NPY_MAX_PIVOT_STACK) {
+        pivots[*npiv - 1] = pivot;
+    }
+    /*
+     * we only need pivots larger than current kth, larger pivots are not
+     * useful as partitions on smaller kth would reorder the stored pivots
+     */
+    else if (pivot >= kth && *npiv < NPY_MAX_PIVOT_STACK) {
+        pivots[*npiv] = pivot;
+        (*npiv) += 1;
+    }
+}
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
+ *         npy_cdouble, npy_clongdouble#
+ * #inexact = 0*11, 1*7#
+ */
+
+static npy_intp
+amedian_of_median5_@suff@(@type@ *v, npy_intp* tosort, const npy_intp num,
+                         npy_intp * pivots,
+                         npy_intp * npiv);
+
+static npy_intp
+median_of_median5_@suff@(@type@ *v, const npy_intp num,
+                         npy_intp * pivots,
+                         npy_intp * npiv);
+
+/**begin repeat1
+ * #name = , a#
+ * #idx = , tosort#
+ * #arg = 0, 1#
+ */
+#if @arg@
+/* helper macros to avoid duplication of direct/indirect selection */
+#define IDX(x) tosort[x]
+#define SORTEE(x) tosort[x]
+#define SWAP INTP_SWAP
+#define MEDIAN3_SWAP(v, tosort, low, mid, high) \
+    amedian3_swap_@suff@(v, tosort, low, mid, high)
+#define MEDIAN5(v, tosort, subleft) \
+        amedian5_@suff@(v, tosort + subleft)
+#define UNGUARDED_PARTITION(v, tosort, pivot, ll, hh) \
+        aunguarded_partition_@suff@(v, tosort, pivot, ll, hh)
+#define INTROSELECT(v, tosort, num, kth, pivots, npiv) \
+        aintroselect_@suff@(v, tosort, nmed, nmed / 2, pivots, npiv, NULL)
+#define DUMBSELECT(v, tosort, left, num, kth) \
+        adumb_select_@suff@(v, tosort + left, num, kth)
+#else
+#define IDX(x) (x)
+#define SORTEE(x) v[x]
+#define SWAP @TYPE@_SWAP
+#define MEDIAN3_SWAP(v, tosort, low, mid, high) \
+    median3_swap_@suff@(v, low, mid, high)
+#define MEDIAN5(v, tosort, subleft) \
+        median5_@suff@(v + subleft)
+#define UNGUARDED_PARTITION(v, tosort, pivot, ll, hh) \
+        unguarded_partition_@suff@(v, pivot, ll, hh)
+#define INTROSELECT(v, tosort, num, kth, pivots, npiv) \
+        introselect_@suff@(v, nmed, nmed / 2, pivots, npiv, NULL)
+#define DUMBSELECT(v, tosort, left, num, kth) \
+        dumb_select_@suff@(v + left, num, kth)
+#endif
+
+
+/*
+ * median of 3 pivot strategy
+ * gets min and median and moves median to low and min to low + 1
+ * for efficient partitioning, see unguarded_partition
+ */
+static NPY_INLINE void
+@name@median3_swap_@suff@(@type@ * v,
+#if @arg@
+                          npy_intp * tosort,
+#endif
+                          npy_intp low, npy_intp mid, npy_intp high)
+{
+    if (@TYPE@_LT(v[IDX(high)], v[IDX(mid)]))
+        SWAP(SORTEE(high), SORTEE(mid));
+    if (@TYPE@_LT(v[IDX(high)], v[IDX(low)]))
+        SWAP(SORTEE(high), SORTEE(low));
+    /* move pivot to low */
+    if (@TYPE@_LT(v[IDX(low)], v[IDX(mid)]))
+        SWAP(SORTEE(low), SORTEE(mid));
+    /* move 3-lowest element to low + 1 */
+    SWAP(SORTEE(mid), SORTEE(low + 1));
+}
+
+
+/* select index of median of five elements */
+static npy_intp @name@median5_@suff@(
+#if @arg@
+                                    const @type@ * v, npy_intp * tosort
+#else
+                                    @type@ * v
+#endif
+                                    )
+{
+    /* could be optimized as we only need the index (no swaps) */
+    if (@TYPE@_LT(v[IDX(1)], v[IDX(0)])) {
+        SWAP(SORTEE(1), SORTEE(0));
+    }
+    if (@TYPE@_LT(v[IDX(4)], v[IDX(3)])) {
+        SWAP(SORTEE(4), SORTEE(3));
+    }
+    if (@TYPE@_LT(v[IDX(3)], v[IDX(0)])) {
+        SWAP(SORTEE(3), SORTEE(0));
+    }
+    if (@TYPE@_LT(v[IDX(4)], v[IDX(1)])) {
+        SWAP(SORTEE(4), SORTEE(1));
+    }
+    if (@TYPE@_LT(v[IDX(2)], v[IDX(1)])) {
+        SWAP(SORTEE(2), SORTEE(1));
+    }
+    if (@TYPE@_LT(v[IDX(3)], v[IDX(2)])) {
+        if (@TYPE@_LT(v[IDX(3)], v[IDX(1)])) {
+            return 1;
+        }
+        else {
+            return 3;
+        }
+    }
+    else {
+        /* v[1] and v[2] swapped into order above */
+        return 2;
+    }
+}
+
+
+/*
+ * partition and return the index were the pivot belongs
+ * the data must have following property to avoid bound checks:
+ *                  ll ... hh
+ * lower-than-pivot [x x x x] larger-than-pivot
+ */
+static NPY_INLINE void
+@name@unguarded_partition_@suff@(@type@ * v,
+#if @arg@
+                                 npy_intp * tosort,
+#endif
+                                 const @type@ pivot,
+                                 npy_intp * ll, npy_intp * hh)
+{
+    for (;;) {
+        do (*ll)++; while (@TYPE@_LT(v[IDX(*ll)], pivot));
+        do (*hh)--; while (@TYPE@_LT(pivot, v[IDX(*hh)]));
+
+        if (*hh < *ll)
+            break;
+
+        SWAP(SORTEE(*ll), SORTEE(*hh));
+    }
+}
+
+
+/*
+ * select median of median of blocks of 5
+ * if used as partition pivot it splits the range into at least 30%/70%
+ * allowing linear time worstcase quickselect
+ */
+static npy_intp
+@name@median_of_median5_@suff@(@type@ *v,
+#if @arg@
+                               npy_intp* tosort,
+#endif
+                               const npy_intp num,
+                               npy_intp * pivots,
+                               npy_intp * npiv)
+{
+    npy_intp i, subleft;
+    npy_intp right = num - 1;
+    npy_intp nmed = (right + 1) / 5;
+    for (i = 0, subleft = 0; i < nmed; i++, subleft += 5) {
+        npy_intp m = MEDIAN5(v, tosort, subleft);
+        SWAP(SORTEE(subleft + m), SORTEE(i));
+    }
+
+    if (nmed > 2)
+        INTROSELECT(v, tosort, nmed, nmed / 2, pivots, npiv);
+    return nmed / 2;
+}
+
+
+/*
+ * N^2 selection, fast only for very small kth
+ * useful for close multiple partitions
+ * (e.g. even element median, interpolating percentile)
+ */
+static int
+@name@dumb_select_@suff@(@type@ *v,
+#if @arg@
+                         npy_intp * tosort,
+#endif
+                         npy_intp num, npy_intp kth)
+{
+    npy_intp i;
+    for (i = 0; i <= kth; i++) {
+        npy_intp minidx = i;
+        @type@ minval = v[IDX(i)];
+        npy_intp k;
+        for (k = i + 1; k < num; k++) {
+            if (@TYPE@_LT(v[IDX(k)], minval)) {
+                minidx = k;
+                minval = v[IDX(k)];
+            }
+        }
+        SWAP(SORTEE(i), SORTEE(minidx));
+    }
+
+    return 0;
+}
+
+
+/*
+ * iterative median of 3 quickselect with cutoff to median-of-medians-of5
+ * receives stack of already computed pivots in v to minimize the
+ * partition size were kth is searched in
+ *
+ * area that needs partitioning in [...]
+ * kth 0:  [8  7  6  5  4  3  2  1  0] -> med3 partitions elements [4, 2, 0]
+ *          0  1  2  3  4  8  7  5  6  -> pop requested kth -> stack [4, 2]
+ * kth 3:   0  1  2 [3] 4  8  7  5  6  -> stack [4]
+ * kth 5:   0  1  2  3  4 [8  7  5  6] -> stack [6]
+ * kth 8:   0  1  2  3  4  5  6 [8  7] -> stack []
+ *
+ */
+int
+@name@introselect_@suff@(@type@ *v,
+#if @arg@
+                         npy_intp* tosort,
+#endif
+                         npy_intp num, npy_intp kth,
+                         npy_intp * pivots,
+                         npy_intp * npiv,
+                         void *NOT_USED)
+{
+    npy_intp low  = 0;
+    npy_intp high = num - 1;
+    int depth_limit;
+
+    if (npiv == NULL)
+        pivots = NULL;
+
+    while (pivots != NULL && *npiv > 0) {
+        if (pivots[*npiv - 1] > kth) {
+            /* pivot larger than kth set it as upper bound */
+            high = pivots[*npiv - 1] - 1;
+            break;
+        }
+        else if (pivots[*npiv - 1] == kth) {
+            /* kth was already found in a previous iteration -> done */
+            return 0;
+        }
+
+        low = pivots[*npiv - 1] + 1;
+
+        /* pop from stack */
+        *npiv -= 1;
+    }
+
+    /*
+     * use a faster O(n*kth) algorithm for very small kth
+     * e.g. for interpolating percentile
+     */
+    if (kth - low < 3) {
+        DUMBSELECT(v, tosort, low, high - low + 1, kth - low);
+        store_pivot(kth, kth, pivots, npiv);
+        return 0;
+    }
+    else if (@inexact@ && kth == num - 1) {
+        /* useful to check if NaN present via partition(d, (x, -1)) */
+        npy_intp k;
+        npy_intp maxidx = low;
+        @type@ maxval = v[IDX(low)];
+        for (k = low + 1; k < num; k++) {
+            if (!@TYPE@_LT(v[IDX(k)], maxval)) {
+                maxidx = k;
+                maxval = v[IDX(k)];
+            }
+        }
+        SWAP(SORTEE(kth), SORTEE(maxidx));
+        return 0;
+    }
+
+    depth_limit = npy_get_msb(num) * 2;
+
+    /* guarantee three elements */
+    for (;low + 1 < high;) {
+        npy_intp       ll = low + 1;
+        npy_intp       hh = high;
+
+        /*
+         * if we aren't making sufficient progress with median of 3
+         * fall back to median-of-median5 pivot for linear worst case
+         * med3 for small sizes is required to do unguarded partition
+         */
+        if (depth_limit > 0 || hh - ll < 5) {
+            const npy_intp mid = low + (high - low) / 2;
+            /* median of 3 pivot strategy,
+             * swapping for efficient partition */
+            MEDIAN3_SWAP(v, tosort, low, mid, high);
+        }
+        else {
+            npy_intp mid;
+            /* FIXME: always use pivots to optimize this iterative partition */
+#if @arg@
+            mid = ll + amedian_of_median5_@suff@(v, tosort + ll, hh - ll, NULL, NULL);
+#else
+            mid = ll + median_of_median5_@suff@(v + ll, hh - ll, NULL, NULL);
+#endif
+            SWAP(SORTEE(mid), SORTEE(low));
+            /* adapt for the larger partition than med3 pivot */
+            ll--;
+            hh++;
+        }
+
+        depth_limit--;
+
+        /*
+         * find place to put pivot (in low):
+         * previous swapping removes need for bound checks
+         * pivot 3-lowest [x x x] 3-highest
+         */
+        UNGUARDED_PARTITION(v, tosort, v[IDX(low)], &ll, &hh);
+
+        /* move pivot into position */
+        SWAP(SORTEE(low), SORTEE(hh));
+
+        /* kth pivot stored later */
+        if (hh != kth) {
+            store_pivot(hh, kth, pivots, npiv);
+        }
+
+        if (hh >= kth)
+            high = hh - 1;
+        if (hh <= kth)
+            low = ll;
+    }
+
+    /* two elements */
+    if (high == low + 1) {
+        if (@TYPE@_LT(v[IDX(high)], v[IDX(low)])) {
+            SWAP(SORTEE(high), SORTEE(low))
+        }
+    }
+    store_pivot(kth, kth, pivots, npiv);
+
+    return 0;
+}
+
+
+#undef IDX
+#undef SWAP
+#undef SORTEE
+#undef MEDIAN3_SWAP
+#undef MEDIAN5
+#undef UNGUARDED_PARTITION
+#undef INTROSELECT
+#undef DUMBSELECT
+/**end repeat1**/
+
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/_operand_flag_tests.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/_operand_flag_tests.c.src
new file mode 100644
index 00000000000..551a9c6329b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/_operand_flag_tests.c.src
@@ -0,0 +1,105 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include <numpy/ufuncobject.h>
+#include "numpy/npy_3kcompat.h"
+#include <math.h>
+#include <structmember.h>
+
+
+static PyMethodDef TestMethods[] = {
+        {NULL, NULL, 0, NULL}
+};
+
+
+static void
+inplace_add(char **args, npy_intp *dimensions, npy_intp *steps, void *data)
+{
+    npy_intp i;
+    npy_intp n = dimensions[0];
+    char *in1 = args[0];
+    char *in2 = args[1];
+    npy_intp in1_step = steps[0];
+    npy_intp in2_step = steps[1];
+
+    for (i = 0; i < n; i++) {
+        (*(long *)in1) = *(long*)in1 + *(long*)in2;
+        in1 += in1_step;
+        in2 += in2_step;
+    }
+}
+
+
+/*This a pointer to the above function*/
+PyUFuncGenericFunction funcs[1] = {&inplace_add};
+
+/* These are the input and return dtypes of logit.*/
+static char types[2] = {NPY_LONG, NPY_LONG};
+
+static void *data[1] = {NULL};
+
+#if defined(NPY_PY3K)
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_operand_flag_tests",
+    NULL,
+    -1,
+    TestMethods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+#define RETVAL m
+PyMODINIT_FUNC PyInit__operand_flag_tests(void)
+{
+#else
+#define RETVAL
+PyMODINIT_FUNC init_operand_flag_tests(void)
+{
+#endif
+    PyObject *m = NULL;
+    PyObject *ufunc;
+
+#if defined(NPY_PY3K)
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule("_operand_flag_tests", TestMethods);
+#endif
+    if (m == NULL) {
+        goto fail;
+    }
+
+    import_array();
+    import_umath();
+
+    ufunc = PyUFunc_FromFuncAndData(funcs, data, types, 1, 2, 0,
+                                    PyUFunc_None, "inplace_add",
+                                    "inplace_add_docstring", 0);
+
+    /*
+     * Set flags to turn off buffering for first input operand,
+     * so that result can be written back to input operand.
+     */
+    ((PyUFuncObject*)ufunc)->op_flags[0] = NPY_ITER_READWRITE;
+    ((PyUFuncObject*)ufunc)->iter_flags = NPY_ITER_REDUCE_OK;
+    PyModule_AddObject(m, "inplace_add", (PyObject*)ufunc);
+
+    return RETVAL;
+
+fail:
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _operand_flag_tests module.");
+    }
+#if defined(NPY_PY3K)
+    if (m) {
+        Py_DECREF(m);
+        m = NULL;
+    }
+#endif
+    return RETVAL;
+
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/_rational_tests.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/_rational_tests.c.src
new file mode 100644
index 00000000000..9e74845df29
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/_rational_tests.c.src
@@ -0,0 +1,1409 @@
+/* Fixed size rational numbers exposed to Python */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <Python.h>
+#include <structmember.h>
+#include <numpy/arrayobject.h>
+#include <numpy/ufuncobject.h>
+#include <numpy/npy_3kcompat.h>
+#include <math.h>
+
+#include "common.h"  /* for error_converting */
+
+
+/* Relevant arithmetic exceptions */
+
+/* Uncomment the following line to work around a bug in numpy */
+/* #define ACQUIRE_GIL */
+
+static void
+set_overflow(void) {
+#ifdef ACQUIRE_GIL
+    /* Need to grab the GIL to dodge a bug in numpy */
+    PyGILState_STATE state = PyGILState_Ensure();
+#endif
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError,
+                "overflow in rational arithmetic");
+    }
+#ifdef ACQUIRE_GIL
+    PyGILState_Release(state);
+#endif
+}
+
+static void
+set_zero_divide(void) {
+#ifdef ACQUIRE_GIL
+    /* Need to grab the GIL to dodge a bug in numpy */
+    PyGILState_STATE state = PyGILState_Ensure();
+#endif
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_ZeroDivisionError,
+                "zero divide in rational arithmetic");
+    }
+#ifdef ACQUIRE_GIL
+    PyGILState_Release(state);
+#endif
+}
+
+/* Integer arithmetic utilities */
+
+static NPY_INLINE npy_int32
+safe_neg(npy_int32 x) {
+    if (x==(npy_int32)1<<31) {
+        set_overflow();
+    }
+    return -x;
+}
+
+static NPY_INLINE npy_int32
+safe_abs32(npy_int32 x) {
+    npy_int32 nx;
+    if (x>=0) {
+        return x;
+    }
+    nx = -x;
+    if (nx<0) {
+        set_overflow();
+    }
+    return nx;
+}
+
+static NPY_INLINE npy_int64
+safe_abs64(npy_int64 x) {
+    npy_int64 nx;
+    if (x>=0) {
+        return x;
+    }
+    nx = -x;
+    if (nx<0) {
+        set_overflow();
+    }
+    return nx;
+}
+
+static NPY_INLINE npy_int64
+gcd(npy_int64 x, npy_int64 y) {
+    x = safe_abs64(x);
+    y = safe_abs64(y);
+    if (x < y) {
+        npy_int64 t = x;
+        x = y;
+        y = t;
+    }
+    while (y) {
+        npy_int64 t;
+        x = x%y;
+        t = x;
+        x = y;
+        y = t;
+    }
+    return x;
+}
+
+static NPY_INLINE npy_int64
+lcm(npy_int64 x, npy_int64 y) {
+    npy_int64 lcm;
+    if (!x || !y) {
+        return 0;
+    }
+    x /= gcd(x,y);
+    lcm = x*y;
+    if (lcm/y!=x) {
+        set_overflow();
+    }
+    return safe_abs64(lcm);
+}
+
+/* Fixed precision rational numbers */
+
+typedef struct {
+    /* numerator */
+    npy_int32 n;
+    /*
+     * denominator minus one: numpy.zeros() uses memset(0) for non-object
+     * types, so need to ensure that rational(0) has all zero bytes
+     */
+    npy_int32 dmm;
+} rational;
+
+static NPY_INLINE rational
+make_rational_int(npy_int64 n) {
+    rational r = {(npy_int32)n,0};
+    if (r.n != n) {
+        set_overflow();
+    }
+    return r;
+}
+
+static rational
+make_rational_slow(npy_int64 n_, npy_int64 d_) {
+    rational r = {0};
+    if (!d_) {
+        set_zero_divide();
+    }
+    else {
+        npy_int64 g = gcd(n_,d_);
+        npy_int32 d;
+        n_ /= g;
+        d_ /= g;
+        r.n = (npy_int32)n_;
+        d = (npy_int32)d_;
+        if (r.n!=n_ || d!=d_) {
+            set_overflow();
+        }
+        else {
+            if (d <= 0) {
+                d = -d;
+                r.n = safe_neg(r.n);
+            }
+            r.dmm = d-1;
+        }
+    }
+    return r;
+}
+
+static NPY_INLINE npy_int32
+d(rational r) {
+    return r.dmm+1;
+}
+
+/* Assumes d_ > 0 */
+static rational
+make_rational_fast(npy_int64 n_, npy_int64 d_) {
+    npy_int64 g = gcd(n_,d_);
+    rational r;
+    n_ /= g;
+    d_ /= g;
+    r.n = (npy_int32)n_;
+    r.dmm = (npy_int32)(d_-1);
+    if (r.n!=n_ || r.dmm+1!=d_) {
+        set_overflow();
+    }
+    return r;
+}
+
+static NPY_INLINE rational
+rational_negative(rational r) {
+    rational x;
+    x.n = safe_neg(r.n);
+    x.dmm = r.dmm;
+    return x;
+}
+
+static NPY_INLINE rational
+rational_add(rational x, rational y) {
+    /*
+     * Note that the numerator computation can never overflow int128_t,
+     * since each term is strictly under 2**128/4 (since d > 0).
+     */
+    return make_rational_fast((npy_int64)x.n*d(y)+(npy_int64)d(x)*y.n,
+        (npy_int64)d(x)*d(y));
+}
+
+static NPY_INLINE rational
+rational_subtract(rational x, rational y) {
+    /* We're safe from overflow as with + */
+    return make_rational_fast((npy_int64)x.n*d(y)-(npy_int64)d(x)*y.n,
+        (npy_int64)d(x)*d(y));
+}
+
+static NPY_INLINE rational
+rational_multiply(rational x, rational y) {
+    /* We're safe from overflow as with + */
+    return make_rational_fast((npy_int64)x.n*y.n,(npy_int64)d(x)*d(y));
+}
+
+static NPY_INLINE rational
+rational_divide(rational x, rational y) {
+    return make_rational_slow((npy_int64)x.n*d(y),(npy_int64)d(x)*y.n);
+}
+
+static NPY_INLINE npy_int64
+rational_floor(rational x) {
+    /* Always round down */
+    if (x.n>=0) {
+        return x.n/d(x);
+    }
+    /*
+     * This can be done without casting up to 64 bits, but it requires
+     * working out all the sign cases
+     */
+    return -((-(npy_int64)x.n+d(x)-1)/d(x));
+}
+
+static NPY_INLINE npy_int64
+rational_ceil(rational x) {
+    return -rational_floor(rational_negative(x));
+}
+
+static NPY_INLINE rational
+rational_remainder(rational x, rational y) {
+    return rational_subtract(x, rational_multiply(y,make_rational_int(
+                    rational_floor(rational_divide(x,y)))));
+}
+
+static NPY_INLINE rational
+rational_abs(rational x) {
+    rational y;
+    y.n = safe_abs32(x.n);
+    y.dmm = x.dmm;
+    return y;
+}
+
+static NPY_INLINE npy_int64
+rational_rint(rational x) {
+    /*
+     * Round towards nearest integer, moving exact half integers towards
+     * zero
+     */
+    npy_int32 d_ = d(x);
+    return (2*(npy_int64)x.n+(x.n<0?-d_:d_))/(2*(npy_int64)d_);
+}
+
+static NPY_INLINE int
+rational_sign(rational x) {
+    return x.n<0?-1:x.n==0?0:1;
+}
+
+static NPY_INLINE rational
+rational_inverse(rational x) {
+    rational y = {0};
+    if (!x.n) {
+        set_zero_divide();
+    }
+    else {
+        npy_int32 d_;
+        y.n = d(x);
+        d_ = x.n;
+        if (d_ <= 0) {
+            d_ = safe_neg(d_);
+            y.n = -y.n;
+        }
+        y.dmm = d_-1;
+    }
+    return y;
+}
+
+static NPY_INLINE int
+rational_eq(rational x, rational y) {
+    /*
+     * Since we enforce d > 0, and store fractions in reduced form,
+     * equality is easy.
+     */
+    return x.n==y.n && x.dmm==y.dmm;
+}
+
+static NPY_INLINE int
+rational_ne(rational x, rational y) {
+    return !rational_eq(x,y);
+}
+
+static NPY_INLINE int
+rational_lt(rational x, rational y) {
+    return (npy_int64)x.n*d(y) < (npy_int64)y.n*d(x);
+}
+
+static NPY_INLINE int
+rational_gt(rational x, rational y) {
+    return rational_lt(y,x);
+}
+
+static NPY_INLINE int
+rational_le(rational x, rational y) {
+    return !rational_lt(y,x);
+}
+
+static NPY_INLINE int
+rational_ge(rational x, rational y) {
+    return !rational_lt(x,y);
+}
+
+static NPY_INLINE npy_int32
+rational_int(rational x) {
+    return x.n/d(x);
+}
+
+static NPY_INLINE double
+rational_double(rational x) {
+    return (double)x.n/d(x);
+}
+
+static NPY_INLINE int
+rational_nonzero(rational x) {
+    return x.n!=0;
+}
+
+static int
+scan_rational(const char** s, rational* x) {
+    long n,d;
+    int offset;
+    const char* ss;
+    if (sscanf(*s,"%ld%n",&n,&offset)<=0) {
+        return 0;
+    }
+    ss = *s+offset;
+    if (*ss!='/') {
+        *s = ss;
+        *x = make_rational_int(n);
+        return 1;
+    }
+    ss++;
+    if (sscanf(ss,"%ld%n",&d,&offset)<=0 || d<=0) {
+        return 0;
+    }
+    *s = ss+offset;
+    *x = make_rational_slow(n,d);
+    return 1;
+}
+
+/* Expose rational to Python as a numpy scalar */
+
+typedef struct {
+    PyObject_HEAD
+    rational r;
+} PyRational;
+
+static PyTypeObject PyRational_Type;
+
+static NPY_INLINE int
+PyRational_Check(PyObject* object) {
+    return PyObject_IsInstance(object,(PyObject*)&PyRational_Type);
+}
+
+static PyObject*
+PyRational_FromRational(rational x) {
+    PyRational* p = (PyRational*)PyRational_Type.tp_alloc(&PyRational_Type,0);
+    if (p) {
+        p->r = x;
+    }
+    return (PyObject*)p;
+}
+
+static PyObject*
+pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
+    Py_ssize_t size;
+    PyObject* x[2];
+    long n[2]={0,1};
+    int i;
+    rational r;
+    if (kwds && PyDict_Size(kwds)) {
+        PyErr_SetString(PyExc_TypeError,
+                "constructor takes no keyword arguments");
+        return 0;
+    }
+    size = PyTuple_GET_SIZE(args);
+    if (size > 2) {
+        PyErr_SetString(PyExc_TypeError,
+                "expected rational or numerator and optional denominator");
+        return 0;
+    }
+
+    if (size == 1) {
+        x[0] = PyTuple_GET_ITEM(args, 0);
+        if (PyRational_Check(x[0])) {
+            Py_INCREF(x[0]);
+            return x[0];
+        }
+        else if (PyString_Check(x[0])) {
+            const char* s = PyString_AS_STRING(x[0]);
+            rational x;
+            if (scan_rational(&s,&x)) {
+                const char* p;
+                for (p = s; *p; p++) {
+                    if (!isspace(*p)) {
+                        goto bad;
+                    }
+                }
+                return PyRational_FromRational(x);
+            }
+            bad:
+            PyErr_Format(PyExc_ValueError,
+                    "invalid rational literal '%s'",s);
+            return 0;
+        }
+    }
+
+    for (i=0; i<size; i++) {
+        PyObject* y;
+        int eq;
+        x[i] = PyTuple_GET_ITEM(args, i);
+        n[i] = PyInt_AsLong(x[i]);
+        if (error_converting(n[i])) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Format(PyExc_TypeError,
+                        "expected integer %s, got %s",
+                        (i ? "denominator" : "numerator"),
+                        x[i]->ob_type->tp_name);
+            }
+            return 0;
+        }
+        /* Check that we had an exact integer */
+        y = PyInt_FromLong(n[i]);
+        if (!y) {
+            return 0;
+        }
+        eq = PyObject_RichCompareBool(x[i],y,Py_EQ);
+        Py_DECREF(y);
+        if (eq<0) {
+            return 0;
+        }
+        if (!eq) {
+            PyErr_Format(PyExc_TypeError,
+                    "expected integer %s, got %s",
+                    (i ? "denominator" : "numerator"),
+                    x[i]->ob_type->tp_name);
+            return 0;
+        }
+    }
+    r = make_rational_slow(n[0],n[1]);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    return PyRational_FromRational(r);
+}
+
+/*
+ * Returns Py_NotImplemented on most conversion failures, or raises an
+ * overflow error for too long ints
+ */
+#define AS_RATIONAL(dst,object) \
+    { \
+        dst.n = 0; \
+        if (PyRational_Check(object)) { \
+            dst = ((PyRational*)object)->r; \
+        } \
+        else { \
+            PyObject* y_; \
+            int eq_; \
+            long n_ = PyInt_AsLong(object); \
+            if (error_converting(n_)) { \
+                if (PyErr_ExceptionMatches(PyExc_TypeError)) { \
+                    PyErr_Clear(); \
+                    Py_INCREF(Py_NotImplemented); \
+                    return Py_NotImplemented; \
+                } \
+                return 0; \
+            } \
+            y_ = PyInt_FromLong(n_); \
+            if (!y_) { \
+                return 0; \
+            } \
+            eq_ = PyObject_RichCompareBool(object,y_,Py_EQ); \
+            Py_DECREF(y_); \
+            if (eq_<0) { \
+                return 0; \
+            } \
+            if (!eq_) { \
+                Py_INCREF(Py_NotImplemented); \
+                return Py_NotImplemented; \
+            } \
+            dst = make_rational_int(n_); \
+        } \
+    }
+
+static PyObject*
+pyrational_richcompare(PyObject* a, PyObject* b, int op) {
+    rational x, y;
+    int result = 0;
+    AS_RATIONAL(x,a);
+    AS_RATIONAL(y,b);
+    #define OP(py,op) case py: result = rational_##op(x,y); break;
+    switch (op) {
+        OP(Py_LT,lt)
+        OP(Py_LE,le)
+        OP(Py_EQ,eq)
+        OP(Py_NE,ne)
+        OP(Py_GT,gt)
+        OP(Py_GE,ge)
+    };
+    #undef OP
+    return PyBool_FromLong(result);
+}
+
+static PyObject*
+pyrational_repr(PyObject* self) {
+    rational x = ((PyRational*)self)->r;
+    if (d(x)!=1) {
+        return PyUString_FromFormat(
+                "rational(%ld,%ld)",(long)x.n,(long)d(x));
+    }
+    else {
+        return PyUString_FromFormat(
+                "rational(%ld)",(long)x.n);
+    }
+}
+
+static PyObject*
+pyrational_str(PyObject* self) {
+    rational x = ((PyRational*)self)->r;
+    if (d(x)!=1) {
+        return PyString_FromFormat(
+                "%ld/%ld",(long)x.n,(long)d(x));
+    }
+    else {
+        return PyString_FromFormat(
+                "%ld",(long)x.n);
+    }
+}
+
+static npy_hash_t
+pyrational_hash(PyObject* self) {
+    rational x = ((PyRational*)self)->r;
+    /* Use a fairly weak hash as Python expects */
+    long h = 131071*x.n+524287*x.dmm;
+    /* Never return the special error value -1 */
+    return h==-1?2:h;
+}
+
+#define RATIONAL_BINOP_2(name,exp) \
+    static PyObject* \
+    pyrational_##name(PyObject* a, PyObject* b) { \
+        rational x, y, z; \
+        AS_RATIONAL(x,a); \
+        AS_RATIONAL(y,b); \
+        z = exp; \
+        if (PyErr_Occurred()) { \
+            return 0; \
+        } \
+        return PyRational_FromRational(z); \
+    }
+#define RATIONAL_BINOP(name) RATIONAL_BINOP_2(name,rational_##name(x,y))
+RATIONAL_BINOP(add)
+RATIONAL_BINOP(subtract)
+RATIONAL_BINOP(multiply)
+RATIONAL_BINOP(divide)
+RATIONAL_BINOP(remainder)
+RATIONAL_BINOP_2(floor_divide,
+    make_rational_int(rational_floor(rational_divide(x,y))))
+
+#define RATIONAL_UNOP(name,type,exp,convert) \
+    static PyObject* \
+    pyrational_##name(PyObject* self) { \
+        rational x = ((PyRational*)self)->r; \
+        type y = exp; \
+        if (PyErr_Occurred()) { \
+            return 0; \
+        } \
+        return convert(y); \
+    }
+RATIONAL_UNOP(negative,rational,rational_negative(x),PyRational_FromRational)
+RATIONAL_UNOP(absolute,rational,rational_abs(x),PyRational_FromRational)
+RATIONAL_UNOP(int,long,rational_int(x),PyInt_FromLong)
+RATIONAL_UNOP(float,double,rational_double(x),PyFloat_FromDouble)
+
+static PyObject*
+pyrational_positive(PyObject* self) {
+    Py_INCREF(self);
+    return self;
+}
+
+static int
+pyrational_nonzero(PyObject* self) {
+    rational x = ((PyRational*)self)->r;
+    return rational_nonzero(x);
+}
+
+static PyNumberMethods pyrational_as_number = {
+    pyrational_add,          /* nb_add */
+    pyrational_subtract,     /* nb_subtract */
+    pyrational_multiply,     /* nb_multiply */
+#if PY_MAJOR_VERSION < 3
+    pyrational_divide,       /* nb_divide */
+#endif
+    pyrational_remainder,    /* nb_remainder */
+    0,                       /* nb_divmod */
+    0,                       /* nb_power */
+    pyrational_negative,     /* nb_negative */
+    pyrational_positive,     /* nb_positive */
+    pyrational_absolute,     /* nb_absolute */
+    pyrational_nonzero,      /* nb_nonzero */
+    0,                       /* nb_invert */
+    0,                       /* nb_lshift */
+    0,                       /* nb_rshift */
+    0,                       /* nb_and */
+    0,                       /* nb_xor */
+    0,                       /* nb_or */
+#if PY_MAJOR_VERSION < 3
+    0,                       /* nb_coerce */
+#endif
+    pyrational_int,          /* nb_int */
+#if PY_MAJOR_VERSION < 3
+    pyrational_int,          /* nb_long */
+#else
+    0,                       /* reserved */
+#endif
+    pyrational_float,        /* nb_float */
+#if PY_MAJOR_VERSION < 3
+    0,                       /* nb_oct */
+    0,                       /* nb_hex */
+#endif
+
+    0,                       /* nb_inplace_add */
+    0,                       /* nb_inplace_subtract */
+    0,                       /* nb_inplace_multiply */
+#if PY_MAJOR_VERSION < 3
+    0,                       /* nb_inplace_divide */
+#endif
+    0,                       /* nb_inplace_remainder */
+    0,                       /* nb_inplace_power */
+    0,                       /* nb_inplace_lshift */
+    0,                       /* nb_inplace_rshift */
+    0,                       /* nb_inplace_and */
+    0,                       /* nb_inplace_xor */
+    0,                       /* nb_inplace_or */
+
+    pyrational_floor_divide, /* nb_floor_divide */
+    pyrational_divide,       /* nb_true_divide */
+    0,                       /* nb_inplace_floor_divide */
+    0,                       /* nb_inplace_true_divide */
+    0,                       /* nb_index */
+};
+
+static PyObject*
+pyrational_n(PyObject* self, void* closure) {
+    return PyInt_FromLong(((PyRational*)self)->r.n);
+}
+
+static PyObject*
+pyrational_d(PyObject* self, void* closure) {
+    return PyInt_FromLong(d(((PyRational*)self)->r));
+}
+
+static PyGetSetDef pyrational_getset[] = {
+    {(char*)"n",pyrational_n,0,(char*)"numerator",0},
+    {(char*)"d",pyrational_d,0,(char*)"denominator",0},
+    {0} /* sentinel */
+};
+
+static PyTypeObject PyRational_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                        /* ob_size */
+#endif
+    "rational",                               /* tp_name */
+    sizeof(PyRational),                       /* tp_basicsize */
+    0,                                        /* tp_itemsize */
+    0,                                        /* tp_dealloc */
+    0,                                        /* tp_print */
+    0,                                        /* tp_getattr */
+    0,                                        /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                        /* tp_reserved */
+#else
+    0,                                        /* tp_compare */
+#endif
+    pyrational_repr,                          /* tp_repr */
+    &pyrational_as_number,                    /* tp_as_number */
+    0,                                        /* tp_as_sequence */
+    0,                                        /* tp_as_mapping */
+    pyrational_hash,                          /* tp_hash */
+    0,                                        /* tp_call */
+    pyrational_str,                           /* tp_str */
+    0,                                        /* tp_getattro */
+    0,                                        /* tp_setattro */
+    0,                                        /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    "Fixed precision rational numbers",       /* tp_doc */
+    0,                                        /* tp_traverse */
+    0,                                        /* tp_clear */
+    pyrational_richcompare,                   /* tp_richcompare */
+    0,                                        /* tp_weaklistoffset */
+    0,                                        /* tp_iter */
+    0,                                        /* tp_iternext */
+    0,                                        /* tp_methods */
+    0,                                        /* tp_members */
+    pyrational_getset,                        /* tp_getset */
+    0,                                        /* tp_base */
+    0,                                        /* tp_dict */
+    0,                                        /* tp_descr_get */
+    0,                                        /* tp_descr_set */
+    0,                                        /* tp_dictoffset */
+    0,                                        /* tp_init */
+    0,                                        /* tp_alloc */
+    pyrational_new,                           /* tp_new */
+    0,                                        /* tp_free */
+    0,                                        /* tp_is_gc */
+    0,                                        /* tp_bases */
+    0,                                        /* tp_mro */
+    0,                                        /* tp_cache */
+    0,                                        /* tp_subclasses */
+    0,                                        /* tp_weaklist */
+    0,                                        /* tp_del */
+    0,                                        /* tp_version_tag */
+};
+
+/* NumPy support */
+
+static PyObject*
+npyrational_getitem(void* data, void* arr) {
+    rational r;
+    memcpy(&r,data,sizeof(rational));
+    return PyRational_FromRational(r);
+}
+
+static int
+npyrational_setitem(PyObject* item, void* data, void* arr) {
+    rational r;
+    if (PyRational_Check(item)) {
+        r = ((PyRational*)item)->r;
+    }
+    else {
+        long n = PyInt_AsLong(item);
+        PyObject* y;
+        int eq;
+        if (error_converting(n)) {
+            return -1;
+        }
+        y = PyInt_FromLong(n);
+        if (!y) {
+            return -1;
+        }
+        eq = PyObject_RichCompareBool(item,y,Py_EQ);
+        Py_DECREF(y);
+        if (eq<0) {
+            return -1;
+        }
+        if (!eq) {
+            PyErr_Format(PyExc_TypeError,
+                    "expected rational, got %s", item->ob_type->tp_name);
+            return -1;
+        }
+        r = make_rational_int(n);
+    }
+    memcpy(data,&r,sizeof(rational));
+    return 0;
+}
+
+static NPY_INLINE void
+byteswap(npy_int32* x) {
+    char* p = (char*)x;
+    size_t i;
+    for (i = 0; i < sizeof(*x)/2; i++) {
+        size_t j = sizeof(*x)-1-i;
+        char t = p[i];
+        p[i] = p[j];
+        p[j] = t;
+    }
+}
+
+static void
+npyrational_copyswapn(void* dst_, npy_intp dstride, void* src_,
+        npy_intp sstride, npy_intp n, int swap, void* arr) {
+    char *dst = (char*)dst_, *src = (char*)src_;
+    npy_intp i;
+    if (!src) {
+        return;
+    }
+    if (swap) {
+        for (i = 0; i < n; i++) {
+            rational* r = (rational*)(dst+dstride*i);
+            memcpy(r,src+sstride*i,sizeof(rational));
+            byteswap(&r->n);
+            byteswap(&r->dmm);
+        }
+    }
+    else if (dstride == sizeof(rational) && sstride == sizeof(rational)) {
+        memcpy(dst, src, n*sizeof(rational));
+    }
+    else {
+        for (i = 0; i < n; i++) {
+            memcpy(dst + dstride*i, src + sstride*i, sizeof(rational));
+        }
+    }
+}
+
+static void
+npyrational_copyswap(void* dst, void* src, int swap, void* arr) {
+    rational* r;
+    if (!src) {
+        return;
+    }
+    r = (rational*)dst;
+    memcpy(r,src,sizeof(rational));
+    if (swap) {
+        byteswap(&r->n);
+        byteswap(&r->dmm);
+    }
+}
+
+static int
+npyrational_compare(const void* d0, const void* d1, void* arr) {
+    rational x = *(rational*)d0,
+             y = *(rational*)d1;
+    return rational_lt(x,y)?-1:rational_eq(x,y)?0:1;
+}
+
+#define FIND_EXTREME(name,op) \
+    static int \
+    npyrational_##name(void* data_, npy_intp n, \
+            npy_intp* max_ind, void* arr) { \
+        const rational* data; \
+        npy_intp best_i; \
+        rational best_r; \
+        npy_intp i; \
+        if (!n) { \
+            return 0; \
+        } \
+        data = (rational*)data_; \
+        best_i = 0; \
+        best_r = data[0]; \
+        for (i = 1; i < n; i++) { \
+            if (rational_##op(data[i],best_r)) { \
+                best_i = i; \
+                best_r = data[i]; \
+            } \
+        } \
+        *max_ind = best_i; \
+        return 0; \
+    }
+FIND_EXTREME(argmin,lt)
+FIND_EXTREME(argmax,gt)
+
+static void
+npyrational_dot(void* ip0_, npy_intp is0, void* ip1_, npy_intp is1,
+        void* op, npy_intp n, void* arr) {
+    rational r = {0};
+    const char *ip0 = (char*)ip0_, *ip1 = (char*)ip1_;
+    npy_intp i;
+    for (i = 0; i < n; i++) {
+        r = rational_add(r,rational_multiply(*(rational*)ip0,*(rational*)ip1));
+        ip0 += is0;
+        ip1 += is1;
+    }
+    *(rational*)op = r;
+}
+
+static npy_bool
+npyrational_nonzero(void* data, void* arr) {
+    rational r;
+    memcpy(&r,data,sizeof(r));
+    return rational_nonzero(r)?NPY_TRUE:NPY_FALSE;
+}
+
+static int
+npyrational_fill(void* data_, npy_intp length, void* arr) {
+    rational* data = (rational*)data_;
+    rational delta = rational_subtract(data[1],data[0]);
+    rational r = data[1];
+    npy_intp i;
+    for (i = 2; i < length; i++) {
+        r = rational_add(r,delta);
+        data[i] = r;
+    }
+    return 0;
+}
+
+static int
+npyrational_fillwithscalar(void* buffer_, npy_intp length,
+        void* value, void* arr) {
+    rational r = *(rational*)value;
+    rational* buffer = (rational*)buffer_;
+    npy_intp i;
+    for (i = 0; i < length; i++) {
+        buffer[i] = r;
+    }
+    return 0;
+}
+
+static PyArray_ArrFuncs npyrational_arrfuncs;
+
+typedef struct { char c; rational r; } align_test;
+
+PyArray_Descr npyrational_descr = {
+    PyObject_HEAD_INIT(0)
+    &PyRational_Type,       /* typeobj */
+    'V',                    /* kind */
+    'r',                    /* type */
+    '=',                    /* byteorder */
+    /*
+     * For now, we need NPY_NEEDS_PYAPI in order to make numpy detect our
+     * exceptions.  This isn't technically necessary,
+     * since we're careful about thread safety, and hopefully future
+     * versions of numpy will recognize that.
+     */
+    NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM, /* hasobject */
+    0,                      /* type_num */
+    sizeof(rational),       /* elsize */
+    offsetof(align_test,r), /* alignment */
+    0,                      /* subarray */
+    0,                      /* fields */
+    0,                      /* names */
+    &npyrational_arrfuncs,  /* f */
+};
+
+#define DEFINE_CAST(From,To,statement) \
+    static void \
+    npycast_##From##_##To(void* from_, void* to_, npy_intp n, \
+                          void* fromarr, void* toarr) { \
+        const From* from = (From*)from_; \
+        To* to = (To*)to_; \
+        npy_intp i; \
+        for (i = 0; i < n; i++) { \
+            From x = from[i]; \
+            statement \
+            to[i] = y; \
+        } \
+    }
+#define DEFINE_INT_CAST(bits) \
+    DEFINE_CAST(npy_int##bits,rational,rational y = make_rational_int(x);) \
+    DEFINE_CAST(rational,npy_int##bits,npy_int32 z = rational_int(x); \
+                npy_int##bits y = z; if (y != z) set_overflow();)
+DEFINE_INT_CAST(8)
+DEFINE_INT_CAST(16)
+DEFINE_INT_CAST(32)
+DEFINE_INT_CAST(64)
+DEFINE_CAST(rational,float,double y = rational_double(x);)
+DEFINE_CAST(rational,double,double y = rational_double(x);)
+DEFINE_CAST(npy_bool,rational,rational y = make_rational_int(x);)
+DEFINE_CAST(rational,npy_bool,npy_bool y = rational_nonzero(x);)
+
+#define BINARY_UFUNC(name,intype0,intype1,outtype,exp) \
+    void name(char** args, npy_intp* dimensions, \
+              npy_intp* steps, void* data) { \
+        npy_intp is0 = steps[0], is1 = steps[1], \
+            os = steps[2], n = *dimensions; \
+        char *i0 = args[0], *i1 = args[1], *o = args[2]; \
+        int k; \
+        for (k = 0; k < n; k++) { \
+            intype0 x = *(intype0*)i0; \
+            intype1 y = *(intype1*)i1; \
+            *(outtype*)o = exp; \
+            i0 += is0; i1 += is1; o += os; \
+        } \
+    }
+#define RATIONAL_BINARY_UFUNC(name,type,exp) \
+    BINARY_UFUNC(rational_ufunc_##name,rational,rational,type,exp)
+RATIONAL_BINARY_UFUNC(add,rational,rational_add(x,y))
+RATIONAL_BINARY_UFUNC(subtract,rational,rational_subtract(x,y))
+RATIONAL_BINARY_UFUNC(multiply,rational,rational_multiply(x,y))
+RATIONAL_BINARY_UFUNC(divide,rational,rational_divide(x,y))
+RATIONAL_BINARY_UFUNC(remainder,rational,rational_remainder(x,y))
+RATIONAL_BINARY_UFUNC(floor_divide,rational,
+    make_rational_int(rational_floor(rational_divide(x,y))))
+PyUFuncGenericFunction rational_ufunc_true_divide = rational_ufunc_divide;
+RATIONAL_BINARY_UFUNC(minimum,rational,rational_lt(x,y)?x:y)
+RATIONAL_BINARY_UFUNC(maximum,rational,rational_lt(x,y)?y:x)
+RATIONAL_BINARY_UFUNC(equal,npy_bool,rational_eq(x,y))
+RATIONAL_BINARY_UFUNC(not_equal,npy_bool,rational_ne(x,y))
+RATIONAL_BINARY_UFUNC(less,npy_bool,rational_lt(x,y))
+RATIONAL_BINARY_UFUNC(greater,npy_bool,rational_gt(x,y))
+RATIONAL_BINARY_UFUNC(less_equal,npy_bool,rational_le(x,y))
+RATIONAL_BINARY_UFUNC(greater_equal,npy_bool,rational_ge(x,y))
+
+BINARY_UFUNC(gcd_ufunc,npy_int64,npy_int64,npy_int64,gcd(x,y))
+BINARY_UFUNC(lcm_ufunc,npy_int64,npy_int64,npy_int64,lcm(x,y))
+
+#define UNARY_UFUNC(name,type,exp) \
+    void rational_ufunc_##name(char** args, npy_intp* dimensions, \
+                               npy_intp* steps, void* data) { \
+        npy_intp is = steps[0], os = steps[1], n = *dimensions; \
+        char *i = args[0], *o = args[1]; \
+        int k; \
+        for (k = 0; k < n; k++) { \
+            rational x = *(rational*)i; \
+            *(type*)o = exp; \
+            i += is; o += os; \
+        } \
+    }
+UNARY_UFUNC(negative,rational,rational_negative(x))
+UNARY_UFUNC(absolute,rational,rational_abs(x))
+UNARY_UFUNC(floor,rational,make_rational_int(rational_floor(x)))
+UNARY_UFUNC(ceil,rational,make_rational_int(rational_ceil(x)))
+UNARY_UFUNC(trunc,rational,make_rational_int(x.n/d(x)))
+UNARY_UFUNC(square,rational,rational_multiply(x,x))
+UNARY_UFUNC(rint,rational,make_rational_int(rational_rint(x)))
+UNARY_UFUNC(sign,rational,make_rational_int(rational_sign(x)))
+UNARY_UFUNC(reciprocal,rational,rational_inverse(x))
+UNARY_UFUNC(numerator,npy_int64,x.n)
+UNARY_UFUNC(denominator,npy_int64,d(x))
+
+static NPY_INLINE void
+rational_matrix_multiply(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+    /* pointers to data for input and output arrays */
+    char *ip1 = args[0];
+    char *ip2 = args[1];
+    char *op = args[2];
+
+    /* lengths of core dimensions */
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+
+    /* striding over core dimensions */
+    npy_intp is1_m = steps[0];
+    npy_intp is1_n = steps[1];
+    npy_intp is2_n = steps[2];
+    npy_intp is2_p = steps[3];
+    npy_intp os_m = steps[4];
+    npy_intp os_p = steps[5];
+
+    /* core dimensions counters */
+    npy_intp m, p;
+
+    /* calculate dot product for each row/column vector pair */
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+            npyrational_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+
+            /* advance to next column of 2nd input array and output array */
+            ip2 += is2_p;
+            op  +=  os_p;
+        }
+
+        /* reset to first column of 2nd input array and output array */
+        ip2 -= is2_p * p;
+        op -= os_p * p;
+
+        /* advance to next row of 1st input array and output array */
+        ip1 += is1_m;
+        op += os_m;
+    }
+}
+
+
+static void
+rational_gufunc_matrix_multiply(char **args, npy_intp *dimensions,
+                                npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /* outer dimensions counter */
+    npy_intp N_;
+
+    /* length of flattened outer dimensions */
+    npy_intp dN = dimensions[0];
+
+    /* striding over flattened outer dimensions for input and output arrays */
+    npy_intp s0 = steps[0];
+    npy_intp s1 = steps[1];
+    npy_intp s2 = steps[2];
+
+    /*
+     * loop through outer dimensions, performing matrix multiply on
+     * core dimensions for each loop
+     */
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1, args[2] += s2) {
+        rational_matrix_multiply(args, dimensions+1, steps+3);
+    }
+}
+
+
+static void
+rational_ufunc_test_add(char** args, npy_intp* dimensions,
+                        npy_intp* steps, void* data) {
+    npy_intp is0 = steps[0], is1 = steps[1], os = steps[2], n = *dimensions;
+    char *i0 = args[0], *i1 = args[1], *o = args[2];
+    int k;
+    for (k = 0; k < n; k++) {
+        npy_int64 x = *(npy_int64*)i0;
+        npy_int64 y = *(npy_int64*)i1;
+        *(rational*)o = rational_add(make_rational_fast(x, 1),
+                                     make_rational_fast(y, 1));
+        i0 += is0; i1 += is1; o += os;
+    }
+}
+
+
+static void
+rational_ufunc_test_add_rationals(char** args, npy_intp* dimensions,
+                        npy_intp* steps, void* data) {
+    npy_intp is0 = steps[0], is1 = steps[1], os = steps[2], n = *dimensions;
+    char *i0 = args[0], *i1 = args[1], *o = args[2];
+    int k;
+    for (k = 0; k < n; k++) {
+        rational x = *(rational*)i0;
+        rational y = *(rational*)i1;
+        *(rational*)o = rational_add(x, y);
+        i0 += is0; i1 += is1; o += os;
+    }
+}
+
+
+PyMethodDef module_methods[] = {
+    {0} /* sentinel */
+};
+
+#if defined(NPY_PY3K)
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_rational_tests",
+    NULL,
+    -1,
+    module_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#endif
+
+#if defined(NPY_PY3K)
+#define RETVAL m
+PyMODINIT_FUNC PyInit__rational_tests(void) {
+#else
+#define RETVAL
+PyMODINIT_FUNC init_rational_tests(void) {
+#endif
+
+    PyObject *m = NULL;
+    PyObject* numpy_str;
+    PyObject* numpy;
+    int npy_rational;
+
+    import_array();
+    if (PyErr_Occurred()) {
+        goto fail;
+    }
+    import_umath();
+    if (PyErr_Occurred()) {
+        goto fail;
+    }
+    numpy_str = PyUString_FromString("numpy");
+    if (!numpy_str) {
+        goto fail;
+    }
+    numpy = PyImport_Import(numpy_str);
+    Py_DECREF(numpy_str);
+    if (!numpy) {
+        goto fail;
+    }
+
+    /* Can't set this until we import numpy */
+    PyRational_Type.tp_base = &PyGenericArrType_Type;
+
+    /* Initialize rational type object */
+    if (PyType_Ready(&PyRational_Type) < 0) {
+        goto fail;
+    }
+
+    /* Initialize rational descriptor */
+    PyArray_InitArrFuncs(&npyrational_arrfuncs);
+    npyrational_arrfuncs.getitem = npyrational_getitem;
+    npyrational_arrfuncs.setitem = npyrational_setitem;
+    npyrational_arrfuncs.copyswapn = npyrational_copyswapn;
+    npyrational_arrfuncs.copyswap = npyrational_copyswap;
+    npyrational_arrfuncs.compare = npyrational_compare;
+    npyrational_arrfuncs.argmin = npyrational_argmin;
+    npyrational_arrfuncs.argmax = npyrational_argmax;
+    npyrational_arrfuncs.dotfunc = npyrational_dot;
+    npyrational_arrfuncs.nonzero = npyrational_nonzero;
+    npyrational_arrfuncs.fill = npyrational_fill;
+    npyrational_arrfuncs.fillwithscalar = npyrational_fillwithscalar;
+    /* Left undefined: scanfunc, fromstr, sort, argsort */
+    Py_TYPE(&npyrational_descr) = &PyArrayDescr_Type;
+    npy_rational = PyArray_RegisterDataType(&npyrational_descr);
+    if (npy_rational<0) {
+        goto fail;
+    }
+
+    /* Support dtype(rational) syntax */
+    if (PyDict_SetItemString(PyRational_Type.tp_dict, "dtype",
+                             (PyObject*)&npyrational_descr) < 0) {
+        goto fail;
+    }
+
+    /* Register casts to and from rational */
+    #define REGISTER_CAST(From,To,from_descr,to_typenum,safe) { \
+            PyArray_Descr* from_descr_##From##_##To = (from_descr); \
+            if (PyArray_RegisterCastFunc(from_descr_##From##_##To, \
+                                         (to_typenum), \
+                                         npycast_##From##_##To) < 0) { \
+                goto fail; \
+            } \
+            if (safe && PyArray_RegisterCanCast(from_descr_##From##_##To, \
+                                                (to_typenum), \
+                                                NPY_NOSCALAR) < 0) { \
+                goto fail; \
+            } \
+        }
+    #define REGISTER_INT_CASTS(bits) \
+        REGISTER_CAST(npy_int##bits, rational, \
+                      PyArray_DescrFromType(NPY_INT##bits), npy_rational, 1) \
+        REGISTER_CAST(rational, npy_int##bits, &npyrational_descr, \
+                      NPY_INT##bits, 0)
+    REGISTER_INT_CASTS(8)
+    REGISTER_INT_CASTS(16)
+    REGISTER_INT_CASTS(32)
+    REGISTER_INT_CASTS(64)
+    REGISTER_CAST(rational,float,&npyrational_descr,NPY_FLOAT,0)
+    REGISTER_CAST(rational,double,&npyrational_descr,NPY_DOUBLE,1)
+    REGISTER_CAST(npy_bool,rational, PyArray_DescrFromType(NPY_BOOL),
+                  npy_rational,1)
+    REGISTER_CAST(rational,npy_bool,&npyrational_descr,NPY_BOOL,0)
+
+    /* Register ufuncs */
+    #define REGISTER_UFUNC(name,...) { \
+        PyUFuncObject* ufunc = \
+            (PyUFuncObject*)PyObject_GetAttrString(numpy, #name); \
+        int _types[] = __VA_ARGS__; \
+        if (!ufunc) { \
+            goto fail; \
+        } \
+        if (sizeof(_types)/sizeof(int)!=ufunc->nargs) { \
+            PyErr_Format(PyExc_AssertionError, \
+                         "ufunc %s takes %d arguments, our loop takes %lu", \
+                         #name, ufunc->nargs, (unsigned long) \
+                         (sizeof(_types)/sizeof(int))); \
+            Py_DECREF(ufunc); \
+            goto fail; \
+        } \
+        if (PyUFunc_RegisterLoopForType((PyUFuncObject*)ufunc, npy_rational, \
+                rational_ufunc_##name, _types, 0) < 0) { \
+            Py_DECREF(ufunc); \
+            goto fail; \
+        } \
+        Py_DECREF(ufunc); \
+    }
+    #define REGISTER_UFUNC_BINARY_RATIONAL(name) \
+        REGISTER_UFUNC(name, {npy_rational, npy_rational, npy_rational})
+    #define REGISTER_UFUNC_BINARY_COMPARE(name) \
+        REGISTER_UFUNC(name, {npy_rational, npy_rational, NPY_BOOL})
+    #define REGISTER_UFUNC_UNARY(name) \
+        REGISTER_UFUNC(name, {npy_rational, npy_rational})
+    /* Binary */
+    REGISTER_UFUNC_BINARY_RATIONAL(add)
+    REGISTER_UFUNC_BINARY_RATIONAL(subtract)
+    REGISTER_UFUNC_BINARY_RATIONAL(multiply)
+    REGISTER_UFUNC_BINARY_RATIONAL(divide)
+    REGISTER_UFUNC_BINARY_RATIONAL(remainder)
+    REGISTER_UFUNC_BINARY_RATIONAL(true_divide)
+    REGISTER_UFUNC_BINARY_RATIONAL(floor_divide)
+    REGISTER_UFUNC_BINARY_RATIONAL(minimum)
+    REGISTER_UFUNC_BINARY_RATIONAL(maximum)
+    /* Comparisons */
+    REGISTER_UFUNC_BINARY_COMPARE(equal)
+    REGISTER_UFUNC_BINARY_COMPARE(not_equal)
+    REGISTER_UFUNC_BINARY_COMPARE(less)
+    REGISTER_UFUNC_BINARY_COMPARE(greater)
+    REGISTER_UFUNC_BINARY_COMPARE(less_equal)
+    REGISTER_UFUNC_BINARY_COMPARE(greater_equal)
+    /* Unary */
+    REGISTER_UFUNC_UNARY(negative)
+    REGISTER_UFUNC_UNARY(absolute)
+    REGISTER_UFUNC_UNARY(floor)
+    REGISTER_UFUNC_UNARY(ceil)
+    REGISTER_UFUNC_UNARY(trunc)
+    REGISTER_UFUNC_UNARY(rint)
+    REGISTER_UFUNC_UNARY(square)
+    REGISTER_UFUNC_UNARY(reciprocal)
+    REGISTER_UFUNC_UNARY(sign)
+
+    /* Create module */
+#if defined(NPY_PY3K)
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule("_rational_tests", module_methods);
+#endif
+
+    if (!m) {
+        goto fail;
+    }
+
+    /* Add rational type */
+    Py_INCREF(&PyRational_Type);
+    PyModule_AddObject(m,"rational",(PyObject*)&PyRational_Type);
+
+    /* Create matrix multiply generalized ufunc */
+    {
+        int types2[3] = {npy_rational,npy_rational,npy_rational};
+        PyObject* gufunc = PyUFunc_FromFuncAndDataAndSignature(0,0,0,0,2,1,
+            PyUFunc_None,(char*)"matrix_multiply",
+            (char*)"return result of multiplying two matrices of rationals",
+            0,"(m,n),(n,p)->(m,p)");
+        if (!gufunc) {
+            goto fail;
+        }
+        if (PyUFunc_RegisterLoopForType((PyUFuncObject*)gufunc, npy_rational,
+                rational_gufunc_matrix_multiply, types2, 0) < 0) {
+            goto fail;
+        }
+        PyModule_AddObject(m,"matrix_multiply",(PyObject*)gufunc);
+    }
+
+    /* Create test ufunc with built in input types and rational output type */
+    {
+        int types3[3] = {NPY_INT64,NPY_INT64,npy_rational};
+
+        PyObject* ufunc = PyUFunc_FromFuncAndData(0,0,0,0,2,1,
+                PyUFunc_None,(char*)"test_add",
+                (char*)"add two matrices of int64 and return rational matrix",0);
+        if (!ufunc) {
+            goto fail;
+        }
+        if (PyUFunc_RegisterLoopForType((PyUFuncObject*)ufunc, npy_rational,
+                rational_ufunc_test_add, types3, 0) < 0) {
+            goto fail;
+        }
+        PyModule_AddObject(m,"test_add",(PyObject*)ufunc);
+    }
+
+    /* Create test ufunc with rational types using RegisterLoopForDescr */
+    {
+        PyObject* ufunc = PyUFunc_FromFuncAndData(0,0,0,0,2,1,
+                PyUFunc_None,(char*)"test_add_rationals",
+                (char*)"add two matrices of rationals and return rational matrix",0);
+        PyArray_Descr* types[3] = {&npyrational_descr,
+                                    &npyrational_descr,
+                                    &npyrational_descr};
+
+        if (!ufunc) {
+            goto fail;
+        }
+        if (PyUFunc_RegisterLoopForDescr((PyUFuncObject*)ufunc, &npyrational_descr,
+                rational_ufunc_test_add_rationals, types, 0) < 0) {
+            goto fail;
+        }
+        PyModule_AddObject(m,"test_add_rationals",(PyObject*)ufunc);
+    }
+
+    /* Create numerator and denominator ufuncs */
+    #define NEW_UNARY_UFUNC(name,type,doc) { \
+        int types[2] = {npy_rational,type}; \
+        PyObject* ufunc = PyUFunc_FromFuncAndData(0,0,0,0,1,1, \
+            PyUFunc_None,(char*)#name,(char*)doc,0); \
+        if (!ufunc) { \
+            goto fail; \
+        } \
+        if (PyUFunc_RegisterLoopForType((PyUFuncObject*)ufunc, \
+                npy_rational,rational_ufunc_##name,types,0)<0) { \
+            goto fail; \
+        } \
+        PyModule_AddObject(m,#name,(PyObject*)ufunc); \
+    }
+    NEW_UNARY_UFUNC(numerator,NPY_INT64,"rational number numerator");
+    NEW_UNARY_UFUNC(denominator,NPY_INT64,"rational number denominator");
+
+    /* Create gcd and lcm ufuncs */
+    #define GCD_LCM_UFUNC(name,type,doc) { \
+        static const PyUFuncGenericFunction func[1] = {name##_ufunc}; \
+        static const char types[3] = {type,type,type}; \
+        static void* data[1] = {0}; \
+        PyObject* ufunc = PyUFunc_FromFuncAndData( \
+            (PyUFuncGenericFunction*)func, data,(char*)types, \
+            1,2,1,PyUFunc_One,(char*)#name,(char*)doc,0); \
+        if (!ufunc) { \
+            goto fail; \
+        } \
+        PyModule_AddObject(m,#name,(PyObject*)ufunc); \
+    }
+    GCD_LCM_UFUNC(gcd,NPY_INT64,"greatest common denominator of two integers");
+    GCD_LCM_UFUNC(lcm,NPY_INT64,"least common multiple of two integers");
+
+    return RETVAL;
+
+fail:
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _rational_tests module.");
+    }
+#if defined(NPY_PY3K)
+    if (m) {
+        Py_DECREF(m);
+        m = NULL;
+    }
+#endif
+    return RETVAL;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/_struct_ufunc_tests.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/_struct_ufunc_tests.c.src
new file mode 100644
index 00000000000..5c6e235e01b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/_struct_ufunc_tests.c.src
@@ -0,0 +1,125 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+#include "math.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_3kcompat.h"
+
+
+/*
+ * struct_ufunc_test.c
+ * This is the C code for creating your own
+ * NumPy ufunc for a structured array dtype.
+ *
+ * Details explaining the Python-C API can be found under
+ * 'Extending and Embedding' and 'Python/C API' at
+ * docs.python.org .
+ */
+
+static PyMethodDef StructUfuncTestMethods[] = {
+    {NULL, NULL, 0, NULL}
+};
+
+/* The loop definition must precede the PyMODINIT_FUNC. */
+
+static void add_uint64_triplet(char **args, npy_intp *dimensions,
+                            npy_intp* steps, void* data)
+{
+    npy_intp i;
+    npy_intp is1=steps[0];
+    npy_intp is2=steps[1];
+    npy_intp os=steps[2];
+    npy_intp n=dimensions[0];
+    npy_uint64 *x, *y, *z;
+
+    char *i1=args[0];
+    char *i2=args[1];
+    char *op=args[2];
+
+    for (i = 0; i < n; i++) {
+
+        x = (npy_uint64*)i1;
+        y = (npy_uint64*)i2;
+        z = (npy_uint64*)op;
+
+        z[0] = x[0] + y[0];
+        z[1] = x[1] + y[1];
+        z[2] = x[2] + y[2];
+
+        i1 += is1;
+        i2 += is2;
+        op += os;
+    }
+}
+
+#if defined(NPY_PY3K)
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_struct_ufunc_tests",
+    NULL,
+    -1,
+    StructUfuncTestMethods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#endif
+
+#if defined(NPY_PY3K)
+PyMODINIT_FUNC PyInit__struct_ufunc_tests(void)
+#else
+PyMODINIT_FUNC init_struct_ufunc_tests(void)
+#endif
+{
+    PyObject *m, *add_triplet, *d;
+    PyObject *dtype_dict;
+    PyArray_Descr *dtype;
+    PyArray_Descr *dtypes[3];
+
+#if defined(NPY_PY3K)
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule("_struct_ufunc_tests", StructUfuncTestMethods);
+#endif
+
+    if (m == NULL) {
+#if defined(NPY_PY3K)
+        return NULL;
+#else
+        return;
+#endif
+    }
+
+    import_array();
+    import_umath();
+
+    add_triplet = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 2, 1,
+                                    PyUFunc_None, "add_triplet",
+                                    "add_triplet_docstring", 0);
+
+    dtype_dict = Py_BuildValue("[(s, s), (s, s), (s, s)]",
+        "f0", "u8", "f1", "u8", "f2", "u8");
+    PyArray_DescrConverter(dtype_dict, &dtype);
+    Py_DECREF(dtype_dict);
+
+    dtypes[0] = dtype;
+    dtypes[1] = dtype;
+    dtypes[2] = dtype;
+
+    PyUFunc_RegisterLoopForDescr((PyUFuncObject *)add_triplet,
+                                dtype,
+                                &add_uint64_triplet,
+                                dtypes,
+                                NULL);
+
+    Py_DECREF(dtype);
+    d = PyModule_GetDict(m);
+
+    PyDict_SetItemString(d, "add_triplet", add_triplet);
+    Py_DECREF(add_triplet);
+#if defined(NPY_PY3K)
+    return m;
+#endif
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/_umath_tests.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/_umath_tests.c.src
new file mode 100644
index 00000000000..6c3bcce7133
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/_umath_tests.c.src
@@ -0,0 +1,643 @@
+/* -*- c -*- */
+
+/*
+ *****************************************************************************
+ **                            INCLUDES                                     **
+ *****************************************************************************
+ */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_config.h"
+
+/*
+ *****************************************************************************
+ **                            BASICS                                       **
+ *****************************************************************************
+ */
+
+#define INIT_OUTER_LOOP_1       \
+    npy_intp dN = *dimensions++;\
+    npy_intp N_;                \
+    npy_intp s0 = *steps++;
+
+#define INIT_OUTER_LOOP_2       \
+    INIT_OUTER_LOOP_1           \
+    npy_intp s1 = *steps++;
+
+#define INIT_OUTER_LOOP_3       \
+    INIT_OUTER_LOOP_2           \
+    npy_intp s2 = *steps++;
+
+#define INIT_OUTER_LOOP_4       \
+    INIT_OUTER_LOOP_3           \
+    npy_intp s3 = *steps++;
+
+#define BEGIN_OUTER_LOOP_2      \
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1) {
+
+#define BEGIN_OUTER_LOOP_3      \
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1, args[2] += s2) {
+
+#define BEGIN_OUTER_LOOP_4      \
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1, args[2] += s2, args[3] += s3) {
+
+#define END_OUTER_LOOP  }
+
+
+/*
+ *****************************************************************************
+ **                             UFUNC LOOPS                                 **
+ *****************************************************************************
+ */
+
+char *inner1d_signature = "(i),(i)->()";
+
+/**begin repeat
+
+   #TYPE=LONG,DOUBLE#
+   #typ=npy_long,npy_double#
+*/
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i { in1[n, i] * in2[n, i] }.
+ */
+
+static void
+@TYPE@_inner1d(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_3
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is1=steps[0], is2=steps[1];
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        @typ@ sum = 0;
+        for (i = 0; i < di; i++) {
+            sum += (*(@typ@ *)ip1) * (*(@typ@ *)ip2);
+            ip1 += is1;
+            ip2 += is2;
+        }
+        *(@typ@ *)op = sum;
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
+char *innerwt_signature = "(i),(i),(i)->()";
+
+/**begin repeat
+
+   #TYPE=LONG,DOUBLE#
+   #typ=npy_long,npy_double#
+*/
+
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i { in1[n, i] * in2[n, i] * in3[n, i] }.
+ */
+
+static void
+@TYPE@_innerwt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_4
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is1=steps[0], is2=steps[1], is3=steps[2];
+    BEGIN_OUTER_LOOP_4
+        char *ip1=args[0], *ip2=args[1], *ip3=args[2], *op=args[3];
+        @typ@ sum = 0;
+        for (i = 0; i < di; i++) {
+            sum += (*(@typ@ *)ip1) * (*(@typ@ *)ip2) * (*(@typ@ *)ip3);
+            ip1 += is1;
+            ip2 += is2;
+            ip3 += is3;
+        }
+        *(@typ@ *)op = sum;
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
+char *matrix_multiply_signature = "(m,n),(n,p)->(m,p)";
+/* for use with matrix_multiply code, but different signature */
+char *matmul_signature = "(m?,n),(n,p?)->(m?,p?)";
+
+/**begin repeat
+
+   #TYPE=FLOAT,DOUBLE,LONG#
+   #typ=npy_float,npy_double,npy_long#
+*/
+
+/*
+ *  This implements the function
+ *        out[k, m, p] = sum_n { in1[k, m, n] * in2[k, n, p] }.
+ */
+
+static void
+@TYPE@_matrix_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /* no BLAS is available */
+    INIT_OUTER_LOOP_3
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp m,n,p;
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+    npy_intp ib1_n = is1_n*dn;
+    npy_intp ib2_n = is2_n*dn;
+    npy_intp ib2_p = is2_p*dp;
+    npy_intp ob_p  = os_p *dp;
+    if (dn == 0) {
+        /* No operand, need to zero the output */
+        BEGIN_OUTER_LOOP_3
+            char *op=args[2];
+            for (m = 0; m < dm; m++) {
+                for (p = 0; p < dp; p++) {
+                    *(@typ@ *)op = 0;
+                    op  +=  os_p;
+                }
+                op  +=  os_m - ob_p;
+            }
+        END_OUTER_LOOP
+        return;
+    }
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        for (m = 0; m < dm; m++) {
+            for (n = 0; n < dn; n++) {
+                @typ@ val1 = (*(@typ@ *)ip1);
+                for (p = 0; p < dp; p++) {
+                    if (n == 0) *(@typ@ *)op = 0;
+                    *(@typ@ *)op += val1 * (*(@typ@ *)ip2);
+                    ip2 += is2_p;
+                    op  +=  os_p;
+                }
+                ip2 -= ib2_p;
+                op  -=  ob_p;
+                ip1 += is1_n;
+                ip2 += is2_n;
+            }
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            ip1 += is1_m;
+            op  +=  os_m;
+        }
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
+char *cross1d_signature = "(3),(3)->(3)";
+
+/**begin repeat
+
+   #TYPE=LONG,DOUBLE#
+   #typ=npy_long, npy_double#
+*/
+
+/*
+ *  This implements the cross product:
+ *        out[n, 0] = in1[n, 1]*in2[n, 2] - in1[n, 2]*in2[n, 1]
+ *        out[n, 1] = in1[n, 2]*in2[n, 0] - in1[n, 0]*in2[n, 2]
+ *        out[n, 2] = in1[n, 0]*in2[n, 1] - in1[n, 1]*in2[n, 0]
+ */
+static void
+@TYPE@_cross1d(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_3
+    npy_intp is1=steps[0], is2=steps[1], os = steps[2];
+    BEGIN_OUTER_LOOP_3
+        @typ@ i1_x = *(@typ@ *)(args[0] + 0*is1);
+        @typ@ i1_y = *(@typ@ *)(args[0] + 1*is1);
+        @typ@ i1_z = *(@typ@ *)(args[0] + 2*is1);
+
+        @typ@ i2_x = *(@typ@ *)(args[1] + 0*is2);
+        @typ@ i2_y = *(@typ@ *)(args[1] + 1*is2);
+        @typ@ i2_z = *(@typ@ *)(args[1] + 2*is2);
+        char *op = args[2];
+
+        *(@typ@ *)op = i1_y * i2_z - i1_z * i2_y;
+        op += os;
+        *(@typ@ *)op = i1_z * i2_x - i1_x * i2_z;
+        op += os;
+        *(@typ@ *)op = i1_x * i2_y - i1_y * i2_x;
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
+char *euclidean_pdist_signature = "(n,d)->(p)";
+
+/**begin repeat
+
+   #TYPE=FLOAT,DOUBLE#
+   #typ=npy_float,npy_double#
+   #sqrt_func=sqrtf,sqrt#
+*/
+
+/*
+ *  This implements the function
+ *        out[j*(2*n-3-j)+k-1] = sum_d { (in1[j, d] - in1[k, d])^2 }
+ *  with 0 < k < j < n, i.e. computes all unique pairwise euclidean distances.
+ */
+
+static void
+@TYPE@_euclidean_pdist(char **args, npy_intp *dimensions, npy_intp *steps,
+                       void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp len_n = *dimensions++;
+    npy_intp len_d = *dimensions++;
+    npy_intp stride_n = *steps++;
+    npy_intp stride_d = *steps++;
+    npy_intp stride_p = *steps;
+
+    assert(len_n * (len_n - 1) / 2 == *dimensions);
+
+    BEGIN_OUTER_LOOP_2
+        const char *data_this = (const char *)args[0];
+        char *data_out = args[1];
+        npy_intp n;
+        for (n = 0; n < len_n; ++n) {
+            const char *data_that = data_this + stride_n;
+            npy_intp nn;
+            for (nn = n + 1; nn < len_n; ++nn) {
+                const char *ptr_this = data_this;
+                const char *ptr_that = data_that;
+                @typ@ out = 0;
+                npy_intp d;
+                for (d = 0; d < len_d; ++d) {
+                    const @typ@ delta = *(const @typ@ *)ptr_this -
+                                        *(const @typ@ *)ptr_that;
+                    out += delta * delta;
+                    ptr_this += stride_d;
+                    ptr_that += stride_d;
+                }
+                *(@typ@ *)data_out = npy_@sqrt_func@(out);
+                data_that += stride_n;
+                data_out += stride_p;
+            }
+            data_this += stride_n;
+        }
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
+char *cumsum_signature = "(i)->(i)";
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i^n in[i]
+ */
+
+/**begin repeat
+
+   #TYPE=LONG,DOUBLE#
+   #typ=npy_long,npy_double#
+*/
+
+static void
+@TYPE@_cumsum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    BEGIN_OUTER_LOOP_2
+        char *ip=args[0], *op=args[1];
+        @typ@ cumsum = 0;
+        for (i = 0; i < di; i++, ip += is, op += os) {
+            cumsum += (*(@typ@ *)ip);
+            *(@typ@ *)op = cumsum;
+        }
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
+/*  The following lines were generated using a slightly modified
+    version of code_generators/generate_umath.py and adding these
+    lines to defdict:
+
+defdict = {
+'inner1d' :
+    Ufunc(2, 1, None_,
+        r'''inner on the last dimension and broadcast on the rest \n"
+        "     \"(i),(i)->()\" \n''',
+        TD('ld'),
+        ),
+'innerwt' :
+    Ufunc(3, 1, None_,
+        r'''inner1d with a weight argument \n"
+        "     \"(i),(i),(i)->()\" \n''',
+        TD('ld'),
+        ),
+}
+
+*/
+
+static PyUFuncGenericFunction inner1d_functions[] = { LONG_inner1d, DOUBLE_inner1d };
+static void *inner1d_data[] = { (void *)NULL, (void *)NULL };
+static char inner1d_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction innerwt_functions[] = { LONG_innerwt, DOUBLE_innerwt };
+static void *innerwt_data[] = { (void *)NULL, (void *)NULL };
+static char innerwt_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction matrix_multiply_functions[] = { LONG_matrix_multiply, FLOAT_matrix_multiply, DOUBLE_matrix_multiply };
+static void *matrix_multiply_data[] = { (void *)NULL, (void *)NULL, (void *)NULL };
+static char matrix_multiply_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG,  NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,  NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction cross1d_functions[] = { LONG_cross1d, DOUBLE_cross1d };
+static void *cross1d_data[] = { (void *)NULL, (void *)NULL };
+static char cross1d_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction euclidean_pdist_functions[] =
+                            { FLOAT_euclidean_pdist, DOUBLE_euclidean_pdist };
+static void *eucldiean_pdist_data[] = { (void *)NULL, (void *)NULL };
+static char euclidean_pdist_signatures[] = { NPY_FLOAT, NPY_FLOAT,
+                                             NPY_DOUBLE, NPY_DOUBLE };
+
+static PyUFuncGenericFunction cumsum_functions[] = { LONG_cumsum, DOUBLE_cumsum };
+static void *cumsum_data[] = { (void *)NULL, (void *)NULL };
+static char cumsum_signatures[] = { NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE };
+
+
+static int
+addUfuncs(PyObject *dictionary) {
+    PyObject *f;
+
+    f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
+                    inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d",
+                    "inner on the last dimension and broadcast on the rest \n"
+                    "     \"(i),(i)->()\" \n",
+                    0, inner1d_signature);
+    /*
+     * yes, this should not happen, but I (MHvK) just spent an hour looking at
+     * segfaults because I screwed up something that seemed totally unrelated.
+     */
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "inner1d", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(innerwt_functions, innerwt_data,
+                    innerwt_signatures, 2, 3, 1, PyUFunc_None, "innerwt",
+                    "inner1d with a weight argument \n"
+                    "     \"(i),(i),(i)->()\" \n",
+                    0, innerwt_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "innerwt", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(matrix_multiply_functions,
+                    matrix_multiply_data, matrix_multiply_signatures,
+                    3, 2, 1, PyUFunc_None, "matrix_multiply",
+                    "matrix multiplication on last two dimensions \n"
+                    "     \"(m,n),(n,p)->(m,p)\" \n",
+                    0, matrix_multiply_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "matrix_multiply", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(matrix_multiply_functions,
+                    matrix_multiply_data, matrix_multiply_signatures,
+                    3, 2, 1, PyUFunc_None, "matmul",
+                    "matmul on last two dimensions, with some being optional\n"
+                    "     \"(m?,n),(n,p?)->(m?,p?)\" \n",
+                    0, matmul_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "matmul", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(euclidean_pdist_functions,
+                    eucldiean_pdist_data, euclidean_pdist_signatures,
+                    2, 1, 1, PyUFunc_None, "euclidean_pdist",
+                    "pairwise euclidean distance on last two dimensions \n"
+                    "     \"(n,d)->(p)\" \n",
+                    0, euclidean_pdist_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "euclidean_pdist", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(cumsum_functions,
+                    cumsum_data, cumsum_signatures,
+                    2, 1, 1, PyUFunc_None, "cumsum",
+                    "Cumulative sum of the input (n)->(n)\n",
+                    0, cumsum_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "cumsum", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
+                    inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d_no_doc",
+                    NULL,
+                    0, inner1d_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "inner1d_no_doc", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(cross1d_functions, cross1d_data,
+                    cross1d_signatures, 2, 2, 1, PyUFunc_None, "cross1d",
+                    "cross product on the last dimension and broadcast on the rest \n"\
+                    "     \"(3),(3)->(3)\" \n",
+                    0, cross1d_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "cross1d", f);
+    Py_DECREF(f);
+
+    return 0;
+}
+
+
+static PyObject *
+UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    int nin, nout, i;
+    PyObject *signature=NULL, *sig_str=NULL;
+    PyUFuncObject *f=NULL;
+    PyObject *core_num_dims=NULL, *core_dim_ixs=NULL;
+    PyObject *core_dim_flags=NULL, *core_dim_sizes=NULL;
+    int core_enabled;
+    int core_num_ixs = 0;
+
+    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) {
+        return NULL;
+    }
+
+    if (PyString_Check(signature)) {
+        sig_str = signature;
+    } else if (PyUnicode_Check(signature)) {
+        sig_str = PyUnicode_AsUTF8String(signature);
+    } else {
+        PyErr_SetString(PyExc_ValueError, "signature should be a string");
+        return NULL;
+    }
+
+    f = (PyUFuncObject*)PyUFunc_FromFuncAndDataAndSignature(
+        NULL, NULL, NULL,
+        0, nin, nout, PyUFunc_None, "no name",
+        "doc:none",
+        1, PyString_AS_STRING(sig_str));
+    if (sig_str != signature) {
+        Py_DECREF(sig_str);
+    }
+    if (f == NULL) {
+        return NULL;
+    }
+    core_enabled = f->core_enabled;
+    /*
+     * Don't presume core_num_dims and core_dim_ixs are defined;
+     * they currently are even if core_enabled=0, but there's no real
+     * reason they should be.  So avoid segfaults if we change our mind.
+     */
+    if (f->core_num_dims != NULL) {
+        core_num_dims = PyTuple_New(f->nargs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->nargs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_num_dims[i]);
+            PyTuple_SET_ITEM(core_num_dims, i, val);
+            core_num_ixs += f->core_num_dims[i];
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_num_dims = Py_None;
+    }
+    if (f->core_dim_ixs != NULL) {
+        core_dim_ixs = PyTuple_New(core_num_ixs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < core_num_ixs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_dim_ixs[i]);
+            PyTuple_SET_ITEM(core_dim_ixs, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_ixs = Py_None;
+    }
+    if (f->core_dim_flags != NULL) {
+        core_dim_flags = PyTuple_New(f->core_num_dim_ix);
+        if (core_dim_flags == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->core_num_dim_ix; i++) {
+            PyObject *val = PyLong_FromLong(f->core_dim_flags[i]);
+            PyTuple_SET_ITEM(core_dim_flags, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_flags = Py_None;
+    }
+    if (f->core_dim_sizes != NULL) {
+        core_dim_sizes = PyTuple_New(f->core_num_dim_ix);
+        if (core_dim_sizes == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->core_num_dim_ix; i++) {
+            PyObject *val = PyLong_FromLong(f->core_dim_sizes[i]);
+            PyTuple_SET_ITEM(core_dim_sizes, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_sizes = Py_None;
+    }
+    Py_DECREF(f);
+    return Py_BuildValue("iNNNN", core_enabled, core_num_dims,
+                         core_dim_ixs, core_dim_flags, core_dim_sizes);
+
+fail:
+    Py_XDECREF(f);
+    Py_XDECREF(core_num_dims);
+    Py_XDECREF(core_dim_ixs);
+    Py_XDECREF(core_dim_flags);
+    Py_XDECREF(core_dim_sizes);
+    return NULL;
+}
+
+static PyMethodDef UMath_TestsMethods[] = {
+    {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
+     "Test signature parsing of ufunc. \n"
+     "Arguments: nin nout signature \n"
+     "If fails, it returns NULL. Otherwise it returns a tuple of ufunc "
+     "internals. \n",
+     },
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+#if defined(NPY_PY3K)
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "_umath_tests",
+        NULL,
+        -1,
+        UMath_TestsMethods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+#endif
+
+/* Initialization function for the module */
+#if defined(NPY_PY3K)
+#define RETVAL(x) x
+PyMODINIT_FUNC PyInit__umath_tests(void) {
+#else
+#define RETVAL(x)
+PyMODINIT_FUNC init_umath_tests(void) {
+#endif
+    PyObject *m;
+    PyObject *d;
+    PyObject *version;
+
+#if defined(NPY_PY3K)
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule("_umath_tests", UMath_TestsMethods);
+#endif
+    if (m == NULL) {
+        return RETVAL(NULL);
+    }
+
+    import_array();
+    import_ufunc();
+
+    d = PyModule_GetDict(m);
+
+    version = PyString_FromString("0.1");
+    PyDict_SetItemString(d, "__version__", version);
+    Py_DECREF(version);
+
+    /* Load the ufunc operators into the module's namespace */
+    if (addUfuncs(d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return RETVAL(NULL);
+    }
+
+    return RETVAL(m);
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/funcs.inc.src b/contrib/python/numpy/py2/numpy/core/src/umath/funcs.inc.src
new file mode 100644
index 00000000000..da2ab07f8b3
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/funcs.inc.src
@@ -0,0 +1,432 @@
+/* -*- c -*- */
+
+/*
+ * This file is for the definitions of the non-c99 functions used in ufuncs.
+ * All the complex ufuncs are defined here along with a smattering of real and
+ * object functions.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "npy_pycompat.h"
+#include "npy_import.h"
+
+
+/*
+ *****************************************************************************
+ **                        PYTHON OBJECT FUNCTIONS                          **
+ *****************************************************************************
+ */
+
+static PyObject *
+Py_square(PyObject *o)
+{
+    return PyNumber_Multiply(o, o);
+}
+
+static PyObject *
+Py_get_one(PyObject *NPY_UNUSED(o))
+{
+    return PyInt_FromLong(1);
+}
+
+static PyObject *
+Py_reciprocal(PyObject *o)
+{
+    PyObject *one = PyInt_FromLong(1);
+    PyObject *result;
+
+    if (!one) {
+        return NULL;
+    }
+#if defined(NPY_PY3K)
+    result = PyNumber_TrueDivide(one, o);
+#else
+    result = PyNumber_Divide(one, o);
+#endif
+    Py_DECREF(one);
+    return result;
+}
+
+/*
+ * Define numpy version of PyNumber_Power as binary function.
+ */
+static PyObject *
+npy_ObjectPower(PyObject *x, PyObject *y)
+{
+    return PyNumber_Power(x, y, Py_None);
+}
+
+/**begin repeat
+ * #Kind = Max, Min#
+ * #OP = Py_GE, Py_LE#
+ */
+static PyObject *
+npy_Object@Kind@(PyObject *i1, PyObject *i2)
+{
+    PyObject *result;
+    int cmp;
+
+    cmp = PyObject_RichCompareBool(i1, i2, @OP@);
+    if (cmp < 0) {
+        return NULL;
+    }
+    if (cmp == 1) {
+        result = i1;
+    }
+    else {
+        result = i2;
+    }
+    Py_INCREF(result);
+    return result;
+}
+/**end repeat**/
+
+/* Emulates Python's 'a or b' behavior */
+static PyObject *
+npy_ObjectLogicalOr(PyObject *i1, PyObject *i2)
+{
+    if (i1 == NULL) {
+        Py_XINCREF(i2);
+        return i2;
+    }
+    else if (i2 == NULL) {
+        Py_INCREF(i1);
+        return i1;
+    }
+    else {
+        int retcode = PyObject_IsTrue(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (retcode) {
+            Py_INCREF(i1);
+            return i1;
+        }
+        else {
+            Py_INCREF(i2);
+            return i2;
+        }
+    }
+}
+
+/* Emulates Python's 'a and b' behavior */
+static PyObject *
+npy_ObjectLogicalAnd(PyObject *i1, PyObject *i2)
+{
+    if (i1 == NULL) {
+        return NULL;
+    }
+    else if (i2 == NULL) {
+        return NULL;
+    }
+    else {
+        int retcode = PyObject_IsTrue(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (!retcode) {
+            Py_INCREF(i1);
+            return i1;
+        }
+        else {
+            Py_INCREF(i2);
+            return i2;
+        }
+    }
+}
+
+
+/* Emulates Python's 'not b' behavior */
+static PyObject *
+npy_ObjectLogicalNot(PyObject *i1)
+{
+    if (i1 == NULL) {
+        return NULL;
+    }
+    else {
+        int retcode = PyObject_Not(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (retcode) {
+            Py_INCREF(Py_True);
+            return Py_True;
+        }
+        else {
+            Py_INCREF(Py_False);
+            return Py_False;
+        }
+    }
+}
+
+static PyObject *
+npy_ObjectGCD(PyObject *i1, PyObject *i2)
+{
+    PyObject *gcd = NULL;
+
+    /* use math.gcd if available, and valid on the provided types */
+#if PY_VERSION_HEX >= 0x03050000
+    {
+        static PyObject *math_gcd_func = NULL;
+
+        npy_cache_import("math", "gcd", &math_gcd_func);
+        if (math_gcd_func == NULL) {
+            return NULL;
+        }
+        gcd = PyObject_CallFunction(math_gcd_func, "OO", i1, i2);
+        if (gcd != NULL) {
+            return gcd;
+        }
+        /* silence errors, and fall back on pure-python gcd */
+        PyErr_Clear();
+    }
+#endif
+
+    /* otherwise, use our internal one, written in python */
+    {
+        static PyObject *internal_gcd_func = NULL;
+
+        npy_cache_import("numpy.core._internal", "_gcd", &internal_gcd_func);
+        if (internal_gcd_func == NULL) {
+            return NULL;
+        }
+        gcd = PyObject_CallFunction(internal_gcd_func, "OO", i1, i2);
+        if (gcd == NULL) {
+            return NULL;
+        }
+        /* _gcd has some unusual behaviour regarding sign */
+        return PyNumber_Absolute(gcd);
+    }
+}
+
+static PyObject *
+npy_ObjectLCM(PyObject *i1, PyObject *i2)
+{
+    /* lcm(a, b) = abs(a // gcd(a, b) * b) */
+
+    PyObject *gcd = npy_ObjectGCD(i1, i2);
+    PyObject *tmp;
+    if(gcd == NULL) {
+        return NULL;
+    }
+    /* Floor divide preserves integer types - we know the division will have
+     * no remainder
+     */
+    tmp = PyNumber_FloorDivide(i1, gcd);
+    if(tmp == NULL) {
+        return NULL;
+    }
+
+    tmp = PyNumber_Multiply(tmp, i2);
+    if(tmp == NULL) {
+        return NULL;
+    }
+
+    /* even though we fix gcd to be positive, we need to do it again here */
+    return PyNumber_Absolute(tmp);
+}
+
+/*
+ *****************************************************************************
+ **                           COMPLEX FUNCTIONS                             **
+ *****************************************************************************
+ */
+
+
+/*
+ * Don't pass structures between functions (only pointers) because how
+ * structures are passed is compiler dependent and could cause segfaults if
+ * umath_ufunc_object.inc is compiled with a different compiler than an
+ * extension that makes use of the UFUNC API
+ */
+
+/**begin repeat
+ *
+ * #ctype = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #ftype = npy_float, npy_double, npy_longdouble#
+ * #c = f, ,l#
+ */
+
+static void
+nc_neg@c@(@ctype@ *a, @ctype@ *r)
+{
+    r->real = -a->real;
+    r->imag = -a->imag;
+    return;
+}
+
+static void
+nc_pos@c@(@ctype@ *a, @ctype@ *r)
+{
+    r->real = +a->real;
+    r->imag = +a->imag;
+    return;
+}
+
+static void
+nc_sqrt@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_csqrt@c@(*x);
+    return;
+}
+
+static void
+nc_rint@c@(@ctype@ *x, @ctype@ *r)
+{
+    r->real = npy_rint@c@(x->real);
+    r->imag = npy_rint@c@(x->imag);
+}
+
+static void
+nc_log@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_clog@c@(*x);
+    return;
+}
+
+static void
+nc_log1p@c@(@ctype@ *x, @ctype@ *r)
+{
+    @ftype@ l = npy_hypot@c@(x->real + 1,x->imag);
+    r->imag = npy_atan2@c@(x->imag, x->real + 1);
+    r->real = npy_log@c@(l);
+    return;
+}
+
+static void
+nc_exp@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_cexp@c@(*x);
+    return;
+}
+
+static void
+nc_exp2@c@(@ctype@ *x, @ctype@ *r)
+{
+    @ctype@ a;
+    a.real = x->real*NPY_LOGE2@c@;
+    a.imag = x->imag*NPY_LOGE2@c@;
+    nc_exp@c@(&a, r);
+    return;
+}
+
+static void
+nc_expm1@c@(@ctype@ *x, @ctype@ *r)
+{
+    @ftype@ a = npy_exp@c@(x->real);
+    r->real = a*npy_cos@c@(x->imag) - 1.0@c@;
+    r->imag = a*npy_sin@c@(x->imag);
+    return;
+}
+
+static void
+nc_pow@c@(@ctype@ *a, @ctype@ *b, @ctype@ *r)
+{
+   *r = npy_cpow@c@(*a, *b);
+    return;
+}
+
+static void
+nc_acos@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_cacos@c@(*x);
+    return;
+}
+
+static void
+nc_acosh@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_cacosh@c@(*x);
+    return;
+}
+
+static void
+nc_asin@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_casin@c@(*x);
+    return;
+}
+
+
+static void
+nc_asinh@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_casinh@c@(*x);
+    return;
+}
+
+static void
+nc_atan@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_catan@c@(*x);
+    return;
+}
+
+static void
+nc_atanh@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_catanh@c@(*x);
+    return;
+}
+
+static void
+nc_cos@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_ccos@c@(*x);
+    return;
+}
+
+static void
+nc_cosh@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_ccosh@c@(*x);
+    return;
+}
+
+static void
+nc_log10@c@(@ctype@ *x, @ctype@ *r)
+{
+    nc_log@c@(x, r);
+    r->real *= NPY_LOG10E@c@;
+    r->imag *= NPY_LOG10E@c@;
+    return;
+}
+
+static void
+nc_log2@c@(@ctype@ *x, @ctype@ *r)
+{
+    nc_log@c@(x, r);
+    r->real *= NPY_LOG2E@c@;
+    r->imag *= NPY_LOG2E@c@;
+    return;
+}
+
+static void
+nc_sin@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_csin@c@(*x);
+    return;
+}
+
+static void
+nc_sinh@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_csinh@c@(*x);
+    return;
+}
+
+static void
+nc_tan@c@(@ctype@ *x, @ctype@ *r)
+{
+   *r = npy_ctan@c@(*x);
+   return;
+}
+
+static void
+nc_tanh@c@(@ctype@ *x, @ctype@ *r)
+{
+    *r = npy_ctanh@c@(*x);
+    return;
+}
+
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/loops.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/loops.c.src
new file mode 100644
index 00000000000..975a5e6b833
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/loops.c.src
@@ -0,0 +1,2988 @@
+/* -*- c -*- */
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+
+#include "npy_config.h"
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
+
+#include "npy_pycompat.h"
+
+#include "ufunc_object.h"
+
+#include <string.h> /* for memchr */
+
+/*
+ * cutoff blocksize for pairwise summation
+ * decreasing it decreases errors slightly as more pairs are summed but
+ * also lowers performance, as the inner loop is unrolled eight times it is
+ * effectively 16
+ */
+#define PW_BLOCKSIZE    128
+
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#ifndef NPY_MAX_SIMD_SIZE
+#define NPY_MAX_SIMD_SIZE 1024
+#endif
+
+/*
+ * include vectorized functions and dispatchers
+ * this file is safe to include also for generic builds
+ * platform specific instructions are either masked via the proprocessor or
+ * runtime detected
+ */
+#include "simd.inc"
+
+
+/*
+ *****************************************************************************
+ **                             UFUNC LOOPS                                 **
+ *****************************************************************************
+ */
+
+/* unary loop input and output contiguous */
+#define IS_UNARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
+                                  steps[1] == sizeof(tout))
+
+#define IS_BINARY_REDUCE ((args[0] == args[2])\
+        && (steps[0] == steps[2])\
+        && (steps[0] == 0))
+
+/* binary loop input and output contiguous */
+#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
+                                   steps[1] == sizeof(tin) && \
+                                   steps[2] == sizeof(tout))
+/* binary loop input and output contiguous with first scalar */
+#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
+                                   steps[1] == sizeof(tin) && \
+                                   steps[2] == sizeof(tout))
+/* binary loop input and output contiguous with second scalar */
+#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
+                                   steps[1] == 0 && \
+                                   steps[2] == sizeof(tout))
+
+#define OUTPUT_LOOP\
+    char *op1 = args[1];\
+    npy_intp os1 = steps[1];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, op1 += os1)
+
+#define UNARY_LOOP\
+    char *ip1 = args[0], *op1 = args[1];\
+    npy_intp is1 = steps[0], os1 = steps[1];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
+
+/*
+ * loop with contiguous specialization
+ * op should be the code working on `tin in` and
+ * storing the result in `tout * out`
+ * combine with NPY_GCC_OPT_3 to allow autovectorization
+ * should only be used where its worthwhile to avoid code bloat
+ */
+#define BASE_UNARY_LOOP(tin, tout, op) \
+    UNARY_LOOP { \
+        const tin in = *(tin *)ip1; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
+#define UNARY_LOOP_FAST(tin, tout, op) \
+    do { \
+    /* condition allows compiler to optimize the generic macro */ \
+    if (IS_UNARY_CONT(tin, tout)) { \
+        if (args[0] == args[1]) { \
+            BASE_UNARY_LOOP(tin, tout, op) \
+        } \
+        else { \
+            BASE_UNARY_LOOP(tin, tout, op) \
+        } \
+    } \
+    else { \
+        BASE_UNARY_LOOP(tin, tout, op) \
+    } \
+    } \
+    while (0)
+
+#define UNARY_LOOP_TWO_OUT\
+    char *ip1 = args[0], *op1 = args[1], *op2 = args[2];\
+    npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
+
+#define BINARY_LOOP\
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
+
+/*
+ * loop with contiguous specialization
+ * op should be the code working on `tin in1`, `tin in2` and
+ * storing the result in `tout * out`
+ * combine with NPY_GCC_OPT_3 to allow autovectorization
+ * should only be used where its worthwhile to avoid code bloat
+ */
+#define BASE_BINARY_LOOP(tin, tout, op) \
+    BINARY_LOOP { \
+        const tin in1 = *(tin *)ip1; \
+        const tin in2 = *(tin *)ip2; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
+/*
+ * unfortunately gcc 6/7 regressed and we need to give it additional hints to
+ * vectorize inplace operations (PR80198)
+ * must only be used after op1 == ip1 or ip2 has been checked
+ * TODO: using ivdep might allow other compilers to vectorize too
+ */
+#if __GNUC__ >= 6
+#define IVDEP_LOOP _Pragma("GCC ivdep")
+#else
+#define IVDEP_LOOP
+#endif
+#define BASE_BINARY_LOOP_INP(tin, tout, op) \
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    IVDEP_LOOP \
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
+        const tin in1 = *(tin *)ip1; \
+        const tin in2 = *(tin *)ip2; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
+#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
+    const tin cin = *(tin *)cinp; \
+    BINARY_LOOP { \
+        const tin vin = *(tin *)vinp; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
+/* PR80198 again, scalar works without the pragma */
+#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
+    const tin cin = *(tin *)cinp; \
+    BINARY_LOOP { \
+        const tin vin = *(tin *)vinp; \
+        tout * out = (tout *)vinp; \
+        op; \
+    }
+#define BINARY_LOOP_FAST(tin, tout, op) \
+    do { \
+    /* condition allows compiler to optimize the generic macro */ \
+    if (IS_BINARY_CONT(tin, tout)) { \
+        if (abs_ptrdiff(args[2], args[0]) == 0 && \
+                abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+            BASE_BINARY_LOOP_INP(tin, tout, op) \
+        } \
+        else if (abs_ptrdiff(args[2], args[1]) == 0 && \
+                     abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+            BASE_BINARY_LOOP_INP(tin, tout, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP(tin, tout, op) \
+        } \
+    } \
+    else if (IS_BINARY_CONT_S1(tin, tout)) { \
+        if (abs_ptrdiff(args[2], args[1]) == 0) { \
+            BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+        } \
+    } \
+    else if (IS_BINARY_CONT_S2(tin, tout)) { \
+        if (abs_ptrdiff(args[2], args[0]) == 0) { \
+            BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+        }\
+    } \
+    else { \
+        BASE_BINARY_LOOP(tin, tout, op) \
+    } \
+    } \
+    while (0)
+
+#define BINARY_REDUCE_LOOP_INNER\
+    char *ip2 = args[1]; \
+    npy_intp is2 = steps[1]; \
+    npy_intp n = dimensions[0]; \
+    npy_intp i; \
+    for(i = 0; i < n; i++, ip2 += is2)
+
+#define BINARY_REDUCE_LOOP(TYPE)\
+    char *iop1 = args[0]; \
+    TYPE io1 = *(TYPE *)iop1; \
+    BINARY_REDUCE_LOOP_INNER
+
+#define BINARY_LOOP_TWO_OUT\
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
+
+/******************************************************************************
+ **                          GENERIC FLOAT LOOPS                             **
+ *****************************************************************************/
+
+
+typedef float halfUnaryFunc(npy_half x);
+typedef float floatUnaryFunc(float x);
+typedef double doubleUnaryFunc(double x);
+typedef npy_longdouble longdoubleUnaryFunc(npy_longdouble x);
+typedef npy_half halfBinaryFunc(npy_half x, npy_half y);
+typedef float floatBinaryFunc(float x, float y);
+typedef double doubleBinaryFunc(double x, double y);
+typedef npy_longdouble longdoubleBinaryFunc(npy_longdouble x, npy_longdouble y);
+
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_e_e(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    halfUnaryFunc *f = (halfUnaryFunc *)func;
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *(npy_half *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_e_e_As_f_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    floatUnaryFunc *f = (floatUnaryFunc *)func;
+    UNARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *(npy_half *)op1 = npy_float_to_half(f(in1));
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_e_e_As_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    doubleUnaryFunc *f = (doubleUnaryFunc *)func;
+    UNARY_LOOP {
+        const double in1 = npy_half_to_double(*(npy_half *)ip1);
+        *(npy_half *)op1 = npy_double_to_half(f(in1));
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_f_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    floatUnaryFunc *f = (floatUnaryFunc *)func;
+    UNARY_LOOP {
+        const float in1 = *(float *)ip1;
+        *(float *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_f_f_As_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    doubleUnaryFunc *f = (doubleUnaryFunc *)func;
+    UNARY_LOOP {
+        const float in1 = *(float *)ip1;
+        *(float *)op1 = (float)f((double)in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ee_e(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    halfBinaryFunc *f = (halfBinaryFunc *)func;
+    BINARY_LOOP {
+        npy_half in1 = *(npy_half *)ip1;
+        npy_half in2 = *(npy_half *)ip2;
+        *(npy_half *)op1 = f(in1, in2);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ee_e_As_ff_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    floatBinaryFunc *f = (floatBinaryFunc *)func;
+    BINARY_LOOP {
+        float in1 = npy_half_to_float(*(npy_half *)ip1);
+        float in2 = npy_half_to_float(*(npy_half *)ip2);
+        *(npy_half *)op1 = npy_float_to_half(f(in1, in2));
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ee_e_As_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    doubleBinaryFunc *f = (doubleBinaryFunc *)func;
+    BINARY_LOOP {
+        double in1 = npy_half_to_double(*(npy_half *)ip1);
+        double in2 = npy_half_to_double(*(npy_half *)ip2);
+        *(npy_half *)op1 = npy_double_to_half(f(in1, in2));
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ff_f(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    floatBinaryFunc *f = (floatBinaryFunc *)func;
+    BINARY_LOOP {
+        float in1 = *(float *)ip1;
+        float in2 = *(float *)ip2;
+        *(float *)op1 = f(in1, in2);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ff_f_As_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    doubleBinaryFunc *f = (doubleBinaryFunc *)func;
+    BINARY_LOOP {
+        float in1 = *(float *)ip1;
+        float in2 = *(float *)ip2;
+        *(float *)op1 = (double)f((double)in1, (double)in2);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    doubleUnaryFunc *f = (doubleUnaryFunc *)func;
+    UNARY_LOOP {
+        double in1 = *(double *)ip1;
+        *(double *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    doubleBinaryFunc *f = (doubleBinaryFunc *)func;
+    BINARY_LOOP {
+        double in1 = *(double *)ip1;
+        double in2 = *(double *)ip2;
+        *(double *)op1 = f(in1, in2);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_g_g(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    longdoubleUnaryFunc *f = (longdoubleUnaryFunc *)func;
+    UNARY_LOOP {
+        npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *(npy_longdouble *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_gg_g(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    longdoubleBinaryFunc *f = (longdoubleBinaryFunc *)func;
+    BINARY_LOOP {
+        npy_longdouble in1 = *(npy_longdouble *)ip1;
+        npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *(npy_longdouble *)op1 = f(in1, in2);
+    }
+}
+
+
+
+/******************************************************************************
+ **                          GENERIC COMPLEX LOOPS                           **
+ *****************************************************************************/
+
+
+typedef void cdoubleUnaryFunc(npy_cdouble *x, npy_cdouble *r);
+typedef void cfloatUnaryFunc(npy_cfloat *x, npy_cfloat *r);
+typedef void clongdoubleUnaryFunc(npy_clongdouble *x, npy_clongdouble *r);
+typedef void cdoubleBinaryFunc(npy_cdouble *x, npy_cdouble *y, npy_cdouble *r);
+typedef void cfloatBinaryFunc(npy_cfloat *x, npy_cfloat *y, npy_cfloat *r);
+typedef void clongdoubleBinaryFunc(npy_clongdouble *x, npy_clongdouble *y,
+                                   npy_clongdouble *r);
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_F_F(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    cfloatUnaryFunc *f = (cfloatUnaryFunc *)func;
+    UNARY_LOOP {
+        npy_cfloat in1 = *(npy_cfloat *)ip1;
+        npy_cfloat *out = (npy_cfloat *)op1;
+        f(&in1, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_F_F_As_D_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    cdoubleUnaryFunc *f = (cdoubleUnaryFunc *)func;
+    UNARY_LOOP {
+        npy_cdouble tmp, out;
+        tmp.real = (double)((float *)ip1)[0];
+        tmp.imag = (double)((float *)ip1)[1];
+        f(&tmp, &out);
+        ((float *)op1)[0] = (float)out.real;
+        ((float *)op1)[1] = (float)out.imag;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_FF_F(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    cfloatBinaryFunc *f = (cfloatBinaryFunc *)func;
+    BINARY_LOOP {
+        npy_cfloat in1 = *(npy_cfloat *)ip1;
+        npy_cfloat in2 = *(npy_cfloat *)ip2;
+        npy_cfloat *out = (npy_cfloat *)op1;
+        f(&in1, &in2, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_FF_F_As_DD_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    cdoubleBinaryFunc *f = (cdoubleBinaryFunc *)func;
+    BINARY_LOOP {
+        npy_cdouble tmp1, tmp2, out;
+        tmp1.real = (double)((float *)ip1)[0];
+        tmp1.imag = (double)((float *)ip1)[1];
+        tmp2.real = (double)((float *)ip2)[0];
+        tmp2.imag = (double)((float *)ip2)[1];
+        f(&tmp1, &tmp2, &out);
+        ((float *)op1)[0] = (float)out.real;
+        ((float *)op1)[1] = (float)out.imag;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_D_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    cdoubleUnaryFunc *f = (cdoubleUnaryFunc *)func;
+    UNARY_LOOP {
+        npy_cdouble in1 = *(npy_cdouble *)ip1;
+        npy_cdouble *out = (npy_cdouble *)op1;
+        f(&in1, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_DD_D(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    cdoubleBinaryFunc *f = (cdoubleBinaryFunc *)func;
+    BINARY_LOOP {
+        npy_cdouble in1 = *(npy_cdouble *)ip1;
+        npy_cdouble in2 = *(npy_cdouble *)ip2;
+        npy_cdouble *out = (npy_cdouble *)op1;
+        f(&in1, &in2, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_G_G(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    clongdoubleUnaryFunc *f = (clongdoubleUnaryFunc *)func;
+    UNARY_LOOP {
+        npy_clongdouble in1 = *(npy_clongdouble *)ip1;
+        npy_clongdouble *out = (npy_clongdouble *)op1;
+        f(&in1, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_GG_G(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    clongdoubleBinaryFunc *f = (clongdoubleBinaryFunc *)func;
+    BINARY_LOOP {
+        npy_clongdouble in1 = *(npy_clongdouble *)ip1;
+        npy_clongdouble in2 = *(npy_clongdouble *)ip2;
+        npy_clongdouble *out = (npy_clongdouble *)op1;
+        f(&in1, &in2, out);
+    }
+}
+
+
+/******************************************************************************
+ **                         GENERIC OBJECT lOOPS                             **
+ *****************************************************************************/
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_O_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    unaryfunc f = (unaryfunc)func;
+    UNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = f(in1 ? in1 : Py_None);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_O_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    char *meth = (char *)func;
+    UNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None, meth, NULL);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_OO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    binaryfunc f = (binaryfunc)func;
+    BINARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = f(in1 ? in1 : Py_None, in2 ? in2 : Py_None);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_OO_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    char *meth = (char *)func;
+    BINARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None,
+                                            meth, "(O)", in2);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*
+ * A general-purpose ufunc that deals with general-purpose Python callable.
+ * func is a structure with nin, nout, and a Python callable function
+ */
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_On_Om(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    npy_intp n =  dimensions[0];
+    PyUFunc_PyFuncData *data = (PyUFunc_PyFuncData *)func;
+    int nin = data->nin;
+    int nout = data->nout;
+    PyObject *tocall = data->callable;
+    char *ptrs[NPY_MAXARGS];
+    PyObject *arglist, *result;
+    PyObject *in, **op;
+    npy_intp i, j, ntot;
+
+    ntot = nin+nout;
+
+    for(j = 0; j < ntot; j++) {
+        ptrs[j] = args[j];
+    }
+    for(i = 0; i < n; i++) {
+        arglist = PyTuple_New(nin);
+        if (arglist == NULL) {
+            return;
+        }
+        for(j = 0; j < nin; j++) {
+            in = *((PyObject **)ptrs[j]);
+            if (in == NULL) {
+                in = Py_None;
+            }
+            PyTuple_SET_ITEM(arglist, j, in);
+            Py_INCREF(in);
+        }
+        result = PyEval_CallObject(tocall, arglist);
+        Py_DECREF(arglist);
+        if (result == NULL) {
+            return;
+        }
+        if (nout == 0  && result == Py_None) {
+            /* No output expected, no output received, continue */
+            Py_DECREF(result);
+        }
+        else if (nout == 1) {
+            /* Single output expected, assign and continue */
+            op = (PyObject **)ptrs[nin];
+            Py_XDECREF(*op);
+            *op = result;
+        }
+        else if (PyTuple_Check(result) && nout == PyTuple_Size(result)) {
+            /*
+             * Multiple returns match expected number of outputs, assign
+             * and continue. Will also gobble empty tuples if nout == 0.
+             */
+            for(j = 0; j < nout; j++) {
+                op = (PyObject **)ptrs[j+nin];
+                Py_XDECREF(*op);
+                *op = PyTuple_GET_ITEM(result, j);
+                Py_INCREF(*op);
+            }
+            Py_DECREF(result);
+        }
+        else {
+            /* Mismatch between returns and expected outputs, exit */
+            Py_DECREF(result);
+            return;
+        }
+        for(j = 0; j < ntot; j++) {
+            ptrs[j] += steps[j];
+        }
+    }
+}
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP =  ==, !=, >, >=, <, <=#
+ **/
+
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+        *((npy_bool *)op1)= in1 @OP@ in2;
+    }
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #OP =  &&, ||#
+ * #SC =  ==, !=#
+ * #and = 1, 0#
+ **/
+
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 @OP@ in2;
+                if (io1 @SC@ 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 @OP@ in2;
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = absolute, logical_not#
+ * #OP =  !=, ==#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 @OP@ 0;
+        }
+    }
+}
+/**end repeat**/
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_bool *)op1) = 1;
+    }
+}
+
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ *          npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ */
+
+#define @TYPE@_floor_divide @TYPE@_divide
+#define @TYPE@_fmax @TYPE@_maximum
+#define @TYPE@_fmin @TYPE@_minimum
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((@type@ *)op1) = 1;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
+}
+
+/**begin repeat1
+ * #isa = , _avx2#
+ * #ISA = , AVX2#
+ * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2#
+ * #ATTR = , NPY_GCC_TARGET_AVX2#
+ */
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_square@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+#endif
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_reciprocal@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+#endif
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_conjugate@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+#endif
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_negative@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = -in);
+}
+#endif
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_logical_not@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+#endif
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_invert@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+#endif
+
+/**begin repeat2
+ * Arithmetic
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *          left_shift, right_shift#
+ * #OP = +, -,*, &, |, ^, <<, >>#
+ */
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            io1 @OP@= *(@type@ *)ip2;
+        }
+        *((@type@ *)iop1) = io1;
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+#endif
+
+/**end repeat2**/
+
+/**begin repeat2
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or#
+ * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ */
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+#endif
+
+/**end repeat2**/
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_logical_xor@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int t1 = !!*(@type@ *)ip1;
+        const int t2 = !!*(@type@ *)ip2;
+        *((npy_bool *)op1) = (t1 != t2);
+    }
+}
+#endif
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            const @type@ in2 = *(@type@ *)ip2;
+            io1 = (io1 @OP@ in2) ? io1 : in2;
+        }
+        *((@type@ *)iop1) = io1;
+    }
+    else {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
+        }
+    }
+}
+
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_power(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        @type@ out;
+
+#if @SIGNED@
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((@type@ *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((@type@ *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((@type@ *) op1) = out;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_fmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+        }
+        else {
+            *((@type@ *)op1)= in1 % in2;
+        }
+
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #c    = ,,,l,ll#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+NPY_NO_EXPORT void
+@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        /*
+         * FIXME: On x86 at least, dividing the smallest representable integer
+         * by -1 causes a SIFGPE (division overflow). We treat this case here
+         * (to avoid a SIGFPE crash at python level), but a good solution would
+         * be to treat integer division problems separately from FPU exceptions
+         * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+         */
+        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+        }
+        else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
+            *((@type@ *)op1) = in1/in2 - 1;
+        }
+        else {
+            *((@type@ *)op1) = in1/in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const @type@ rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((@type@ *)op1) = rem;
+            }
+            else {
+                *((@type@ *)op1) = rem + in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        /* see FIXME note for divide above */
+        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+            *((@type@ *)op2) = 0;
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const @type@ quo = in1 / in2;
+            const @type@ rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((@type@ *)op1) = quo;
+                *((@type@ *)op2) = rem;
+            }
+            else {
+                *((@type@ *)op1) = quo - 1;
+                *((@type@ *)op2) = rem + in2;
+            }
+        }
+    }
+}
+
+/**begin repeat1
+ * #kind = gcd, lcm#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1) = npy_@kind@@c@(in1, in2);
+    }
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #c    = u,u,u,ul,ull#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
+}
+
+NPY_NO_EXPORT void
+@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+        }
+        else {
+            *((@type@ *)op1)= in1/in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+        }
+        else {
+            *((@type@ *)op1) = in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((@type@ *)op1) = 0;
+            *((@type@ *)op2) = 0;
+        }
+        else {
+            *((@type@ *)op1)= in1/in2;
+            *((@type@ *)op2) = in1 % in2;
+        }
+    }
+}
+
+/**begin repeat1
+ * #kind = gcd, lcm#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1) = npy_@kind@@c@(in1, in2);
+    }
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           DATETIME LOOPS                                **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = -in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        *((npy_timedelta *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = (in1 >= 0) ? in1 : -in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        *((npy_timedelta *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : 0);
+    }
+}
+
+/**begin repeat
+ * #type = npy_datetime, npy_timedelta#
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_isnat(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((npy_bool *)op1) = (in1 == NPY_DATETIME_NAT);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((@type@ *)op1) = 1;
+    }
+}
+
+/**begin repeat1
+ * #kind = equal, greater, greater_equal, less, less_equal#
+ * #OP =  ==, >, >=, <, <=#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((npy_bool *)op1) = (in1 @OP@ in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((npy_bool *)op1) = (in1 != in2 ||
+                              in1 == NPY_DATETIME_NAT ||
+                              in2 == NPY_DATETIME_NAT);
+    }
+}
+
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((@type@ *)op1) = in2;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((@type@ *)op1) = in1;
+        }
+        else {
+            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
+        }
+    }
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 - in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 - in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 - in2;
+        }
+    }
+}
+
+/* Note: Assuming 'q' == NPY_LONGLONG */
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_int64 in2 = *(npy_int64 *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 * in2;
+        }
+    }
+}
+
+/* Note: Assuming 'q' == NPY_LONGLONG */
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_int64 in1 = *(npy_int64 *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 * in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const double in2 = *(double *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            double result = in1 * in2;
+            if (npy_isfinite(result)) {
+                *((npy_timedelta *)op1) = (npy_timedelta)result;
+            }
+            else {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const double in1 = *(double *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            double result = in1 * in2;
+            if (npy_isfinite(result)) {
+                *((npy_timedelta *)op1) = (npy_timedelta)result;
+            }
+            else {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+    }
+}
+
+/* Note: Assuming 'q' == NPY_LONGLONG */
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_int64 in2 = *(npy_int64 *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == 0) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 / in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const double in2 = *(double *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            double result = in1 / in2;
+            if (npy_isfinite(result)) {
+                *((npy_timedelta *)op1) = (npy_timedelta)result;
+            }
+            else {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((double *)op1) = NPY_NAN;
+        }
+        else {
+            *((double *)op1) = (double)in1 / (double)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+            else {
+                /* handle mixed case the way Python does */
+                const npy_timedelta rem = in1 % in2;
+                if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                    *((npy_timedelta *)op1) = rem;
+                }
+                else {
+                    *((npy_timedelta *)op1) = rem + in2;
+                }
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            *((npy_int64 *)op1) = 0;
+        }
+        else if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_int64 *)op1) = 0;
+        }
+        else {
+            if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
+                *((npy_int64 *)op1) = in1/in2 - 1;
+            }
+            else {
+                *((npy_int64 *)op1) = in1/in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            *((npy_int64 *)op1) = 0;
+            *((npy_timedelta *)op2) = NPY_DATETIME_NAT;
+        }
+        else if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_int64 *)op1) = 0;
+            *((npy_timedelta *)op2) = NPY_DATETIME_NAT;
+        }
+        else {
+            const npy_int64 quo = in1 / in2;
+            const npy_timedelta rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_int64 *)op1) = quo;
+                *((npy_timedelta *)op2) = rem;
+            }
+            else {
+                *((npy_int64 *)op1) = quo - 1;
+                *((npy_timedelta *)op2) = rem + in2;
+            }
+        }
+    }
+}
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #scalarf = npy_sqrtf, npy_sqrt#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = @scalarf@(in1);
+        }
+    }
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble, npy_float#
+ *  #dtype = npy_float, npy_double, npy_longdouble, npy_half#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
+ *  #c = f, , l, #
+ *  #C = F, , L, #
+ *  #trf = , , , npy_half_to_float#
+ */
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static @type@
+pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        @type@ res = 0.;
+
+        for (i = 0; i < n; i++) {
+            res += @trf@(*((@dtype@*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        @type@ r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
+        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
+        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
+        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
+        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
+        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
+        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
+        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
+            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
+            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
+            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
+            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
+            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
+            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
+            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
+            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += @trf@(*((@dtype@ *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return pairwise_sum_@TYPE@(a, n2, stride) +
+               pairwise_sum_@TYPE@(a + n2 * stride, n - n2, stride);
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ *  #c = f, , l#
+ *  #C = F, , L#
+ */
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if @PW@
+        @type@ * iop1 = (@type@ *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(@type@) {
+            io1 @OP@= *(@type@ *)ip2;
+        }
+        *((@type@ *)iop1) = io1;
+#endif
+    }
+    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((@type@ *)op1) = in1 @OP@ in2;
+        }
+    }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((npy_bool *)op1) = in1 @OP@ in2;
+        }
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int t1 = !!*(@type@ *)ip1;
+        const int t2 = !!*(@type@ *)ip2;
+        *((npy_bool *)op1) = (t1 != t2);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((npy_bool *)op1) = !in1;
+    }
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (!run_@kind@_simd_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *((npy_bool *)op1) = @func@(in1) != 0;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = npy_spacing@c@(in1);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1)= npy_copysign@c@(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1)= npy_nextafter@c@(in1, in2);
+    }
+}
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    if (IS_BINARY_REDUCE) {
+        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+            BINARY_REDUCE_LOOP(@type@) {
+                const @type@ in2 = *(@type@ *)ip2;
+                /* Order of operations important for MSVC 2015 */
+                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
+            }
+            *((@type@ *)iop1) = io1;
+        }
+    }
+    else {
+        BINARY_LOOP {
+            @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            /* Order of operations important for MSVC 2015 */
+            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
+            *((@type@ *)op1) = in1;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            const @type@ in2 = *(@type@ *)ip2;
+            /* Order of operations important for MSVC 2015 */
+            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
+        }
+        *((@type@ *)iop1) = io1;
+    }
+    else {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            /* Order of operations important for MSVC 2015 */
+            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        @type@ mod;
+        *((@type@ *)op1) = npy_divmod@c@(in1, in2, &mod);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        npy_divmod@c@(in1, in2, (@type@ *)op1);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1) = npy_divmod@c@(in1, in2, (@type@ *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    char * margs[] = {args[0], args[0], args[1]};
+    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
+    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *((@type@ *)op1) = in1*in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    @type@ one = 1.@c@;
+    char * margs[] = {(char*)&one, args[0], args[1]};
+    npy_intp msteps[] = {0, steps[0], steps[1]};
+    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *((@type@ *)op1) = 1/in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((@type@ *)op1) = 1;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *((@type@ *)op1) = -in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /* Sign of nan is nan */
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : (in1 == 0 ? 0 : in1));
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = npy_modf@c@(in1, (@type@ *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = npy_frexp@c@(in1, (int *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const int in2 = *(int *)ip2;
+        *((@type@ *)op1) = npy_ldexp@c@(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
+     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
+     * to handle the default integer type.
+     */
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const long in2 = *(long *)ip2;
+        if (((int)in2) == in2) {
+            /* Range OK */
+            *((@type@ *)op1) = npy_ldexp@c@(in1, ((int)in2));
+        }
+        else {
+            /*
+             * Outside npy_int range -- also ldexp will overflow in this case,
+             * given that exponent has less bits than npy_int.
+             */
+            if (in2 > 0) {
+                *((@type@ *)op1) = npy_ldexp@c@(in1, NPY_MAX_INT);
+            }
+            else {
+                *((@type@ *)op1) = npy_ldexp@c@(in1, NPY_MIN_INT);
+            }
+        }
+    }
+}
+
+#define @TYPE@_true_divide @TYPE@_divide
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                          HALF-FLOAT LOOPS                               **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ */
+NPY_NO_EXPORT void
+HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        char *iop1 = args[0];
+        float io1 = npy_half_to_float(*(npy_half *)iop1);
+#if @PW@
+        npy_intp n = dimensions[0];
+
+        io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP_INNER {
+            io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
+        }
+#endif
+        *((npy_half *)iop1) = npy_float_to_half(io1);
+    }
+    else {
+        BINARY_LOOP {
+            const float in1 = npy_half_to_float(*(npy_half *)ip1);
+            const float in2 = npy_half_to_float(*(npy_half *)ip2);
+            *((npy_half *)op1) = npy_float_to_half(in1 @OP@ in2);
+        }
+    }
+}
+/**end repeat**/
+
+#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
+#define _HALF_LOGICAL_OR(a,b) (!npy_half_iszero(a) || !npy_half_iszero(b))
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater,
+ *         greater_equal, logical_and, logical_or#
+ * #OP = npy_half_eq, npy_half_ne, npy_half_lt, npy_half_le, npy_half_gt,
+ *       npy_half_ge, _HALF_LOGICAL_AND, _HALF_LOGICAL_OR#
+ */
+NPY_NO_EXPORT void
+HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = @OP@(in1, in2);
+    }
+}
+/**end repeat**/
+#undef _HALF_LOGICAL_AND
+#undef _HALF_LOGICAL_OR
+
+NPY_NO_EXPORT void
+HALF_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int in1 = !npy_half_iszero(*(npy_half *)ip1);
+        const int in2 = !npy_half_iszero(*(npy_half *)ip2);
+        *((npy_bool *)op1) = (in1 != in2);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = npy_half_iszero(in1);
+    }
+}
+
+/**begin repeat
+ * #kind = isnan, isinf, isfinite, signbit#
+ * #func = npy_half_isnan, npy_half_isinf, npy_half_isfinite, npy_half_signbit#
+ **/
+NPY_NO_EXPORT void
+HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = @func@(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat**/
+
+NPY_NO_EXPORT void
+HALF_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = npy_half_spacing(in1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1)= npy_half_copysign(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1)= npy_half_nextafter(in1, in2);
+    }
+}
+
+/**begin repeat
+ * #kind = maximum, minimum#
+ * #OP =  npy_half_ge, npy_half_le#
+ **/
+NPY_NO_EXPORT void
+HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in1)) ? in1 : in2;
+    }
+    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = fmax, fmin#
+ * #OP =  npy_half_ge, npy_half_le#
+ **/
+NPY_NO_EXPORT void
+HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
+    }
+    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+}
+/**end repeat**/
+
+NPY_NO_EXPORT void
+HALF_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        npy_half mod;
+        *((npy_half *)op1) = npy_half_divmod(in1, in2, &mod);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        npy_half_divmod(in1, in2, (npy_half *)op1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = npy_half_divmod(in1, in2, (npy_half *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(in1*in1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(1/in1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_half *)op1) = NPY_HALF_ONE;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = in1&0x7fffu;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = in1^0x8000u;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /* Sign of nan is nan */
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = npy_half_isnan(in1) ? in1 :
+                    (((in1&0x7fffu) == 0) ? 0 :
+                      (((in1&0x8000u) == 0) ? NPY_HALF_ONE : NPY_HALF_NEGONE));
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    float temp;
+
+    UNARY_LOOP_TWO_OUT {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_modff(in1, &temp));
+        *((npy_half *)op2) = npy_float_to_half(temp);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_frexpf(in1, (int *)op2));
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        const int in2 = *(int *)ip2;
+        *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, in2));
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
+     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
+     * to handle the default integer type.
+     */
+    BINARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        const long in2 = *(long *)ip2;
+        if (((int)in2) == in2) {
+            /* Range OK */
+            *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, ((int)in2)));
+        }
+        else {
+            /*
+             * Outside npy_int range -- also ldexp will overflow in this case,
+             * given that exponent has less bits than npy_int.
+             */
+            if (in2 > 0) {
+                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MAX_INT));
+            }
+            else {
+                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MIN_INT));
+            }
+        }
+    }
+}
+
+#define HALF_true_divide HALF_divide
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+
+#define CGE(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi >= yi))
+#define CLE(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi <= yi))
+#define CGT(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi > yi))
+#define CLT(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi < yi))
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi)
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi)
+
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #ftype = npy_float, npy_double, npy_longdouble#
+ * #c = f, , l#
+ * #C = F, , L#
+ */
+
+/* similar to pairwise sum of real floats */
+static void
+pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = 0.;
+        *ri = 0.;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        @ftype@ r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((@ftype@ *)(a + 0 * stride));
+        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
+        r[2] = *((@ftype@ *)(a + 2 * stride));
+        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
+        r[4] = *((@ftype@ *)(a + 4 * stride));
+        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
+        r[6] = *((@ftype@ *)(a + 6 * stride));
+        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
+            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
+            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
+            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
+            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        @ftype@ rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        pairwise_sum_@TYPE@(&rr1, &ri1, a, n2, stride);
+        pairwise_sum_@TYPE@(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract#
+ * #OP = +, -#
+ * #PW = 1, 0#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE && @PW@) {
+        npy_intp n = dimensions[0];
+        @ftype@ * or = ((@ftype@ *)args[0]);
+        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+        @ftype@ rr, ri;
+
+        pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or @OP@= rr;
+        *oi @OP@= ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
+            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+        }
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
+        ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        const @ftype@ in2r_abs = npy_fabs@c@(in2r);
+        const @ftype@ in2i_abs = npy_fabs@c@(in2i);
+        if (in2r_abs >= in2i_abs) {
+            if (in2r_abs == 0 && in2i_abs == 0) {
+                /* divide by zero should yield a complex inf or nan */
+                ((@ftype@ *)op1)[0] = in1r/in2r_abs;
+                ((@ftype@ *)op1)[1] = in1i/in2i_abs;
+            }
+            else {
+                const @ftype@ rat = in2i/in2r;
+                const @ftype@ scl = 1.0@c@/(in2r + in2i*rat);
+                ((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl;
+                ((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl;
+            }
+        }
+        else {
+            const @ftype@ rat = in2r/in2i;
+            const @ftype@ scl = 1.0@c@/(in2i + in2r*rat);
+            ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
+            ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        if (npy_fabs@c@(in2r) >= npy_fabs@c@(in2i)) {
+            const @ftype@ rat = in2i/in2r;
+            ((@ftype@ *)op1)[0] = npy_floor@c@((in1r + in1i*rat)/(in2r + in2i*rat));
+            ((@ftype@ *)op1)[1] = 0;
+        }
+        else {
+            const @ftype@ rat = in2r/in2i;
+            ((@ftype@ *)op1)[0] = npy_floor@c@((in1r*rat + in1i)/(in2i + in2r*rat));
+            ((@ftype@ *)op1)[1] = 0;
+        }
+    }
+}
+
+/**begin repeat1
+ * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
+ * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        *((npy_bool *)op1) = @OP@(in1r,in1i,in2r,in2i);
+    }
+}
+/**end repeat1**/
+
+/**begin repeat1
+   #kind = logical_and, logical_or#
+   #OP1 = ||, ||#
+   #OP2 = &&, ||#
+*/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        *((npy_bool *)op1) = (in1r @OP1@ in1i) @OP2@ (in2r @OP1@ in2i);
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        const npy_bool tmp1 = (in1r || in1i);
+        const npy_bool tmp2 = (in2r || in2i);
+        *((npy_bool *)op1) = tmp1 != tmp2;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        *((npy_bool *)op1) = !(in1r || in1i);
+    }
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #OP = ||, ||, &&#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        *((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        ((@ftype@ *)op1)[0] = in1r*in1r - in1i*in1i;
+        ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        if (npy_fabs@c@(in1i) <= npy_fabs@c@(in1r)) {
+            const @ftype@ r = in1i/in1r;
+            const @ftype@ d = in1r + in1i*r;
+            ((@ftype@ *)op1)[0] = 1/d;
+            ((@ftype@ *)op1)[1] = -r/d;
+        } else {
+            const @ftype@ r = in1r/in1i;
+            const @ftype@ d = in1r*r + in1i;
+            ((@ftype@ *)op1)[0] = r/d;
+            ((@ftype@ *)op1)[1] = -1/d;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        ((@ftype@ *)op1)[0] = 1;
+        ((@ftype@ *)op1)[1] = 0;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) {
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        ((@ftype@ *)op1)[0] = in1r;
+        ((@ftype@ *)op1)[1] = -in1i;
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        *((@ftype@ *)op1) = npy_atan2@c@(in1i, in1r);
+    }
+}
+
+NPY_NO_EXPORT void
+@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    /* fixme: sign of nan is currently 0 */
+    UNARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        ((@ftype@ *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ?  1 :
+                            (CLT(in1r, in1i, 0.0, 0.0) ? -1 :
+                            (CEQ(in1r, in1i, 0.0, 0.0) ?  0 : NPY_NAN@C@));
+        ((@ftype@ *)op1)[1] = 0;
+    }
+}
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || @OP@(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((@ftype@ *)op1)[0] = in1r;
+        ((@ftype@ *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
+        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
+        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
+        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || @OP@(in1r, in1i, in2r, in2i)) {
+            ((@ftype@ *)op1)[0] = in1r;
+            ((@ftype@ *)op1)[1] = in1i;
+        }
+        else {
+            ((@ftype@ *)op1)[0] = in2r;
+            ((@ftype@ *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+
+#define @TYPE@_true_divide @TYPE@_divide
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP = EQ, NE, GT, GE, LT, LE#
+ * #identity = NPY_TRUE, NPY_FALSE, -1*4#
+ */
+
+/**begin repeat1
+ * #suffix = , _OO_O#
+ * #as_bool = 1, 0#
+ */
+NPY_NO_EXPORT void
+OBJECT@suffix@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_@OP@);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if @as_bool@
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    PyObject *zero = PyLong_FromLong(0);
+
+    UNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = NULL;
+        int v;
+
+        if (in1 == NULL) {
+            in1 = Py_None;
+        }
+
+        if ((v = PyObject_RichCompareBool(in1, zero, Py_LT)) == 1) {
+            ret = PyLong_FromLong(-1);
+        }
+        else if (v == 0 &&
+                (v = PyObject_RichCompareBool(in1, zero, Py_GT)) == 1) {
+            ret = PyLong_FromLong(1);
+        }
+        else if (v == 0 &&
+                (v = PyObject_RichCompareBool(in1, zero, Py_EQ)) == 1) {
+            ret = PyLong_FromLong(0);
+        }
+        else if (v == 0) {
+            /* in1 is NaN */
+            PyErr_SetString(PyExc_TypeError,
+                    "unorderable types for comparison");
+        }
+
+        if (ret == NULL) {
+            break;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+    Py_XDECREF(zero);
+}
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/loops.h.src b/contrib/python/numpy/py2/numpy/core/src/umath/loops.h.src
new file mode 100644
index 00000000000..5264a6533ee
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/loops.h.src
@@ -0,0 +1,525 @@
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ */
+
+#ifndef _NPY_UMATH_LOOPS_H_
+#define _NPY_UMATH_LOOPS_H_
+
+#define BOOL_invert BOOL_logical_not
+#define BOOL_negative BOOL_logical_not
+#define BOOL_add BOOL_logical_or
+#define BOOL_bitwise_and BOOL_logical_and
+#define BOOL_bitwise_or BOOL_logical_or
+#define BOOL_logical_xor BOOL_not_equal
+#define BOOL_bitwise_xor BOOL_logical_xor
+#define BOOL_multiply BOOL_logical_and
+#define BOOL_subtract BOOL_logical_xor
+#define BOOL_maximum BOOL_logical_or
+#define BOOL_minimum BOOL_logical_and
+#define BOOL_fmax BOOL_maximum
+#define BOOL_fmin BOOL_minimum
+
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or, absolute, logical_not#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+
+/**begin repeat1
+ * both signed and unsigned integer types
+ * #s = , u#
+ * #S = , U#
+ */
+
+#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_fmax @S@@TYPE@_maximum
+#define @S@@TYPE@_fmin @S@@TYPE@_minimum
+
+NPY_NO_EXPORT void
+@S@@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #isa = , _avx2#
+ */
+
+NPY_NO_EXPORT void
+@S@@TYPE@_square@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_reciprocal@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_conjugate@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_negative@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_not@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_invert@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat3
+ * Arithmetic
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *          left_shift, right_shift#
+ * #OP = +, -,*, &, |, ^, <<, >>#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+/**begin repeat3
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or#
+ * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_xor@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**begin repeat2
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_power(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_fmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_gcd(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_lcm(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+NPY_NO_EXPORT void
+@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+/**begin repeat
+ * Float types
+ *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ *  #c = f, f, , l#
+ *  #C = F, F, , L#
+ */
+
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
+ * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+#define @TYPE@_true_divide @TYPE@_divide
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+/**begin repeat
+ * complex types
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #c = f, , l#
+ * #C = F, , L#
+ */
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract#
+ * #OP = +, -#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
+ * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+   #kind = logical_and, logical_or#
+   #OP1 = ||, ||#
+   #OP2 = &&, ||#
+*/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #OP = ||, ||, &&#
+ **/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+C@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+C@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+C@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+#define C@TYPE@_true_divide C@TYPE@_divide
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            DATETIME LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_isnat(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP =  ==, !=, >, >=, <, <=#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/* Special case equivalents to above functions */
+
+#define TIMEDELTA_mq_m_true_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_true_divide TIMEDELTA_md_m_divide
+#define TIMEDELTA_mm_d_true_divide TIMEDELTA_mm_d_divide
+#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
+/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+#define TIMEDELTA_fmin TIMEDELTA_minimum
+#define TIMEDELTA_fmax TIMEDELTA_maximum
+#define DATETIME_fmin DATETIME_minimum
+#define DATETIME_fmax DATETIME_maximum
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP = EQ, NE, GT, GE, LT, LE#
+ */
+/**begin repeat1
+ * #suffix = , _OO_O#
+ */
+NPY_NO_EXPORT void
+OBJECT@suffix@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/matmul.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/matmul.c.src
new file mode 100644
index 00000000000..bc00d3562d0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/matmul.c.src
@@ -0,0 +1,503 @@
+/* -*- c -*- */
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+
+#include "npy_config.h"
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_cblas.h"
+#include "arraytypes.h" /* For TYPE_dot functions */
+
+#include <assert.h>
+
+/*
+ *****************************************************************************
+ **                            BASICS                                       **
+ *****************************************************************************
+ */
+
+/*
+ * -1 to be conservative, in case blas internally uses a for loop with an
+ * inclusive upper bound
+ */
+#define BLAS_MAXSIZE (NPY_MAX_INT - 1)
+
+/*
+ * Determine if a 2d matrix can be used by BLAS
+ * 1. Strides must not alias or overlap
+ * 2. The faster (second) axis must be contiguous
+ * 3. The slower (first) axis stride, in unit steps, must be larger than
+ *    the faster axis dimension
+ */
+static NPY_INLINE npy_bool
+is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
+              npy_intp d1, npy_intp d2,  npy_intp itemsize)
+{
+    npy_intp unit_stride1 = byte_stride1 / itemsize;
+    if (byte_stride2 != itemsize) {
+        return NPY_FALSE;
+    }
+    if ((byte_stride1 % itemsize ==0) &&
+        (unit_stride1 >= d2) &&
+        (unit_stride1 <= BLAS_MAXSIZE))
+    {
+        return NPY_TRUE;
+    }
+    return NPY_FALSE;
+}
+
+#if defined(HAVE_CBLAS)
+static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
+static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
+
+/**begin repeat
+ *
+ * #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+ * #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #prefix = s, d, c, z#
+ * #step1 = 1.F, 1., &oneF, &oneD#
+ * #step0 = 0.F, 0., &zeroF, &zeroD#
+ */
+NPY_NO_EXPORT void
+@name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
+{
+    /*
+     * Vector matrix multiplication -- Level 2 BLAS
+     * arguments
+     * ip1: contiguous data, m*n shape
+     * ip2: data in c order, n*1 shape
+     * op:  data in c order, m shape
+     */
+    enum CBLAS_ORDER order;
+    int M, N, lda;
+
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+    assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@)));
+    M = (int)m;
+    N = (int)n;
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
+        order = CblasColMajor;
+        lda = (int)(is1_m / sizeof(@typ@));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        order = CblasRowMajor;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
+        lda = (int)(is1_n / sizeof(@typ@));
+    }
+    cblas_@prefix@gemv(order, CblasTrans, N, M, @step1@, ip1, lda, ip2,
+                                     is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@));
+}
+
+NPY_NO_EXPORT void
+@name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *op, npy_intp os_m, npy_intp os_p,
+                           npy_intp m, npy_intp n, npy_intp p)
+{
+    /*
+     * matrix matrix multiplication -- Level 3 BLAS
+     */
+    enum CBLAS_ORDER order = CblasRowMajor;
+    enum CBLAS_TRANSPOSE trans1, trans2;
+    int M, N, P, lda, ldb, ldc;
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+    M = (int)m;
+    N = (int)n;
+    P = (int)p;
+
+    assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@)));
+    ldc = (int)(os_m / sizeof(@typ@));
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
+        trans1 = CblasNoTrans;
+        lda = (int)(is1_m / sizeof(@typ@));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
+        trans1 = CblasTrans;
+        lda = (int)(is1_n / sizeof(@typ@));
+    }
+
+    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) {
+        trans2 = CblasNoTrans;
+        ldb = (int)(is2_n / sizeof(@typ@));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@)));
+        trans2 = CblasTrans;
+        ldb = (int)(is2_p / sizeof(@typ@));
+    }
+    /*
+     * Use syrk if we have a case of a matrix times its transpose.
+     * Otherwise, use gemm for all other cases.
+     */
+    if (
+        (ip1 == ip2) &&
+        (m == p) &&
+        (is1_m == is2_p) &&
+        (is1_n == is2_n) &&
+        (trans1 != trans2)
+    ) {
+        npy_intp i,j;
+        if (trans1 == CblasNoTrans) {
+            cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
+                               ip1, lda, @step0@, op, ldc);
+        }
+        else {
+            cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
+                               ip1, ldb, @step0@, op, ldc);
+        }
+        /* Copy the triangle */
+        for (i = 0; i < P; i++) {
+            for (j = i + 1; j < P; j++) {
+                ((@typ@*)op)[j * ldc + i] = ((@typ@*)op)[i * ldc + j];
+            }
+        }
+
+    }
+    else {
+        cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda,
+                           ip2, ldb, @step0@, op, ldc);
+    }
+}
+
+/**end repeat**/
+#endif
+
+/*
+ * matmul loops
+ * signature is (m?,n),(n,p?)->(m?,p?)
+ */
+
+/**begin repeat
+ *  #TYPE = LONGDOUBLE,
+ *          FLOAT, DOUBLE, HALF,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG#
+ *  #typ = npy_longdouble,
+ *         npy_float,npy_double,npy_half,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*10#
+ * #IS_HALF = 0, 0, 0, 1, 0*13#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if @IS_COMPLEX@ == 1
+            (*(@typ@ *)op).real = 0;
+            (*(@typ@ *)op).imag = 0;
+#elif @IS_HALF@
+            float sum = 0;
+#else
+            *(@typ@ *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                @typ@ val1 = (*(@typ@ *)ip1);
+                @typ@ val2 = (*(@typ@ *)ip2);
+#if @IS_HALF@
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif @IS_COMPLEX@ == 1
+                (*(@typ@ *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(@typ@ *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(@typ@ *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if @IS_HALF@
+            *(@typ@ *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+/**end repeat**/
+NPY_NO_EXPORT void
+BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+{
+    npy_intp m, n, p;
+    npy_intp ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+            char *ip1tmp = ip1;
+            char *ip2tmp = ip2;
+            *(npy_bool *)op = NPY_FALSE;
+            for (n = 0; n < dn; n++) {
+                npy_bool val1 = (*(npy_bool *)ip1tmp);
+                npy_bool val2 = (*(npy_bool *)ip2tmp);
+                if (val1 != 0 && val2 != 0) {
+                    *(npy_bool *)op = NPY_TRUE;
+                    break;
+                }
+                ip2tmp += is2_n;
+                ip1tmp += is1_n;
+            }
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+NPY_NO_EXPORT void
+OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+{
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    npy_intp ib1_n = is1_n * dn;
+    npy_intp ib2_n = is2_n * dn;
+    npy_intp ib2_p = is2_p * dp;
+    npy_intp ob_p  = os_p * dp;
+    npy_intp m, p, n;
+
+    PyObject *product, *sum_of_products = NULL;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+            if ( 0 == dn ) {
+                sum_of_products = PyLong_FromLong(0);
+                if (sum_of_products == NULL) {
+                    return;
+                }
+            }
+
+            for (n = 0; n < dn; n++) {
+                PyObject *obj1 = *(PyObject**)ip1, *obj2 = *(PyObject**)ip2;
+                if (obj1 == NULL) {
+                    obj1 = Py_None;
+                }
+                if (obj2 == NULL) {
+                    obj2 = Py_None;
+                }
+
+                product = PyNumber_Multiply(obj1, obj2);
+                if (product == NULL) {
+                    Py_XDECREF(sum_of_products);
+                    return;
+                }
+
+                if (n == 0) {
+                    sum_of_products = product;
+                }
+                else {
+                    Py_SETREF(sum_of_products, PyNumber_Add(sum_of_products, product));
+                    Py_DECREF(product);
+                    if (sum_of_products == NULL) {
+                        return;
+                    }
+                }
+
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+
+            *((PyObject **)op) = sum_of_products;
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          BOOL, OBJECT#
+ *  #typ = npy_float,npy_double,npy_longdouble, npy_half,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_bool,npy_object#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*12#
+ * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*13#
+ */
+
+
+NPY_NO_EXPORT void
+@TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if @USEBLAS@ && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(@typ@);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if @USEBLAS@ && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                @TYPE@_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                @TYPE@_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                @TYPE@_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                @TYPE@_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+/**end repeat**/
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/matmul.h.src b/contrib/python/numpy/py2/numpy/core/src/umath/matmul.h.src
new file mode 100644
index 00000000000..a664b1b4e1f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/matmul.h.src
@@ -0,0 +1,12 @@
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          BOOL, OBJECT#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/scalarmath.c.src b/contrib/python/numpy/py2/numpy/core/src/umath/scalarmath.c.src
new file mode 100644
index 00000000000..a7987acda06
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/scalarmath.c.src
@@ -0,0 +1,1704 @@
+/* -*- c -*- */
+
+/* The purpose of this module is to add faster math for array scalars
+   that does not go through the ufunc machinery
+
+   but still supports error-modes.
+*/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+#include "npy_config.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/arrayscalars.h"
+
+#include "npy_pycompat.h"
+
+#include "numpy/halffloat.h"
+#include "templ_common.h"
+
+#include "binop_override.h"
+#include "npy_longdouble.h"
+
+/* Basic operations:
+ *
+ *  BINARY:
+ *
+ * add, subtract, multiply, divide, remainder, divmod, power,
+ * floor_divide, true_divide
+ *
+ * lshift, rshift, and, or, xor (integers only)
+ *
+ * UNARY:
+ *
+ * negative, positive, absolute, nonzero, invert, int, long, float, oct, hex
+ *
+ */
+
+/**begin repeat
+ *  #name = byte, short, int, long, longlong#
+ *  #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ */
+static void
+@name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
+    *out = a + b;
+    if ((*out^a) >= 0 || (*out^b) >= 0) {
+        return;
+    }
+    npy_set_floatstatus_overflow();
+    return;
+}
+static void
+@name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
+    *out = a - b;
+    if ((*out^a) >= 0 || (*out^~b) >= 0) {
+        return;
+    }
+    npy_set_floatstatus_overflow();
+    return;
+}
+/**end repeat**/
+
+/**begin repeat
+ *  #name = ubyte, ushort, uint, ulong, ulonglong#
+ *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ */
+static void
+@name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
+    *out = a + b;
+    if (*out >= a && *out >= b) {
+        return;
+    }
+    npy_set_floatstatus_overflow();
+    return;
+}
+static void
+@name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
+    *out = a - b;
+    if (a >= b) {
+        return;
+    }
+    npy_set_floatstatus_overflow();
+    return;
+}
+/**end repeat**/
+
+#ifndef NPY_SIZEOF_BYTE
+#define NPY_SIZEOF_BYTE 1
+#endif
+
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort,
+ *         int, uint, long, ulong#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort,
+ *         npy_int, npy_uint, npy_long, npy_ulong#
+ * #big =  npy_int, npy_uint, npy_int, npy_uint,
+ *         npy_longlong, npy_ulonglong, npy_longlong, npy_ulonglong#
+ * #NAME = BYTE, UBYTE, SHORT, USHORT,
+ *         INT, UINT, LONG, ULONG#
+ * #SIZENAME = BYTE*2, SHORT*2, INT*2, LONG*2#
+ * #SIZE = INT*4,LONGLONG*4#
+ * #neg = (1,0)*4#
+ */
+#if NPY_SIZEOF_@SIZE@ > NPY_SIZEOF_@SIZENAME@
+static void
+@name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
+    @big@ temp;
+    temp = ((@big@) a) * ((@big@) b);
+    *out = (@type@) temp;
+#if @neg@
+    if (temp > NPY_MAX_@NAME@ || temp < NPY_MIN_@NAME@)
+#else
+        if (temp > NPY_MAX_@NAME@)
+#endif
+            npy_set_floatstatus_overflow();
+    return;
+}
+#endif
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = int, uint, long, ulong,
+ *         longlong, ulonglong#
+ * #type = npy_int, npy_uint, npy_long, npy_ulong,
+ *         npy_longlong, npy_ulonglong#
+ * #SIZE = INT*2, LONG*2, LONGLONG*2#
+ */
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_@SIZE@
+static void
+@name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
+    if (npy_mul_with_overflow_@name@(out, a, b)) {
+        npy_set_floatstatus_overflow();
+    }
+    return;
+}
+#endif
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #neg = (1,0)*5#
+ */
+static void
+@name@_ctype_divide(@type@ a, @type@ b, @type@ *out) {
+    if (b == 0) {
+        npy_set_floatstatus_divbyzero();
+        *out = 0;
+    }
+#if @neg@
+    else if (b == -1 && a < 0 && a == -a) {
+        npy_set_floatstatus_overflow();
+        *out = a / b;
+    }
+#endif
+    else {
+#if @neg@
+        @type@ tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+    }
+}
+
+#define @name@_ctype_floor_divide @name@_ctype_divide
+
+static void
+@name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
+    if (a == 0 || b == 0) {
+        if (b == 0) npy_set_floatstatus_divbyzero();
+        *out = 0;
+        return;
+    }
+#if @neg@
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort, int, uint, long,
+ *         ulong, longlong, ulonglong#
+ * #otyp = npy_float*4, npy_double*6#
+ */
+#define @name@_ctype_true_divide(a, b, out)     \
+    *(out) = ((@otyp@) (a)) / ((@otyp@) (b));
+/**end repeat**/
+
+/* b will always be positive in this call */
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #upc = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *        LONG, ULONG, LONGLONG, ULONGLONG#
+ */
+static void
+@name@_ctype_power(@type@ a, @type@ b, @type@ *out) {
+    @type@ tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return;
+    }
+    if (a == 1) {
+        *out = 1;
+        return;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+}
+/**end repeat**/
+
+
+
+/* QUESTION:  Should we check for overflow / underflow in (l,r)shift? */
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ */
+
+/**begin repeat1
+ * #oper = and, xor, or, lshift, rshift#
+ * #op = &, ^, |, <<, >>#
+ */
+
+#define @name@_ctype_@oper@(arg1, arg2, out) *(out) = (arg1) @op@ (arg2)
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/**begin repeat
+ * #name = float, double, longdouble#
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #c = f, , l#
+ */
+#define @name@_ctype_add(a, b, outp) *(outp) = (a) + (b)
+#define @name@_ctype_subtract(a, b, outp) *(outp) = (a) - (b)
+#define @name@_ctype_multiply(a, b, outp) *(outp) = (a) * (b)
+#define @name@_ctype_divide(a, b, outp) *(outp) = (a) / (b)
+#define @name@_ctype_true_divide @name@_ctype_divide
+
+
+static void
+@name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
+    @type@ mod;
+
+    *out = npy_divmod@c@(a, b, &mod);
+}
+
+
+static void
+@name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
+    npy_divmod@c@(a, b, out);
+}
+
+
+static void
+@name@_ctype_divmod(@type@ a, @type@ b, @type@ *out1, @type@ *out2) {
+    *out1 = npy_divmod@c@(a, b, out2);
+}
+
+
+/**end repeat**/
+
+#define half_ctype_add(a, b, outp) *(outp) = \
+        npy_float_to_half(npy_half_to_float(a) + npy_half_to_float(b))
+#define half_ctype_subtract(a, b, outp) *(outp) = \
+        npy_float_to_half(npy_half_to_float(a) - npy_half_to_float(b))
+#define half_ctype_multiply(a, b, outp) *(outp) = \
+        npy_float_to_half(npy_half_to_float(a) * npy_half_to_float(b))
+#define half_ctype_divide(a, b, outp) *(outp) = \
+        npy_float_to_half(npy_half_to_float(a) / npy_half_to_float(b))
+#define half_ctype_true_divide half_ctype_divide
+
+
+static void
+half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out) {
+    npy_half mod;
+
+    *out = npy_half_divmod(a, b, &mod);
+}
+
+
+static void
+half_ctype_remainder(npy_half a, npy_half b, npy_half *out) {
+    npy_half_divmod(a, b, out);
+}
+
+
+static void
+half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2) {
+    *out1 = npy_half_divmod(a, b, out2);
+}
+
+/**begin repeat
+ * #name = cfloat, cdouble, clongdouble#
+ * #rname = float, double, longdouble#
+ * #rtype = npy_float, npy_double, npy_longdouble#
+ * #c = f,,l#
+ */
+#define @name@_ctype_add(a, b, outp) do{        \
+    (outp)->real = (a).real + (b).real;         \
+    (outp)->imag = (a).imag + (b).imag;         \
+    } while(0)
+#define @name@_ctype_subtract(a, b, outp) do{   \
+    (outp)->real = (a).real - (b).real;         \
+    (outp)->imag = (a).imag - (b).imag;         \
+    } while(0)
+#define @name@_ctype_multiply(a, b, outp) do{                   \
+    (outp)->real = (a).real * (b).real - (a).imag * (b).imag;   \
+    (outp)->imag = (a).real * (b).imag + (a).imag * (b).real;   \
+    } while(0)
+/* Algorithm identical to that in loops.c.src, for consistency */
+#define @name@_ctype_divide(a, b, outp) do{                         \
+    @rtype@ in1r = (a).real;                                        \
+    @rtype@ in1i = (a).imag;                                        \
+    @rtype@ in2r = (b).real;                                        \
+    @rtype@ in2i = (b).imag;                                        \
+    @rtype@ in2r_abs = npy_fabs@c@(in2r);                           \
+    @rtype@ in2i_abs = npy_fabs@c@(in2i);                           \
+    if (in2r_abs >= in2i_abs) {                                     \
+        if (in2r_abs == 0 && in2i_abs == 0) {                       \
+            /* divide by zero should yield a complex inf or nan */  \
+            (outp)->real = in1r/in2r_abs;                           \
+            (outp)->imag = in1i/in2i_abs;                           \
+        }                                                           \
+        else {                                                      \
+            @rtype@ rat = in2i/in2r;                                \
+            @rtype@ scl = 1.0@c@/(in2r + in2i*rat);                 \
+            (outp)->real = (in1r + in1i*rat)*scl;                   \
+            (outp)->imag = (in1i - in1r*rat)*scl;                   \
+        }                                                           \
+    }                                                               \
+    else {                                                          \
+        @rtype@ rat = in2r/in2i;                                    \
+        @rtype@ scl = 1.0@c@/(in2i + in2r*rat);                     \
+        (outp)->real = (in1r*rat + in1i)*scl;                       \
+        (outp)->imag = (in1i*rat - in1r)*scl;                       \
+    }                                                               \
+    } while(0)
+
+#define @name@_ctype_true_divide @name@_ctype_divide
+
+#define @name@_ctype_floor_divide(a, b, outp) do {      \
+    @rname@_ctype_floor_divide(                         \
+        ((a).real*(b).real + (a).imag*(b).imag),        \
+        ((b).real*(b).real + (b).imag*(b).imag),        \
+        &((outp)->real));                               \
+    (outp)->imag = 0;                                   \
+    } while(0)
+/**end repeat**/
+
+
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, cfloat, cdouble, clongdouble#
+ */
+#define @name@_ctype_divmod(a, b, out, out2) {  \
+    @name@_ctype_floor_divide(a, b, out);       \
+    @name@_ctype_remainder(a, b, out2);         \
+    }
+/**end repeat**/
+
+
+/**begin repeat
+ * #name = float, double, longdouble#
+ * #type = npy_float, npy_double, npy_longdouble#
+ */
+static npy_@name@ (*_basic_@name@_pow)(@type@ a, @type@ b);
+
+static void
+@name@_ctype_power(@type@ a, @type@ b, @type@ *out)
+{
+    *out = _basic_@name@_pow(a, b);
+}
+/**end repeat**/
+static void
+half_ctype_power(npy_half a, npy_half b, npy_half *out)
+{
+    const npy_float af = npy_half_to_float(a);
+    const npy_float bf = npy_half_to_float(b);
+    const npy_float outf = _basic_float_pow(af,bf);
+    *out = npy_float_to_half(outf);
+}
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         float, double, longdouble#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_float, npy_double, npy_longdouble#
+ * #uns = (0,1)*5,0*3#
+ */
+static void
+@name@_ctype_negative(@type@ a, @type@ *out)
+{
+#if @uns@
+    npy_set_floatstatus_overflow();
+#endif
+    *out = -a;
+}
+/**end repeat**/
+
+static void
+half_ctype_negative(npy_half a, npy_half *out)
+{
+    *out = a^0x8000u;
+}
+
+
+/**begin repeat
+ * #name = cfloat, cdouble, clongdouble#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+static void
+@name@_ctype_negative(@type@ a, @type@ *out)
+{
+    out->real = -a.real;
+    out->imag = -a.imag;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         half, float, double, longdouble#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble#
+ */
+static void
+@name@_ctype_positive(@type@ a, @type@ *out)
+{
+    *out = a;
+}
+/**end repeat**/
+
+/*
+ * Get the nc_powf, nc_pow, and nc_powl functions from
+ * the data area of the power ufunc in umathmodule.
+ */
+
+/**begin repeat
+ * #name = cfloat, cdouble, clongdouble#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ */
+static void
+@name@_ctype_positive(@type@ a, @type@ *out)
+{
+    out->real = a.real;
+    out->imag = a.imag;
+}
+
+static void (*_basic_@name@_pow)(@type@ *, @type@ *, @type@ *);
+
+static void
+@name@_ctype_power(@type@ a, @type@ b, @type@ *out)
+{
+    _basic_@name@_pow(&a, &b, out);
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #name = ubyte, ushort, uint, ulong, ulonglong#
+ */
+
+#define @name@_ctype_absolute @name@_ctype_positive
+
+/**end repeat**/
+
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ */
+static void
+@name@_ctype_absolute(@type@ a, @type@ *out)
+{
+    *out = (a < 0 ? -a : a);
+}
+/**end repeat**/
+
+/**begin repeat
+ * #name = float, double, longdouble#
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #c = f,,l#
+ */
+static void
+@name@_ctype_absolute(@type@ a, @type@ *out)
+{
+    *out = npy_fabs@c@(a);
+}
+/**end repeat**/
+
+static void
+half_ctype_absolute(npy_half a, npy_half *out)
+{
+    *out = a&0x7fffu;
+}
+
+/**begin repeat
+ * #name = cfloat, cdouble, clongdouble#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #rtype = npy_float, npy_double, npy_longdouble#
+ * #c = f,,l#
+ */
+static void
+@name@_ctype_absolute(@type@ a, @rtype@ *out)
+{
+    *out = npy_cabs@c@(a);
+}
+/**end repeat**/
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint, long,
+ *         ulong, longlong, ulonglong#
+ */
+
+#define @name@_ctype_invert(a, out) *(out) = ~a;
+
+/**end repeat**/
+
+/*** END OF BASIC CODE **/
+
+
+/* The general strategy for commutative binary operators is to
+ *
+ * 1) Convert the types to the common type if both are scalars (0 return)
+ * 2) If both are not scalars use ufunc machinery (-2 return)
+ * 3) If both are scalars but cannot be cast to the right type
+ * return NotImplmented (-1 return)
+ *
+ * 4) Perform the function on the C-type.
+ * 5) If an error condition occurred, check to see
+ * what the current error-handling is and handle the error.
+ *
+ * 6) Construct and return the output scalar.
+ */
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         half, float, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #Name = Byte, UByte, Short, UShort, Int, UInt,
+ *         Long, ULong, LongLong, ULongLong,
+ *         Half, Float, LongDouble,
+ *         CFloat, CDouble, CLongDouble#
+ * #TYPE = NPY_BYTE, NPY_UBYTE, NPY_SHORT, NPY_USHORT, NPY_INT, NPY_UINT,
+ *         NPY_LONG, NPY_ULONG, NPY_LONGLONG, NPY_ULONGLONG,
+ *         NPY_HALF, NPY_FLOAT, NPY_LONGDOUBLE,
+ *         NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE#
+ */
+
+static int
+_@name@_convert_to_ctype(PyObject *a, @type@ *arg1)
+{
+    PyObject *temp;
+
+    if (PyArray_IsScalar(a, @Name@)) {
+        *arg1 = PyArrayScalar_VAL(a, @Name@);
+        return 0;
+    }
+    else if (PyArray_IsScalar(a, Generic)) {
+        PyArray_Descr *descr1;
+
+        if (!PyArray_IsScalar(a, Number)) {
+            return -1;
+        }
+        descr1 = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(a));
+        if (PyArray_CanCastSafely(descr1->type_num, @TYPE@)) {
+            PyArray_CastScalarDirect(a, descr1, arg1, @TYPE@);
+            Py_DECREF(descr1);
+            return 0;
+        }
+        else {
+            Py_DECREF(descr1);
+            return -1;
+        }
+    }
+    else if (PyArray_GetPriority(a, NPY_PRIORITY) > NPY_PRIORITY) {
+        return -2;
+    }
+    else if ((temp = PyArray_ScalarFromObject(a)) != NULL) {
+        int retval = _@name@_convert_to_ctype(temp, arg1);
+
+        Py_DECREF(temp);
+        return retval;
+    }
+    return -2;
+}
+
+/**end repeat**/
+
+
+/* Same as above but added exact checks against known python types for speed */
+
+/**begin repeat
+ * #name = double#
+ * #type = npy_double#
+ * #Name = Double#
+ * #TYPE = NPY_DOUBLE#
+ * #PYCHECKEXACT = PyFloat_CheckExact#
+ * #PYEXTRACTCTYPE = PyFloat_AS_DOUBLE#
+ */
+
+static int
+_@name@_convert_to_ctype(PyObject *a, @type@ *arg1)
+{
+    PyObject *temp;
+
+    if (@PYCHECKEXACT@(a)){
+        *arg1 = @PYEXTRACTCTYPE@(a);
+        return 0;
+    }
+
+    if (PyArray_IsScalar(a, @Name@)) {
+        *arg1 = PyArrayScalar_VAL(a, @Name@);
+        return 0;
+    }
+    else if (PyArray_IsScalar(a, Generic)) {
+        PyArray_Descr *descr1;
+
+        if (!PyArray_IsScalar(a, Number)) {
+            return -1;
+        }
+        descr1 = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(a));
+        if (PyArray_CanCastSafely(descr1->type_num, @TYPE@)) {
+            PyArray_CastScalarDirect(a, descr1, arg1, @TYPE@);
+            Py_DECREF(descr1);
+            return 0;
+        }
+        else {
+            Py_DECREF(descr1);
+            return -1;
+        }
+    }
+    else if (PyArray_GetPriority(a, NPY_PRIORITY) > NPY_PRIORITY) {
+        return -2;
+    }
+    else if ((temp = PyArray_ScalarFromObject(a)) != NULL) {
+        int retval = _@name@_convert_to_ctype(temp, arg1);
+
+        Py_DECREF(temp);
+        return retval;
+    }
+    return -2;
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         half, float, double, cfloat, cdouble#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_cfloat, npy_cdouble#
+ */
+static int
+_@name@_convert2_to_ctypes(PyObject *a, @type@ *arg1,
+                           PyObject *b, @type@ *arg2)
+{
+    int ret;
+    ret = _@name@_convert_to_ctype(a, arg1);
+    if (ret < 0) {
+        return ret;
+    }
+    ret = _@name@_convert_to_ctype(b, arg2);
+    if (ret < 0) {
+        return ret;
+    }
+    return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #name = longdouble, clongdouble#
+ * #type = npy_longdouble, npy_clongdouble#
+ */
+
+static int
+_@name@_convert2_to_ctypes(PyObject *a, @type@ *arg1,
+                           PyObject *b, @type@ *arg2)
+{
+    int ret;
+    ret = _@name@_convert_to_ctype(a, arg1);
+    if (ret < 0) {
+        return ret;
+    }
+    ret = _@name@_convert_to_ctype(b, arg2);
+    if (ret == -2) {
+        ret = -3;
+    }
+    if (ret < 0) {
+        return ret;
+    }
+    return 0;
+}
+
+/**end repeat**/
+
+
+#if defined(NPY_PY3K)
+#define CODEGEN_SKIP_divide_FLAG
+#endif
+
+/**begin repeat
+ *
+ * #name = (byte, ubyte, short, ushort, int, uint,
+ *             long, ulong, longlong, ulonglong)*13,
+ *         (half, float, double, longdouble,
+ *             cfloat, cdouble, clongdouble)*6,
+ *         (half, float, double, longdouble)*2#
+ * #Name = (Byte, UByte, Short, UShort, Int, UInt,
+ *             Long, ULong,LongLong,ULongLong)*13,
+ *         (Half, Float, Double, LongDouble,
+ *             CFloat, CDouble, CLongDouble)*6,
+ *         (Half, Float, Double, LongDouble)*2#
+ * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*13,
+ *         (npy_half, npy_float, npy_double, npy_longdouble,
+ *             npy_cfloat, npy_cdouble, npy_clongdouble)*6,
+ *         (npy_half, npy_float, npy_double, npy_longdouble)*2#
+ *
+ * #oper = add*10, subtract*10, multiply*10, divide*10, remainder*10,
+ *         divmod*10, floor_divide*10, lshift*10, rshift*10, and*10,
+ *         or*10, xor*10, true_divide*10,
+ *         add*7, subtract*7, multiply*7, divide*7, floor_divide*7, true_divide*7,
+ *         divmod*4, remainder*4#
+ *
+ * #fperr = 1*70,0*50,1*10,
+ *          1*42,
+ *          1*8#
+ * #twoout = 0*50,1*10,0*70,
+ *           0*42,
+ *           1*4,0*4#
+ * #otype = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12,
+ *         npy_float*4, npy_double*6,
+ *         (npy_half, npy_float, npy_double, npy_longdouble,
+ *             npy_cfloat, npy_cdouble, npy_clongdouble)*6,
+ *         (npy_half, npy_float, npy_double, npy_longdouble)*2#
+ * #OName = (Byte, UByte, Short, UShort, Int, UInt,
+ *              Long, ULong, LongLong, ULongLong)*12,
+ *          Float*4, Double*6,
+ *          (Half, Float, Double, LongDouble,
+ *              CFloat, CDouble, CLongDouble)*6,
+ *          (Half, Float, Double, LongDouble)*2#
+ */
+
+#if !defined(CODEGEN_SKIP_@oper@_FLAG)
+
+static PyObject *
+@name@_@oper@(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    @type@ arg1, arg2;
+    /*
+     * NOTE: In gcc >= 4.1, the compiler will reorder floating point
+     *       operations and floating point error state checks. In
+     *       particular, the arithmetic operations were being reordered
+     *       so that the errors weren't caught.  Declaring this output
+     *       variable volatile was the minimal fix for the issue.
+     *       (Ticket #1671)
+     */
+    volatile @otype@ out;
+#if @twoout@
+    @otype@ out2;
+    PyObject *obj;
+#endif
+
+#if @fperr@
+    int retstatus;
+    int first;
+#endif
+
+    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_@oper@, @name@_@oper@);
+
+    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
+        case 0:
+            break;
+        case -1:
+            /* one of them can't be cast safely must be mixed-types*/
+            return PyArray_Type.tp_as_number->nb_@oper@(a,b);
+        case -2:
+            /* use default handling */
+            if (PyErr_Occurred()) {
+                return NULL;
+            }
+            return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
+        case -3:
+            /*
+             * special case for longdouble and clongdouble
+             * because they have a recursive getitem in their dtype
+             */
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+    }
+
+#if @fperr@
+    npy_clear_floatstatus_barrier((char*)&out);
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if @twoout@
+    @name@_ctype_@oper@(arg1, arg2, (@otype@ *)&out, &out2);
+#else
+    @name@_ctype_@oper@(arg1, arg2, (@otype@ *)&out);
+#endif
+
+#if @fperr@
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus = npy_get_floatstatus_barrier((char*)&out);
+    if (retstatus) {
+        int bufsize, errmask;
+        PyObject *errobj;
+
+        if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask,
+                                &errobj) < 0) {
+            return NULL;
+        }
+        first = 1;
+        if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
+            Py_XDECREF(errobj);
+            return NULL;
+        }
+        Py_XDECREF(errobj);
+    }
+#endif
+
+
+#if @twoout@
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(@OName@);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, @OName@, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(@OName@);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, @OName@, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(@OName@);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, @OName@, out);
+#endif
+    return ret;
+}
+#endif
+
+/**end repeat**/
+
+#undef CODEGEN_SKIP_divide_FLAG
+
+#define _IS_ZERO(x) (x == 0)
+
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ *
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ *
+ * #Name = Byte, UByte, Short, UShort, Int, UInt,
+ *         Long, ULong, LongLong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble#
+ *
+ * #oname = float*4, double*6, half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ *
+ * #otype = npy_float*4, npy_double*6, npy_half, npy_float,
+ *          npy_double, npy_longdouble,
+ *          npy_cfloat, npy_cdouble, npy_clongdouble#
+ *
+ * #OName = Float*4, Double*6, Half, Float,
+ *          Double, LongDouble,
+ *          CFloat, CDouble, CLongDouble#
+ *
+ * #isint = (1,0)*5,0*7#
+ * #cmplx = 0*14,1*3#
+ * #iszero = _IS_ZERO*10, npy_half_iszero, _IS_ZERO*6#
+ * #zero = 0*10, NPY_HALF_ZERO, 0*6#
+ * #one = 1*10, NPY_HALF_ONE, 1*6#
+ */
+
+#if @cmplx@
+static PyObject *
+@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    PyObject *ret;
+    @type@ arg1, arg2;
+    int retstatus;
+    int first;
+    @type@ out = {@zero@, @zero@};
+
+    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
+
+    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
+        case 0:
+            break;
+        case -1:
+            /* can't cast both safely mixed-types? */
+            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
+        case -2:
+            /* use default handling */
+            if (PyErr_Occurred()) {
+                return NULL;
+            }
+            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
+        case -3:
+        default:
+            /*
+             * special case for longdouble and clongdouble
+             * because they have a recursive getitem in their dtype
+             */
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+    }
+
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    npy_clear_floatstatus_barrier((char*)&out);
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+    if (@iszero@(arg2.real) && @iszero@(arg2.imag)) {
+        out.real = @one@;
+        out.imag = @zero@;
+    }
+    else {
+        @name@_ctype_power(arg1, arg2, &out);
+    }
+
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus = npy_get_floatstatus_barrier((char*)&out);
+    if (retstatus) {
+        int bufsize, errmask;
+        PyObject *errobj;
+
+        if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask,
+                                &errobj) < 0) {
+            return NULL;
+        }
+        first = 1;
+        if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
+            Py_XDECREF(errobj);
+            return NULL;
+        }
+        Py_XDECREF(errobj);
+    }
+
+    ret = PyArrayScalar_New(@Name@);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, @Name@, out);
+
+    return ret;
+}
+
+#elif @isint@
+
+static PyObject *
+@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    PyObject *ret;
+    @type@ arg1, arg2, out;
+
+    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
+
+    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
+        case 0:
+            break;
+        case -1:
+            /* can't cast both safely mixed-types? */
+            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
+        case -2:
+            /* use default handling */
+            if (PyErr_Occurred()) {
+                return NULL;
+            }
+            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
+        case -3:
+        default:
+            /*
+             * special case for longdouble and clongdouble
+             * because they have a recursive getitem in their dtype
+             */
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+    }
+
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    npy_clear_floatstatus_barrier((char*)&out);
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+    @name@_ctype_power(arg1, arg2, &out);
+
+    ret = PyArrayScalar_New(@Name@);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, @Name@, out);
+
+    return ret;
+}
+
+#else
+
+static PyObject *
+@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    PyObject *ret;
+    @type@ arg1, arg2;
+    int retstatus;
+    int first;
+
+    @type@ out = @zero@;
+
+    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
+
+    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
+        case 0:
+            break;
+        case -1:
+            /* can't cast both safely mixed-types? */
+            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
+        case -2:
+            /* use default handling */
+            if (PyErr_Occurred()) {
+                return NULL;
+            }
+            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
+        case -3:
+        default:
+            /*
+             * special case for longdouble and clongdouble
+             * because they have a recursive getitem in their dtype
+             */
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+    }
+
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    npy_clear_floatstatus_barrier((char*)&out);
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+    if (@iszero@(arg2)) {
+        out = @one@;
+    }
+    else {
+        @name@_ctype_power(arg1, arg2, &out);
+    }
+
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus = npy_get_floatstatus_barrier((char*)&out);
+    if (retstatus) {
+        int bufsize, errmask;
+        PyObject *errobj;
+
+        if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask,
+                                &errobj) < 0) {
+            return NULL;
+        }
+        first = 1;
+        if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
+            Py_XDECREF(errobj);
+            return NULL;
+        }
+        Py_XDECREF(errobj);
+    }
+
+    ret = PyArrayScalar_New(@Name@);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, @Name@, out);
+
+    return ret;
+}
+
+#endif
+
+/**end repeat**/
+#undef _IS_ZERO
+
+
+/**begin repeat
+ *
+ * #name = cfloat, cdouble, clongdouble#
+ *
+ */
+
+/**begin repeat1
+ *
+ * #oper = divmod, remainder#
+ *
+ */
+
+#define @name@_@oper@ NULL
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = half, float, double, longdouble, cfloat, cdouble, clongdouble#
+ *
+ */
+
+/**begin repeat1
+ *
+ * #oper = lshift, rshift, and, or, xor#
+ *
+ */
+
+#define @name@_@oper@ NULL
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/**begin repeat
+ * #name = (byte, ubyte, short, ushort, int, uint,
+ *             long, ulong, longlong, ulonglong,
+ *             half, float, double, longdouble,
+ *             cfloat, cdouble, clongdouble)*3,
+ *
+ *         byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong#
+ *
+ * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_half, npy_float, npy_double, npy_longdouble,
+ *             npy_cfloat, npy_cdouble, npy_clongdouble)*3,
+ *
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ *
+ * #otype = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *             npy_half, npy_float, npy_double, npy_longdouble,
+ *             npy_cfloat, npy_cdouble, npy_clongdouble)*2,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_float, npy_double, npy_longdouble,
+ *
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ *
+ * #OName = (Byte, UByte, Short, UShort, Int, UInt,
+ *              Long, ULong, LongLong, ULongLong,
+ *              Half, Float, Double, LongDouble,
+ *              CFloat, CDouble, CLongDouble)*2,
+ *          Byte, UByte, Short, UShort, Int, UInt,
+ *          Long, ULong, LongLong, ULongLong,
+ *          Half, Float, Double, LongDouble,
+ *          Float, Double, LongDouble,
+ *
+ *          Byte, UByte, Short, UShort, Int, UInt,
+ *          Long, ULong, LongLong, ULongLong#
+ *
+ * #oper = negative*17, positive*17, absolute*17, invert*10#
+ */
+static PyObject *
+@name@_@oper@(PyObject *a)
+{
+    @type@ arg1;
+    @otype@ out;
+    PyObject *ret;
+
+    switch(_@name@_convert_to_ctype(a, &arg1)) {
+    case 0:
+        break;
+    case -1:
+        /* can't cast both safely use different add function */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    case -2:
+        /* use default handling */
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+        return PyGenericArrType_Type.tp_as_number->nb_@oper@(a);
+    }
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * make it a function call.
+     */
+
+    @name@_ctype_@oper@(arg1, &out);
+
+    ret = PyArrayScalar_New(@OName@);
+    PyArrayScalar_ASSIGN(ret, @OName@, out);
+
+    return ret;
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = half, float, double, longdouble, cfloat, cdouble, clongdouble#
+ */
+
+#define @name@_invert NULL
+
+/**end repeat**/
+
+#if defined(NPY_PY3K)
+#define NONZERO_NAME(prefix) prefix##bool
+#else
+#define NONZERO_NAME(prefix) prefix##nonzero
+#endif
+
+#define _IS_NONZERO(x) (x != 0)
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort, int,
+ *         uint, long, ulong, longlong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #simp = 1*14, 0*3#
+ * #nonzero = _IS_NONZERO*10, !npy_half_iszero, _IS_NONZERO*6#
+ */
+static int
+NONZERO_NAME(@name@_)(PyObject *a)
+{
+    int ret;
+    @type@ arg1;
+
+    if (_@name@_convert_to_ctype(a, &arg1) < 0) {
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+        return PyGenericArrType_Type.tp_as_number->NONZERO_NAME(nb_)(a);
+    }
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * make it a function call.
+     */
+
+#if @simp@
+    ret = @nonzero@(arg1);
+#else
+    ret = (@nonzero@(arg1.real) || @nonzero@(arg1.imag));
+#endif
+
+    return ret;
+}
+/**end repeat**/
+#undef _IS_NONZERO
+
+
+static int
+emit_complexwarning(void)
+{
+    static PyObject *cls = NULL;
+    if (cls == NULL) {
+        PyObject *mod;
+        mod = PyImport_ImportModule("numpy.core");
+        assert(mod != NULL);
+        cls = PyObject_GetAttrString(mod, "ComplexWarning");
+        assert(cls != NULL);
+        Py_DECREF(mod);
+    }
+    return PyErr_WarnEx(cls,
+            "Casting complex values to real discards the imaginary part", 1);
+}
+
+/**begin repeat
+ *
+ * #name = byte, ubyte, short, ushort, int,
+ *         uint, long, ulong, longlong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ *
+ * #Name = Byte, UByte, Short, UShort, Int,
+ *         UInt, Long, ULong, LongLong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble#
+ *
+ * #cmplx = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1#
+ * #sign = (signed, unsigned)*5, , , , , , , #
+ * #ctype = long*8, PY_LONG_LONG*2,
+ *          double*3, npy_longdouble, double*2, npy_longdouble#
+ * #to_ctype = , , , , , , , , , , npy_half_to_double, , , , , , #
+ * #func = (PyLong_FromLong, PyLong_FromUnsignedLong)*4,
+ *         PyLong_FromLongLong, PyLong_FromUnsignedLongLong,
+ *         PyLong_FromDouble*3, npy_longdouble_to_PyLong,
+ *         PyLong_FromDouble*2, npy_longdouble_to_PyLong#
+ */
+static PyObject *
+@name@_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if @cmplx@
+    @sign@ @ctype@ x = @to_ctype@(PyArrayScalar_VAL(obj, @Name@).real);
+#else
+    @sign@ @ctype@ x = @to_ctype@(PyArrayScalar_VAL(obj, @Name@));
+#endif
+
+#if @cmplx@
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = @func@(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+#ifndef NPY_PY3K
+    /* Invoke long.__int__ to try to downcast */
+    {
+        PyObject *before_downcast = long_result;
+        long_result = Py_TYPE(long_result)->tp_as_number->nb_int(long_result);
+        Py_DECREF(before_downcast);
+    }
+#endif
+
+    return long_result;
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = (byte, ubyte, short, ushort, int, uint,
+ *             long, ulong, longlong, ulonglong,
+ *             half, float, double, longdouble,
+ *             cfloat, cdouble, clongdouble)*2#
+ * #Name = (Byte, UByte, Short, UShort, Int, UInt,
+ *             Long, ULong, LongLong, ULongLong,
+ *             Half, Float, Double, LongDouble,
+ *             CFloat, CDouble, CLongDouble)*2#
+ * #cmplx = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1)*2#
+ * #to_ctype = (, , , , , , , , , , npy_half_to_double, , , , , , )*2#
+ * #which = long*17, float*17#
+ * #func = (PyLong_FromLongLong,  PyLong_FromUnsignedLongLong)*5,
+ *         PyLong_FromDouble*3, npy_longdouble_to_PyLong,
+ *         PyLong_FromDouble*2, npy_longdouble_to_PyLong,
+ *         PyFloat_FromDouble*17#
+ */
+static NPY_INLINE PyObject *
+@name@_@which@(PyObject *obj)
+{
+#if @cmplx@
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return @func@(@to_ctype@(PyArrayScalar_VAL(obj, @Name@).real));
+#else
+    return @func@(@to_ctype@(PyArrayScalar_VAL(obj, @Name@)));
+#endif
+}
+/**end repeat**/
+
+#if !defined(NPY_PY3K)
+
+/**begin repeat
+ *
+ * #name = (byte, ubyte, short, ushort, int, uint,
+ *             long, ulong, longlong, ulonglong,
+ *             half, float, double, longdouble,
+ *             cfloat, cdouble, clongdouble)*2#
+ * #oper = oct*17,  hex*17#
+ * #kind = (int*5,  long*5,  int*2,  long*2,  int,  long*2)*2#
+ * #cap = (Int*5,  Long*5,  Int*2,  Long*2,  Int,  Long*2)*2#
+ */
+static PyObject *
+@name@_@oper@(PyObject *obj)
+{
+    PyObject *pyint;
+    pyint = @name@_@kind@(obj);
+    if (pyint == NULL) {
+        return NULL;
+    }
+    return Py@cap@_Type.tp_as_number->nb_@oper@(pyint);
+}
+/**end repeat**/
+
+#endif
+
+/**begin repeat
+ * #oper = le, ge, lt, gt, eq, ne#
+ * #op = <=, >=, <, >, ==, !=#
+ * #halfop = npy_half_le, npy_half_ge, npy_half_lt,
+ *           npy_half_gt, npy_half_eq, npy_half_ne#
+ */
+#define def_cmp_@oper@(arg1, arg2) (arg1 @op@ arg2)
+#define cmplx_cmp_@oper@(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag @op@ arg2.imag :        \
+                                      arg1.real @op@ arg2.real)
+#define def_half_cmp_@oper@(arg1, arg2) @halfop@(arg1, arg2)
+/**end repeat**/
+
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #simp = def*10, def_half, def*3, cmplx*3#
+ */
+static PyObject*
+@name@_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_@name@ arg1, arg2;
+    int out=0;
+
+    RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+
+    switch(_@name@_convert2_to_ctypes(self, &arg1, other, &arg2)) {
+    case 0:
+        break;
+    case -1:
+        /* can't cast both safely use different add function */
+    case -2:
+        /* use ufunc */
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+        return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+    case -3:
+        /*
+         * special case for longdouble and clongdouble
+         * because they have a recursive getitem in their dtype
+         */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = @simp@_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = @simp@_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = @simp@_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = @simp@_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = @simp@_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = @simp@_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ *  #name = byte, ubyte, short, ushort, int, uint,
+ *          long, ulong, longlong, ulonglong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+**/
+static PyNumberMethods @name@_as_number = {
+    (binaryfunc)@name@_add,                     /*nb_add*/
+    (binaryfunc)@name@_subtract,                /*nb_subtract*/
+    (binaryfunc)@name@_multiply,                /*nb_multiply*/
+#if !defined(NPY_PY3K)
+    (binaryfunc)@name@_divide,                  /*nb_divide*/
+#endif
+    (binaryfunc)@name@_remainder,               /*nb_remainder*/
+    (binaryfunc)@name@_divmod,                  /*nb_divmod*/
+    (ternaryfunc)@name@_power,                  /*nb_power*/
+    (unaryfunc)@name@_negative,
+    (unaryfunc)@name@_positive,                 /*nb_pos*/
+    (unaryfunc)@name@_absolute,                 /*nb_abs*/
+#if defined(NPY_PY3K)
+    (inquiry)@name@_bool,                       /*nb_bool*/
+#else
+    (inquiry)@name@_nonzero,                    /*nb_nonzero*/
+#endif
+    (unaryfunc)@name@_invert,                   /*nb_invert*/
+    (binaryfunc)@name@_lshift,                  /*nb_lshift*/
+    (binaryfunc)@name@_rshift,                  /*nb_rshift*/
+    (binaryfunc)@name@_and,                     /*nb_and*/
+    (binaryfunc)@name@_xor,                     /*nb_xor*/
+    (binaryfunc)@name@_or,                      /*nb_or*/
+#if !defined(NPY_PY3K)
+    0,                                          /*nb_coerce*/
+#endif
+    (unaryfunc)@name@_int,                      /*nb_int*/
+#if defined(NPY_PY3K)
+    (unaryfunc)0,                               /*nb_reserved*/
+#else
+    (unaryfunc)@name@_long,                     /*nb_long*/
+#endif
+    (unaryfunc)@name@_float,                    /*nb_float*/
+#if !defined(NPY_PY3K)
+    (unaryfunc)@name@_oct,                      /*nb_oct*/
+    (unaryfunc)@name@_hex,                      /*nb_hex*/
+#endif
+    0,                                          /*inplace_add*/
+    0,                                          /*inplace_subtract*/
+    0,                                          /*inplace_multiply*/
+#if !defined(NPY_PY3K)
+    0,                                          /*inplace_divide*/
+#endif
+    0,                                          /*inplace_remainder*/
+    0,                                          /*inplace_power*/
+    0,                                          /*inplace_lshift*/
+    0,                                          /*inplace_rshift*/
+    0,                                          /*inplace_and*/
+    0,                                          /*inplace_xor*/
+    0,                                          /*inplace_or*/
+    (binaryfunc)@name@_floor_divide,            /*nb_floor_divide*/
+    (binaryfunc)@name@_true_divide,             /*nb_true_divide*/
+    0,                                          /*nb_inplace_floor_divide*/
+    0,                                          /*nb_inplace_true_divide*/
+    (unaryfunc)NULL,                            /*nb_index*/
+#if PY_VERSION_HEX >= 0x03050000
+    0,                                          /*nb_matrix_multiply*/
+    0,                                          /*nb_inplace_matrix_multiply*/
+#endif
+};
+/**end repeat**/
+
+NPY_NO_EXPORT void
+add_scalarmath(void)
+{
+    /**begin repeat
+     *  #name = byte, ubyte, short, ushort, int, uint,
+     *          long, ulong, longlong, ulonglong,
+     *          half, float, double, longdouble,
+     *          cfloat, cdouble, clongdouble#
+     *  #NAME = Byte, UByte, Short, UShort, Int, UInt,
+     *          Long, ULong, LongLong, ULongLong,
+     *          Half, Float, Double, LongDouble,
+     *          CFloat, CDouble, CLongDouble#
+     **/
+    @name@_as_number.nb_index = Py@NAME@ArrType_Type.tp_as_number->nb_index;
+    Py@NAME@ArrType_Type.tp_as_number = &(@name@_as_number);
+    Py@NAME@ArrType_Type.tp_richcompare = @name@_richcompare;
+    /**end repeat**/
+}
+
+static int
+get_functions(PyObject * mm)
+{
+    PyObject *obj;
+    void **funcdata;
+    char *signatures;
+    int i, j;
+    int ret = -1;
+
+    /* Get the nc_pow functions */
+    /* Get the pow functions */
+    obj = PyObject_GetAttrString(mm, "power");
+    if (obj == NULL) {
+        goto fail;
+    }
+    funcdata = ((PyUFuncObject *)obj)->data;
+    signatures = ((PyUFuncObject *)obj)->types;
+
+    i = 0;
+    j = 0;
+    while (signatures[i] != NPY_FLOAT) {
+        i += 3;
+        j++;
+    }
+    _basic_float_pow = funcdata[j];
+    _basic_double_pow = funcdata[j + 1];
+    _basic_longdouble_pow = funcdata[j + 2];
+    _basic_cfloat_pow = funcdata[j + 3];
+    _basic_cdouble_pow = funcdata[j + 4];
+    _basic_clongdouble_pow = funcdata[j + 5];
+    Py_DECREF(obj);
+
+    return ret = 0;
+
+ fail:
+    Py_DECREF(mm);
+    return ret;
+}
+
+
+NPY_NO_EXPORT int initscalarmath(PyObject * m)
+{
+    if (get_functions(m) < 0) {
+        return -1;
+    }
+
+    add_scalarmath();
+
+    return 0;
+}
diff --git a/contrib/python/numpy/py2/numpy/core/src/umath/simd.inc.src b/contrib/python/numpy/py2/numpy/core/src/umath/simd.inc.src
new file mode 100644
index 00000000000..4bb8569beeb
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/src/umath/simd.inc.src
@@ -0,0 +1,1219 @@
+/* -*- c -*- */
+
+/*
+ * This file is for the definitions of simd vectorized operations.
+ *
+ * Currently contains sse2 functions that are built on amd64, x32 or
+ * non-generic builds (CFLAGS=-march=...)
+ * In future it may contain other instruction sets like AVX or NEON detected
+ * at runtime in which case it needs to be included indirectly via a file
+ * compiled with special options (or use gcc target attributes) so the binary
+ * stays portable.
+ */
+
+
+#ifndef __NPY_SIMD_INC
+#define __NPY_SIMD_INC
+
+#include "lowlevel_strided_loops.h"
+#include "numpy/npy_common.h"
+#include "numpy/npy_math.h"
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+#include <emmintrin.h>
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+#include <immintrin.h>
+#else
+#undef __AVX2__
+#undef __AVX512F__
+#endif
+#endif
+#include <assert.h>
+#include <stdlib.h>
+#include <float.h>
+#include <string.h> /* for memcpy */
+
+#define VECTOR_SIZE_BYTES 16
+
+static NPY_INLINE npy_uintp
+abs_ptrdiff(char *a, char *b)
+{
+    return (a > b) ? (a - b) : (b - a);
+}
+
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register. The check of the steps against
+ * esize also quarantees that steps are >= 0.
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+      ((abs_ptrdiff(args[1], args[0]) == 0))))
+
+#define IS_BLOCKABLE_REDUCE(esize, vsize) \
+    (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
+     npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY(esize, vsize) \
+    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)) && \
+     (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
+      abs_ptrdiff(args[2], args[0]) == 0) && \
+     (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
+      abs_ptrdiff(args[2], args[1]) >= 0))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
+    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
+      (abs_ptrdiff(args[2], args[1]) == 0)) && \
+     abs_ptrdiff(args[2], args[0]) >= (esize))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
+    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
+     ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
+      (abs_ptrdiff(args[2], args[0]) == 0)) && \
+     abs_ptrdiff(args[2], args[1]) >= (esize))
+
+#undef abs_ptrdiff
+
+#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
+     npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
+    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
+     npy_is_aligned(args[1], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
+    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
+     npy_is_aligned(args[0], (esize)))
+
+/* align var to alignment */
+#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+                                                alignment, n);\
+    for(i = 0; i < peel; i++)
+
+#define LOOP_BLOCKED(type, vsize)\
+    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+            i += (vsize / sizeof(type)))
+
+#define LOOP_BLOCKED_END\
+    for (; i < n; i++)
+
+
+/*
+ * Dispatcher functions
+ * decide whether the operation can be vectorized and run it
+ * if it was run returns true and false if nothing was done
+ */
+
+/*
+ *****************************************************************************
+ **                           FLOAT DISPATCHERS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ *  #vector = 1, 1, 0#
+ */
+
+/**begin repeat1
+ * #func = sqrt, absolute, negative, minimum, maximum#
+ * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
+ * #name = unary*3, unary_reduce*2#
+ */
+
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+
+/* prototypes */
+static void
+sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ */
+
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+
+/* prototypes */
+static void
+sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+#if defined __AVX512F__
+    const npy_intp vector_size_bytes = 64;
+#elif defined __AVX2__
+    const npy_intp vector_size_bytes = 32;
+#else
+    const npy_intp vector_size_bytes = 32;
+#endif
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
+        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
+        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
+        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *         logical_and, logical_or#
+ * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
+ */
+
+#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
+
+/* prototypes */
+static void
+sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    npy_bool * op = (npy_bool *)args[2];
+    npy_intp n = dimensions[0];
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = isnan, isfinite, isinf, signbit#
+ */
+
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+
+static void
+sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_@kind@_simd_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
+        npy_is_aligned(args[0], sizeof(@type@))) {
+        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           BOOL DISPATCHERS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ */
+
+#if defined NPY_HAVE_SSE2_INTRINSICS
+static void
+sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
+                        npy_intp n);
+
+static void
+sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
+#endif
+
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined NPY_HAVE_SSE2_INTRINSICS
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
+        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined NPY_HAVE_SSE2_INTRINSICS
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
+        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * # kind = absolute, logical_not#
+ */
+
+#if defined NPY_HAVE_SSE2_INTRINSICS
+static void
+sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
+#endif
+
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined NPY_HAVE_SSE2_INTRINSICS
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
+        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat**/
+
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+
+/*
+ * Vectorized operations
+ */
+/*
+ *****************************************************************************
+ **                           FLOAT LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+* horizontal reductions on a vector
+* # VOP = min, max#
+*/
+
+static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
+{
+    npy_float r;
+    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
+    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
+    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
+    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
+    return r;
+}
+
+static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
+{
+    npy_double r;
+    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
+    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
+    return r;
+}
+
+/**end repeat**/
+
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #c = f, #
+ *  #vtype = __m128, __m128d#
+ *  #vtype256 = __m256, __m256d#
+ *  #vtype512 = __m512, __m512d#
+ *  #vpre = _mm, _mm#
+ *  #vpre256 = _mm256, _mm256#
+ *  #vpre512 = _mm512, _mm512#
+ *  #vsuf = ps, pd#
+ *  #vsufs = ss, sd#
+ *  #nan = NPY_NANF, NPY_NAN#
+ *  #double = 0, 1#
+ *  #cast = _mm_castps_si128, _mm_castpd_si128#
+ */
+
+
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef  __AVX512F__
+    const npy_intp vector_size_bytes = 64;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+                @vpre512@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+#elif __AVX2__
+    const npy_intp vector_size_bytes = 32;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
+            npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
+                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+                @vpre256@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+#else
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
+            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        if (ip1 == ip2) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+        else {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+                @vpre@_store_@vsuf@(&op[i], c);
+            }
+        }
+    }
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
+
+
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef __AVX512F__
+    const npy_intp vector_size_bytes = 64;
+    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+
+
+#elif __AVX2__
+    const npy_intp vector_size_bytes = 32;
+    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+#else
+    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+        op[i] = ip1[0] @OP@ ip2[i];
+    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
+
+
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef __AVX512F__
+    const npy_intp vector_size_bytes = 64;
+    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
+            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
+            @vpre512@_store_@vsuf@(&op[i], c);
+        }
+    }
+
+#elif __AVX2__
+    const npy_intp vector_size_bytes = 32;
+    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
+            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
+            @vpre256@_store_@vsuf@(&op[i], c);
+        }
+    }
+#else
+    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+        op[i] = ip1[i] @OP@ ip2[0];
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+            @vpre@_store_@vsuf@(&op[i], c);
+        }
+    }
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+
+/**end repeat1**/
+
+/*
+ * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
+ * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
+ * calling convention leading to C2719 on 32 bit, see #4795
+ */
+static NPY_INLINE void
+sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
+                              npy_bool * op)
+{
+    const __m128i mask = @vpre@_set1_epi8(0x1);
+    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
+    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
+    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
+#if @double@
+    rr = @vpre@_packs_epi16(rr, rr);
+    rr = @vpre@_and_si128(rr, mask);
+    @vpre@_storel_epi64((__m128i*)op, rr);
+#else
+    rr = @vpre@_and_si128(rr, mask);
+    @vpre@_storeu_si128((__m128i*)op, rr);
+#endif
+}
+
+static void
+sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = npy_signbit(ip1[i]) != 0;
+    }
+    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+        int r = @vpre@_movemask_@vsuf@(a);
+        if (sizeof(@type@) == 8) {
+            op[i] = r & 1;
+            op[i + 1] = (r >> 1);
+        }
+        else {
+            op[i] = r & 1;
+            op[i + 1] = (r >> 1) & 1;
+            op[i + 2] = (r >> 2) & 1;
+            op[i + 3] = (r >> 3);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = npy_signbit(ip1[i]) != 0;
+    }
+}
+
+/**begin repeat1
+ * #kind = isnan, isfinite, isinf#
+ * #var = 0, 1, 2#
+ */
+
+static void
+sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
+{
+#if @var@ != 0 /* isinf/isfinite */
+    /* signbit mask 0x7FFFFFFF after andnot */
+    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
+                                             @vpre@_setzero_@vsuf@());
+#if @double@
+    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
+#else
+    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
+#endif
+#endif
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = npy_@kind@(ip1[i]) != 0;
+    }
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ r1, r2, r3, r4;
+#if @var@ != 0 /* isinf/isfinite */
+        /* fabs via masking of sign bit */
+        r1 = @vpre@_andnot_@vsuf@(mask, a);
+        r2 = @vpre@_andnot_@vsuf@(mask, b);
+        r3 = @vpre@_andnot_@vsuf@(mask, c);
+        r4 = @vpre@_andnot_@vsuf@(mask, d);
+#if @var@ == 1 /* isfinite */
+        /* negative compare against max float, nan is always true */
+        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
+        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
+        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
+        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
+#else /* isinf */
+        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
+        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
+        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
+        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
+#endif
+        /* flip results to what we want (andnot as there is no sse not) */
+        r1 = @vpre@_andnot_@vsuf@(r1, ones);
+        r2 = @vpre@_andnot_@vsuf@(r2, ones);
+        r3 = @vpre@_andnot_@vsuf@(r3, ones);
+        r4 = @vpre@_andnot_@vsuf@(r4, ones);
+#endif
+#if @var@ == 0 /* isnan */
+        r1 = @vpre@_cmpneq_@vsuf@(a, a);
+        r2 = @vpre@_cmpneq_@vsuf@(b, b);
+        r3 = @vpre@_cmpneq_@vsuf@(c, c);
+        r4 = @vpre@_cmpneq_@vsuf@(d, d);
+#endif
+        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = npy_@kind@(ip1[i]) != 0;
+    }
+}
+
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+*/
+
+/* sets invalid fpu flag on QNaN for consistency with packed compare */
+static NPY_INLINE int
+sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
+{
+    @vtype@ one = @vpre@_set1_@vsuf@(1);
+    @type@ tmp;
+    @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
+                                     @vpre@_load_@vsufs@(&b));
+    v = @vpre@_and_@vsuf@(v, one);
+    @vpre@_store_@vsufs@(&tmp, v);
+    return tmp;
+}
+
+static void
+sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
+    }
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
+        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
+        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
+        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
+        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
+    }
+}
+
+
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
+    }
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
+        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
+        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
+        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
+        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
+    }
+}
+
+
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
+    }
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
+        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
+        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
+        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
+        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
+    }
+}
+/**end repeat1**/
+
+static void
+sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
+{
+    /* align output to VECTOR_SIZE_BYTES bytes */
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = @scalarf@(ip[i]);
+    }
+    assert(n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
+           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
+    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = @scalarf@(ip[i]);
+    }
+}
+
+
+static NPY_INLINE
+@type@ scalar_abs_@type@(@type@ v)
+{
+    /* add 0 to clear -0.0 */
+    return (v > 0 ? v: -v) + 0;
+}
+
+static NPY_INLINE
+@type@ scalar_neg_@type@(@type@ v)
+{
+    return -v;
+}
+
+/**begin repeat1
+ * #kind = absolute, negative#
+ * #VOP = andnot, xor#
+ * #scalar = scalar_abs, scalar_neg#
+ **/
+static void
+sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
+{
+    /*
+     * get 0x7FFFFFFF mask (everything but signbit set)
+     * float & ~mask will remove the sign, float ^ mask flips the sign
+     * this is equivalent to how the compiler implements fabs on amd64
+     */
+    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+    /* align output to VECTOR_SIZE_BYTES bytes */
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
+        op[i] = @scalar@_@type@(ip[i]);
+    }
+    assert(n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
+           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
+    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = @scalar@_@type@(ip[i]);
+    }
+}
+/**end repeat1**/
+
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #VOP = max, min#
+ * #OP = >=, <=#
+ **/
+/* arguments swapped as unary reduce has the swapped compared to unary */
+static void
+sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
+{
+    const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
+    LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
+        /* Order of operations important for MSVC 2015 */
+        *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
+    }
+    assert(n < (stride) || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
+    if (i + 3 * stride <= n) {
+        /* load the first elements */
+        @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
+        @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
+        i += 2 * stride;
+
+        /* minps/minpd will set invalid flag if nan is encountered */
+        npy_clear_floatstatus_barrier((char*)&c1);
+        LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
+            @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
+            @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
+            c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
+            c2 = @vpre@_@VOP@_@vsuf@(c2, v2);
+        }
+        c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
+
+        if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
+            *op = @nan@;
+        }
+        else {
+            @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
+            /* Order of operations important for MSVC 2015 */
+            *op  = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
+        }
+    }
+    LOOP_BLOCKED_END {
+        /* Order of operations important for MSVC 2015 */
+        *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
+    }
+    npy_clear_floatstatus_barrier((char*)op);
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           BOOL LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ * # and = 0, 1#
+ * # op = ||, &&#
+ * # sc = !=, ==#
+ * # vpre = _mm*2#
+ * # vsuf = si128*2#
+ * # vtype = __m128i*2#
+ * # type = npy_bool*2#
+ * # vload = _mm_load_si128*2#
+ * # vloadu = _mm_loadu_si128*2#
+ * # vstore = _mm_store_si128*2#
+ */
+
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+#if !@and@
+static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
+{
+    const @vtype@ zero = @vpre@_setzero_@vsuf@();
+    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
+    /* get 0xFF for zeros */
+    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
+    /* filled with 0xFF/0x00, negate and mask to boolean true */
+    return @vpre@_andnot_@vsuf@(tmp, truemask);
+}
+#endif
+
+static void
+sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+        op[i] = ip1[i] @op@ ip2[i];
+    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
+        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
+#if @and@
+        const @vtype@ zero = @vpre@_setzero_@vsuf@();
+        /* get 0xFF for non zeros*/
+        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
+        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
+        tmp = @vpre@_andnot_@vsuf@(tmp, b);
+#else
+        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
+#endif
+
+        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
+    }
+    LOOP_BLOCKED_END {
+        op[i] = (ip1[i] @op@ ip2[i]);
+    }
+}
+
+
+static void
+sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
+{
+    const @vtype@ zero = @vpre@_setzero_@vsuf@();
+    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
+        *op = *op @op@ ip[i];
+        if (*op @sc@ 0) {
+            return;
+        }
+    }
+    /* unrolled once to replace a slow movmsk with a fast pmaxb */
+    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
+        @vtype@ v = @vload@((@vtype@*)&ip[i]);
+        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
+        v = @vpre@_cmpeq_epi8(v, zero);
+        v2 = @vpre@_cmpeq_epi8(v2, zero);
+#if @and@
+        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
+            *op = 0;
+#else
+        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
+            *op = 1;
+#endif
+            return;
+        }
+    }
+    LOOP_BLOCKED_END {
+        *op = *op @op@ ip[i];
+        if (*op @sc@ 0) {
+            return;
+        }
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * # kind = absolute, logical_not#
+ * # op = !=, ==#
+ * # not = 0, 1#
+ * # vpre = _mm*2#
+ * # vsuf = si128*2#
+ * # vtype = __m128i*2#
+ * # type = npy_bool*2#
+ * # vloadu = _mm_loadu_si128*2#
+ * # vstore = _mm_store_si128*2#
+ */
+
+static void
+sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+        op[i] = (ip[i] @op@ 0);
+    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
+#if @not@
+        const @vtype@ zero = @vpre@_setzero_@vsuf@();
+        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
+        /* equivalent to byte_to_true but can skip the negation */
+        a = @vpre@_cmpeq_epi8(a, zero);
+        a = @vpre@_and_@vsuf@(a, truemask);
+#else
+        /* abs is kind of pointless but maybe its used for byte_to_true */
+        a = byte_to_true(a);
+#endif
+        @vstore@((@vtype@*)&op[i], a);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = (ip[i] @op@ 0);
+    }
+}
+
+/**end repeat**/
+
+#undef VECTOR_SIZE_BYTES
+
+#endif /* NPY_HAVE_SSE2_INTRINSICS */
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/core/tests/__init__.py b/contrib/python/numpy/py2/numpy/core/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/core/tests/_locales.py b/contrib/python/numpy/py2/numpy/core/tests/_locales.py
new file mode 100644
index 00000000000..52e4ff36d5e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/_locales.py
@@ -0,0 +1,76 @@
+"""Provide class for testing in French locale
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys
+import locale
+
+import pytest
+
+__ALL__ = ['CommaDecimalPointLocale']
+
+
+def find_comma_decimal_point_locale():
+    """See if platform has a decimal point as comma locale.
+
+    Find a locale that uses a comma instead of a period as the
+    decimal point.
+
+    Returns
+    -------
+    old_locale: str
+        Locale when the function was called.
+    new_locale: {str, None)
+        First French locale found, None if none found.
+
+    """
+    if sys.platform == 'win32':
+        locales = ['FRENCH']
+    else:
+        locales = ['fr_FR', 'fr_FR.UTF-8', 'fi_FI', 'fi_FI.UTF-8']
+
+    old_locale = locale.getlocale(locale.LC_NUMERIC)
+    new_locale = None
+    try:
+        for loc in locales:
+            try:
+                locale.setlocale(locale.LC_NUMERIC, loc)
+                new_locale = loc
+                break
+            except locale.Error:
+                pass
+    finally:
+        locale.setlocale(locale.LC_NUMERIC, locale=old_locale)
+    return old_locale, new_locale
+
+
+class CommaDecimalPointLocale(object):
+    """Sets LC_NUMERIC to a locale with comma as decimal point.
+
+    Classes derived from this class have setup and teardown methods that run
+    tests with locale.LC_NUMERIC set to a locale where commas (',') are used as
+    the decimal point instead of periods ('.'). On exit the locale is restored
+    to the initial locale. It also serves as context manager with the same
+    effect. If no such locale is available, the test is skipped.
+
+    .. versionadded:: 1.15.0
+
+    """
+    (cur_locale, tst_locale) = find_comma_decimal_point_locale()
+
+    def setup(self):
+        if self.tst_locale is None:
+            pytest.skip("No French locale available")
+        locale.setlocale(locale.LC_NUMERIC, locale=self.tst_locale)
+
+    def teardown(self):
+        locale.setlocale(locale.LC_NUMERIC, locale=self.cur_locale)
+
+    def __enter__(self):
+        if self.tst_locale is None:
+            pytest.skip("No French locale available")
+        locale.setlocale(locale.LC_NUMERIC, locale=self.tst_locale)
+
+    def __exit__(self, type, value, traceback):
+        locale.setlocale(locale.LC_NUMERIC, locale=self.cur_locale)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/data/astype_copy.pkl b/contrib/python/numpy/py2/numpy/core/tests/data/astype_copy.pkl
new file mode 100644
index 00000000000..7397c978297
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/data/astype_copy.pkl
diff --git a/contrib/python/numpy/py2/numpy/core/tests/data/recarray_from_file.fits b/contrib/python/numpy/py2/numpy/core/tests/data/recarray_from_file.fits
new file mode 100644
index 00000000000..ca48ee85153
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/data/recarray_from_file.fits
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_abc.py b/contrib/python/numpy/py2/numpy/core/tests/test_abc.py
new file mode 100644
index 00000000000..d9c61b0c611
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_abc.py
@@ -0,0 +1,56 @@
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import assert_
+
+import numbers
+
+import numpy as np
+from numpy.core.numerictypes import sctypes
+
+class TestABC(object):
+    def test_abstract(self):
+        assert_(issubclass(np.number, numbers.Number))
+
+        assert_(issubclass(np.inexact, numbers.Complex))
+        assert_(issubclass(np.complexfloating, numbers.Complex))
+        assert_(issubclass(np.floating, numbers.Real))
+
+        assert_(issubclass(np.integer, numbers.Integral))
+        assert_(issubclass(np.signedinteger, numbers.Integral))
+        assert_(issubclass(np.unsignedinteger, numbers.Integral))
+
+    def test_floats(self):
+        for t in sctypes['float']:
+            assert_(isinstance(t(), numbers.Real),
+                    "{0} is not instance of Real".format(t.__name__))
+            assert_(issubclass(t, numbers.Real),
+                    "{0} is not subclass of Real".format(t.__name__))
+            assert_(not isinstance(t(), numbers.Rational),
+                    "{0} is instance of Rational".format(t.__name__))
+            assert_(not issubclass(t, numbers.Rational),
+                    "{0} is subclass of Rational".format(t.__name__))
+
+    def test_complex(self):
+        for t in sctypes['complex']:
+            assert_(isinstance(t(), numbers.Complex),
+                    "{0} is not instance of Complex".format(t.__name__))
+            assert_(issubclass(t, numbers.Complex),
+                    "{0} is not subclass of Complex".format(t.__name__))
+            assert_(not isinstance(t(), numbers.Real),
+                    "{0} is instance of Real".format(t.__name__))
+            assert_(not issubclass(t, numbers.Real),
+                    "{0} is subclass of Real".format(t.__name__))
+
+    def test_int(self):
+        for t in sctypes['int']:
+            assert_(isinstance(t(), numbers.Integral),
+                    "{0} is not instance of Integral".format(t.__name__))
+            assert_(issubclass(t, numbers.Integral),
+                    "{0} is not subclass of Integral".format(t.__name__))
+
+    def test_uint(self):
+        for t in sctypes['uint']:
+            assert_(isinstance(t(), numbers.Integral),
+                    "{0} is not instance of Integral".format(t.__name__))
+            assert_(issubclass(t, numbers.Integral),
+                    "{0} is not subclass of Integral".format(t.__name__))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_api.py b/contrib/python/numpy/py2/numpy/core/tests/test_api.py
new file mode 100644
index 00000000000..9755e7b36df
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_api.py
@@ -0,0 +1,516 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+import numpy as np
+from numpy.testing import (
+     assert_, assert_equal, assert_array_equal, assert_raises, HAS_REFCOUNT
+    )
+
+# Switch between new behaviour when NPY_RELAXED_STRIDES_CHECKING is set.
+NPY_RELAXED_STRIDES_CHECKING = np.ones((10, 1), order='C').flags.f_contiguous
+
+
+def test_array_array():
+    tobj = type(object)
+    ones11 = np.ones((1, 1), np.float64)
+    tndarray = type(ones11)
+    # Test is_ndarray
+    assert_equal(np.array(ones11, dtype=np.float64), ones11)
+    if HAS_REFCOUNT:
+        old_refcount = sys.getrefcount(tndarray)
+        np.array(ones11)
+        assert_equal(old_refcount, sys.getrefcount(tndarray))
+
+    # test None
+    assert_equal(np.array(None, dtype=np.float64),
+                 np.array(np.nan, dtype=np.float64))
+    if HAS_REFCOUNT:
+        old_refcount = sys.getrefcount(tobj)
+        np.array(None, dtype=np.float64)
+        assert_equal(old_refcount, sys.getrefcount(tobj))
+
+    # test scalar
+    assert_equal(np.array(1.0, dtype=np.float64),
+                 np.ones((), dtype=np.float64))
+    if HAS_REFCOUNT:
+        old_refcount = sys.getrefcount(np.float64)
+        np.array(np.array(1.0, dtype=np.float64), dtype=np.float64)
+        assert_equal(old_refcount, sys.getrefcount(np.float64))
+
+    # test string
+    S2 = np.dtype((str, 2))
+    S3 = np.dtype((str, 3))
+    S5 = np.dtype((str, 5))
+    assert_equal(np.array("1.0", dtype=np.float64),
+                 np.ones((), dtype=np.float64))
+    assert_equal(np.array("1.0").dtype, S3)
+    assert_equal(np.array("1.0", dtype=str).dtype, S3)
+    assert_equal(np.array("1.0", dtype=S2), np.array("1."))
+    assert_equal(np.array("1", dtype=S5), np.ones((), dtype=S5))
+
+    # test unicode
+    _unicode = globals().get("unicode")
+    if _unicode:
+        U2 = np.dtype((_unicode, 2))
+        U3 = np.dtype((_unicode, 3))
+        U5 = np.dtype((_unicode, 5))
+        assert_equal(np.array(_unicode("1.0"), dtype=np.float64),
+                     np.ones((), dtype=np.float64))
+        assert_equal(np.array(_unicode("1.0")).dtype, U3)
+        assert_equal(np.array(_unicode("1.0"), dtype=_unicode).dtype, U3)
+        assert_equal(np.array(_unicode("1.0"), dtype=U2),
+                     np.array(_unicode("1.")))
+        assert_equal(np.array(_unicode("1"), dtype=U5),
+                     np.ones((), dtype=U5))
+
+    builtins = getattr(__builtins__, '__dict__', __builtins__)
+    assert_(hasattr(builtins, 'get'))
+
+    # test buffer
+    _buffer = builtins.get("buffer")
+    if _buffer and sys.version_info[:3] >= (2, 7, 5):
+        # This test fails for earlier versions of Python.
+        # Evidently a bug got fixed in 2.7.5.
+        dat = np.array(_buffer('1.0'), dtype=np.float64)
+        assert_equal(dat, [49.0, 46.0, 48.0])
+        assert_(dat.dtype.type is np.float64)
+
+        dat = np.array(_buffer(b'1.0'))
+        assert_equal(dat, [49, 46, 48])
+        assert_(dat.dtype.type is np.uint8)
+
+    # test memoryview, new version of buffer
+    _memoryview = builtins.get("memoryview")
+    if _memoryview:
+        dat = np.array(_memoryview(b'1.0'), dtype=np.float64)
+        assert_equal(dat, [49.0, 46.0, 48.0])
+        assert_(dat.dtype.type is np.float64)
+
+        dat = np.array(_memoryview(b'1.0'))
+        assert_equal(dat, [49, 46, 48])
+        assert_(dat.dtype.type is np.uint8)
+
+    # test array interface
+    a = np.array(100.0, dtype=np.float64)
+    o = type("o", (object,),
+             dict(__array_interface__=a.__array_interface__))
+    assert_equal(np.array(o, dtype=np.float64), a)
+
+    # test array_struct interface
+    a = np.array([(1, 4.0, 'Hello'), (2, 6.0, 'World')],
+                 dtype=[('f0', int), ('f1', float), ('f2', str)])
+    o = type("o", (object,),
+             dict(__array_struct__=a.__array_struct__))
+    ## wasn't what I expected... is np.array(o) supposed to equal a ?
+    ## instead we get a array([...], dtype=">V18")
+    assert_equal(bytes(np.array(o).data), bytes(a.data))
+
+    # test array
+    o = type("o", (object,),
+             dict(__array__=lambda *x: np.array(100.0, dtype=np.float64)))()
+    assert_equal(np.array(o, dtype=np.float64), np.array(100.0, np.float64))
+
+    # test recursion
+    nested = 1.5
+    for i in range(np.MAXDIMS):
+        nested = [nested]
+
+    # no error
+    np.array(nested)
+
+    # Exceeds recursion limit
+    assert_raises(ValueError, np.array, [nested], dtype=np.float64)
+
+    # Try with lists...
+    assert_equal(np.array([None] * 10, dtype=np.float64),
+                 np.full((10,), np.nan, dtype=np.float64))
+    assert_equal(np.array([[None]] * 10, dtype=np.float64),
+                 np.full((10, 1), np.nan, dtype=np.float64))
+    assert_equal(np.array([[None] * 10], dtype=np.float64),
+                 np.full((1, 10), np.nan, dtype=np.float64))
+    assert_equal(np.array([[None] * 10] * 10, dtype=np.float64),
+                 np.full((10, 10), np.nan, dtype=np.float64))
+
+    assert_equal(np.array([1.0] * 10, dtype=np.float64),
+                 np.ones((10,), dtype=np.float64))
+    assert_equal(np.array([[1.0]] * 10, dtype=np.float64),
+                 np.ones((10, 1), dtype=np.float64))
+    assert_equal(np.array([[1.0] * 10], dtype=np.float64),
+                 np.ones((1, 10), dtype=np.float64))
+    assert_equal(np.array([[1.0] * 10] * 10, dtype=np.float64),
+                 np.ones((10, 10), dtype=np.float64))
+
+    # Try with tuples
+    assert_equal(np.array((None,) * 10, dtype=np.float64),
+                 np.full((10,), np.nan, dtype=np.float64))
+    assert_equal(np.array([(None,)] * 10, dtype=np.float64),
+                 np.full((10, 1), np.nan, dtype=np.float64))
+    assert_equal(np.array([(None,) * 10], dtype=np.float64),
+                 np.full((1, 10), np.nan, dtype=np.float64))
+    assert_equal(np.array([(None,) * 10] * 10, dtype=np.float64),
+                 np.full((10, 10), np.nan, dtype=np.float64))
+
+    assert_equal(np.array((1.0,) * 10, dtype=np.float64),
+                 np.ones((10,), dtype=np.float64))
+    assert_equal(np.array([(1.0,)] * 10, dtype=np.float64),
+                 np.ones((10, 1), dtype=np.float64))
+    assert_equal(np.array([(1.0,) * 10], dtype=np.float64),
+                 np.ones((1, 10), dtype=np.float64))
+    assert_equal(np.array([(1.0,) * 10] * 10, dtype=np.float64),
+                 np.ones((10, 10), dtype=np.float64))
+
+
+def test_fastCopyAndTranspose():
+    # 0D array
+    a = np.array(2)
+    b = np.fastCopyAndTranspose(a)
+    assert_equal(b, a.T)
+    assert_(b.flags.owndata)
+
+    # 1D array
+    a = np.array([3, 2, 7, 0])
+    b = np.fastCopyAndTranspose(a)
+    assert_equal(b, a.T)
+    assert_(b.flags.owndata)
+
+    # 2D array
+    a = np.arange(6).reshape(2, 3)
+    b = np.fastCopyAndTranspose(a)
+    assert_equal(b, a.T)
+    assert_(b.flags.owndata)
+
+def test_array_astype():
+    a = np.arange(6, dtype='f4').reshape(2, 3)
+    # Default behavior: allows unsafe casts, keeps memory layout,
+    #                   always copies.
+    b = a.astype('i4')
+    assert_equal(a, b)
+    assert_equal(b.dtype, np.dtype('i4'))
+    assert_equal(a.strides, b.strides)
+    b = a.T.astype('i4')
+    assert_equal(a.T, b)
+    assert_equal(b.dtype, np.dtype('i4'))
+    assert_equal(a.T.strides, b.strides)
+    b = a.astype('f4')
+    assert_equal(a, b)
+    assert_(not (a is b))
+
+    # copy=False parameter can sometimes skip a copy
+    b = a.astype('f4', copy=False)
+    assert_(a is b)
+
+    # order parameter allows overriding of the memory layout,
+    # forcing a copy if the layout is wrong
+    b = a.astype('f4', order='F', copy=False)
+    assert_equal(a, b)
+    assert_(not (a is b))
+    assert_(b.flags.f_contiguous)
+
+    b = a.astype('f4', order='C', copy=False)
+    assert_equal(a, b)
+    assert_(a is b)
+    assert_(b.flags.c_contiguous)
+
+    # casting parameter allows catching bad casts
+    b = a.astype('c8', casting='safe')
+    assert_equal(a, b)
+    assert_equal(b.dtype, np.dtype('c8'))
+
+    assert_raises(TypeError, a.astype, 'i4', casting='safe')
+
+    # subok=False passes through a non-subclassed array
+    b = a.astype('f4', subok=0, copy=False)
+    assert_(a is b)
+
+    class MyNDArray(np.ndarray):
+        pass
+
+    a = np.array([[0, 1, 2], [3, 4, 5]], dtype='f4').view(MyNDArray)
+
+    # subok=True passes through a subclass
+    b = a.astype('f4', subok=True, copy=False)
+    assert_(a is b)
+
+    # subok=True is default, and creates a subtype on a cast
+    b = a.astype('i4', copy=False)
+    assert_equal(a, b)
+    assert_equal(type(b), MyNDArray)
+
+    # subok=False never returns a subclass
+    b = a.astype('f4', subok=False, copy=False)
+    assert_equal(a, b)
+    assert_(not (a is b))
+    assert_(type(b) is not MyNDArray)
+
+    # Make sure converting from string object to fixed length string
+    # does not truncate.
+    a = np.array([b'a'*100], dtype='O')
+    b = a.astype('S')
+    assert_equal(a, b)
+    assert_equal(b.dtype, np.dtype('S100'))
+    a = np.array([u'a'*100], dtype='O')
+    b = a.astype('U')
+    assert_equal(a, b)
+    assert_equal(b.dtype, np.dtype('U100'))
+
+    # Same test as above but for strings shorter than 64 characters
+    a = np.array([b'a'*10], dtype='O')
+    b = a.astype('S')
+    assert_equal(a, b)
+    assert_equal(b.dtype, np.dtype('S10'))
+    a = np.array([u'a'*10], dtype='O')
+    b = a.astype('U')
+    assert_equal(a, b)
+    assert_equal(b.dtype, np.dtype('U10'))
+
+    a = np.array(123456789012345678901234567890, dtype='O').astype('S')
+    assert_array_equal(a, np.array(b'1234567890' * 3, dtype='S30'))
+    a = np.array(123456789012345678901234567890, dtype='O').astype('U')
+    assert_array_equal(a, np.array(u'1234567890' * 3, dtype='U30'))
+
+    a = np.array([123456789012345678901234567890], dtype='O').astype('S')
+    assert_array_equal(a, np.array(b'1234567890' * 3, dtype='S30'))
+    a = np.array([123456789012345678901234567890], dtype='O').astype('U')
+    assert_array_equal(a, np.array(u'1234567890' * 3, dtype='U30'))
+
+    a = np.array(123456789012345678901234567890, dtype='S')
+    assert_array_equal(a, np.array(b'1234567890' * 3, dtype='S30'))
+    a = np.array(123456789012345678901234567890, dtype='U')
+    assert_array_equal(a, np.array(u'1234567890' * 3, dtype='U30'))
+
+    a = np.array(u'a\u0140', dtype='U')
+    b = np.ndarray(buffer=a, dtype='uint32', shape=2)
+    assert_(b.size == 2)
+
+    a = np.array([1000], dtype='i4')
+    assert_raises(TypeError, a.astype, 'S1', casting='safe')
+
+    a = np.array(1000, dtype='i4')
+    assert_raises(TypeError, a.astype, 'U1', casting='safe')
+
+def test_copyto_fromscalar():
+    a = np.arange(6, dtype='f4').reshape(2, 3)
+
+    # Simple copy
+    np.copyto(a, 1.5)
+    assert_equal(a, 1.5)
+    np.copyto(a.T, 2.5)
+    assert_equal(a, 2.5)
+
+    # Where-masked copy
+    mask = np.array([[0, 1, 0], [0, 0, 1]], dtype='?')
+    np.copyto(a, 3.5, where=mask)
+    assert_equal(a, [[2.5, 3.5, 2.5], [2.5, 2.5, 3.5]])
+    mask = np.array([[0, 1], [1, 1], [1, 0]], dtype='?')
+    np.copyto(a.T, 4.5, where=mask)
+    assert_equal(a, [[2.5, 4.5, 4.5], [4.5, 4.5, 3.5]])
+
+def test_copyto():
+    a = np.arange(6, dtype='i4').reshape(2, 3)
+
+    # Simple copy
+    np.copyto(a, [[3, 1, 5], [6, 2, 1]])
+    assert_equal(a, [[3, 1, 5], [6, 2, 1]])
+
+    # Overlapping copy should work
+    np.copyto(a[:, :2], a[::-1, 1::-1])
+    assert_equal(a, [[2, 6, 5], [1, 3, 1]])
+
+    # Defaults to 'same_kind' casting
+    assert_raises(TypeError, np.copyto, a, 1.5)
+
+    # Force a copy with 'unsafe' casting, truncating 1.5 to 1
+    np.copyto(a, 1.5, casting='unsafe')
+    assert_equal(a, 1)
+
+    # Copying with a mask
+    np.copyto(a, 3, where=[True, False, True])
+    assert_equal(a, [[3, 1, 3], [3, 1, 3]])
+
+    # Casting rule still applies with a mask
+    assert_raises(TypeError, np.copyto, a, 3.5, where=[True, False, True])
+
+    # Lists of integer 0's and 1's is ok too
+    np.copyto(a, 4.0, casting='unsafe', where=[[0, 1, 1], [1, 0, 0]])
+    assert_equal(a, [[3, 4, 4], [4, 1, 3]])
+
+    # Overlapping copy with mask should work
+    np.copyto(a[:, :2], a[::-1, 1::-1], where=[[0, 1], [1, 1]])
+    assert_equal(a, [[3, 4, 4], [4, 3, 3]])
+
+    # 'dst' must be an array
+    assert_raises(TypeError, np.copyto, [1, 2, 3], [2, 3, 4])
+
+def test_copyto_permut():
+    # test explicit overflow case
+    pad = 500
+    l = [True] * pad + [True, True, True, True]
+    r = np.zeros(len(l)-pad)
+    d = np.ones(len(l)-pad)
+    mask = np.array(l)[pad:]
+    np.copyto(r, d, where=mask[::-1])
+
+    # test all permutation of possible masks, 9 should be sufficient for
+    # current 4 byte unrolled code
+    power = 9
+    d = np.ones(power)
+    for i in range(2**power):
+        r = np.zeros(power)
+        l = [(i & x) != 0 for x in range(power)]
+        mask = np.array(l)
+        np.copyto(r, d, where=mask)
+        assert_array_equal(r == 1, l)
+        assert_equal(r.sum(), sum(l))
+
+        r = np.zeros(power)
+        np.copyto(r, d, where=mask[::-1])
+        assert_array_equal(r == 1, l[::-1])
+        assert_equal(r.sum(), sum(l))
+
+        r = np.zeros(power)
+        np.copyto(r[::2], d[::2], where=mask[::2])
+        assert_array_equal(r[::2] == 1, l[::2])
+        assert_equal(r[::2].sum(), sum(l[::2]))
+
+        r = np.zeros(power)
+        np.copyto(r[::2], d[::2], where=mask[::-2])
+        assert_array_equal(r[::2] == 1, l[::-2])
+        assert_equal(r[::2].sum(), sum(l[::-2]))
+
+        for c in [0xFF, 0x7F, 0x02, 0x10]:
+            r = np.zeros(power)
+            mask = np.array(l)
+            imask = np.array(l).view(np.uint8)
+            imask[mask != 0] = c
+            np.copyto(r, d, where=mask)
+            assert_array_equal(r == 1, l)
+            assert_equal(r.sum(), sum(l))
+
+    r = np.zeros(power)
+    np.copyto(r, d, where=True)
+    assert_equal(r.sum(), r.size)
+    r = np.ones(power)
+    d = np.zeros(power)
+    np.copyto(r, d, where=False)
+    assert_equal(r.sum(), r.size)
+
+def test_copy_order():
+    a = np.arange(24).reshape(2, 1, 3, 4)
+    b = a.copy(order='F')
+    c = np.arange(24).reshape(2, 1, 4, 3).swapaxes(2, 3)
+
+    def check_copy_result(x, y, ccontig, fcontig, strides=False):
+        assert_(not (x is y))
+        assert_equal(x, y)
+        assert_equal(res.flags.c_contiguous, ccontig)
+        assert_equal(res.flags.f_contiguous, fcontig)
+        # This check is impossible only because
+        # NPY_RELAXED_STRIDES_CHECKING changes the strides actively
+        if not NPY_RELAXED_STRIDES_CHECKING:
+            if strides:
+                assert_equal(x.strides, y.strides)
+            else:
+                assert_(x.strides != y.strides)
+
+    # Validate the initial state of a, b, and c
+    assert_(a.flags.c_contiguous)
+    assert_(not a.flags.f_contiguous)
+    assert_(not b.flags.c_contiguous)
+    assert_(b.flags.f_contiguous)
+    assert_(not c.flags.c_contiguous)
+    assert_(not c.flags.f_contiguous)
+
+    # Copy with order='C'
+    res = a.copy(order='C')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = b.copy(order='C')
+    check_copy_result(res, b, ccontig=True, fcontig=False, strides=False)
+    res = c.copy(order='C')
+    check_copy_result(res, c, ccontig=True, fcontig=False, strides=False)
+    res = np.copy(a, order='C')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = np.copy(b, order='C')
+    check_copy_result(res, b, ccontig=True, fcontig=False, strides=False)
+    res = np.copy(c, order='C')
+    check_copy_result(res, c, ccontig=True, fcontig=False, strides=False)
+
+    # Copy with order='F'
+    res = a.copy(order='F')
+    check_copy_result(res, a, ccontig=False, fcontig=True, strides=False)
+    res = b.copy(order='F')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = c.copy(order='F')
+    check_copy_result(res, c, ccontig=False, fcontig=True, strides=False)
+    res = np.copy(a, order='F')
+    check_copy_result(res, a, ccontig=False, fcontig=True, strides=False)
+    res = np.copy(b, order='F')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = np.copy(c, order='F')
+    check_copy_result(res, c, ccontig=False, fcontig=True, strides=False)
+
+    # Copy with order='K'
+    res = a.copy(order='K')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = b.copy(order='K')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = c.copy(order='K')
+    check_copy_result(res, c, ccontig=False, fcontig=False, strides=True)
+    res = np.copy(a, order='K')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = np.copy(b, order='K')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = np.copy(c, order='K')
+    check_copy_result(res, c, ccontig=False, fcontig=False, strides=True)
+
+def test_contiguous_flags():
+    a = np.ones((4, 4, 1))[::2,:,:]
+    if NPY_RELAXED_STRIDES_CHECKING:
+        a.strides = a.strides[:2] + (-123,)
+    b = np.ones((2, 2, 1, 2, 2)).swapaxes(3, 4)
+
+    def check_contig(a, ccontig, fcontig):
+        assert_(a.flags.c_contiguous == ccontig)
+        assert_(a.flags.f_contiguous == fcontig)
+
+    # Check if new arrays are correct:
+    check_contig(a, False, False)
+    check_contig(b, False, False)
+    if NPY_RELAXED_STRIDES_CHECKING:
+        check_contig(np.empty((2, 2, 0, 2, 2)), True, True)
+        check_contig(np.array([[[1], [2]]], order='F'), True, True)
+    else:
+        check_contig(np.empty((2, 2, 0, 2, 2)), True, False)
+        check_contig(np.array([[[1], [2]]], order='F'), False, True)
+    check_contig(np.empty((2, 2)), True, False)
+    check_contig(np.empty((2, 2), order='F'), False, True)
+
+    # Check that np.array creates correct contiguous flags:
+    check_contig(np.array(a, copy=False), False, False)
+    check_contig(np.array(a, copy=False, order='C'), True, False)
+    check_contig(np.array(a, ndmin=4, copy=False, order='F'), False, True)
+
+    if NPY_RELAXED_STRIDES_CHECKING:
+        # Check slicing update of flags and :
+        check_contig(a[0], True, True)
+        check_contig(a[None, ::4, ..., None], True, True)
+        check_contig(b[0, 0, ...], False, True)
+        check_contig(b[:,:, 0:0,:,:], True, True)
+    else:
+        # Check slicing update of flags:
+        check_contig(a[0], True, False)
+        # Would be nice if this was C-Contiguous:
+        check_contig(a[None, 0, ..., None], False, False)
+        check_contig(b[0, 0, 0, ...], False, True)
+
+    # Test ravel and squeeze.
+    check_contig(a.ravel(), True, True)
+    check_contig(np.ones((1, 3, 1)).squeeze(), True, True)
+
+def test_broadcast_arrays():
+    # Test user defined dtypes
+    a = np.array([(1, 2, 3)], dtype='u4,u4,u4')
+    b = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype='u4,u4,u4')
+    result = np.broadcast_arrays(a, b)
+    assert_equal(result[0], np.array([(1, 2, 3), (1, 2, 3), (1, 2, 3)], dtype='u4,u4,u4'))
+    assert_equal(result[1], np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype='u4,u4,u4'))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_arrayprint.py b/contrib/python/numpy/py2/numpy/core/tests/test_arrayprint.py
new file mode 100644
index 00000000000..f2b8fdca714
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_arrayprint.py
@@ -0,0 +1,893 @@
+# -*- coding: utf-8 -*-
+from __future__ import division, absolute_import, print_function
+
+import sys
+import gc
+import pytest
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_warns, HAS_REFCOUNT,
+    assert_raises_regex,
+    )
+import textwrap
+
+class TestArrayRepr(object):
+    def test_nan_inf(self):
+        x = np.array([np.nan, np.inf])
+        assert_equal(repr(x), 'array([nan, inf])')
+
+    def test_subclass(self):
+        class sub(np.ndarray): pass
+
+        # one dimensional
+        x1d = np.array([1, 2]).view(sub)
+        assert_equal(repr(x1d), 'sub([1, 2])')
+
+        # two dimensional
+        x2d = np.array([[1, 2], [3, 4]]).view(sub)
+        assert_equal(repr(x2d),
+            'sub([[1, 2],\n'
+            '     [3, 4]])')
+
+        # two dimensional with flexible dtype
+        xstruct = np.ones((2,2), dtype=[('a', '<i4')]).view(sub)
+        assert_equal(repr(xstruct),
+            "sub([[(1,), (1,)],\n"
+            "     [(1,), (1,)]], dtype=[('a', '<i4')])"
+        )
+
+    @pytest.mark.xfail(reason="See gh-10544")
+    def test_object_subclass(self):
+        class sub(np.ndarray):
+            def __new__(cls, inp):
+                obj = np.asarray(inp).view(cls)
+                return obj
+
+            def __getitem__(self, ind):
+                ret = super(sub, self).__getitem__(ind)
+                return sub(ret)
+
+        # test that object + subclass is OK:
+        x = sub([None, None])
+        assert_equal(repr(x), 'sub([None, None], dtype=object)')
+        assert_equal(str(x), '[None None]')
+
+        x = sub([None, sub([None, None])])
+        assert_equal(repr(x),
+            'sub([None, sub([None, None], dtype=object)], dtype=object)')
+        assert_equal(str(x), '[None sub([None, None], dtype=object)]')
+
+    def test_0d_object_subclass(self):
+        # make sure that subclasses which return 0ds instead
+        # of scalars don't cause infinite recursion in str
+        class sub(np.ndarray):
+            def __new__(cls, inp):
+                obj = np.asarray(inp).view(cls)
+                return obj
+
+            def __getitem__(self, ind):
+                ret = super(sub, self).__getitem__(ind)
+                return sub(ret)
+
+        x = sub(1)
+        assert_equal(repr(x), 'sub(1)')
+        assert_equal(str(x), '1')
+
+        x = sub([1, 1])
+        assert_equal(repr(x), 'sub([1, 1])')
+        assert_equal(str(x), '[1 1]')
+
+        # check it works properly with object arrays too
+        x = sub(None)
+        assert_equal(repr(x), 'sub(None, dtype=object)')
+        assert_equal(str(x), 'None')
+
+        # plus recursive object arrays (even depth > 1)
+        y = sub(None)
+        x[()] = y
+        y[()] = x
+        assert_equal(repr(x),
+            'sub(sub(sub(..., dtype=object), dtype=object), dtype=object)')
+        assert_equal(str(x), '...')
+        x[()] = 0  # resolve circular references for garbage collector
+
+        # nested 0d-subclass-object
+        x = sub(None)
+        x[()] = sub(None)
+        assert_equal(repr(x), 'sub(sub(None, dtype=object), dtype=object)')
+        assert_equal(str(x), 'None')
+
+        # gh-10663
+        class DuckCounter(np.ndarray):
+            def __getitem__(self, item):
+                result = super(DuckCounter, self).__getitem__(item)
+                if not isinstance(result, DuckCounter):
+                    result = result[...].view(DuckCounter)
+                return result
+
+            def to_string(self):
+                return {0: 'zero', 1: 'one', 2: 'two'}.get(self.item(), 'many')
+
+            def __str__(self):
+                if self.shape == ():
+                    return self.to_string()
+                else:
+                    fmt = {'all': lambda x: x.to_string()}
+                    return np.array2string(self, formatter=fmt)
+
+        dc = np.arange(5).view(DuckCounter)
+        assert_equal(str(dc), "[zero one two many many]")
+        assert_equal(str(dc[0]), "zero")
+
+    def test_self_containing(self):
+        arr0d = np.array(None)
+        arr0d[()] = arr0d
+        assert_equal(repr(arr0d),
+            'array(array(..., dtype=object), dtype=object)')
+        arr0d[()] = 0  # resolve recursion for garbage collector
+
+        arr1d = np.array([None, None])
+        arr1d[1] = arr1d
+        assert_equal(repr(arr1d),
+            'array([None, array(..., dtype=object)], dtype=object)')
+        arr1d[1] = 0  # resolve recursion for garbage collector
+
+        first = np.array(None)
+        second = np.array(None)
+        first[()] = second
+        second[()] = first
+        assert_equal(repr(first),
+            'array(array(array(..., dtype=object), dtype=object), dtype=object)')
+        first[()] = 0  # resolve circular references for garbage collector
+
+    def test_containing_list(self):
+        # printing square brackets directly would be ambiguuous
+        arr1d = np.array([None, None])
+        arr1d[0] = [1, 2]
+        arr1d[1] = [3]
+        assert_equal(repr(arr1d),
+            'array([list([1, 2]), list([3])], dtype=object)')
+
+    def test_void_scalar_recursion(self):
+        # gh-9345
+        repr(np.void(b'test'))  # RecursionError ?
+
+    def test_fieldless_structured(self):
+        # gh-10366
+        no_fields = np.dtype([])
+        arr_no_fields = np.empty(4, dtype=no_fields)
+        assert_equal(repr(arr_no_fields), 'array([(), (), (), ()], dtype=[])')
+
+
+class TestComplexArray(object):
+    def test_str(self):
+        rvals = [0, 1, -1, np.inf, -np.inf, np.nan]
+        cvals = [complex(rp, ip) for rp in rvals for ip in rvals]
+        dtypes = [np.complex64, np.cdouble, np.clongdouble]
+        actual = [str(np.array([c], dt)) for c in cvals for dt in dtypes]
+        wanted = [
+            '[0.+0.j]',    '[0.+0.j]',    '[0.+0.j]',
+            '[0.+1.j]',    '[0.+1.j]',    '[0.+1.j]',
+            '[0.-1.j]',    '[0.-1.j]',    '[0.-1.j]',
+            '[0.+infj]',   '[0.+infj]',   '[0.+infj]',
+            '[0.-infj]',   '[0.-infj]',   '[0.-infj]',
+            '[0.+nanj]',   '[0.+nanj]',   '[0.+nanj]',
+            '[1.+0.j]',    '[1.+0.j]',    '[1.+0.j]',
+            '[1.+1.j]',    '[1.+1.j]',    '[1.+1.j]',
+            '[1.-1.j]',    '[1.-1.j]',    '[1.-1.j]',
+            '[1.+infj]',   '[1.+infj]',   '[1.+infj]',
+            '[1.-infj]',   '[1.-infj]',   '[1.-infj]',
+            '[1.+nanj]',   '[1.+nanj]',   '[1.+nanj]',
+            '[-1.+0.j]',   '[-1.+0.j]',   '[-1.+0.j]',
+            '[-1.+1.j]',   '[-1.+1.j]',   '[-1.+1.j]',
+            '[-1.-1.j]',   '[-1.-1.j]',   '[-1.-1.j]',
+            '[-1.+infj]',  '[-1.+infj]',  '[-1.+infj]',
+            '[-1.-infj]',  '[-1.-infj]',  '[-1.-infj]',
+            '[-1.+nanj]',  '[-1.+nanj]',  '[-1.+nanj]',
+            '[inf+0.j]',   '[inf+0.j]',   '[inf+0.j]',
+            '[inf+1.j]',   '[inf+1.j]',   '[inf+1.j]',
+            '[inf-1.j]',   '[inf-1.j]',   '[inf-1.j]',
+            '[inf+infj]',  '[inf+infj]',  '[inf+infj]',
+            '[inf-infj]',  '[inf-infj]',  '[inf-infj]',
+            '[inf+nanj]',  '[inf+nanj]',  '[inf+nanj]',
+            '[-inf+0.j]',  '[-inf+0.j]',  '[-inf+0.j]',
+            '[-inf+1.j]',  '[-inf+1.j]',  '[-inf+1.j]',
+            '[-inf-1.j]',  '[-inf-1.j]',  '[-inf-1.j]',
+            '[-inf+infj]', '[-inf+infj]', '[-inf+infj]',
+            '[-inf-infj]', '[-inf-infj]', '[-inf-infj]',
+            '[-inf+nanj]', '[-inf+nanj]', '[-inf+nanj]',
+            '[nan+0.j]',   '[nan+0.j]',   '[nan+0.j]',
+            '[nan+1.j]',   '[nan+1.j]',   '[nan+1.j]',
+            '[nan-1.j]',   '[nan-1.j]',   '[nan-1.j]',
+            '[nan+infj]',  '[nan+infj]',  '[nan+infj]',
+            '[nan-infj]',  '[nan-infj]',  '[nan-infj]',
+            '[nan+nanj]',  '[nan+nanj]',  '[nan+nanj]']
+
+        for res, val in zip(actual, wanted):
+            assert_equal(res, val)
+
+class TestArray2String(object):
+    def test_basic(self):
+        """Basic test of array2string."""
+        a = np.arange(3)
+        assert_(np.array2string(a) == '[0 1 2]')
+        assert_(np.array2string(a, max_line_width=4, legacy='1.13') == '[0 1\n 2]')
+        assert_(np.array2string(a, max_line_width=4) == '[0\n 1\n 2]')
+
+    def test_unexpected_kwarg(self):
+        # ensure than an appropriate TypeError
+        # is raised when array2string receives
+        # an unexpected kwarg
+
+        with assert_raises_regex(TypeError, 'nonsense'):
+            np.array2string(np.array([1, 2, 3]),
+                            nonsense=None)
+
+    def test_format_function(self):
+        """Test custom format function for each element in array."""
+        def _format_function(x):
+            if np.abs(x) < 1:
+                return '.'
+            elif np.abs(x) < 2:
+                return 'o'
+            else:
+                return 'O'
+
+        x = np.arange(3)
+        if sys.version_info[0] >= 3:
+            x_hex = "[0x0 0x1 0x2]"
+            x_oct = "[0o0 0o1 0o2]"
+        else:
+            x_hex = "[0x0L 0x1L 0x2L]"
+            x_oct = "[0L 01L 02L]"
+        assert_(np.array2string(x, formatter={'all':_format_function}) ==
+                "[. o O]")
+        assert_(np.array2string(x, formatter={'int_kind':_format_function}) ==
+                "[. o O]")
+        assert_(np.array2string(x, formatter={'all':lambda x: "%.4f" % x}) ==
+                "[0.0000 1.0000 2.0000]")
+        assert_equal(np.array2string(x, formatter={'int':lambda x: hex(x)}),
+                x_hex)
+        assert_equal(np.array2string(x, formatter={'int':lambda x: oct(x)}),
+                x_oct)
+
+        x = np.arange(3.)
+        assert_(np.array2string(x, formatter={'float_kind':lambda x: "%.2f" % x}) ==
+                "[0.00 1.00 2.00]")
+        assert_(np.array2string(x, formatter={'float':lambda x: "%.2f" % x}) ==
+                "[0.00 1.00 2.00]")
+
+        s = np.array(['abc', 'def'])
+        assert_(np.array2string(s, formatter={'numpystr':lambda s: s*2}) ==
+                '[abcabc defdef]')
+
+        # check for backcompat that using FloatFormat works and emits warning
+        with assert_warns(DeprecationWarning):
+            fmt = np.core.arrayprint.FloatFormat(x, 9, 'maxprec', False)
+        assert_equal(np.array2string(x, formatter={'float_kind': fmt}),
+                     '[0. 1. 2.]')
+
+    def test_structure_format(self):
+        dt = np.dtype([('name', np.str_, 16), ('grades', np.float64, (2,))])
+        x = np.array([('Sarah', (8.0, 7.0)), ('John', (6.0, 7.0))], dtype=dt)
+        assert_equal(np.array2string(x),
+                "[('Sarah', [8., 7.]) ('John', [6., 7.])]")
+
+        np.set_printoptions(legacy='1.13')
+        try:
+            # for issue #5692
+            A = np.zeros(shape=10, dtype=[("A", "M8[s]")])
+            A[5:].fill(np.datetime64('NaT'))
+            assert_equal(
+                np.array2string(A),
+                textwrap.dedent("""\
+                [('1970-01-01T00:00:00',) ('1970-01-01T00:00:00',) ('1970-01-01T00:00:00',)
+                 ('1970-01-01T00:00:00',) ('1970-01-01T00:00:00',) ('NaT',) ('NaT',)
+                 ('NaT',) ('NaT',) ('NaT',)]""")
+            )
+        finally:
+            np.set_printoptions(legacy=False)
+
+        # same again, but with non-legacy behavior
+        assert_equal(
+            np.array2string(A),
+            textwrap.dedent("""\
+            [('1970-01-01T00:00:00',) ('1970-01-01T00:00:00',)
+             ('1970-01-01T00:00:00',) ('1970-01-01T00:00:00',)
+             ('1970-01-01T00:00:00',) (                'NaT',)
+             (                'NaT',) (                'NaT',)
+             (                'NaT',) (                'NaT',)]""")
+        )
+
+        # and again, with timedeltas
+        A = np.full(10, 123456, dtype=[("A", "m8[s]")])
+        A[5:].fill(np.datetime64('NaT'))
+        assert_equal(
+            np.array2string(A),
+            textwrap.dedent("""\
+            [(123456,) (123456,) (123456,) (123456,) (123456,) ( 'NaT',) ( 'NaT',)
+             ( 'NaT',) ( 'NaT',) ( 'NaT',)]""")
+        )
+
+        # See #8160
+        struct_int = np.array([([1, -1],), ([123, 1],)], dtype=[('B', 'i4', 2)])
+        assert_equal(np.array2string(struct_int),
+                "[([  1,  -1],) ([123,   1],)]")
+        struct_2dint = np.array([([[0, 1], [2, 3]],), ([[12, 0], [0, 0]],)],
+                dtype=[('B', 'i4', (2, 2))])
+        assert_equal(np.array2string(struct_2dint),
+                "[([[ 0,  1], [ 2,  3]],) ([[12,  0], [ 0,  0]],)]")
+
+        # See #8172
+        array_scalar = np.array(
+                (1., 2.1234567890123456789, 3.), dtype=('f8,f8,f8'))
+        assert_equal(np.array2string(array_scalar), "(1., 2.12345679, 3.)")
+
+    def test_unstructured_void_repr(self):
+        a = np.array([27, 91, 50, 75,  7, 65, 10,  8,
+                      27, 91, 51, 49,109, 82,101,100], dtype='u1').view('V8')
+        assert_equal(repr(a[0]), r"void(b'\x1B\x5B\x32\x4B\x07\x41\x0A\x08')")
+        assert_equal(str(a[0]), r"b'\x1B\x5B\x32\x4B\x07\x41\x0A\x08'")
+        assert_equal(repr(a),
+            r"array([b'\x1B\x5B\x32\x4B\x07\x41\x0A\x08'," "\n"
+            r"       b'\x1B\x5B\x33\x31\x6D\x52\x65\x64'], dtype='|V8')")
+
+        assert_equal(eval(repr(a), vars(np)), a)
+        assert_equal(eval(repr(a[0]), vars(np)), a[0])
+
+    def test_edgeitems_kwarg(self):
+        # previously the global print options would be taken over the kwarg
+        arr = np.zeros(3, int)
+        assert_equal(
+            np.array2string(arr, edgeitems=1, threshold=0),
+            "[0 ... 0]"
+        )
+
+    def test_summarize_1d(self):
+        A = np.arange(1001)
+        strA = '[   0    1    2 ...  998  999 1000]'
+        assert_equal(str(A), strA)
+
+        reprA = 'array([   0,    1,    2, ...,  998,  999, 1000])'
+        assert_equal(repr(A), reprA)
+
+    def test_summarize_2d(self):
+        A = np.arange(1002).reshape(2, 501)
+        strA = '[[   0    1    2 ...  498  499  500]\n' \
+               ' [ 501  502  503 ...  999 1000 1001]]'
+        assert_equal(str(A), strA)
+
+        reprA = 'array([[   0,    1,    2, ...,  498,  499,  500],\n' \
+                '       [ 501,  502,  503, ...,  999, 1000, 1001]])'
+        assert_equal(repr(A), reprA)
+
+    def test_linewidth(self):
+        a = np.full(6, 1)
+
+        def make_str(a, width, **kw):
+            return np.array2string(a, separator="", max_line_width=width, **kw)
+
+        assert_equal(make_str(a, 8, legacy='1.13'), '[111111]')
+        assert_equal(make_str(a, 7, legacy='1.13'), '[111111]')
+        assert_equal(make_str(a, 5, legacy='1.13'), '[1111\n'
+                                                    ' 11]')
+
+        assert_equal(make_str(a, 8), '[111111]')
+        assert_equal(make_str(a, 7), '[11111\n'
+                                     ' 1]')
+        assert_equal(make_str(a, 5), '[111\n'
+                                     ' 111]')
+
+        b = a[None,None,:]
+
+        assert_equal(make_str(b, 12, legacy='1.13'), '[[[111111]]]')
+        assert_equal(make_str(b,  9, legacy='1.13'), '[[[111111]]]')
+        assert_equal(make_str(b,  8, legacy='1.13'), '[[[11111\n'
+                                                     '   1]]]')
+
+        assert_equal(make_str(b, 12), '[[[111111]]]')
+        assert_equal(make_str(b,  9), '[[[111\n'
+                                      '   111]]]')
+        assert_equal(make_str(b,  8), '[[[11\n'
+                                      '   11\n'
+                                      '   11]]]')
+
+    def test_wide_element(self):
+        a = np.array(['xxxxx'])
+        assert_equal(
+            np.array2string(a, max_line_width=5),
+            "['xxxxx']"
+        )
+        assert_equal(
+            np.array2string(a, max_line_width=5, legacy='1.13'),
+            "[ 'xxxxx']"
+        )
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_refcount(self):
+        # make sure we do not hold references to the array due to a recursive
+        # closure (gh-10620)
+        gc.disable()
+        a = np.arange(2)
+        r1 = sys.getrefcount(a)
+        np.array2string(a)
+        np.array2string(a)
+        r2 = sys.getrefcount(a)
+        gc.collect()
+        gc.enable()
+        assert_(r1 == r2)
+
+class TestPrintOptions(object):
+    """Test getting and setting global print options."""
+
+    def setup(self):
+        self.oldopts = np.get_printoptions()
+
+    def teardown(self):
+        np.set_printoptions(**self.oldopts)
+
+    def test_basic(self):
+        x = np.array([1.5, 0, 1.234567890])
+        assert_equal(repr(x), "array([1.5       , 0.        , 1.23456789])")
+        np.set_printoptions(precision=4)
+        assert_equal(repr(x), "array([1.5   , 0.    , 1.2346])")
+
+    def test_precision_zero(self):
+        np.set_printoptions(precision=0)
+        for values, string in (
+                ([0.], "0."), ([.3], "0."), ([-.3], "-0."), ([.7], "1."),
+                ([1.5], "2."), ([-1.5], "-2."), ([-15.34], "-15."),
+                ([100.], "100."), ([.2, -1, 122.51], "  0.,  -1., 123."),
+                ([0], "0"), ([-12], "-12"), ([complex(.3, -.7)], "0.-1.j")):
+            x = np.array(values)
+            assert_equal(repr(x), "array([%s])" % string)
+
+    def test_formatter(self):
+        x = np.arange(3)
+        np.set_printoptions(formatter={'all':lambda x: str(x-1)})
+        assert_equal(repr(x), "array([-1, 0, 1])")
+
+    def test_formatter_reset(self):
+        x = np.arange(3)
+        np.set_printoptions(formatter={'all':lambda x: str(x-1)})
+        assert_equal(repr(x), "array([-1, 0, 1])")
+        np.set_printoptions(formatter={'int':None})
+        assert_equal(repr(x), "array([0, 1, 2])")
+
+        np.set_printoptions(formatter={'all':lambda x: str(x-1)})
+        assert_equal(repr(x), "array([-1, 0, 1])")
+        np.set_printoptions(formatter={'all':None})
+        assert_equal(repr(x), "array([0, 1, 2])")
+
+        np.set_printoptions(formatter={'int':lambda x: str(x-1)})
+        assert_equal(repr(x), "array([-1, 0, 1])")
+        np.set_printoptions(formatter={'int_kind':None})
+        assert_equal(repr(x), "array([0, 1, 2])")
+
+        x = np.arange(3.)
+        np.set_printoptions(formatter={'float':lambda x: str(x-1)})
+        assert_equal(repr(x), "array([-1.0, 0.0, 1.0])")
+        np.set_printoptions(formatter={'float_kind':None})
+        assert_equal(repr(x), "array([0., 1., 2.])")
+
+    def test_0d_arrays(self):
+        unicode = type(u'')
+
+        assert_equal(unicode(np.array(u'café', '<U4')), u'café')
+
+        if sys.version_info[0] >= 3:
+            assert_equal(repr(np.array('café', '<U4')),
+                         "array('café', dtype='<U4')")
+        else:
+            assert_equal(repr(np.array(u'café', '<U4')),
+                         "array(u'caf\\xe9', dtype='<U4')")
+        assert_equal(str(np.array('test', np.str_)), 'test')
+
+        a = np.zeros(1, dtype=[('a', '<i4', (3,))])
+        assert_equal(str(a[0]), '([0, 0, 0],)')
+
+        assert_equal(repr(np.datetime64('2005-02-25')[...]),
+                     "array('2005-02-25', dtype='datetime64[D]')")
+
+        assert_equal(repr(np.timedelta64('10', 'Y')[...]),
+                     "array(10, dtype='timedelta64[Y]')")
+
+        # repr of 0d arrays is affected by printoptions
+        x = np.array(1)
+        np.set_printoptions(formatter={'all':lambda x: "test"})
+        assert_equal(repr(x), "array(test)")
+        # str is unaffected
+        assert_equal(str(x), "1")
+
+        # check `style` arg raises
+        assert_warns(DeprecationWarning, np.array2string,
+                                         np.array(1.), style=repr)
+        # but not in legacy mode
+        np.array2string(np.array(1.), style=repr, legacy='1.13')
+        # gh-10934 style was broken in legacy mode, check it works
+        np.array2string(np.array(1.), legacy='1.13')
+
+    def test_float_spacing(self):
+        x = np.array([1., 2., 3.])
+        y = np.array([1., 2., -10.])
+        z = np.array([100., 2., -1.])
+        w = np.array([-100., 2., 1.])
+
+        assert_equal(repr(x), 'array([1., 2., 3.])')
+        assert_equal(repr(y), 'array([  1.,   2., -10.])')
+        assert_equal(repr(np.array(y[0])), 'array(1.)')
+        assert_equal(repr(np.array(y[-1])), 'array(-10.)')
+        assert_equal(repr(z), 'array([100.,   2.,  -1.])')
+        assert_equal(repr(w), 'array([-100.,    2.,    1.])')
+
+        assert_equal(repr(np.array([np.nan, np.inf])), 'array([nan, inf])')
+        assert_equal(repr(np.array([np.nan, -np.inf])), 'array([ nan, -inf])')
+
+        x = np.array([np.inf, 100000, 1.1234])
+        y = np.array([np.inf, 100000, -1.1234])
+        z = np.array([np.inf, 1.1234, -1e120])
+        np.set_printoptions(precision=2)
+        assert_equal(repr(x), 'array([     inf, 1.00e+05, 1.12e+00])')
+        assert_equal(repr(y), 'array([      inf,  1.00e+05, -1.12e+00])')
+        assert_equal(repr(z), 'array([       inf,  1.12e+000, -1.00e+120])')
+
+    def test_bool_spacing(self):
+        assert_equal(repr(np.array([True,  True])),
+                     'array([ True,  True])')
+        assert_equal(repr(np.array([True, False])),
+                     'array([ True, False])')
+        assert_equal(repr(np.array([True])),
+                     'array([ True])')
+        assert_equal(repr(np.array(True)),
+                     'array(True)')
+        assert_equal(repr(np.array(False)),
+                     'array(False)')
+
+    def test_sign_spacing(self):
+        a = np.arange(4.)
+        b = np.array([1.234e9])
+        c = np.array([1.0 + 1.0j, 1.123456789 + 1.123456789j], dtype='c16')
+
+        assert_equal(repr(a), 'array([0., 1., 2., 3.])')
+        assert_equal(repr(np.array(1.)), 'array(1.)')
+        assert_equal(repr(b), 'array([1.234e+09])')
+        assert_equal(repr(np.array([0.])), 'array([0.])')
+        assert_equal(repr(c),
+            "array([1.        +1.j        , 1.12345679+1.12345679j])")
+        assert_equal(repr(np.array([0., -0.])), 'array([ 0., -0.])')
+
+        np.set_printoptions(sign=' ')
+        assert_equal(repr(a), 'array([ 0.,  1.,  2.,  3.])')
+        assert_equal(repr(np.array(1.)), 'array( 1.)')
+        assert_equal(repr(b), 'array([ 1.234e+09])')
+        assert_equal(repr(c),
+            "array([ 1.        +1.j        ,  1.12345679+1.12345679j])")
+        assert_equal(repr(np.array([0., -0.])), 'array([ 0., -0.])')
+
+        np.set_printoptions(sign='+')
+        assert_equal(repr(a), 'array([+0., +1., +2., +3.])')
+        assert_equal(repr(np.array(1.)), 'array(+1.)')
+        assert_equal(repr(b), 'array([+1.234e+09])')
+        assert_equal(repr(c),
+            "array([+1.        +1.j        , +1.12345679+1.12345679j])")
+
+        np.set_printoptions(legacy='1.13')
+        assert_equal(repr(a), 'array([ 0.,  1.,  2.,  3.])')
+        assert_equal(repr(b),  'array([  1.23400000e+09])')
+        assert_equal(repr(-b), 'array([ -1.23400000e+09])')
+        assert_equal(repr(np.array(1.)), 'array(1.0)')
+        assert_equal(repr(np.array([0.])), 'array([ 0.])')
+        assert_equal(repr(c),
+            "array([ 1.00000000+1.j        ,  1.12345679+1.12345679j])")
+        # gh-10383
+        assert_equal(str(np.array([-1., 10])), "[ -1.  10.]")
+
+        assert_raises(TypeError, np.set_printoptions, wrongarg=True)
+
+    def test_float_overflow_nowarn(self):
+        # make sure internal computations in FloatingFormat don't
+        # warn about overflow
+        repr(np.array([1e4, 0.1], dtype='f2'))
+
+    def test_sign_spacing_structured(self):
+        a = np.ones(2, dtype='<f,<f')
+        assert_equal(repr(a),
+            "array([(1., 1.), (1., 1.)], dtype=[('f0', '<f4'), ('f1', '<f4')])")
+        assert_equal(repr(a[0]), "(1., 1.)")
+
+    def test_floatmode(self):
+        x = np.array([0.6104, 0.922, 0.457, 0.0906, 0.3733, 0.007244,
+                      0.5933, 0.947, 0.2383, 0.4226], dtype=np.float16)
+        y = np.array([0.2918820979355541, 0.5064172631089138,
+                      0.2848750619642916, 0.4342965294660567,
+                      0.7326538397312751, 0.3459503329096204,
+                      0.0862072768214508, 0.39112753029631175],
+                      dtype=np.float64)
+        z = np.arange(6, dtype=np.float16)/10
+        c = np.array([1.0 + 1.0j, 1.123456789 + 1.123456789j], dtype='c16')
+
+        # also make sure 1e23 is right (is between two fp numbers)
+        w = np.array(['1e{}'.format(i) for i in range(25)], dtype=np.float64)
+        # note: we construct w from the strings `1eXX` instead of doing
+        # `10.**arange(24)` because it turns out the two are not equivalent in
+        # python. On some architectures `1e23 != 10.**23`.
+        wp = np.array([1.234e1, 1e2, 1e123])
+
+        # unique mode
+        np.set_printoptions(floatmode='unique')
+        assert_equal(repr(x),
+            "array([0.6104  , 0.922   , 0.457   , 0.0906  , 0.3733  , 0.007244,\n"
+            "       0.5933  , 0.947   , 0.2383  , 0.4226  ], dtype=float16)")
+        assert_equal(repr(y),
+            "array([0.2918820979355541 , 0.5064172631089138 , 0.2848750619642916 ,\n"
+            "       0.4342965294660567 , 0.7326538397312751 , 0.3459503329096204 ,\n"
+            "       0.0862072768214508 , 0.39112753029631175])")
+        assert_equal(repr(z),
+            "array([0. , 0.1, 0.2, 0.3, 0.4, 0.5], dtype=float16)")
+        assert_equal(repr(w),
+            "array([1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07,\n"
+            "       1.e+08, 1.e+09, 1.e+10, 1.e+11, 1.e+12, 1.e+13, 1.e+14, 1.e+15,\n"
+            "       1.e+16, 1.e+17, 1.e+18, 1.e+19, 1.e+20, 1.e+21, 1.e+22, 1.e+23,\n"
+            "       1.e+24])")
+        assert_equal(repr(wp), "array([1.234e+001, 1.000e+002, 1.000e+123])")
+        assert_equal(repr(c),
+            "array([1.         +1.j         , 1.123456789+1.123456789j])")
+
+        # maxprec mode, precision=8
+        np.set_printoptions(floatmode='maxprec', precision=8)
+        assert_equal(repr(x),
+            "array([0.6104  , 0.922   , 0.457   , 0.0906  , 0.3733  , 0.007244,\n"
+            "       0.5933  , 0.947   , 0.2383  , 0.4226  ], dtype=float16)")
+        assert_equal(repr(y),
+            "array([0.2918821 , 0.50641726, 0.28487506, 0.43429653, 0.73265384,\n"
+            "       0.34595033, 0.08620728, 0.39112753])")
+        assert_equal(repr(z),
+            "array([0. , 0.1, 0.2, 0.3, 0.4, 0.5], dtype=float16)")
+        assert_equal(repr(w[::5]),
+            "array([1.e+00, 1.e+05, 1.e+10, 1.e+15, 1.e+20])")
+        assert_equal(repr(wp), "array([1.234e+001, 1.000e+002, 1.000e+123])")
+        assert_equal(repr(c),
+            "array([1.        +1.j        , 1.12345679+1.12345679j])")
+
+        # fixed mode, precision=4
+        np.set_printoptions(floatmode='fixed', precision=4)
+        assert_equal(repr(x),
+            "array([0.6104, 0.9219, 0.4570, 0.0906, 0.3733, 0.0072, 0.5933, 0.9468,\n"
+            "       0.2383, 0.4226], dtype=float16)")
+        assert_equal(repr(y),
+            "array([0.2919, 0.5064, 0.2849, 0.4343, 0.7327, 0.3460, 0.0862, 0.3911])")
+        assert_equal(repr(z),
+            "array([0.0000, 0.1000, 0.2000, 0.3000, 0.3999, 0.5000], dtype=float16)")
+        assert_equal(repr(w[::5]),
+            "array([1.0000e+00, 1.0000e+05, 1.0000e+10, 1.0000e+15, 1.0000e+20])")
+        assert_equal(repr(wp), "array([1.2340e+001, 1.0000e+002, 1.0000e+123])")
+        assert_equal(repr(np.zeros(3)), "array([0.0000, 0.0000, 0.0000])")
+        assert_equal(repr(c),
+            "array([1.0000+1.0000j, 1.1235+1.1235j])")
+        # for larger precision, representation error becomes more apparent:
+        np.set_printoptions(floatmode='fixed', precision=8)
+        assert_equal(repr(z),
+            "array([0.00000000, 0.09997559, 0.19995117, 0.30004883, 0.39990234,\n"
+            "       0.50000000], dtype=float16)")
+
+        # maxprec_equal  mode, precision=8
+        np.set_printoptions(floatmode='maxprec_equal', precision=8)
+        assert_equal(repr(x),
+            "array([0.610352, 0.921875, 0.457031, 0.090576, 0.373291, 0.007244,\n"
+            "       0.593262, 0.946777, 0.238281, 0.422607], dtype=float16)")
+        assert_equal(repr(y),
+            "array([0.29188210, 0.50641726, 0.28487506, 0.43429653, 0.73265384,\n"
+            "       0.34595033, 0.08620728, 0.39112753])")
+        assert_equal(repr(z),
+            "array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5], dtype=float16)")
+        assert_equal(repr(w[::5]),
+            "array([1.e+00, 1.e+05, 1.e+10, 1.e+15, 1.e+20])")
+        assert_equal(repr(wp), "array([1.234e+001, 1.000e+002, 1.000e+123])")
+        assert_equal(repr(c),
+            "array([1.00000000+1.00000000j, 1.12345679+1.12345679j])")
+
+    def test_legacy_mode_scalars(self):
+        # in legacy mode, str of floats get truncated, and complex scalars
+        # use * for non-finite imaginary part
+        np.set_printoptions(legacy='1.13')
+        assert_equal(str(np.float64(1.123456789123456789)), '1.12345678912')
+        assert_equal(str(np.complex128(complex(1, np.nan))), '(1+nan*j)')
+
+        np.set_printoptions(legacy=False)
+        assert_equal(str(np.float64(1.123456789123456789)),
+                     '1.1234567891234568')
+        assert_equal(str(np.complex128(complex(1, np.nan))), '(1+nanj)')
+
+    def test_legacy_stray_comma(self):
+        np.set_printoptions(legacy='1.13')
+        assert_equal(str(np.arange(10000)), '[   0    1    2 ..., 9997 9998 9999]')
+
+        np.set_printoptions(legacy=False)
+        assert_equal(str(np.arange(10000)), '[   0    1    2 ... 9997 9998 9999]')
+
+    def test_dtype_linewidth_wrapping(self):
+        np.set_printoptions(linewidth=75)
+        assert_equal(repr(np.arange(10,20., dtype='f4')),
+            "array([10., 11., 12., 13., 14., 15., 16., 17., 18., 19.], dtype=float32)")
+        assert_equal(repr(np.arange(10,23., dtype='f4')), textwrap.dedent("""\
+            array([10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.],
+                  dtype=float32)"""))
+
+        styp = '<U4' if sys.version_info[0] >= 3 else '|S4'
+        assert_equal(repr(np.ones(3, dtype=styp)),
+            "array(['1', '1', '1'], dtype='{}')".format(styp))
+        assert_equal(repr(np.ones(12, dtype=styp)), textwrap.dedent("""\
+            array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
+                  dtype='{}')""".format(styp)))
+
+    def test_linewidth_repr(self):
+        a = np.full(7, fill_value=2)
+        np.set_printoptions(linewidth=17)
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            array([2, 2, 2,
+                   2, 2, 2,
+                   2])""")
+        )
+        np.set_printoptions(linewidth=17, legacy='1.13')
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            array([2, 2, 2,
+                   2, 2, 2, 2])""")
+        )
+
+        a = np.full(8, fill_value=2)
+
+        np.set_printoptions(linewidth=18, legacy=False)
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            array([2, 2, 2,
+                   2, 2, 2,
+                   2, 2])""")
+        )
+
+        np.set_printoptions(linewidth=18, legacy='1.13')
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            array([2, 2, 2, 2,
+                   2, 2, 2, 2])""")
+        )
+
+    def test_linewidth_str(self):
+        a = np.full(18, fill_value=2)
+        np.set_printoptions(linewidth=18)
+        assert_equal(
+            str(a),
+            textwrap.dedent("""\
+            [2 2 2 2 2 2 2 2
+             2 2 2 2 2 2 2 2
+             2 2]""")
+        )
+        np.set_printoptions(linewidth=18, legacy='1.13')
+        assert_equal(
+            str(a),
+            textwrap.dedent("""\
+            [2 2 2 2 2 2 2 2 2
+             2 2 2 2 2 2 2 2 2]""")
+        )
+
+    def test_edgeitems(self):
+        np.set_printoptions(edgeitems=1, threshold=1)
+        a = np.arange(27).reshape((3, 3, 3))
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            array([[[ 0, ...,  2],
+                    ...,
+                    [ 6, ...,  8]],
+
+                   ...,
+
+                   [[18, ..., 20],
+                    ...,
+                    [24, ..., 26]]])""")
+        )
+
+        b = np.zeros((3, 3, 1, 1))
+        assert_equal(
+            repr(b),
+            textwrap.dedent("""\
+            array([[[[0.]],
+
+                    ...,
+
+                    [[0.]]],
+
+
+                   ...,
+
+
+                   [[[0.]],
+
+                    ...,
+
+                    [[0.]]]])""")
+        )
+
+        # 1.13 had extra trailing spaces, and was missing newlines
+        np.set_printoptions(legacy='1.13')
+
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            array([[[ 0, ...,  2],
+                    ..., 
+                    [ 6, ...,  8]],
+
+                   ..., 
+                   [[18, ..., 20],
+                    ..., 
+                    [24, ..., 26]]])""")
+        )
+
+        assert_equal(
+            repr(b),
+            textwrap.dedent("""\
+            array([[[[ 0.]],
+
+                    ..., 
+                    [[ 0.]]],
+
+
+                   ..., 
+                   [[[ 0.]],
+
+                    ..., 
+                    [[ 0.]]]])""")
+        )
+
+    def test_bad_args(self):
+        assert_raises(ValueError, np.set_printoptions, threshold='nan')
+        assert_raises(ValueError, np.set_printoptions, threshold=u'1')
+        assert_raises(ValueError, np.set_printoptions, threshold=b'1')
+
+def test_unicode_object_array():
+    import sys
+    if sys.version_info[0] >= 3:
+        expected = "array(['é'], dtype=object)"
+    else:
+        expected = "array([u'\\xe9'], dtype=object)"
+    x = np.array([u'\xe9'], dtype=object)
+    assert_equal(repr(x), expected)
+
+
+class TestContextManager(object):
+    def test_ctx_mgr(self):
+        # test that context manager actuall works
+        with np.printoptions(precision=2):
+            s = str(np.array([2.0]) / 3)
+        assert_equal(s, '[0.67]')
+
+    def test_ctx_mgr_restores(self):
+        # test that print options are actually restrored
+        opts = np.get_printoptions()
+        with np.printoptions(precision=opts['precision'] - 1,
+                             linewidth=opts['linewidth'] - 4):
+            pass
+        assert_equal(np.get_printoptions(), opts)
+
+    def test_ctx_mgr_exceptions(self):
+        # test that print options are restored even if an exception is raised
+        opts = np.get_printoptions()
+        try:
+            with np.printoptions(precision=2, linewidth=11):
+                raise ValueError
+        except ValueError:
+            pass
+        assert_equal(np.get_printoptions(), opts)
+
+    def test_ctx_mgr_as_smth(self):
+        opts = {"precision": 2}
+        with np.printoptions(**opts) as ctx:
+            saved_opts = ctx.copy()
+        assert_equal({k: saved_opts[k] for k in opts}, opts)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_datetime.py b/contrib/python/numpy/py2/numpy/core/tests/test_datetime.py
new file mode 100644
index 00000000000..170c52e9e3a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_datetime.py
@@ -0,0 +1,2228 @@
+from __future__ import division, absolute_import, print_function
+
+
+import numpy
+import numpy as np
+import datetime
+import pytest
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_warns, suppress_warnings,
+    assert_raises_regex,
+    )
+from numpy.core.numeric import pickle
+
+# Use pytz to test out various time zones if available
+try:
+    from pytz import timezone as tz
+    _has_pytz = True
+except ImportError:
+    _has_pytz = False
+
+try:
+    RecursionError
+except NameError:
+    RecursionError = RuntimeError  # python < 3.5
+
+
+class TestDateTime(object):
+    def test_datetime_dtype_creation(self):
+        for unit in ['Y', 'M', 'W', 'D',
+                     'h', 'm', 's', 'ms', 'us',
+                     'ns', 'ps', 'fs', 'as']:
+            dt1 = np.dtype('M8[750%s]' % unit)
+            assert_(dt1 == np.dtype('datetime64[750%s]' % unit))
+            dt2 = np.dtype('m8[%s]' % unit)
+            assert_(dt2 == np.dtype('timedelta64[%s]' % unit))
+
+        # Generic units shouldn't add [] to the end
+        assert_equal(str(np.dtype("M8")), "datetime64")
+
+        # Should be possible to specify the endianness
+        assert_equal(np.dtype("=M8"), np.dtype("M8"))
+        assert_equal(np.dtype("=M8[s]"), np.dtype("M8[s]"))
+        assert_(np.dtype(">M8") == np.dtype("M8") or
+                np.dtype("<M8") == np.dtype("M8"))
+        assert_(np.dtype(">M8[D]") == np.dtype("M8[D]") or
+                np.dtype("<M8[D]") == np.dtype("M8[D]"))
+        assert_(np.dtype(">M8") != np.dtype("<M8"))
+
+        assert_equal(np.dtype("=m8"), np.dtype("m8"))
+        assert_equal(np.dtype("=m8[s]"), np.dtype("m8[s]"))
+        assert_(np.dtype(">m8") == np.dtype("m8") or
+                np.dtype("<m8") == np.dtype("m8"))
+        assert_(np.dtype(">m8[D]") == np.dtype("m8[D]") or
+                np.dtype("<m8[D]") == np.dtype("m8[D]"))
+        assert_(np.dtype(">m8") != np.dtype("<m8"))
+
+        # Check that the parser rejects bad datetime types
+        assert_raises(TypeError, np.dtype, 'M8[badunit]')
+        assert_raises(TypeError, np.dtype, 'm8[badunit]')
+        assert_raises(TypeError, np.dtype, 'M8[YY]')
+        assert_raises(TypeError, np.dtype, 'm8[YY]')
+        assert_raises(TypeError, np.dtype, 'm4')
+        assert_raises(TypeError, np.dtype, 'M7')
+        assert_raises(TypeError, np.dtype, 'm7')
+        assert_raises(TypeError, np.dtype, 'M16')
+        assert_raises(TypeError, np.dtype, 'm16')
+
+    def test_datetime_casting_rules(self):
+        # Cannot cast safely/same_kind between timedelta and datetime
+        assert_(not np.can_cast('m8', 'M8', casting='same_kind'))
+        assert_(not np.can_cast('M8', 'm8', casting='same_kind'))
+        assert_(not np.can_cast('m8', 'M8', casting='safe'))
+        assert_(not np.can_cast('M8', 'm8', casting='safe'))
+
+        # Can cast safely/same_kind from integer to timedelta
+        assert_(np.can_cast('i8', 'm8', casting='same_kind'))
+        assert_(np.can_cast('i8', 'm8', casting='safe'))
+
+        # Cannot cast safely/same_kind from float to timedelta
+        assert_(not np.can_cast('f4', 'm8', casting='same_kind'))
+        assert_(not np.can_cast('f4', 'm8', casting='safe'))
+
+        # Cannot cast safely/same_kind from integer to datetime
+        assert_(not np.can_cast('i8', 'M8', casting='same_kind'))
+        assert_(not np.can_cast('i8', 'M8', casting='safe'))
+
+        # Cannot cast safely/same_kind from bool to datetime
+        assert_(not np.can_cast('b1', 'M8', casting='same_kind'))
+        assert_(not np.can_cast('b1', 'M8', casting='safe'))
+        # Can cast safely/same_kind from bool to timedelta
+        assert_(np.can_cast('b1', 'm8', casting='same_kind'))
+        assert_(np.can_cast('b1', 'm8', casting='safe'))
+
+        # Can cast datetime safely from months/years to days
+        assert_(np.can_cast('M8[M]', 'M8[D]', casting='safe'))
+        assert_(np.can_cast('M8[Y]', 'M8[D]', casting='safe'))
+        # Cannot cast timedelta safely from months/years to days
+        assert_(not np.can_cast('m8[M]', 'm8[D]', casting='safe'))
+        assert_(not np.can_cast('m8[Y]', 'm8[D]', casting='safe'))
+        # Can cast datetime same_kind from months/years to days
+        assert_(np.can_cast('M8[M]', 'M8[D]', casting='same_kind'))
+        assert_(np.can_cast('M8[Y]', 'M8[D]', casting='same_kind'))
+        # Can't cast timedelta same_kind from months/years to days
+        assert_(not np.can_cast('m8[M]', 'm8[D]', casting='same_kind'))
+        assert_(not np.can_cast('m8[Y]', 'm8[D]', casting='same_kind'))
+        # Can cast datetime same_kind across the date/time boundary
+        assert_(np.can_cast('M8[D]', 'M8[h]', casting='same_kind'))
+        # Can cast timedelta same_kind across the date/time boundary
+        assert_(np.can_cast('m8[D]', 'm8[h]', casting='same_kind'))
+        assert_(np.can_cast('m8[h]', 'm8[D]', casting='same_kind'))
+
+        # Cannot cast safely if the integer multiplier doesn't divide
+        assert_(not np.can_cast('M8[7h]', 'M8[3h]', casting='safe'))
+        assert_(not np.can_cast('M8[3h]', 'M8[6h]', casting='safe'))
+        # But can cast same_kind
+        assert_(np.can_cast('M8[7h]', 'M8[3h]', casting='same_kind'))
+        # Can cast safely if the integer multiplier does divide
+        assert_(np.can_cast('M8[6h]', 'M8[3h]', casting='safe'))
+
+        # We can always cast types with generic units (corresponding to NaT) to
+        # more specific types
+        assert_(np.can_cast('m8', 'm8[h]', casting='same_kind'))
+        assert_(np.can_cast('m8', 'm8[h]', casting='safe'))
+        assert_(np.can_cast('M8', 'M8[h]', casting='same_kind'))
+        assert_(np.can_cast('M8', 'M8[h]', casting='safe'))
+        # but not the other way around
+        assert_(not np.can_cast('m8[h]', 'm8', casting='same_kind'))
+        assert_(not np.can_cast('m8[h]', 'm8', casting='safe'))
+        assert_(not np.can_cast('M8[h]', 'M8', casting='same_kind'))
+        assert_(not np.can_cast('M8[h]', 'M8', casting='safe'))
+
+    def test_compare_generic_nat(self):
+        # regression tests for gh-6452
+        assert_(np.datetime64('NaT') !=
+                np.datetime64('2000') + np.timedelta64('NaT'))
+        assert_(np.datetime64('NaT') != np.datetime64('NaT', 'us'))
+        assert_(np.datetime64('NaT', 'us') != np.datetime64('NaT'))
+
+    def test_datetime_scalar_construction(self):
+        # Construct with different units
+        assert_equal(np.datetime64('1950-03-12', 'D'),
+                     np.datetime64('1950-03-12'))
+        assert_equal(np.datetime64('1950-03-12T13', 's'),
+                     np.datetime64('1950-03-12T13', 'm'))
+
+        # Default construction means NaT
+        assert_equal(np.datetime64(), np.datetime64('NaT'))
+
+        # Some basic strings and repr
+        assert_equal(str(np.datetime64('NaT')), 'NaT')
+        assert_equal(repr(np.datetime64('NaT')),
+                     "numpy.datetime64('NaT')")
+        assert_equal(str(np.datetime64('2011-02')), '2011-02')
+        assert_equal(repr(np.datetime64('2011-02')),
+                     "numpy.datetime64('2011-02')")
+
+        # None gets constructed as NaT
+        assert_equal(np.datetime64(None), np.datetime64('NaT'))
+
+        # Default construction of NaT is in generic units
+        assert_equal(np.datetime64().dtype, np.dtype('M8'))
+        assert_equal(np.datetime64('NaT').dtype, np.dtype('M8'))
+
+        # Construction from integers requires a specified unit
+        assert_raises(ValueError, np.datetime64, 17)
+
+        # When constructing from a scalar or zero-dimensional array,
+        # it either keeps the units or you can override them.
+        a = np.datetime64('2000-03-18T16', 'h')
+        b = np.array('2000-03-18T16', dtype='M8[h]')
+
+        assert_equal(a.dtype, np.dtype('M8[h]'))
+        assert_equal(b.dtype, np.dtype('M8[h]'))
+
+        assert_equal(np.datetime64(a), a)
+        assert_equal(np.datetime64(a).dtype, np.dtype('M8[h]'))
+
+        assert_equal(np.datetime64(b), a)
+        assert_equal(np.datetime64(b).dtype, np.dtype('M8[h]'))
+
+        assert_equal(np.datetime64(a, 's'), a)
+        assert_equal(np.datetime64(a, 's').dtype, np.dtype('M8[s]'))
+
+        assert_equal(np.datetime64(b, 's'), a)
+        assert_equal(np.datetime64(b, 's').dtype, np.dtype('M8[s]'))
+
+        # Construction from datetime.date
+        assert_equal(np.datetime64('1945-03-25'),
+                     np.datetime64(datetime.date(1945, 3, 25)))
+        assert_equal(np.datetime64('2045-03-25', 'D'),
+                     np.datetime64(datetime.date(2045, 3, 25), 'D'))
+        # Construction from datetime.datetime
+        assert_equal(np.datetime64('1980-01-25T14:36:22.5'),
+                     np.datetime64(datetime.datetime(1980, 1, 25,
+                                                14, 36, 22, 500000)))
+
+        # Construction with time units from a date is okay
+        assert_equal(np.datetime64('1920-03-13', 'h'),
+                     np.datetime64('1920-03-13T00'))
+        assert_equal(np.datetime64('1920-03', 'm'),
+                     np.datetime64('1920-03-01T00:00'))
+        assert_equal(np.datetime64('1920', 's'),
+                     np.datetime64('1920-01-01T00:00:00'))
+        assert_equal(np.datetime64(datetime.date(2045, 3, 25), 'ms'),
+                     np.datetime64('2045-03-25T00:00:00.000'))
+
+        # Construction with date units from a datetime is also okay
+        assert_equal(np.datetime64('1920-03-13T18', 'D'),
+                     np.datetime64('1920-03-13'))
+        assert_equal(np.datetime64('1920-03-13T18:33:12', 'M'),
+                     np.datetime64('1920-03'))
+        assert_equal(np.datetime64('1920-03-13T18:33:12.5', 'Y'),
+                     np.datetime64('1920'))
+
+    def test_datetime_scalar_construction_timezone(self):
+        # verify that supplying an explicit timezone works, but is deprecated
+        with assert_warns(DeprecationWarning):
+            assert_equal(np.datetime64('2000-01-01T00Z'),
+                         np.datetime64('2000-01-01T00'))
+        with assert_warns(DeprecationWarning):
+            assert_equal(np.datetime64('2000-01-01T00-08'),
+                         np.datetime64('2000-01-01T08'))
+
+    def test_datetime_array_find_type(self):
+        dt = np.datetime64('1970-01-01', 'M')
+        arr = np.array([dt])
+        assert_equal(arr.dtype, np.dtype('M8[M]'))
+
+        # at the moment, we don't automatically convert these to datetime64
+
+        dt = datetime.date(1970, 1, 1)
+        arr = np.array([dt])
+        assert_equal(arr.dtype, np.dtype('O'))
+
+        dt = datetime.datetime(1970, 1, 1, 12, 30, 40)
+        arr = np.array([dt])
+        assert_equal(arr.dtype, np.dtype('O'))
+
+        # find "supertype" for non-dates and dates
+
+        b = np.bool_(True)
+        dm = np.datetime64('1970-01-01', 'M')
+        d = datetime.date(1970, 1, 1)
+        dt = datetime.datetime(1970, 1, 1, 12, 30, 40)
+
+        arr = np.array([b, dm])
+        assert_equal(arr.dtype, np.dtype('O'))
+
+        arr = np.array([b, d])
+        assert_equal(arr.dtype, np.dtype('O'))
+
+        arr = np.array([b, dt])
+        assert_equal(arr.dtype, np.dtype('O'))
+
+        arr = np.array([d, d]).astype('datetime64')
+        assert_equal(arr.dtype, np.dtype('M8[D]'))
+
+        arr = np.array([dt, dt]).astype('datetime64')
+        assert_equal(arr.dtype, np.dtype('M8[us]'))
+
+    @pytest.mark.parametrize("unit", [
+    # test all date / time units and use
+    # "generic" to select generic unit
+    ("Y"), ("M"), ("W"), ("D"), ("h"), ("m"),
+    ("s"), ("ms"), ("us"), ("ns"), ("ps"),
+    ("fs"), ("as"), ("generic") ])
+    def test_timedelta_np_int_construction(self, unit):
+        # regression test for gh-7617
+        if unit != "generic":
+            assert_equal(np.timedelta64(np.int64(123), unit),
+                         np.timedelta64(123, unit))
+        else:
+            assert_equal(np.timedelta64(np.int64(123)),
+                         np.timedelta64(123))
+
+    def test_timedelta_scalar_construction(self):
+        # Construct with different units
+        assert_equal(np.timedelta64(7, 'D'),
+                     np.timedelta64(1, 'W'))
+        assert_equal(np.timedelta64(120, 's'),
+                     np.timedelta64(2, 'm'))
+
+        # Default construction means 0
+        assert_equal(np.timedelta64(), np.timedelta64(0))
+
+        # None gets constructed as NaT
+        assert_equal(np.timedelta64(None), np.timedelta64('NaT'))
+
+        # Some basic strings and repr
+        assert_equal(str(np.timedelta64('NaT')), 'NaT')
+        assert_equal(repr(np.timedelta64('NaT')),
+                     "numpy.timedelta64('NaT')")
+        assert_equal(str(np.timedelta64(3, 's')), '3 seconds')
+        assert_equal(repr(np.timedelta64(-3, 's')),
+                     "numpy.timedelta64(-3,'s')")
+        assert_equal(repr(np.timedelta64(12)),
+                     "numpy.timedelta64(12)")
+
+        # Construction from an integer produces generic units
+        assert_equal(np.timedelta64(12).dtype, np.dtype('m8'))
+
+        # When constructing from a scalar or zero-dimensional array,
+        # it either keeps the units or you can override them.
+        a = np.timedelta64(2, 'h')
+        b = np.array(2, dtype='m8[h]')
+
+        assert_equal(a.dtype, np.dtype('m8[h]'))
+        assert_equal(b.dtype, np.dtype('m8[h]'))
+
+        assert_equal(np.timedelta64(a), a)
+        assert_equal(np.timedelta64(a).dtype, np.dtype('m8[h]'))
+
+        assert_equal(np.timedelta64(b), a)
+        assert_equal(np.timedelta64(b).dtype, np.dtype('m8[h]'))
+
+        assert_equal(np.timedelta64(a, 's'), a)
+        assert_equal(np.timedelta64(a, 's').dtype, np.dtype('m8[s]'))
+
+        assert_equal(np.timedelta64(b, 's'), a)
+        assert_equal(np.timedelta64(b, 's').dtype, np.dtype('m8[s]'))
+
+        # Construction from datetime.timedelta
+        assert_equal(np.timedelta64(5, 'D'),
+                     np.timedelta64(datetime.timedelta(days=5)))
+        assert_equal(np.timedelta64(102347621, 's'),
+                     np.timedelta64(datetime.timedelta(seconds=102347621)))
+        assert_equal(np.timedelta64(-10234760000, 'us'),
+                     np.timedelta64(datetime.timedelta(
+                                            microseconds=-10234760000)))
+        assert_equal(np.timedelta64(10234760000, 'us'),
+                     np.timedelta64(datetime.timedelta(
+                                            microseconds=10234760000)))
+        assert_equal(np.timedelta64(1023476, 'ms'),
+                     np.timedelta64(datetime.timedelta(milliseconds=1023476)))
+        assert_equal(np.timedelta64(10, 'm'),
+                     np.timedelta64(datetime.timedelta(minutes=10)))
+        assert_equal(np.timedelta64(281, 'h'),
+                     np.timedelta64(datetime.timedelta(hours=281)))
+        assert_equal(np.timedelta64(28, 'W'),
+                     np.timedelta64(datetime.timedelta(weeks=28)))
+
+        # Cannot construct across nonlinear time unit boundaries
+        a = np.timedelta64(3, 's')
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = np.timedelta64(6, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'D')
+        assert_raises(TypeError, np.timedelta64, a, 'h')
+        a = np.timedelta64(1, 'Y')
+        assert_raises(TypeError, np.timedelta64, a, 'D')
+        assert_raises(TypeError, np.timedelta64, a, 'm')
+        a = datetime.timedelta(seconds=3)
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = datetime.timedelta(weeks=3)
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = datetime.timedelta()
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+
+    def test_timedelta_object_array_conversion(self):
+        # Regression test for gh-11096
+        inputs = [datetime.timedelta(28),
+                  datetime.timedelta(30),
+                  datetime.timedelta(31)]
+        expected = np.array([28, 30, 31], dtype='timedelta64[D]')
+        actual = np.array(inputs, dtype='timedelta64[D]')
+        assert_equal(expected, actual)
+
+    def test_timedelta_0_dim_object_array_conversion(self):
+        # Regression test for gh-11151
+        test = np.array(datetime.timedelta(seconds=20))
+        actual = test.astype(np.timedelta64)
+        # expected value from the array constructor workaround
+        # described in above issue
+        expected = np.array(datetime.timedelta(seconds=20),
+                            np.timedelta64)
+        assert_equal(actual, expected)
+
+    def test_timedelta_scalar_construction_units(self):
+        # String construction detecting units
+        assert_equal(np.datetime64('2010').dtype,
+                     np.dtype('M8[Y]'))
+        assert_equal(np.datetime64('2010-03').dtype,
+                     np.dtype('M8[M]'))
+        assert_equal(np.datetime64('2010-03-12').dtype,
+                     np.dtype('M8[D]'))
+        assert_equal(np.datetime64('2010-03-12T17').dtype,
+                     np.dtype('M8[h]'))
+        assert_equal(np.datetime64('2010-03-12T17:15').dtype,
+                     np.dtype('M8[m]'))
+        assert_equal(np.datetime64('2010-03-12T17:15:08').dtype,
+                     np.dtype('M8[s]'))
+
+        assert_equal(np.datetime64('2010-03-12T17:15:08.1').dtype,
+                     np.dtype('M8[ms]'))
+        assert_equal(np.datetime64('2010-03-12T17:15:08.12').dtype,
+                     np.dtype('M8[ms]'))
+        assert_equal(np.datetime64('2010-03-12T17:15:08.123').dtype,
+                     np.dtype('M8[ms]'))
+
+        assert_equal(np.datetime64('2010-03-12T17:15:08.1234').dtype,
+                     np.dtype('M8[us]'))
+        assert_equal(np.datetime64('2010-03-12T17:15:08.12345').dtype,
+                     np.dtype('M8[us]'))
+        assert_equal(np.datetime64('2010-03-12T17:15:08.123456').dtype,
+                     np.dtype('M8[us]'))
+
+        assert_equal(np.datetime64('1970-01-01T00:00:02.1234567').dtype,
+                     np.dtype('M8[ns]'))
+        assert_equal(np.datetime64('1970-01-01T00:00:02.12345678').dtype,
+                     np.dtype('M8[ns]'))
+        assert_equal(np.datetime64('1970-01-01T00:00:02.123456789').dtype,
+                     np.dtype('M8[ns]'))
+
+        assert_equal(np.datetime64('1970-01-01T00:00:02.1234567890').dtype,
+                     np.dtype('M8[ps]'))
+        assert_equal(np.datetime64('1970-01-01T00:00:02.12345678901').dtype,
+                     np.dtype('M8[ps]'))
+        assert_equal(np.datetime64('1970-01-01T00:00:02.123456789012').dtype,
+                     np.dtype('M8[ps]'))
+
+        assert_equal(np.datetime64(
+                     '1970-01-01T00:00:02.1234567890123').dtype,
+                     np.dtype('M8[fs]'))
+        assert_equal(np.datetime64(
+                     '1970-01-01T00:00:02.12345678901234').dtype,
+                     np.dtype('M8[fs]'))
+        assert_equal(np.datetime64(
+                     '1970-01-01T00:00:02.123456789012345').dtype,
+                     np.dtype('M8[fs]'))
+
+        assert_equal(np.datetime64(
+                    '1970-01-01T00:00:02.1234567890123456').dtype,
+                     np.dtype('M8[as]'))
+        assert_equal(np.datetime64(
+                    '1970-01-01T00:00:02.12345678901234567').dtype,
+                     np.dtype('M8[as]'))
+        assert_equal(np.datetime64(
+                    '1970-01-01T00:00:02.123456789012345678').dtype,
+                     np.dtype('M8[as]'))
+
+        # Python date object
+        assert_equal(np.datetime64(datetime.date(2010, 4, 16)).dtype,
+                     np.dtype('M8[D]'))
+
+        # Python datetime object
+        assert_equal(np.datetime64(
+                        datetime.datetime(2010, 4, 16, 13, 45, 18)).dtype,
+                     np.dtype('M8[us]'))
+
+        # 'today' special value
+        assert_equal(np.datetime64('today').dtype,
+                     np.dtype('M8[D]'))
+
+        # 'now' special value
+        assert_equal(np.datetime64('now').dtype,
+                     np.dtype('M8[s]'))
+
+    def test_datetime_nat_casting(self):
+        a = np.array('NaT', dtype='M8[D]')
+        b = np.datetime64('NaT', '[D]')
+
+        # Arrays
+        assert_equal(a.astype('M8[s]'), np.array('NaT', dtype='M8[s]'))
+        assert_equal(a.astype('M8[ms]'), np.array('NaT', dtype='M8[ms]'))
+        assert_equal(a.astype('M8[M]'), np.array('NaT', dtype='M8[M]'))
+        assert_equal(a.astype('M8[Y]'), np.array('NaT', dtype='M8[Y]'))
+        assert_equal(a.astype('M8[W]'), np.array('NaT', dtype='M8[W]'))
+
+        # Scalars -> Scalars
+        assert_equal(np.datetime64(b, '[s]'), np.datetime64('NaT', '[s]'))
+        assert_equal(np.datetime64(b, '[ms]'), np.datetime64('NaT', '[ms]'))
+        assert_equal(np.datetime64(b, '[M]'), np.datetime64('NaT', '[M]'))
+        assert_equal(np.datetime64(b, '[Y]'), np.datetime64('NaT', '[Y]'))
+        assert_equal(np.datetime64(b, '[W]'), np.datetime64('NaT', '[W]'))
+
+        # Arrays -> Scalars
+        assert_equal(np.datetime64(a, '[s]'), np.datetime64('NaT', '[s]'))
+        assert_equal(np.datetime64(a, '[ms]'), np.datetime64('NaT', '[ms]'))
+        assert_equal(np.datetime64(a, '[M]'), np.datetime64('NaT', '[M]'))
+        assert_equal(np.datetime64(a, '[Y]'), np.datetime64('NaT', '[Y]'))
+        assert_equal(np.datetime64(a, '[W]'), np.datetime64('NaT', '[W]'))
+
+    def test_days_creation(self):
+        assert_equal(np.array('1599', dtype='M8[D]').astype('i8'),
+                (1600-1970)*365 - (1972-1600)/4 + 3 - 365)
+        assert_equal(np.array('1600', dtype='M8[D]').astype('i8'),
+                (1600-1970)*365 - (1972-1600)/4 + 3)
+        assert_equal(np.array('1601', dtype='M8[D]').astype('i8'),
+                (1600-1970)*365 - (1972-1600)/4 + 3 + 366)
+        assert_equal(np.array('1900', dtype='M8[D]').astype('i8'),
+                (1900-1970)*365 - (1970-1900)//4)
+        assert_equal(np.array('1901', dtype='M8[D]').astype('i8'),
+                (1900-1970)*365 - (1970-1900)//4 + 365)
+        assert_equal(np.array('1967', dtype='M8[D]').astype('i8'), -3*365 - 1)
+        assert_equal(np.array('1968', dtype='M8[D]').astype('i8'), -2*365 - 1)
+        assert_equal(np.array('1969', dtype='M8[D]').astype('i8'), -1*365)
+        assert_equal(np.array('1970', dtype='M8[D]').astype('i8'), 0*365)
+        assert_equal(np.array('1971', dtype='M8[D]').astype('i8'), 1*365)
+        assert_equal(np.array('1972', dtype='M8[D]').astype('i8'), 2*365)
+        assert_equal(np.array('1973', dtype='M8[D]').astype('i8'), 3*365 + 1)
+        assert_equal(np.array('1974', dtype='M8[D]').astype('i8'), 4*365 + 1)
+        assert_equal(np.array('2000', dtype='M8[D]').astype('i8'),
+                 (2000 - 1970)*365 + (2000 - 1972)//4)
+        assert_equal(np.array('2001', dtype='M8[D]').astype('i8'),
+                 (2000 - 1970)*365 + (2000 - 1972)//4 + 366)
+        assert_equal(np.array('2400', dtype='M8[D]').astype('i8'),
+                 (2400 - 1970)*365 + (2400 - 1972)//4 - 3)
+        assert_equal(np.array('2401', dtype='M8[D]').astype('i8'),
+                 (2400 - 1970)*365 + (2400 - 1972)//4 - 3 + 366)
+
+        assert_equal(np.array('1600-02-29', dtype='M8[D]').astype('i8'),
+                (1600-1970)*365 - (1972-1600)//4 + 3 + 31 + 28)
+        assert_equal(np.array('1600-03-01', dtype='M8[D]').astype('i8'),
+                (1600-1970)*365 - (1972-1600)//4 + 3 + 31 + 29)
+        assert_equal(np.array('2000-02-29', dtype='M8[D]').astype('i8'),
+                 (2000 - 1970)*365 + (2000 - 1972)//4 + 31 + 28)
+        assert_equal(np.array('2000-03-01', dtype='M8[D]').astype('i8'),
+                 (2000 - 1970)*365 + (2000 - 1972)//4 + 31 + 29)
+        assert_equal(np.array('2001-03-22', dtype='M8[D]').astype('i8'),
+                 (2000 - 1970)*365 + (2000 - 1972)//4 + 366 + 31 + 28 + 21)
+
+    def test_days_to_pydate(self):
+        assert_equal(np.array('1599', dtype='M8[D]').astype('O'),
+                    datetime.date(1599, 1, 1))
+        assert_equal(np.array('1600', dtype='M8[D]').astype('O'),
+                    datetime.date(1600, 1, 1))
+        assert_equal(np.array('1601', dtype='M8[D]').astype('O'),
+                    datetime.date(1601, 1, 1))
+        assert_equal(np.array('1900', dtype='M8[D]').astype('O'),
+                    datetime.date(1900, 1, 1))
+        assert_equal(np.array('1901', dtype='M8[D]').astype('O'),
+                    datetime.date(1901, 1, 1))
+        assert_equal(np.array('2000', dtype='M8[D]').astype('O'),
+                    datetime.date(2000, 1, 1))
+        assert_equal(np.array('2001', dtype='M8[D]').astype('O'),
+                    datetime.date(2001, 1, 1))
+        assert_equal(np.array('1600-02-29', dtype='M8[D]').astype('O'),
+                    datetime.date(1600, 2, 29))
+        assert_equal(np.array('1600-03-01', dtype='M8[D]').astype('O'),
+                    datetime.date(1600, 3, 1))
+        assert_equal(np.array('2001-03-22', dtype='M8[D]').astype('O'),
+                    datetime.date(2001, 3, 22))
+
+    def test_dtype_comparison(self):
+        assert_(not (np.dtype('M8[us]') == np.dtype('M8[ms]')))
+        assert_(np.dtype('M8[us]') != np.dtype('M8[ms]'))
+        assert_(np.dtype('M8[2D]') != np.dtype('M8[D]'))
+        assert_(np.dtype('M8[D]') != np.dtype('M8[2D]'))
+
+    def test_pydatetime_creation(self):
+        a = np.array(['1960-03-12', datetime.date(1960, 3, 12)], dtype='M8[D]')
+        assert_equal(a[0], a[1])
+        a = np.array(['1999-12-31', datetime.date(1999, 12, 31)], dtype='M8[D]')
+        assert_equal(a[0], a[1])
+        a = np.array(['2000-01-01', datetime.date(2000, 1, 1)], dtype='M8[D]')
+        assert_equal(a[0], a[1])
+        # Will fail if the date changes during the exact right moment
+        a = np.array(['today', datetime.date.today()], dtype='M8[D]')
+        assert_equal(a[0], a[1])
+        # datetime.datetime.now() returns local time, not UTC
+        #a = np.array(['now', datetime.datetime.now()], dtype='M8[s]')
+        #assert_equal(a[0], a[1])
+
+        # we can give a datetime.date time units
+        assert_equal(np.array(datetime.date(1960, 3, 12), dtype='M8[s]'),
+                     np.array(np.datetime64('1960-03-12T00:00:00')))
+
+    def test_datetime_string_conversion(self):
+        a = ['2011-03-16', '1920-01-01', '2013-05-19']
+        str_a = np.array(a, dtype='S')
+        uni_a = np.array(a, dtype='U')
+        dt_a = np.array(a, dtype='M')
+
+        # String to datetime
+        assert_equal(dt_a, str_a.astype('M'))
+        assert_equal(dt_a.dtype, str_a.astype('M').dtype)
+        dt_b = np.empty_like(dt_a)
+        dt_b[...] = str_a
+        assert_equal(dt_a, dt_b)
+
+        # Datetime to string
+        assert_equal(str_a, dt_a.astype('S0'))
+        str_b = np.empty_like(str_a)
+        str_b[...] = dt_a
+        assert_equal(str_a, str_b)
+
+        # Unicode to datetime
+        assert_equal(dt_a, uni_a.astype('M'))
+        assert_equal(dt_a.dtype, uni_a.astype('M').dtype)
+        dt_b = np.empty_like(dt_a)
+        dt_b[...] = uni_a
+        assert_equal(dt_a, dt_b)
+
+        # Datetime to unicode
+        assert_equal(uni_a, dt_a.astype('U'))
+        uni_b = np.empty_like(uni_a)
+        uni_b[...] = dt_a
+        assert_equal(uni_a, uni_b)
+
+        # Datetime to long string - gh-9712
+        assert_equal(str_a, dt_a.astype((np.string_, 128)))
+        str_b = np.empty(str_a.shape, dtype=(np.string_, 128))
+        str_b[...] = dt_a
+        assert_equal(str_a, str_b)
+
+    def test_datetime_array_str(self):
+        a = np.array(['2011-03-16', '1920-01-01', '2013-05-19'], dtype='M')
+        assert_equal(str(a), "['2011-03-16' '1920-01-01' '2013-05-19']")
+
+        a = np.array(['2011-03-16T13:55', '1920-01-01T03:12'], dtype='M')
+        assert_equal(np.array2string(a, separator=', ',
+                    formatter={'datetime': lambda x:
+                            "'%s'" % np.datetime_as_string(x, timezone='UTC')}),
+                     "['2011-03-16T13:55Z', '1920-01-01T03:12Z']")
+
+        # Check that one NaT doesn't corrupt subsequent entries
+        a = np.array(['2010', 'NaT', '2030']).astype('M')
+        assert_equal(str(a), "['2010'  'NaT' '2030']")
+
+    def test_timedelta_array_str(self):
+        a = np.array([-1, 0, 100], dtype='m')
+        assert_equal(str(a), "[ -1   0 100]")
+        a = np.array(['NaT', 'NaT'], dtype='m')
+        assert_equal(str(a), "['NaT' 'NaT']")
+        # Check right-alignment with NaTs
+        a = np.array([-1, 'NaT', 0], dtype='m')
+        assert_equal(str(a), "[   -1 'NaT'     0]")
+        a = np.array([-1, 'NaT', 1234567], dtype='m')
+        assert_equal(str(a), "[     -1   'NaT' 1234567]")
+
+        # Test with other byteorder:
+        a = np.array([-1, 'NaT', 1234567], dtype='>m')
+        assert_equal(str(a), "[     -1   'NaT' 1234567]")
+        a = np.array([-1, 'NaT', 1234567], dtype='<m')
+        assert_equal(str(a), "[     -1   'NaT' 1234567]")
+
+    def test_pickle(self):
+        # Check that pickle roundtripping works
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            dt = np.dtype('M8[7D]')
+            assert_equal(pickle.loads(pickle.dumps(dt, protocol=proto)), dt)
+            dt = np.dtype('M8[W]')
+            assert_equal(pickle.loads(pickle.dumps(dt, protocol=proto)), dt)
+            scalar = np.datetime64('2016-01-01T00:00:00.000000000')
+            assert_equal(pickle.loads(pickle.dumps(scalar, protocol=proto)),
+                         scalar)
+            delta = scalar - np.datetime64('2015-01-01T00:00:00.000000000')
+            assert_equal(pickle.loads(pickle.dumps(delta, protocol=proto)),
+                         delta)
+
+        # Check that loading pickles from 1.6 works
+        pkl = b"cnumpy\ndtype\np0\n(S'M8'\np1\nI0\nI1\ntp2\nRp3\n" + \
+              b"(I4\nS'<'\np4\nNNNI-1\nI-1\nI0\n((dp5\n(S'D'\np6\n" + \
+              b"I7\nI1\nI1\ntp7\ntp8\ntp9\nb."
+        assert_equal(pickle.loads(pkl), np.dtype('<M8[7D]'))
+        pkl = b"cnumpy\ndtype\np0\n(S'M8'\np1\nI0\nI1\ntp2\nRp3\n" + \
+              b"(I4\nS'<'\np4\nNNNI-1\nI-1\nI0\n((dp5\n(S'W'\np6\n" + \
+              b"I1\nI1\nI1\ntp7\ntp8\ntp9\nb."
+        assert_equal(pickle.loads(pkl), np.dtype('<M8[W]'))
+        pkl = b"cnumpy\ndtype\np0\n(S'M8'\np1\nI0\nI1\ntp2\nRp3\n" + \
+              b"(I4\nS'>'\np4\nNNNI-1\nI-1\nI0\n((dp5\n(S'us'\np6\n" + \
+              b"I1\nI1\nI1\ntp7\ntp8\ntp9\nb."
+        assert_equal(pickle.loads(pkl), np.dtype('>M8[us]'))
+
+    def test_setstate(self):
+        "Verify that datetime dtype __setstate__ can handle bad arguments"
+        dt = np.dtype('>M8[us]')
+        assert_raises(ValueError, dt.__setstate__, (4, '>', None, None, None, -1, -1, 0, 1))
+        assert_(dt.__reduce__()[2] == np.dtype('>M8[us]').__reduce__()[2])
+        assert_raises(TypeError, dt.__setstate__, (4, '>', None, None, None, -1, -1, 0, ({}, 'xxx')))
+        assert_(dt.__reduce__()[2] == np.dtype('>M8[us]').__reduce__()[2])
+
+    def test_dtype_promotion(self):
+        # datetime <op> datetime computes the metadata gcd
+        # timedelta <op> timedelta computes the metadata gcd
+        for mM in ['m', 'M']:
+            assert_equal(
+                np.promote_types(np.dtype(mM+'8[2Y]'), np.dtype(mM+'8[2Y]')),
+                np.dtype(mM+'8[2Y]'))
+            assert_equal(
+                np.promote_types(np.dtype(mM+'8[12Y]'), np.dtype(mM+'8[15Y]')),
+                np.dtype(mM+'8[3Y]'))
+            assert_equal(
+                np.promote_types(np.dtype(mM+'8[62M]'), np.dtype(mM+'8[24M]')),
+                np.dtype(mM+'8[2M]'))
+            assert_equal(
+                np.promote_types(np.dtype(mM+'8[1W]'), np.dtype(mM+'8[2D]')),
+                np.dtype(mM+'8[1D]'))
+            assert_equal(
+                np.promote_types(np.dtype(mM+'8[W]'), np.dtype(mM+'8[13s]')),
+                np.dtype(mM+'8[s]'))
+            assert_equal(
+                np.promote_types(np.dtype(mM+'8[13W]'), np.dtype(mM+'8[49s]')),
+                np.dtype(mM+'8[7s]'))
+        # timedelta <op> timedelta raises when there is no reasonable gcd
+        assert_raises(TypeError, np.promote_types,
+                            np.dtype('m8[Y]'), np.dtype('m8[D]'))
+        assert_raises(TypeError, np.promote_types,
+                            np.dtype('m8[M]'), np.dtype('m8[W]'))
+        # timedelta <op> timedelta may overflow with big unit ranges
+        assert_raises(OverflowError, np.promote_types,
+                            np.dtype('m8[W]'), np.dtype('m8[fs]'))
+        assert_raises(OverflowError, np.promote_types,
+                            np.dtype('m8[s]'), np.dtype('m8[as]'))
+
+    def test_cast_overflow(self):
+        # gh-4486
+        def cast():
+            numpy.datetime64("1971-01-01 00:00:00.000000000000000").astype("<M8[D]")
+        assert_raises(OverflowError, cast)
+
+        def cast2():
+            numpy.datetime64("2014").astype("<M8[fs]")
+        assert_raises(OverflowError, cast2)
+
+    def test_pyobject_roundtrip(self):
+        # All datetime types should be able to roundtrip through object
+        a = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0,
+                      -1020040340, -2942398, -1, 0, 1, 234523453, 1199164176],
+                                                        dtype=np.int64)
+        # With date units
+        for unit in ['M8[D]', 'M8[W]', 'M8[M]', 'M8[Y]']:
+            b = a.copy().view(dtype=unit)
+            b[0] = '-0001-01-01'
+            b[1] = '-0001-12-31'
+            b[2] = '0000-01-01'
+            b[3] = '0001-01-01'
+            b[4] = '1969-12-31'
+            b[5] = '1970-01-01'
+            b[6] = '9999-12-31'
+            b[7] = '10000-01-01'
+            b[8] = 'NaT'
+
+            assert_equal(b.astype(object).astype(unit), b,
+                            "Error roundtripping unit %s" % unit)
+        # With time units
+        for unit in ['M8[as]', 'M8[16fs]', 'M8[ps]', 'M8[us]',
+                     'M8[300as]', 'M8[20us]']:
+            b = a.copy().view(dtype=unit)
+            b[0] = '-0001-01-01T00'
+            b[1] = '-0001-12-31T00'
+            b[2] = '0000-01-01T00'
+            b[3] = '0001-01-01T00'
+            b[4] = '1969-12-31T23:59:59.999999'
+            b[5] = '1970-01-01T00'
+            b[6] = '9999-12-31T23:59:59.999999'
+            b[7] = '10000-01-01T00'
+            b[8] = 'NaT'
+
+            assert_equal(b.astype(object).astype(unit), b,
+                            "Error roundtripping unit %s" % unit)
+
+    def test_month_truncation(self):
+        # Make sure that months are truncating correctly
+        assert_equal(np.array('1945-03-01', dtype='M8[M]'),
+                     np.array('1945-03-31', dtype='M8[M]'))
+        assert_equal(np.array('1969-11-01', dtype='M8[M]'),
+             np.array('1969-11-30T23:59:59.99999', dtype='M').astype('M8[M]'))
+        assert_equal(np.array('1969-12-01', dtype='M8[M]'),
+             np.array('1969-12-31T23:59:59.99999', dtype='M').astype('M8[M]'))
+        assert_equal(np.array('1970-01-01', dtype='M8[M]'),
+             np.array('1970-01-31T23:59:59.99999', dtype='M').astype('M8[M]'))
+        assert_equal(np.array('1980-02-01', dtype='M8[M]'),
+             np.array('1980-02-29T23:59:59.99999', dtype='M').astype('M8[M]'))
+
+    def test_different_unit_comparison(self):
+        # Check some years with date units
+        for unit1 in ['Y', 'M', 'D']:
+            dt1 = np.dtype('M8[%s]' % unit1)
+            for unit2 in ['Y', 'M', 'D']:
+                dt2 = np.dtype('M8[%s]' % unit2)
+                assert_equal(np.array('1945', dtype=dt1),
+                             np.array('1945', dtype=dt2))
+                assert_equal(np.array('1970', dtype=dt1),
+                             np.array('1970', dtype=dt2))
+                assert_equal(np.array('9999', dtype=dt1),
+                             np.array('9999', dtype=dt2))
+                assert_equal(np.array('10000', dtype=dt1),
+                             np.array('10000-01-01', dtype=dt2))
+                assert_equal(np.datetime64('1945', unit1),
+                             np.datetime64('1945', unit2))
+                assert_equal(np.datetime64('1970', unit1),
+                             np.datetime64('1970', unit2))
+                assert_equal(np.datetime64('9999', unit1),
+                             np.datetime64('9999', unit2))
+                assert_equal(np.datetime64('10000', unit1),
+                             np.datetime64('10000-01-01', unit2))
+        # Check some datetimes with time units
+        for unit1 in ['6h', 'h', 'm', 's', '10ms', 'ms', 'us']:
+            dt1 = np.dtype('M8[%s]' % unit1)
+            for unit2 in ['h', 'm', 's', 'ms', 'us']:
+                dt2 = np.dtype('M8[%s]' % unit2)
+                assert_equal(np.array('1945-03-12T18', dtype=dt1),
+                             np.array('1945-03-12T18', dtype=dt2))
+                assert_equal(np.array('1970-03-12T18', dtype=dt1),
+                             np.array('1970-03-12T18', dtype=dt2))
+                assert_equal(np.array('9999-03-12T18', dtype=dt1),
+                             np.array('9999-03-12T18', dtype=dt2))
+                assert_equal(np.array('10000-01-01T00', dtype=dt1),
+                             np.array('10000-01-01T00', dtype=dt2))
+                assert_equal(np.datetime64('1945-03-12T18', unit1),
+                             np.datetime64('1945-03-12T18', unit2))
+                assert_equal(np.datetime64('1970-03-12T18', unit1),
+                             np.datetime64('1970-03-12T18', unit2))
+                assert_equal(np.datetime64('9999-03-12T18', unit1),
+                             np.datetime64('9999-03-12T18', unit2))
+                assert_equal(np.datetime64('10000-01-01T00', unit1),
+                             np.datetime64('10000-01-01T00', unit2))
+        # Check some days with units that won't overflow
+        for unit1 in ['D', '12h', 'h', 'm', 's', '4s', 'ms', 'us']:
+            dt1 = np.dtype('M8[%s]' % unit1)
+            for unit2 in ['D', 'h', 'm', 's', 'ms', 'us']:
+                dt2 = np.dtype('M8[%s]' % unit2)
+                assert_(np.equal(np.array('1932-02-17', dtype='M').astype(dt1),
+                     np.array('1932-02-17T00:00:00', dtype='M').astype(dt2),
+                     casting='unsafe'))
+                assert_(np.equal(np.array('10000-04-27', dtype='M').astype(dt1),
+                     np.array('10000-04-27T00:00:00', dtype='M').astype(dt2),
+                     casting='unsafe'))
+
+        # Shouldn't be able to compare datetime and timedelta
+        # TODO: Changing to 'same_kind' or 'safe' casting in the ufuncs by
+        #       default is needed to properly catch this kind of thing...
+        a = np.array('2012-12-21', dtype='M8[D]')
+        b = np.array(3, dtype='m8[D]')
+        #assert_raises(TypeError, np.less, a, b)
+        assert_raises(TypeError, np.less, a, b, casting='same_kind')
+
+    def test_datetime_like(self):
+        a = np.array([3], dtype='m8[4D]')
+        b = np.array(['2012-12-21'], dtype='M8[D]')
+
+        assert_equal(np.ones_like(a).dtype, a.dtype)
+        assert_equal(np.zeros_like(a).dtype, a.dtype)
+        assert_equal(np.empty_like(a).dtype, a.dtype)
+        assert_equal(np.ones_like(b).dtype, b.dtype)
+        assert_equal(np.zeros_like(b).dtype, b.dtype)
+        assert_equal(np.empty_like(b).dtype, b.dtype)
+
+    def test_datetime_unary(self):
+        for tda, tdb, tdzero, tdone, tdmone in \
+                [
+                 # One-dimensional arrays
+                 (np.array([3], dtype='m8[D]'),
+                  np.array([-3], dtype='m8[D]'),
+                  np.array([0], dtype='m8[D]'),
+                  np.array([1], dtype='m8[D]'),
+                  np.array([-1], dtype='m8[D]')),
+                 # NumPy scalars
+                 (np.timedelta64(3, '[D]'),
+                  np.timedelta64(-3, '[D]'),
+                  np.timedelta64(0, '[D]'),
+                  np.timedelta64(1, '[D]'),
+                  np.timedelta64(-1, '[D]'))]:
+            # negative ufunc
+            assert_equal(-tdb, tda)
+            assert_equal((-tdb).dtype, tda.dtype)
+            assert_equal(np.negative(tdb), tda)
+            assert_equal(np.negative(tdb).dtype, tda.dtype)
+
+            # positive ufunc
+            assert_equal(np.positive(tda), tda)
+            assert_equal(np.positive(tda).dtype, tda.dtype)
+            assert_equal(np.positive(tdb), tdb)
+            assert_equal(np.positive(tdb).dtype, tdb.dtype)
+
+            # absolute ufunc
+            assert_equal(np.absolute(tdb), tda)
+            assert_equal(np.absolute(tdb).dtype, tda.dtype)
+
+            # sign ufunc
+            assert_equal(np.sign(tda), tdone)
+            assert_equal(np.sign(tdb), tdmone)
+            assert_equal(np.sign(tdzero), tdzero)
+            assert_equal(np.sign(tda).dtype, tda.dtype)
+
+            # The ufuncs always produce native-endian results
+            assert_
+
+    def test_datetime_add(self):
+        for dta, dtb, dtc, dtnat, tda, tdb, tdc in \
+                    [
+                     # One-dimensional arrays
+                     (np.array(['2012-12-21'], dtype='M8[D]'),
+                      np.array(['2012-12-24'], dtype='M8[D]'),
+                      np.array(['2012-12-21T11'], dtype='M8[h]'),
+                      np.array(['NaT'], dtype='M8[D]'),
+                      np.array([3], dtype='m8[D]'),
+                      np.array([11], dtype='m8[h]'),
+                      np.array([3*24 + 11], dtype='m8[h]')),
+                     # NumPy scalars
+                     (np.datetime64('2012-12-21', '[D]'),
+                      np.datetime64('2012-12-24', '[D]'),
+                      np.datetime64('2012-12-21T11', '[h]'),
+                      np.datetime64('NaT', '[D]'),
+                      np.timedelta64(3, '[D]'),
+                      np.timedelta64(11, '[h]'),
+                      np.timedelta64(3*24 + 11, '[h]'))]:
+            # m8 + m8
+            assert_equal(tda + tdb, tdc)
+            assert_equal((tda + tdb).dtype, np.dtype('m8[h]'))
+            # m8 + bool
+            assert_equal(tdb + True, tdb + 1)
+            assert_equal((tdb + True).dtype, np.dtype('m8[h]'))
+            # m8 + int
+            assert_equal(tdb + 3*24, tdc)
+            assert_equal((tdb + 3*24).dtype, np.dtype('m8[h]'))
+            # bool + m8
+            assert_equal(False + tdb, tdb)
+            assert_equal((False + tdb).dtype, np.dtype('m8[h]'))
+            # int + m8
+            assert_equal(3*24 + tdb, tdc)
+            assert_equal((3*24 + tdb).dtype, np.dtype('m8[h]'))
+            # M8 + bool
+            assert_equal(dta + True, dta + 1)
+            assert_equal(dtnat + True, dtnat)
+            assert_equal((dta + True).dtype, np.dtype('M8[D]'))
+            # M8 + int
+            assert_equal(dta + 3, dtb)
+            assert_equal(dtnat + 3, dtnat)
+            assert_equal((dta + 3).dtype, np.dtype('M8[D]'))
+            # bool + M8
+            assert_equal(False + dta, dta)
+            assert_equal(False + dtnat, dtnat)
+            assert_equal((False + dta).dtype, np.dtype('M8[D]'))
+            # int + M8
+            assert_equal(3 + dta, dtb)
+            assert_equal(3 + dtnat, dtnat)
+            assert_equal((3 + dta).dtype, np.dtype('M8[D]'))
+            # M8 + m8
+            assert_equal(dta + tda, dtb)
+            assert_equal(dtnat + tda, dtnat)
+            assert_equal((dta + tda).dtype, np.dtype('M8[D]'))
+            # m8 + M8
+            assert_equal(tda + dta, dtb)
+            assert_equal(tda + dtnat, dtnat)
+            assert_equal((tda + dta).dtype, np.dtype('M8[D]'))
+
+            # In M8 + m8, the result goes to higher precision
+            assert_equal(np.add(dta, tdb, casting='unsafe'), dtc)
+            assert_equal(np.add(dta, tdb, casting='unsafe').dtype,
+                         np.dtype('M8[h]'))
+            assert_equal(np.add(tdb, dta, casting='unsafe'), dtc)
+            assert_equal(np.add(tdb, dta, casting='unsafe').dtype,
+                         np.dtype('M8[h]'))
+
+            # M8 + M8
+            assert_raises(TypeError, np.add, dta, dtb)
+
+    def test_datetime_subtract(self):
+        for dta, dtb, dtc, dtd, dte, dtnat, tda, tdb, tdc in \
+                    [
+                     # One-dimensional arrays
+                     (np.array(['2012-12-21'], dtype='M8[D]'),
+                      np.array(['2012-12-24'], dtype='M8[D]'),
+                      np.array(['1940-12-24'], dtype='M8[D]'),
+                      np.array(['1940-12-24T00'], dtype='M8[h]'),
+                      np.array(['1940-12-23T13'], dtype='M8[h]'),
+                      np.array(['NaT'], dtype='M8[D]'),
+                      np.array([3], dtype='m8[D]'),
+                      np.array([11], dtype='m8[h]'),
+                      np.array([3*24 - 11], dtype='m8[h]')),
+                     # NumPy scalars
+                     (np.datetime64('2012-12-21', '[D]'),
+                      np.datetime64('2012-12-24', '[D]'),
+                      np.datetime64('1940-12-24', '[D]'),
+                      np.datetime64('1940-12-24T00', '[h]'),
+                      np.datetime64('1940-12-23T13', '[h]'),
+                      np.datetime64('NaT', '[D]'),
+                      np.timedelta64(3, '[D]'),
+                      np.timedelta64(11, '[h]'),
+                      np.timedelta64(3*24 - 11, '[h]'))]:
+            # m8 - m8
+            assert_equal(tda - tdb, tdc)
+            assert_equal((tda - tdb).dtype, np.dtype('m8[h]'))
+            assert_equal(tdb - tda, -tdc)
+            assert_equal((tdb - tda).dtype, np.dtype('m8[h]'))
+            # m8 - bool
+            assert_equal(tdc - True, tdc - 1)
+            assert_equal((tdc - True).dtype, np.dtype('m8[h]'))
+            # m8 - int
+            assert_equal(tdc - 3*24, -tdb)
+            assert_equal((tdc - 3*24).dtype, np.dtype('m8[h]'))
+            # int - m8
+            assert_equal(False - tdb, -tdb)
+            assert_equal((False - tdb).dtype, np.dtype('m8[h]'))
+            # int - m8
+            assert_equal(3*24 - tdb, tdc)
+            assert_equal((3*24 - tdb).dtype, np.dtype('m8[h]'))
+            # M8 - bool
+            assert_equal(dtb - True, dtb - 1)
+            assert_equal(dtnat - True, dtnat)
+            assert_equal((dtb - True).dtype, np.dtype('M8[D]'))
+            # M8 - int
+            assert_equal(dtb - 3, dta)
+            assert_equal(dtnat - 3, dtnat)
+            assert_equal((dtb - 3).dtype, np.dtype('M8[D]'))
+            # M8 - m8
+            assert_equal(dtb - tda, dta)
+            assert_equal(dtnat - tda, dtnat)
+            assert_equal((dtb - tda).dtype, np.dtype('M8[D]'))
+
+            # In M8 - m8, the result goes to higher precision
+            assert_equal(np.subtract(dtc, tdb, casting='unsafe'), dte)
+            assert_equal(np.subtract(dtc, tdb, casting='unsafe').dtype,
+                         np.dtype('M8[h]'))
+
+            # M8 - M8 with different goes to higher precision
+            assert_equal(np.subtract(dtc, dtd, casting='unsafe'),
+                         np.timedelta64(0, 'h'))
+            assert_equal(np.subtract(dtc, dtd, casting='unsafe').dtype,
+                         np.dtype('m8[h]'))
+            assert_equal(np.subtract(dtd, dtc, casting='unsafe'),
+                         np.timedelta64(0, 'h'))
+            assert_equal(np.subtract(dtd, dtc, casting='unsafe').dtype,
+                         np.dtype('m8[h]'))
+
+            # m8 - M8
+            assert_raises(TypeError, np.subtract, tda, dta)
+            # bool - M8
+            assert_raises(TypeError, np.subtract, False, dta)
+            # int - M8
+            assert_raises(TypeError, np.subtract, 3, dta)
+
+    def test_datetime_multiply(self):
+        for dta, tda, tdb, tdc in \
+                    [
+                     # One-dimensional arrays
+                     (np.array(['2012-12-21'], dtype='M8[D]'),
+                      np.array([6], dtype='m8[h]'),
+                      np.array([9], dtype='m8[h]'),
+                      np.array([12], dtype='m8[h]')),
+                     # NumPy scalars
+                     (np.datetime64('2012-12-21', '[D]'),
+                      np.timedelta64(6, '[h]'),
+                      np.timedelta64(9, '[h]'),
+                      np.timedelta64(12, '[h]'))]:
+            # m8 * int
+            assert_equal(tda * 2, tdc)
+            assert_equal((tda * 2).dtype, np.dtype('m8[h]'))
+            # int * m8
+            assert_equal(2 * tda, tdc)
+            assert_equal((2 * tda).dtype, np.dtype('m8[h]'))
+            # m8 * float
+            assert_equal(tda * 1.5, tdb)
+            assert_equal((tda * 1.5).dtype, np.dtype('m8[h]'))
+            # float * m8
+            assert_equal(1.5 * tda, tdb)
+            assert_equal((1.5 * tda).dtype, np.dtype('m8[h]'))
+
+            # m8 * m8
+            assert_raises(TypeError, np.multiply, tda, tdb)
+            # m8 * M8
+            assert_raises(TypeError, np.multiply, dta, tda)
+            # M8 * m8
+            assert_raises(TypeError, np.multiply, tda, dta)
+            # M8 * int
+            assert_raises(TypeError, np.multiply, dta, 2)
+            # int * M8
+            assert_raises(TypeError, np.multiply, 2, dta)
+            # M8 * float
+            assert_raises(TypeError, np.multiply, dta, 1.5)
+            # float * M8
+            assert_raises(TypeError, np.multiply, 1.5, dta)
+
+        # NaTs
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning, "invalid value encountered in multiply")
+            nat = np.timedelta64('NaT')
+            def check(a, b, res):
+                assert_equal(a * b, res)
+                assert_equal(b * a, res)
+            for tp in (int, float):
+                check(nat, tp(2), nat)
+                check(nat, tp(0), nat)
+            for f in (float('inf'), float('nan')):
+                check(np.timedelta64(1), f, nat)
+                check(np.timedelta64(0), f, nat)
+                check(nat, f, nat)
+
+    @pytest.mark.parametrize("op1, op2, exp", [
+        # m8 same units round down
+        (np.timedelta64(7, 's'),
+         np.timedelta64(4, 's'),
+         1),
+        # m8 same units round down with negative
+        (np.timedelta64(7, 's'),
+         np.timedelta64(-4, 's'),
+         -2),
+        # m8 same units negative no round down
+        (np.timedelta64(8, 's'),
+         np.timedelta64(-4, 's'),
+         -2),
+        # m8 different units
+        (np.timedelta64(1, 'm'),
+         np.timedelta64(31, 's'),
+         1),
+        # m8 generic units
+        (np.timedelta64(1890),
+         np.timedelta64(31),
+         60),
+        # Y // M works
+        (np.timedelta64(2, 'Y'),
+         np.timedelta64('13', 'M'),
+         1),
+        # handle 1D arrays
+        (np.array([1, 2, 3], dtype='m8'),
+         np.array([2], dtype='m8'),
+         np.array([0, 1, 1], dtype=np.int64)),
+        ])
+    def test_timedelta_floor_divide(self, op1, op2, exp):
+        assert_equal(op1 // op2, exp)
+
+    @pytest.mark.parametrize("op1, op2", [
+        # div by 0
+        (np.timedelta64(10, 'us'),
+         np.timedelta64(0, 'us')),
+        # div with NaT
+        (np.timedelta64('NaT'),
+         np.timedelta64(50, 'us')),
+        # special case for int64 min
+        # in integer floor division
+        (np.timedelta64(np.iinfo(np.int64).min),
+         np.timedelta64(-1)),
+        ])
+    def test_timedelta_floor_div_warnings(self, op1, op2):
+        with assert_warns(RuntimeWarning):
+            actual = op1 // op2
+            assert_equal(actual, 0)
+            assert_equal(actual.dtype, np.int64)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # the smallest integer that can't be represented
+        # exactly in a double should be preserved if we avoid
+        # casting to double in floordiv operation
+        (9007199254740993, 1),
+        # stress the alternate floordiv code path where
+        # operand signs don't match and remainder isn't 0
+        (9007199254740999, -2),
+        ])
+    def test_timedelta_floor_div_precision(self, val1, val2):
+        op1 = np.timedelta64(val1)
+        op2 = np.timedelta64(val2)
+        actual = op1 // op2
+        # Python reference integer floor
+        expected = val1 // val2
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # years and months sometimes can't be unambiguously
+        # divided for floor division operation
+        (np.timedelta64(7, 'Y'),
+         np.timedelta64(3, 's')),
+        (np.timedelta64(7, 'M'),
+         np.timedelta64(1, 'D')),
+        ])
+    def test_timedelta_floor_div_error(self, val1, val2):
+        with assert_raises_regex(TypeError, "common metadata divisor"):
+            val1 // val2
+
+    @pytest.mark.parametrize("op1, op2", [
+        # reuse the test cases from floordiv
+        (np.timedelta64(7, 's'),
+         np.timedelta64(4, 's')),
+        # m8 same units round down with negative
+        (np.timedelta64(7, 's'),
+         np.timedelta64(-4, 's')),
+        # m8 same units negative no round down
+        (np.timedelta64(8, 's'),
+         np.timedelta64(-4, 's')),
+        # m8 different units
+        (np.timedelta64(1, 'm'),
+         np.timedelta64(31, 's')),
+        # m8 generic units
+        (np.timedelta64(1890),
+         np.timedelta64(31)),
+        # Y // M works
+        (np.timedelta64(2, 'Y'),
+         np.timedelta64('13', 'M')),
+        # handle 1D arrays
+        (np.array([1, 2, 3], dtype='m8'),
+         np.array([2], dtype='m8')),
+        ])
+    def test_timedelta_divmod(self, op1, op2):
+        expected = (op1 // op2, op1 % op2)
+        assert_equal(divmod(op1, op2), expected)
+
+    @pytest.mark.parametrize("op1, op2", [
+        # reuse cases from floordiv
+        # div by 0
+        (np.timedelta64(10, 'us'),
+         np.timedelta64(0, 'us')),
+        # div with NaT
+        (np.timedelta64('NaT'),
+         np.timedelta64(50, 'us')),
+        # special case for int64 min
+        # in integer floor division
+        (np.timedelta64(np.iinfo(np.int64).min),
+         np.timedelta64(-1)),
+        ])
+    def test_timedelta_divmod_warnings(self, op1, op2):
+        with assert_warns(RuntimeWarning):
+            expected = (op1 // op2, op1 % op2)
+        with assert_warns(RuntimeWarning):
+            actual = divmod(op1, op2)
+        assert_equal(actual, expected)
+
+    def test_datetime_divide(self):
+        for dta, tda, tdb, tdc, tdd in \
+                    [
+                     # One-dimensional arrays
+                     (np.array(['2012-12-21'], dtype='M8[D]'),
+                      np.array([6], dtype='m8[h]'),
+                      np.array([9], dtype='m8[h]'),
+                      np.array([12], dtype='m8[h]'),
+                      np.array([6], dtype='m8[m]')),
+                     # NumPy scalars
+                     (np.datetime64('2012-12-21', '[D]'),
+                      np.timedelta64(6, '[h]'),
+                      np.timedelta64(9, '[h]'),
+                      np.timedelta64(12, '[h]'),
+                      np.timedelta64(6, '[m]'))]:
+            # m8 / int
+            assert_equal(tdc / 2, tda)
+            assert_equal((tdc / 2).dtype, np.dtype('m8[h]'))
+            # m8 / float
+            assert_equal(tda / 0.5, tdc)
+            assert_equal((tda / 0.5).dtype, np.dtype('m8[h]'))
+            # m8 / m8
+            assert_equal(tda / tdb, 6.0 / 9.0)
+            assert_equal(np.divide(tda, tdb), 6.0 / 9.0)
+            assert_equal(np.true_divide(tda, tdb), 6.0 / 9.0)
+            assert_equal(tdb / tda, 9.0 / 6.0)
+            assert_equal((tda / tdb).dtype, np.dtype('f8'))
+            assert_equal(tda / tdd, 60.0)
+            assert_equal(tdd / tda, 1.0 / 60.0)
+
+            # int / m8
+            assert_raises(TypeError, np.divide, 2, tdb)
+            # float / m8
+            assert_raises(TypeError, np.divide, 0.5, tdb)
+            # m8 / M8
+            assert_raises(TypeError, np.divide, dta, tda)
+            # M8 / m8
+            assert_raises(TypeError, np.divide, tda, dta)
+            # M8 / int
+            assert_raises(TypeError, np.divide, dta, 2)
+            # int / M8
+            assert_raises(TypeError, np.divide, 2, dta)
+            # M8 / float
+            assert_raises(TypeError, np.divide, dta, 1.5)
+            # float / M8
+            assert_raises(TypeError, np.divide, 1.5, dta)
+
+        # NaTs
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning,  r".*encountered in true\_divide")
+            nat = np.timedelta64('NaT')
+            for tp in (int, float):
+                assert_equal(np.timedelta64(1) / tp(0), nat)
+                assert_equal(np.timedelta64(0) / tp(0), nat)
+                assert_equal(nat / tp(0), nat)
+                assert_equal(nat / tp(2), nat)
+            # Division by inf
+            assert_equal(np.timedelta64(1) / float('inf'), np.timedelta64(0))
+            assert_equal(np.timedelta64(0) / float('inf'), np.timedelta64(0))
+            assert_equal(nat / float('inf'), nat)
+            # Division by nan
+            assert_equal(np.timedelta64(1) / float('nan'), nat)
+            assert_equal(np.timedelta64(0) / float('nan'), nat)
+            assert_equal(nat / float('nan'), nat)
+
+    def test_datetime_compare(self):
+        # Test all the comparison operators
+        a = np.datetime64('2000-03-12T18:00:00.000000')
+        b = np.array(['2000-03-12T18:00:00.000000',
+                      '2000-03-12T17:59:59.999999',
+                      '2000-03-12T18:00:00.000001',
+                      '1970-01-11T12:00:00.909090',
+                      '2016-01-11T12:00:00.909090'],
+                      dtype='datetime64[us]')
+        assert_equal(np.equal(a, b), [1, 0, 0, 0, 0])
+        assert_equal(np.not_equal(a, b), [0, 1, 1, 1, 1])
+        assert_equal(np.less(a, b), [0, 0, 1, 0, 1])
+        assert_equal(np.less_equal(a, b), [1, 0, 1, 0, 1])
+        assert_equal(np.greater(a, b), [0, 1, 0, 1, 0])
+        assert_equal(np.greater_equal(a, b), [1, 1, 0, 1, 0])
+
+    def test_datetime_compare_nat(self):
+        dt_nat = np.datetime64('NaT', 'D')
+        dt_other = np.datetime64('2000-01-01')
+        td_nat = np.timedelta64('NaT', 'h')
+        td_other = np.timedelta64(1, 'h')
+
+        for op in [np.equal, np.less, np.less_equal,
+                   np.greater, np.greater_equal]:
+            assert_(not op(dt_nat, dt_nat))
+            assert_(not op(dt_nat, dt_other))
+            assert_(not op(dt_other, dt_nat))
+
+            assert_(not op(td_nat, td_nat))
+            assert_(not op(td_nat, td_other))
+            assert_(not op(td_other, td_nat))
+
+        assert_(np.not_equal(dt_nat, dt_nat))
+        assert_(np.not_equal(dt_nat, dt_other))
+        assert_(np.not_equal(dt_other, dt_nat))
+
+        assert_(np.not_equal(td_nat, td_nat))
+        assert_(np.not_equal(td_nat, td_other))
+        assert_(np.not_equal(td_other, td_nat))
+
+    def test_datetime_minmax(self):
+        # The metadata of the result should become the GCD
+        # of the operand metadata
+        a = np.array('1999-03-12T13', dtype='M8[2m]')
+        b = np.array('1999-03-12T12', dtype='M8[s]')
+        assert_equal(np.minimum(a, b), b)
+        assert_equal(np.minimum(a, b).dtype, np.dtype('M8[s]'))
+        assert_equal(np.fmin(a, b), b)
+        assert_equal(np.fmin(a, b).dtype, np.dtype('M8[s]'))
+        assert_equal(np.maximum(a, b), a)
+        assert_equal(np.maximum(a, b).dtype, np.dtype('M8[s]'))
+        assert_equal(np.fmax(a, b), a)
+        assert_equal(np.fmax(a, b).dtype, np.dtype('M8[s]'))
+        # Viewed as integers, the comparison is opposite because
+        # of the units chosen
+        assert_equal(np.minimum(a.view('i8'), b.view('i8')), a.view('i8'))
+
+        # Interaction with NaT
+        a = np.array('1999-03-12T13', dtype='M8[2m]')
+        dtnat = np.array('NaT', dtype='M8[h]')
+        assert_equal(np.minimum(a, dtnat), a)
+        assert_equal(np.minimum(dtnat, a), a)
+        assert_equal(np.maximum(a, dtnat), a)
+        assert_equal(np.maximum(dtnat, a), a)
+
+        # Also do timedelta
+        a = np.array(3, dtype='m8[h]')
+        b = np.array(3*3600 - 3, dtype='m8[s]')
+        assert_equal(np.minimum(a, b), b)
+        assert_equal(np.minimum(a, b).dtype, np.dtype('m8[s]'))
+        assert_equal(np.fmin(a, b), b)
+        assert_equal(np.fmin(a, b).dtype, np.dtype('m8[s]'))
+        assert_equal(np.maximum(a, b), a)
+        assert_equal(np.maximum(a, b).dtype, np.dtype('m8[s]'))
+        assert_equal(np.fmax(a, b), a)
+        assert_equal(np.fmax(a, b).dtype, np.dtype('m8[s]'))
+        # Viewed as integers, the comparison is opposite because
+        # of the units chosen
+        assert_equal(np.minimum(a.view('i8'), b.view('i8')), a.view('i8'))
+
+        # should raise between datetime and timedelta
+        #
+        # TODO: Allowing unsafe casting by
+        #       default in ufuncs strikes again... :(
+        a = np.array(3, dtype='m8[h]')
+        b = np.array('1999-03-12T12', dtype='M8[s]')
+        #assert_raises(TypeError, np.minimum, a, b)
+        #assert_raises(TypeError, np.maximum, a, b)
+        #assert_raises(TypeError, np.fmin, a, b)
+        #assert_raises(TypeError, np.fmax, a, b)
+        assert_raises(TypeError, np.minimum, a, b, casting='same_kind')
+        assert_raises(TypeError, np.maximum, a, b, casting='same_kind')
+        assert_raises(TypeError, np.fmin, a, b, casting='same_kind')
+        assert_raises(TypeError, np.fmax, a, b, casting='same_kind')
+
+    def test_hours(self):
+        t = np.ones(3, dtype='M8[s]')
+        t[0] = 60*60*24 + 60*60*10
+        assert_(t[0].item().hour == 10)
+
+    def test_divisor_conversion_year(self):
+        assert_(np.dtype('M8[Y/4]') == np.dtype('M8[3M]'))
+        assert_(np.dtype('M8[Y/13]') == np.dtype('M8[4W]'))
+        assert_(np.dtype('M8[3Y/73]') == np.dtype('M8[15D]'))
+
+    def test_divisor_conversion_month(self):
+        assert_(np.dtype('M8[M/2]') == np.dtype('M8[2W]'))
+        assert_(np.dtype('M8[M/15]') == np.dtype('M8[2D]'))
+        assert_(np.dtype('M8[3M/40]') == np.dtype('M8[54h]'))
+
+    def test_divisor_conversion_week(self):
+        assert_(np.dtype('m8[W/7]') == np.dtype('m8[D]'))
+        assert_(np.dtype('m8[3W/14]') == np.dtype('m8[36h]'))
+        assert_(np.dtype('m8[5W/140]') == np.dtype('m8[360m]'))
+
+    def test_divisor_conversion_day(self):
+        assert_(np.dtype('M8[D/12]') == np.dtype('M8[2h]'))
+        assert_(np.dtype('M8[D/120]') == np.dtype('M8[12m]'))
+        assert_(np.dtype('M8[3D/960]') == np.dtype('M8[270s]'))
+
+    def test_divisor_conversion_hour(self):
+        assert_(np.dtype('m8[h/30]') == np.dtype('m8[2m]'))
+        assert_(np.dtype('m8[3h/300]') == np.dtype('m8[36s]'))
+
+    def test_divisor_conversion_minute(self):
+        assert_(np.dtype('m8[m/30]') == np.dtype('m8[2s]'))
+        assert_(np.dtype('m8[3m/300]') == np.dtype('m8[600ms]'))
+
+    def test_divisor_conversion_second(self):
+        assert_(np.dtype('m8[s/100]') == np.dtype('m8[10ms]'))
+        assert_(np.dtype('m8[3s/10000]') == np.dtype('m8[300us]'))
+
+    def test_divisor_conversion_fs(self):
+        assert_(np.dtype('M8[fs/100]') == np.dtype('M8[10as]'))
+        assert_raises(ValueError, lambda: np.dtype('M8[3fs/10000]'))
+
+    def test_divisor_conversion_as(self):
+        assert_raises(ValueError, lambda: np.dtype('M8[as/10]'))
+
+    def test_string_parser_variants(self):
+        # Allow space instead of 'T' between date and time
+        assert_equal(np.array(['1980-02-29T01:02:03'], np.dtype('M8[s]')),
+                     np.array(['1980-02-29 01:02:03'], np.dtype('M8[s]')))
+        # Allow positive years
+        assert_equal(np.array(['+1980-02-29T01:02:03'], np.dtype('M8[s]')),
+                     np.array(['+1980-02-29 01:02:03'], np.dtype('M8[s]')))
+        # Allow negative years
+        assert_equal(np.array(['-1980-02-29T01:02:03'], np.dtype('M8[s]')),
+                     np.array(['-1980-02-29 01:02:03'], np.dtype('M8[s]')))
+        # UTC specifier
+        with assert_warns(DeprecationWarning):
+            assert_equal(
+                np.array(['+1980-02-29T01:02:03'], np.dtype('M8[s]')),
+                np.array(['+1980-02-29 01:02:03Z'], np.dtype('M8[s]')))
+        with assert_warns(DeprecationWarning):
+            assert_equal(
+                np.array(['-1980-02-29T01:02:03'], np.dtype('M8[s]')),
+                np.array(['-1980-02-29 01:02:03Z'], np.dtype('M8[s]')))
+        # Time zone offset
+        with assert_warns(DeprecationWarning):
+            assert_equal(
+                np.array(['1980-02-29T02:02:03'], np.dtype('M8[s]')),
+                np.array(['1980-02-29 00:32:03-0130'], np.dtype('M8[s]')))
+        with assert_warns(DeprecationWarning):
+            assert_equal(
+                np.array(['1980-02-28T22:32:03'], np.dtype('M8[s]')),
+                np.array(['1980-02-29 00:02:03+01:30'], np.dtype('M8[s]')))
+        with assert_warns(DeprecationWarning):
+            assert_equal(
+                np.array(['1980-02-29T02:32:03.506'], np.dtype('M8[s]')),
+                np.array(['1980-02-29 00:32:03.506-02'], np.dtype('M8[s]')))
+        with assert_warns(DeprecationWarning):
+            assert_equal(np.datetime64('1977-03-02T12:30-0230'),
+                         np.datetime64('1977-03-02T15:00'))
+
+    def test_string_parser_error_check(self):
+        # Arbitrary bad string
+        assert_raises(ValueError, np.array, ['badvalue'], np.dtype('M8[us]'))
+        # Character after year must be '-'
+        assert_raises(ValueError, np.array, ['1980X'], np.dtype('M8[us]'))
+        # Cannot have trailing '-'
+        assert_raises(ValueError, np.array, ['1980-'], np.dtype('M8[us]'))
+        # Month must be in range [1,12]
+        assert_raises(ValueError, np.array, ['1980-00'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-13'], np.dtype('M8[us]'))
+        # Month must have two digits
+        assert_raises(ValueError, np.array, ['1980-1'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-1-02'], np.dtype('M8[us]'))
+        # 'Mor' is not a valid month
+        assert_raises(ValueError, np.array, ['1980-Mor'], np.dtype('M8[us]'))
+        # Cannot have trailing '-'
+        assert_raises(ValueError, np.array, ['1980-01-'], np.dtype('M8[us]'))
+        # Day must be in range [1,len(month)]
+        assert_raises(ValueError, np.array, ['1980-01-0'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-01-00'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-01-32'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1979-02-29'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-30'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-03-32'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-04-31'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-05-32'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-06-31'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-07-32'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-08-32'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-09-31'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-10-32'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-11-31'], np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-12-32'], np.dtype('M8[us]'))
+        # Cannot have trailing characters
+        assert_raises(ValueError, np.array, ['1980-02-03%'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03 q'],
+                                                        np.dtype('M8[us]'))
+
+        # Hours must be in range [0, 23]
+        assert_raises(ValueError, np.array, ['1980-02-03 25'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03T25'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03 24:01'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03T24:01'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03 -1'],
+                                                        np.dtype('M8[us]'))
+        # No trailing ':'
+        assert_raises(ValueError, np.array, ['1980-02-03 01:'],
+                                                        np.dtype('M8[us]'))
+        # Minutes must be in range [0, 59]
+        assert_raises(ValueError, np.array, ['1980-02-03 01:-1'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03 01:60'],
+                                                        np.dtype('M8[us]'))
+        # No trailing ':'
+        assert_raises(ValueError, np.array, ['1980-02-03 01:60:'],
+                                                        np.dtype('M8[us]'))
+        # Seconds must be in range [0, 59]
+        assert_raises(ValueError, np.array, ['1980-02-03 01:10:-1'],
+                                                        np.dtype('M8[us]'))
+        assert_raises(ValueError, np.array, ['1980-02-03 01:01:60'],
+                                                        np.dtype('M8[us]'))
+        # Timezone offset must within a reasonable range
+        with assert_warns(DeprecationWarning):
+            assert_raises(ValueError, np.array, ['1980-02-03 01:01:00+0661'],
+                                                            np.dtype('M8[us]'))
+        with assert_warns(DeprecationWarning):
+            assert_raises(ValueError, np.array, ['1980-02-03 01:01:00+2500'],
+                                                            np.dtype('M8[us]'))
+        with assert_warns(DeprecationWarning):
+            assert_raises(ValueError, np.array, ['1980-02-03 01:01:00-0070'],
+                                                            np.dtype('M8[us]'))
+        with assert_warns(DeprecationWarning):
+            assert_raises(ValueError, np.array, ['1980-02-03 01:01:00-3000'],
+                                                            np.dtype('M8[us]'))
+        with assert_warns(DeprecationWarning):
+            assert_raises(ValueError, np.array, ['1980-02-03 01:01:00-25:00'],
+                                                            np.dtype('M8[us]'))
+
+    def test_creation_overflow(self):
+        date = '1980-03-23 20:00:00'
+        timesteps = np.array([date], dtype='datetime64[s]')[0].astype(np.int64)
+        for unit in ['ms', 'us', 'ns']:
+            timesteps *= 1000
+            x = np.array([date], dtype='datetime64[%s]' % unit)
+
+            assert_equal(timesteps, x[0].astype(np.int64),
+                         err_msg='Datetime conversion error for unit %s' % unit)
+
+        assert_equal(x[0].astype(np.int64), 322689600000000000)
+
+        # gh-13062
+        with pytest.raises(OverflowError):
+            np.datetime64(2**64, 'D')
+        with pytest.raises(OverflowError):
+            np.timedelta64(2**64, 'D')
+
+    def test_datetime_as_string(self):
+        # Check all the units with default string conversion
+        date = '1959-10-13'
+        datetime = '1959-10-13T12:34:56.789012345678901234'
+
+        assert_equal(np.datetime_as_string(np.datetime64(date, 'Y')),
+                     '1959')
+        assert_equal(np.datetime_as_string(np.datetime64(date, 'M')),
+                     '1959-10')
+        assert_equal(np.datetime_as_string(np.datetime64(date, 'D')),
+                     '1959-10-13')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'h')),
+                     '1959-10-13T12')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'm')),
+                     '1959-10-13T12:34')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 's')),
+                     '1959-10-13T12:34:56')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ms')),
+                     '1959-10-13T12:34:56.789')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'us')),
+                     '1959-10-13T12:34:56.789012')
+
+        datetime = '1969-12-31T23:34:56.789012345678901234'
+
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ns')),
+                     '1969-12-31T23:34:56.789012345')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ps')),
+                     '1969-12-31T23:34:56.789012345678')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'fs')),
+                     '1969-12-31T23:34:56.789012345678901')
+
+        datetime = '1969-12-31T23:59:57.789012345678901234'
+
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'as')),
+                     datetime)
+        datetime = '1970-01-01T00:34:56.789012345678901234'
+
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ns')),
+                     '1970-01-01T00:34:56.789012345')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ps')),
+                     '1970-01-01T00:34:56.789012345678')
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'fs')),
+                     '1970-01-01T00:34:56.789012345678901')
+
+        datetime = '1970-01-01T00:00:05.789012345678901234'
+
+        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'as')),
+                     datetime)
+
+        # String conversion with the unit= parameter
+        a = np.datetime64('2032-07-18T12:23:34.123456', 'us')
+        assert_equal(np.datetime_as_string(a, unit='Y', casting='unsafe'),
+                            '2032')
+        assert_equal(np.datetime_as_string(a, unit='M', casting='unsafe'),
+                            '2032-07')
+        assert_equal(np.datetime_as_string(a, unit='W', casting='unsafe'),
+                            '2032-07-18')
+        assert_equal(np.datetime_as_string(a, unit='D', casting='unsafe'),
+                            '2032-07-18')
+        assert_equal(np.datetime_as_string(a, unit='h'), '2032-07-18T12')
+        assert_equal(np.datetime_as_string(a, unit='m'),
+                            '2032-07-18T12:23')
+        assert_equal(np.datetime_as_string(a, unit='s'),
+                            '2032-07-18T12:23:34')
+        assert_equal(np.datetime_as_string(a, unit='ms'),
+                            '2032-07-18T12:23:34.123')
+        assert_equal(np.datetime_as_string(a, unit='us'),
+                            '2032-07-18T12:23:34.123456')
+        assert_equal(np.datetime_as_string(a, unit='ns'),
+                            '2032-07-18T12:23:34.123456000')
+        assert_equal(np.datetime_as_string(a, unit='ps'),
+                            '2032-07-18T12:23:34.123456000000')
+        assert_equal(np.datetime_as_string(a, unit='fs'),
+                            '2032-07-18T12:23:34.123456000000000')
+        assert_equal(np.datetime_as_string(a, unit='as'),
+                            '2032-07-18T12:23:34.123456000000000000')
+
+        # unit='auto' parameter
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-18T12:23:34.123456', 'us'), unit='auto'),
+                '2032-07-18T12:23:34.123456')
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-18T12:23:34.12', 'us'), unit='auto'),
+                '2032-07-18T12:23:34.120')
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-18T12:23:34', 'us'), unit='auto'),
+                '2032-07-18T12:23:34')
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-18T12:23:00', 'us'), unit='auto'),
+                '2032-07-18T12:23')
+        # 'auto' doesn't split up hour and minute
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-18T12:00:00', 'us'), unit='auto'),
+                '2032-07-18T12:00')
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-18T00:00:00', 'us'), unit='auto'),
+                '2032-07-18')
+        # 'auto' doesn't split up the date
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-07-01T00:00:00', 'us'), unit='auto'),
+                '2032-07-01')
+        assert_equal(np.datetime_as_string(
+                np.datetime64('2032-01-01T00:00:00', 'us'), unit='auto'),
+                '2032-01-01')
+
+    @pytest.mark.skipif(not _has_pytz, reason="The pytz module is not available.")
+    def test_datetime_as_string_timezone(self):
+        # timezone='local' vs 'UTC'
+        a = np.datetime64('2010-03-15T06:30', 'm')
+        assert_equal(np.datetime_as_string(a),
+                '2010-03-15T06:30')
+        assert_equal(np.datetime_as_string(a, timezone='naive'),
+                '2010-03-15T06:30')
+        assert_equal(np.datetime_as_string(a, timezone='UTC'),
+                '2010-03-15T06:30Z')
+        assert_(np.datetime_as_string(a, timezone='local') !=
+                '2010-03-15T06:30')
+
+        b = np.datetime64('2010-02-15T06:30', 'm')
+
+        assert_equal(np.datetime_as_string(a, timezone=tz('US/Central')),
+                     '2010-03-15T01:30-0500')
+        assert_equal(np.datetime_as_string(a, timezone=tz('US/Eastern')),
+                     '2010-03-15T02:30-0400')
+        assert_equal(np.datetime_as_string(a, timezone=tz('US/Pacific')),
+                     '2010-03-14T23:30-0700')
+
+        assert_equal(np.datetime_as_string(b, timezone=tz('US/Central')),
+                     '2010-02-15T00:30-0600')
+        assert_equal(np.datetime_as_string(b, timezone=tz('US/Eastern')),
+                     '2010-02-15T01:30-0500')
+        assert_equal(np.datetime_as_string(b, timezone=tz('US/Pacific')),
+                     '2010-02-14T22:30-0800')
+
+        # Dates to strings with a timezone attached is disabled by default
+        assert_raises(TypeError, np.datetime_as_string, a, unit='D',
+                           timezone=tz('US/Pacific'))
+        # Check that we can print out the date in the specified time zone
+        assert_equal(np.datetime_as_string(a, unit='D',
+                           timezone=tz('US/Pacific'), casting='unsafe'),
+                     '2010-03-14')
+        assert_equal(np.datetime_as_string(b, unit='D',
+                           timezone=tz('US/Central'), casting='unsafe'),
+                     '2010-02-15')
+
+    def test_datetime_arange(self):
+        # With two datetimes provided as strings
+        a = np.arange('2010-01-05', '2010-01-10', dtype='M8[D]')
+        assert_equal(a.dtype, np.dtype('M8[D]'))
+        assert_equal(a,
+            np.array(['2010-01-05', '2010-01-06', '2010-01-07',
+                      '2010-01-08', '2010-01-09'], dtype='M8[D]'))
+
+        a = np.arange('1950-02-10', '1950-02-06', -1, dtype='M8[D]')
+        assert_equal(a.dtype, np.dtype('M8[D]'))
+        assert_equal(a,
+            np.array(['1950-02-10', '1950-02-09', '1950-02-08',
+                      '1950-02-07'], dtype='M8[D]'))
+
+        # Unit should be detected as months here
+        a = np.arange('1969-05', '1970-05', 2, dtype='M8')
+        assert_equal(a.dtype, np.dtype('M8[M]'))
+        assert_equal(a,
+            np.datetime64('1969-05') + np.arange(12, step=2))
+
+        # datetime, integer|timedelta works as well
+        # produces arange (start, start + stop) in this case
+        a = np.arange('1969', 18, 3, dtype='M8')
+        assert_equal(a.dtype, np.dtype('M8[Y]'))
+        assert_equal(a,
+            np.datetime64('1969') + np.arange(18, step=3))
+        a = np.arange('1969-12-19', 22, np.timedelta64(2), dtype='M8')
+        assert_equal(a.dtype, np.dtype('M8[D]'))
+        assert_equal(a,
+            np.datetime64('1969-12-19') + np.arange(22, step=2))
+
+        # Step of 0 is disallowed
+        assert_raises(ValueError, np.arange, np.datetime64('today'),
+                                np.datetime64('today') + 3, 0)
+        # Promotion across nonlinear unit boundaries is disallowed
+        assert_raises(TypeError, np.arange, np.datetime64('2011-03-01', 'D'),
+                                np.timedelta64(5, 'M'))
+        assert_raises(TypeError, np.arange,
+                                np.datetime64('2012-02-03T14', 's'),
+                                np.timedelta64(5, 'Y'))
+
+    def test_datetime_arange_no_dtype(self):
+        d = np.array('2010-01-04', dtype="M8[D]")
+        assert_equal(np.arange(d, d + 1), d)
+        assert_raises(ValueError, np.arange, d)
+
+    def test_timedelta_arange(self):
+        a = np.arange(3, 10, dtype='m8')
+        assert_equal(a.dtype, np.dtype('m8'))
+        assert_equal(a, np.timedelta64(0) + np.arange(3, 10))
+
+        a = np.arange(np.timedelta64(3, 's'), 10, 2, dtype='m8')
+        assert_equal(a.dtype, np.dtype('m8[s]'))
+        assert_equal(a, np.timedelta64(0, 's') + np.arange(3, 10, 2))
+
+        # Step of 0 is disallowed
+        assert_raises(ValueError, np.arange, np.timedelta64(0),
+                                np.timedelta64(5), 0)
+        # Promotion across nonlinear unit boundaries is disallowed
+        assert_raises(TypeError, np.arange, np.timedelta64(0, 'D'),
+                                np.timedelta64(5, 'M'))
+        assert_raises(TypeError, np.arange, np.timedelta64(0, 'Y'),
+                                np.timedelta64(5, 'D'))
+
+    @pytest.mark.parametrize("val1, val2, expected", [
+        # case from gh-12092
+        (np.timedelta64(7, 's'),
+         np.timedelta64(3, 's'),
+         np.timedelta64(1, 's')),
+        # negative value cases
+        (np.timedelta64(3, 's'),
+         np.timedelta64(-2, 's'),
+         np.timedelta64(-1, 's')),
+        (np.timedelta64(-3, 's'),
+         np.timedelta64(2, 's'),
+         np.timedelta64(1, 's')),
+        # larger value cases
+        (np.timedelta64(17, 's'),
+         np.timedelta64(22, 's'),
+         np.timedelta64(17, 's')),
+        (np.timedelta64(22, 's'),
+         np.timedelta64(17, 's'),
+         np.timedelta64(5, 's')),
+        # different units
+        (np.timedelta64(1, 'm'),
+         np.timedelta64(57, 's'),
+         np.timedelta64(3, 's')),
+        (np.timedelta64(1, 'us'),
+         np.timedelta64(727, 'ns'),
+         np.timedelta64(273, 'ns')),
+        # NaT is propagated
+        (np.timedelta64('NaT'),
+         np.timedelta64(50, 'ns'),
+         np.timedelta64('NaT')),
+        # Y % M works
+        (np.timedelta64(2, 'Y'),
+         np.timedelta64(22, 'M'),
+         np.timedelta64(2, 'M')),
+        ])
+    def test_timedelta_modulus(self, val1, val2, expected):
+        assert_equal(val1 % val2, expected)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # years and months sometimes can't be unambiguously
+        # divided for modulus operation
+        (np.timedelta64(7, 'Y'),
+         np.timedelta64(3, 's')),
+        (np.timedelta64(7, 'M'),
+         np.timedelta64(1, 'D')),
+        ])
+    def test_timedelta_modulus_error(self, val1, val2):
+        with assert_raises_regex(TypeError, "common metadata divisor"):
+            val1 % val2
+
+    def test_timedelta_modulus_div_by_zero(self):
+        with assert_warns(RuntimeWarning):
+            actual = np.timedelta64(10, 's') % np.timedelta64(0, 's')
+            assert_equal(actual, np.timedelta64('NaT'))
+
+    @pytest.mark.parametrize("val1, val2", [
+        # cases where one operand is not
+        # timedelta64
+        (np.timedelta64(7, 'Y'),
+         15,),
+        (7.5,
+         np.timedelta64(1, 'D')),
+        ])
+    def test_timedelta_modulus_type_resolution(self, val1, val2):
+        # NOTE: some of the operations may be supported
+        # in the future
+        with assert_raises_regex(TypeError,
+                                 "remainder cannot use operands with types"):
+            val1 % val2
+
+    def test_timedelta_arange_no_dtype(self):
+        d = np.array(5, dtype="m8[D]")
+        assert_equal(np.arange(d, d + 1), d)
+        assert_raises(ValueError, np.arange, d)
+
+    def test_datetime_maximum_reduce(self):
+        a = np.array(['2010-01-02', '1999-03-14', '1833-03'], dtype='M8[D]')
+        assert_equal(np.maximum.reduce(a).dtype, np.dtype('M8[D]'))
+        assert_equal(np.maximum.reduce(a),
+                     np.datetime64('2010-01-02'))
+
+        a = np.array([1, 4, 0, 7, 2], dtype='m8[s]')
+        assert_equal(np.maximum.reduce(a).dtype, np.dtype('m8[s]'))
+        assert_equal(np.maximum.reduce(a),
+                     np.timedelta64(7, 's'))
+
+    def test_datetime_busday_offset(self):
+        # First Monday in June
+        assert_equal(
+            np.busday_offset('2011-06', 0, roll='forward', weekmask='Mon'),
+            np.datetime64('2011-06-06'))
+        # Last Monday in June
+        assert_equal(
+            np.busday_offset('2011-07', -1, roll='forward', weekmask='Mon'),
+            np.datetime64('2011-06-27'))
+        assert_equal(
+            np.busday_offset('2011-07', -1, roll='forward', weekmask='Mon'),
+            np.datetime64('2011-06-27'))
+
+        # Default M-F business days, different roll modes
+        assert_equal(np.busday_offset('2010-08', 0, roll='backward'),
+                     np.datetime64('2010-07-30'))
+        assert_equal(np.busday_offset('2010-08', 0, roll='preceding'),
+                     np.datetime64('2010-07-30'))
+        assert_equal(np.busday_offset('2010-08', 0, roll='modifiedpreceding'),
+                     np.datetime64('2010-08-02'))
+        assert_equal(np.busday_offset('2010-08', 0, roll='modifiedfollowing'),
+                     np.datetime64('2010-08-02'))
+        assert_equal(np.busday_offset('2010-08', 0, roll='forward'),
+                     np.datetime64('2010-08-02'))
+        assert_equal(np.busday_offset('2010-08', 0, roll='following'),
+                     np.datetime64('2010-08-02'))
+        assert_equal(np.busday_offset('2010-10-30', 0, roll='following'),
+                     np.datetime64('2010-11-01'))
+        assert_equal(
+                np.busday_offset('2010-10-30', 0, roll='modifiedfollowing'),
+                np.datetime64('2010-10-29'))
+        assert_equal(
+                np.busday_offset('2010-10-30', 0, roll='modifiedpreceding'),
+                np.datetime64('2010-10-29'))
+        assert_equal(
+                np.busday_offset('2010-10-16', 0, roll='modifiedfollowing'),
+                np.datetime64('2010-10-18'))
+        assert_equal(
+                np.busday_offset('2010-10-16', 0, roll='modifiedpreceding'),
+                np.datetime64('2010-10-15'))
+        # roll='raise' by default
+        assert_raises(ValueError, np.busday_offset, '2011-06-04', 0)
+
+        # Bigger offset values
+        assert_equal(np.busday_offset('2006-02-01', 25),
+                     np.datetime64('2006-03-08'))
+        assert_equal(np.busday_offset('2006-03-08', -25),
+                     np.datetime64('2006-02-01'))
+        assert_equal(np.busday_offset('2007-02-25', 11, weekmask='SatSun'),
+                     np.datetime64('2007-04-07'))
+        assert_equal(np.busday_offset('2007-04-07', -11, weekmask='SatSun'),
+                     np.datetime64('2007-02-25'))
+
+        # NaT values when roll is not raise
+        assert_equal(np.busday_offset(np.datetime64('NaT'), 1, roll='nat'),
+                     np.datetime64('NaT'))
+        assert_equal(np.busday_offset(np.datetime64('NaT'), 1, roll='following'),
+                     np.datetime64('NaT'))
+        assert_equal(np.busday_offset(np.datetime64('NaT'), 1, roll='preceding'),
+                     np.datetime64('NaT'))
+
+    def test_datetime_busdaycalendar(self):
+        # Check that it removes NaT, duplicates, and weekends
+        # and sorts the result.
+        bdd = np.busdaycalendar(
+            holidays=['NaT', '2011-01-17', '2011-03-06', 'NaT',
+                       '2011-12-26', '2011-05-30', '2011-01-17'])
+        assert_equal(bdd.holidays,
+            np.array(['2011-01-17', '2011-05-30', '2011-12-26'], dtype='M8'))
+        # Default M-F weekmask
+        assert_equal(bdd.weekmask, np.array([1, 1, 1, 1, 1, 0, 0], dtype='?'))
+
+        # Check string weekmask with varying whitespace.
+        bdd = np.busdaycalendar(weekmask="Sun TueWed  Thu\tFri")
+        assert_equal(bdd.weekmask, np.array([0, 1, 1, 1, 1, 0, 1], dtype='?'))
+
+        # Check length 7 0/1 string
+        bdd = np.busdaycalendar(weekmask="0011001")
+        assert_equal(bdd.weekmask, np.array([0, 0, 1, 1, 0, 0, 1], dtype='?'))
+
+        # Check length 7 string weekmask.
+        bdd = np.busdaycalendar(weekmask="Mon Tue")
+        assert_equal(bdd.weekmask, np.array([1, 1, 0, 0, 0, 0, 0], dtype='?'))
+
+        # All-zeros weekmask should raise
+        assert_raises(ValueError, np.busdaycalendar, weekmask=[0, 0, 0, 0, 0, 0, 0])
+        # weekday names must be correct case
+        assert_raises(ValueError, np.busdaycalendar, weekmask="satsun")
+        # All-zeros weekmask should raise
+        assert_raises(ValueError, np.busdaycalendar, weekmask="")
+        # Invalid weekday name codes should raise
+        assert_raises(ValueError, np.busdaycalendar, weekmask="Mon Tue We")
+        assert_raises(ValueError, np.busdaycalendar, weekmask="Max")
+        assert_raises(ValueError, np.busdaycalendar, weekmask="Monday Tue")
+
+    def test_datetime_busday_holidays_offset(self):
+        # With exactly one holiday
+        assert_equal(
+            np.busday_offset('2011-11-10', 1, holidays=['2011-11-11']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-04', 5, holidays=['2011-11-11']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-10', 5, holidays=['2011-11-11']),
+            np.datetime64('2011-11-18'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -1, holidays=['2011-11-11']),
+            np.datetime64('2011-11-10'))
+        assert_equal(
+            np.busday_offset('2011-11-18', -5, holidays=['2011-11-11']),
+            np.datetime64('2011-11-10'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -5, holidays=['2011-11-11']),
+            np.datetime64('2011-11-04'))
+        # With the holiday appearing twice
+        assert_equal(
+            np.busday_offset('2011-11-10', 1,
+                holidays=['2011-11-11', '2011-11-11']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -1,
+                holidays=['2011-11-11', '2011-11-11']),
+            np.datetime64('2011-11-10'))
+        # With a NaT holiday
+        assert_equal(
+            np.busday_offset('2011-11-10', 1,
+                holidays=['2011-11-11', 'NaT']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -1,
+                holidays=['NaT', '2011-11-11']),
+            np.datetime64('2011-11-10'))
+        # With another holiday after
+        assert_equal(
+            np.busday_offset('2011-11-10', 1,
+                holidays=['2011-11-11', '2011-11-24']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -1,
+                holidays=['2011-11-11', '2011-11-24']),
+            np.datetime64('2011-11-10'))
+        # With another holiday before
+        assert_equal(
+            np.busday_offset('2011-11-10', 1,
+                holidays=['2011-10-10', '2011-11-11']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -1,
+                holidays=['2011-10-10', '2011-11-11']),
+            np.datetime64('2011-11-10'))
+        # With another holiday before and after
+        assert_equal(
+            np.busday_offset('2011-11-10', 1,
+                holidays=['2011-10-10', '2011-11-11', '2011-11-24']),
+            np.datetime64('2011-11-14'))
+        assert_equal(
+            np.busday_offset('2011-11-14', -1,
+                holidays=['2011-10-10', '2011-11-11', '2011-11-24']),
+            np.datetime64('2011-11-10'))
+
+        # A bigger forward jump across more than one week/holiday
+        holidays = ['2011-10-10', '2011-11-11', '2011-11-24',
+                  '2011-12-25', '2011-05-30', '2011-02-21',
+                  '2011-12-26', '2012-01-02']
+        bdd = np.busdaycalendar(weekmask='1111100', holidays=holidays)
+        assert_equal(
+            np.busday_offset('2011-10-03', 4, holidays=holidays),
+            np.busday_offset('2011-10-03', 4))
+        assert_equal(
+            np.busday_offset('2011-10-03', 5, holidays=holidays),
+            np.busday_offset('2011-10-03', 5 + 1))
+        assert_equal(
+            np.busday_offset('2011-10-03', 27, holidays=holidays),
+            np.busday_offset('2011-10-03', 27 + 1))
+        assert_equal(
+            np.busday_offset('2011-10-03', 28, holidays=holidays),
+            np.busday_offset('2011-10-03', 28 + 2))
+        assert_equal(
+            np.busday_offset('2011-10-03', 35, holidays=holidays),
+            np.busday_offset('2011-10-03', 35 + 2))
+        assert_equal(
+            np.busday_offset('2011-10-03', 36, holidays=holidays),
+            np.busday_offset('2011-10-03', 36 + 3))
+        assert_equal(
+            np.busday_offset('2011-10-03', 56, holidays=holidays),
+            np.busday_offset('2011-10-03', 56 + 3))
+        assert_equal(
+            np.busday_offset('2011-10-03', 57, holidays=holidays),
+            np.busday_offset('2011-10-03', 57 + 4))
+        assert_equal(
+            np.busday_offset('2011-10-03', 60, holidays=holidays),
+            np.busday_offset('2011-10-03', 60 + 4))
+        assert_equal(
+            np.busday_offset('2011-10-03', 61, holidays=holidays),
+            np.busday_offset('2011-10-03', 61 + 5))
+        assert_equal(
+            np.busday_offset('2011-10-03', 61, busdaycal=bdd),
+            np.busday_offset('2011-10-03', 61 + 5))
+        # A bigger backward jump across more than one week/holiday
+        assert_equal(
+            np.busday_offset('2012-01-03', -1, holidays=holidays),
+            np.busday_offset('2012-01-03', -1 - 1))
+        assert_equal(
+            np.busday_offset('2012-01-03', -4, holidays=holidays),
+            np.busday_offset('2012-01-03', -4 - 1))
+        assert_equal(
+            np.busday_offset('2012-01-03', -5, holidays=holidays),
+            np.busday_offset('2012-01-03', -5 - 2))
+        assert_equal(
+            np.busday_offset('2012-01-03', -25, holidays=holidays),
+            np.busday_offset('2012-01-03', -25 - 2))
+        assert_equal(
+            np.busday_offset('2012-01-03', -26, holidays=holidays),
+            np.busday_offset('2012-01-03', -26 - 3))
+        assert_equal(
+            np.busday_offset('2012-01-03', -33, holidays=holidays),
+            np.busday_offset('2012-01-03', -33 - 3))
+        assert_equal(
+            np.busday_offset('2012-01-03', -34, holidays=holidays),
+            np.busday_offset('2012-01-03', -34 - 4))
+        assert_equal(
+            np.busday_offset('2012-01-03', -56, holidays=holidays),
+            np.busday_offset('2012-01-03', -56 - 4))
+        assert_equal(
+            np.busday_offset('2012-01-03', -57, holidays=holidays),
+            np.busday_offset('2012-01-03', -57 - 5))
+        assert_equal(
+            np.busday_offset('2012-01-03', -57, busdaycal=bdd),
+            np.busday_offset('2012-01-03', -57 - 5))
+
+        # Can't supply both a weekmask/holidays and busdaycal
+        assert_raises(ValueError, np.busday_offset, '2012-01-03', -15,
+                        weekmask='1111100', busdaycal=bdd)
+        assert_raises(ValueError, np.busday_offset, '2012-01-03', -15,
+                        holidays=holidays, busdaycal=bdd)
+
+        # Roll with the holidays
+        assert_equal(
+            np.busday_offset('2011-12-25', 0,
+                roll='forward', holidays=holidays),
+            np.datetime64('2011-12-27'))
+        assert_equal(
+            np.busday_offset('2011-12-26', 0,
+                roll='forward', holidays=holidays),
+            np.datetime64('2011-12-27'))
+        assert_equal(
+            np.busday_offset('2011-12-26', 0,
+                roll='backward', holidays=holidays),
+            np.datetime64('2011-12-23'))
+        assert_equal(
+            np.busday_offset('2012-02-27', 0,
+                roll='modifiedfollowing',
+                holidays=['2012-02-27', '2012-02-26', '2012-02-28',
+                          '2012-03-01', '2012-02-29']),
+            np.datetime64('2012-02-24'))
+        assert_equal(
+            np.busday_offset('2012-03-06', 0,
+                roll='modifiedpreceding',
+                holidays=['2012-03-02', '2012-03-03', '2012-03-01',
+                          '2012-03-05', '2012-03-07', '2012-03-06']),
+            np.datetime64('2012-03-08'))
+
+    def test_datetime_busday_holidays_count(self):
+        holidays = ['2011-01-01', '2011-10-10', '2011-11-11', '2011-11-24',
+                    '2011-12-25', '2011-05-30', '2011-02-21', '2011-01-17',
+                    '2011-12-26', '2012-01-02', '2011-02-21', '2011-05-30',
+                    '2011-07-01', '2011-07-04', '2011-09-05', '2011-10-10']
+        bdd = np.busdaycalendar(weekmask='1111100', holidays=holidays)
+
+        # Validate against busday_offset broadcast against
+        # a range of offsets
+        dates = np.busday_offset('2011-01-01', np.arange(366),
+                        roll='forward', busdaycal=bdd)
+        assert_equal(np.busday_count('2011-01-01', dates, busdaycal=bdd),
+                     np.arange(366))
+        # Returns negative value when reversed
+        assert_equal(np.busday_count(dates, '2011-01-01', busdaycal=bdd),
+                     -np.arange(366))
+
+        dates = np.busday_offset('2011-12-31', -np.arange(366),
+                        roll='forward', busdaycal=bdd)
+        assert_equal(np.busday_count(dates, '2011-12-31', busdaycal=bdd),
+                     np.arange(366))
+        # Returns negative value when reversed
+        assert_equal(np.busday_count('2011-12-31', dates, busdaycal=bdd),
+                     -np.arange(366))
+
+        # Can't supply both a weekmask/holidays and busdaycal
+        assert_raises(ValueError, np.busday_offset, '2012-01-03', '2012-02-03',
+                        weekmask='1111100', busdaycal=bdd)
+        assert_raises(ValueError, np.busday_offset, '2012-01-03', '2012-02-03',
+                        holidays=holidays, busdaycal=bdd)
+
+        # Number of Mondays in March 2011
+        assert_equal(np.busday_count('2011-03', '2011-04', weekmask='Mon'), 4)
+        # Returns negative value when reversed
+        assert_equal(np.busday_count('2011-04', '2011-03', weekmask='Mon'), -4)
+
+    def test_datetime_is_busday(self):
+        holidays = ['2011-01-01', '2011-10-10', '2011-11-11', '2011-11-24',
+                    '2011-12-25', '2011-05-30', '2011-02-21', '2011-01-17',
+                    '2011-12-26', '2012-01-02', '2011-02-21', '2011-05-30',
+                    '2011-07-01', '2011-07-04', '2011-09-05', '2011-10-10',
+                    'NaT']
+        bdd = np.busdaycalendar(weekmask='1111100', holidays=holidays)
+
+        # Weekend/weekday tests
+        assert_equal(np.is_busday('2011-01-01'), False)
+        assert_equal(np.is_busday('2011-01-02'), False)
+        assert_equal(np.is_busday('2011-01-03'), True)
+
+        # All the holidays are not business days
+        assert_equal(np.is_busday(holidays, busdaycal=bdd),
+                     np.zeros(len(holidays), dtype='?'))
+
+    def test_datetime_y2038(self):
+        # Test parsing on either side of the Y2038 boundary
+        a = np.datetime64('2038-01-19T03:14:07')
+        assert_equal(a.view(np.int64), 2**31 - 1)
+        a = np.datetime64('2038-01-19T03:14:08')
+        assert_equal(a.view(np.int64), 2**31)
+
+        # Test parsing on either side of the Y2038 boundary with
+        # a manually specified timezone offset
+        with assert_warns(DeprecationWarning):
+            a = np.datetime64('2038-01-19T04:14:07+0100')
+            assert_equal(a.view(np.int64), 2**31 - 1)
+        with assert_warns(DeprecationWarning):
+            a = np.datetime64('2038-01-19T04:14:08+0100')
+            assert_equal(a.view(np.int64), 2**31)
+
+        # Test parsing a date after Y2038
+        a = np.datetime64('2038-01-20T13:21:14')
+        assert_equal(str(a), '2038-01-20T13:21:14')
+
+    def test_isnat(self):
+        assert_(np.isnat(np.datetime64('NaT', 'ms')))
+        assert_(np.isnat(np.datetime64('NaT', 'ns')))
+        assert_(not np.isnat(np.datetime64('2038-01-19T03:14:07')))
+
+        assert_(np.isnat(np.timedelta64('NaT', "ms")))
+        assert_(not np.isnat(np.timedelta64(34, "ms")))
+
+        res = np.array([False, False, True])
+        for unit in ['Y', 'M', 'W', 'D',
+                     'h', 'm', 's', 'ms', 'us',
+                     'ns', 'ps', 'fs', 'as']:
+            arr = np.array([123, -321, "NaT"], dtype='<datetime64[%s]' % unit)
+            assert_equal(np.isnat(arr), res)
+            arr = np.array([123, -321, "NaT"], dtype='>datetime64[%s]' % unit)
+            assert_equal(np.isnat(arr), res)
+            arr = np.array([123, -321, "NaT"], dtype='<timedelta64[%s]' % unit)
+            assert_equal(np.isnat(arr), res)
+            arr = np.array([123, -321, "NaT"], dtype='>timedelta64[%s]' % unit)
+            assert_equal(np.isnat(arr), res)
+
+    def test_isnat_error(self):
+        # Test that only datetime dtype arrays are accepted
+        for t in np.typecodes["All"]:
+            if t in np.typecodes["Datetime"]:
+                continue
+            assert_raises(TypeError, np.isnat, np.zeros(10, t))
+
+    def test_corecursive_input(self):
+        # construct a co-recursive list
+        a, b = [], []
+        a.append(b)
+        b.append(a)
+        obj_arr = np.array([None])
+        obj_arr[0] = a
+
+        # gh-11154: This shouldn't cause a C stack overflow
+        assert_raises(RecursionError, obj_arr.astype, 'M8')
+        assert_raises(RecursionError, obj_arr.astype, 'm8')
+
+
+class TestDateTimeData(object):
+
+    def test_basic(self):
+        a = np.array(['1980-03-23'], dtype=np.datetime64)
+        assert_equal(np.datetime_data(a.dtype), ('D', 1))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_defchararray.py b/contrib/python/numpy/py2/numpy/core/tests/test_defchararray.py
new file mode 100644
index 00000000000..7b0e6f8a4be
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_defchararray.py
@@ -0,0 +1,692 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+import numpy as np
+from numpy.core.multiarray import _vec_string
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises,
+    assert_raises_regex, suppress_warnings,
+    )
+
+kw_unicode_true = {'unicode': True}  # make 2to3 work properly
+kw_unicode_false = {'unicode': False}
+
+class TestBasic(object):
+    def test_from_object_array(self):
+        A = np.array([['abc', 2],
+                      ['long   ', '0123456789']], dtype='O')
+        B = np.char.array(A)
+        assert_equal(B.dtype.itemsize, 10)
+        assert_array_equal(B, [[b'abc', b'2'],
+                               [b'long', b'0123456789']])
+
+    def test_from_object_array_unicode(self):
+        A = np.array([['abc', u'Sigma \u03a3'],
+                      ['long   ', '0123456789']], dtype='O')
+        assert_raises(ValueError, np.char.array, (A,))
+        B = np.char.array(A, **kw_unicode_true)
+        assert_equal(B.dtype.itemsize, 10 * np.array('a', 'U').dtype.itemsize)
+        assert_array_equal(B, [['abc', u'Sigma \u03a3'],
+                               ['long', '0123456789']])
+
+    def test_from_string_array(self):
+        A = np.array([[b'abc', b'foo'],
+                      [b'long   ', b'0123456789']])
+        assert_equal(A.dtype.type, np.string_)
+        B = np.char.array(A)
+        assert_array_equal(B, A)
+        assert_equal(B.dtype, A.dtype)
+        assert_equal(B.shape, A.shape)
+        B[0, 0] = 'changed'
+        assert_(B[0, 0] != A[0, 0])
+        C = np.char.asarray(A)
+        assert_array_equal(C, A)
+        assert_equal(C.dtype, A.dtype)
+        C[0, 0] = 'changed again'
+        assert_(C[0, 0] != B[0, 0])
+        assert_(C[0, 0] == A[0, 0])
+
+    def test_from_unicode_array(self):
+        A = np.array([['abc', u'Sigma \u03a3'],
+                      ['long   ', '0123456789']])
+        assert_equal(A.dtype.type, np.unicode_)
+        B = np.char.array(A)
+        assert_array_equal(B, A)
+        assert_equal(B.dtype, A.dtype)
+        assert_equal(B.shape, A.shape)
+        B = np.char.array(A, **kw_unicode_true)
+        assert_array_equal(B, A)
+        assert_equal(B.dtype, A.dtype)
+        assert_equal(B.shape, A.shape)
+
+        def fail():
+            np.char.array(A, **kw_unicode_false)
+
+        assert_raises(UnicodeEncodeError, fail)
+
+    def test_unicode_upconvert(self):
+        A = np.char.array(['abc'])
+        B = np.char.array([u'\u03a3'])
+        assert_(issubclass((A + B).dtype.type, np.unicode_))
+
+    def test_from_string(self):
+        A = np.char.array(b'abc')
+        assert_equal(len(A), 1)
+        assert_equal(len(A[0]), 3)
+        assert_(issubclass(A.dtype.type, np.string_))
+
+    def test_from_unicode(self):
+        A = np.char.array(u'\u03a3')
+        assert_equal(len(A), 1)
+        assert_equal(len(A[0]), 1)
+        assert_equal(A.itemsize, 4)
+        assert_(issubclass(A.dtype.type, np.unicode_))
+
+class TestVecString(object):
+    def test_non_existent_method(self):
+
+        def fail():
+            _vec_string('a', np.string_, 'bogus')
+
+        assert_raises(AttributeError, fail)
+
+    def test_non_string_array(self):
+
+        def fail():
+            _vec_string(1, np.string_, 'strip')
+
+        assert_raises(TypeError, fail)
+
+    def test_invalid_args_tuple(self):
+
+        def fail():
+            _vec_string(['a'], np.string_, 'strip', 1)
+
+        assert_raises(TypeError, fail)
+
+    def test_invalid_type_descr(self):
+
+        def fail():
+            _vec_string(['a'], 'BOGUS', 'strip')
+
+        assert_raises(TypeError, fail)
+
+    def test_invalid_function_args(self):
+
+        def fail():
+            _vec_string(['a'], np.string_, 'strip', (1,))
+
+        assert_raises(TypeError, fail)
+
+    def test_invalid_result_type(self):
+
+        def fail():
+            _vec_string(['a'], np.integer, 'strip')
+
+        assert_raises(TypeError, fail)
+
+    def test_broadcast_error(self):
+
+        def fail():
+            _vec_string([['abc', 'def']], np.integer, 'find', (['a', 'd', 'j'],))
+
+        assert_raises(ValueError, fail)
+
+
+class TestWhitespace(object):
+    def setup(self):
+        self.A = np.array([['abc ', '123  '],
+                           ['789 ', 'xyz ']]).view(np.chararray)
+        self.B = np.array([['abc', '123'],
+                           ['789', 'xyz']]).view(np.chararray)
+
+    def test1(self):
+        assert_(np.all(self.A == self.B))
+        assert_(np.all(self.A >= self.B))
+        assert_(np.all(self.A <= self.B))
+        assert_(not np.any(self.A > self.B))
+        assert_(not np.any(self.A < self.B))
+        assert_(not np.any(self.A != self.B))
+
+class TestChar(object):
+    def setup(self):
+        self.A = np.array('abc1', dtype='c').view(np.chararray)
+
+    def test_it(self):
+        assert_equal(self.A.shape, (4,))
+        assert_equal(self.A.upper()[:2].tobytes(), b'AB')
+
+class TestComparisons(object):
+    def setup(self):
+        self.A = np.array([['abc', '123'],
+                           ['789', 'xyz']]).view(np.chararray)
+        self.B = np.array([['efg', '123  '],
+                           ['051', 'tuv']]).view(np.chararray)
+
+    def test_not_equal(self):
+        assert_array_equal((self.A != self.B), [[True, False], [True, True]])
+
+    def test_equal(self):
+        assert_array_equal((self.A == self.B), [[False, True], [False, False]])
+
+    def test_greater_equal(self):
+        assert_array_equal((self.A >= self.B), [[False, True], [True, True]])
+
+    def test_less_equal(self):
+        assert_array_equal((self.A <= self.B), [[True, True], [False, False]])
+
+    def test_greater(self):
+        assert_array_equal((self.A > self.B), [[False, False], [True, True]])
+
+    def test_less(self):
+        assert_array_equal((self.A < self.B), [[True, False], [False, False]])
+
+class TestComparisonsMixed1(TestComparisons):
+    """Ticket #1276"""
+
+    def setup(self):
+        TestComparisons.setup(self)
+        self.B = np.array([['efg', '123  '],
+                           ['051', 'tuv']], np.unicode_).view(np.chararray)
+
+class TestComparisonsMixed2(TestComparisons):
+    """Ticket #1276"""
+
+    def setup(self):
+        TestComparisons.setup(self)
+        self.A = np.array([['abc', '123'],
+                           ['789', 'xyz']], np.unicode_).view(np.chararray)
+
+class TestInformation(object):
+    def setup(self):
+        self.A = np.array([[' abc ', ''],
+                           ['12345', 'MixedCase'],
+                           ['123 \t 345 \0 ', 'UPPER']]).view(np.chararray)
+        self.B = np.array([[u' \u03a3 ', u''],
+                           [u'12345', u'MixedCase'],
+                           [u'123 \t 345 \0 ', u'UPPER']]).view(np.chararray)
+
+    def test_len(self):
+        assert_(issubclass(np.char.str_len(self.A).dtype.type, np.integer))
+        assert_array_equal(np.char.str_len(self.A), [[5, 0], [5, 9], [12, 5]])
+        assert_array_equal(np.char.str_len(self.B), [[3, 0], [5, 9], [12, 5]])
+
+    def test_count(self):
+        assert_(issubclass(self.A.count('').dtype.type, np.integer))
+        assert_array_equal(self.A.count('a'), [[1, 0], [0, 1], [0, 0]])
+        assert_array_equal(self.A.count('123'), [[0, 0], [1, 0], [1, 0]])
+        # Python doesn't seem to like counting NULL characters
+        # assert_array_equal(self.A.count('\0'), [[0, 0], [0, 0], [1, 0]])
+        assert_array_equal(self.A.count('a', 0, 2), [[1, 0], [0, 0], [0, 0]])
+        assert_array_equal(self.B.count('a'), [[0, 0], [0, 1], [0, 0]])
+        assert_array_equal(self.B.count('123'), [[0, 0], [1, 0], [1, 0]])
+        # assert_array_equal(self.B.count('\0'), [[0, 0], [0, 0], [1, 0]])
+
+    def test_endswith(self):
+        assert_(issubclass(self.A.endswith('').dtype.type, np.bool_))
+        assert_array_equal(self.A.endswith(' '), [[1, 0], [0, 0], [1, 0]])
+        assert_array_equal(self.A.endswith('3', 0, 3), [[0, 0], [1, 0], [1, 0]])
+
+        def fail():
+            self.A.endswith('3', 'fdjk')
+
+        assert_raises(TypeError, fail)
+
+    def test_find(self):
+        assert_(issubclass(self.A.find('a').dtype.type, np.integer))
+        assert_array_equal(self.A.find('a'), [[1, -1], [-1, 6], [-1, -1]])
+        assert_array_equal(self.A.find('3'), [[-1, -1], [2, -1], [2, -1]])
+        assert_array_equal(self.A.find('a', 0, 2), [[1, -1], [-1, -1], [-1, -1]])
+        assert_array_equal(self.A.find(['1', 'P']), [[-1, -1], [0, -1], [0, 1]])
+
+    def test_index(self):
+
+        def fail():
+            self.A.index('a')
+
+        assert_raises(ValueError, fail)
+        assert_(np.char.index('abcba', 'b') == 1)
+        assert_(issubclass(np.char.index('abcba', 'b').dtype.type, np.integer))
+
+    def test_isalnum(self):
+        assert_(issubclass(self.A.isalnum().dtype.type, np.bool_))
+        assert_array_equal(self.A.isalnum(), [[False, False], [True, True], [False, True]])
+
+    def test_isalpha(self):
+        assert_(issubclass(self.A.isalpha().dtype.type, np.bool_))
+        assert_array_equal(self.A.isalpha(), [[False, False], [False, True], [False, True]])
+
+    def test_isdigit(self):
+        assert_(issubclass(self.A.isdigit().dtype.type, np.bool_))
+        assert_array_equal(self.A.isdigit(), [[False, False], [True, False], [False, False]])
+
+    def test_islower(self):
+        assert_(issubclass(self.A.islower().dtype.type, np.bool_))
+        assert_array_equal(self.A.islower(), [[True, False], [False, False], [False, False]])
+
+    def test_isspace(self):
+        assert_(issubclass(self.A.isspace().dtype.type, np.bool_))
+        assert_array_equal(self.A.isspace(), [[False, False], [False, False], [False, False]])
+
+    def test_istitle(self):
+        assert_(issubclass(self.A.istitle().dtype.type, np.bool_))
+        assert_array_equal(self.A.istitle(), [[False, False], [False, False], [False, False]])
+
+    def test_isupper(self):
+        assert_(issubclass(self.A.isupper().dtype.type, np.bool_))
+        assert_array_equal(self.A.isupper(), [[False, False], [False, False], [False, True]])
+
+    def test_rfind(self):
+        assert_(issubclass(self.A.rfind('a').dtype.type, np.integer))
+        assert_array_equal(self.A.rfind('a'), [[1, -1], [-1, 6], [-1, -1]])
+        assert_array_equal(self.A.rfind('3'), [[-1, -1], [2, -1], [6, -1]])
+        assert_array_equal(self.A.rfind('a', 0, 2), [[1, -1], [-1, -1], [-1, -1]])
+        assert_array_equal(self.A.rfind(['1', 'P']), [[-1, -1], [0, -1], [0, 2]])
+
+    def test_rindex(self):
+
+        def fail():
+            self.A.rindex('a')
+
+        assert_raises(ValueError, fail)
+        assert_(np.char.rindex('abcba', 'b') == 3)
+        assert_(issubclass(np.char.rindex('abcba', 'b').dtype.type, np.integer))
+
+    def test_startswith(self):
+        assert_(issubclass(self.A.startswith('').dtype.type, np.bool_))
+        assert_array_equal(self.A.startswith(' '), [[1, 0], [0, 0], [0, 0]])
+        assert_array_equal(self.A.startswith('1', 0, 3), [[0, 0], [1, 0], [1, 0]])
+
+        def fail():
+            self.A.startswith('3', 'fdjk')
+
+        assert_raises(TypeError, fail)
+
+
+class TestMethods(object):
+    def setup(self):
+        self.A = np.array([[' abc ', ''],
+                           ['12345', 'MixedCase'],
+                           ['123 \t 345 \0 ', 'UPPER']],
+                          dtype='S').view(np.chararray)
+        self.B = np.array([[u' \u03a3 ', u''],
+                           [u'12345', u'MixedCase'],
+                           [u'123 \t 345 \0 ', u'UPPER']]).view(np.chararray)
+
+    def test_capitalize(self):
+        tgt = [[b' abc ', b''],
+               [b'12345', b'Mixedcase'],
+               [b'123 \t 345 \0 ', b'Upper']]
+        assert_(issubclass(self.A.capitalize().dtype.type, np.string_))
+        assert_array_equal(self.A.capitalize(), tgt)
+
+        tgt = [[u' \u03c3 ', ''],
+               ['12345', 'Mixedcase'],
+               ['123 \t 345 \0 ', 'Upper']]
+        assert_(issubclass(self.B.capitalize().dtype.type, np.unicode_))
+        assert_array_equal(self.B.capitalize(), tgt)
+
+    def test_center(self):
+        assert_(issubclass(self.A.center(10).dtype.type, np.string_))
+        C = self.A.center([10, 20])
+        assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
+
+        C = self.A.center(20, b'#')
+        assert_(np.all(C.startswith(b'#')))
+        assert_(np.all(C.endswith(b'#')))
+
+        C = np.char.center(b'FOO', [[10, 20], [15, 8]])
+        tgt = [[b'   FOO    ', b'        FOO         '],
+               [b'      FOO      ', b'  FOO   ']]
+        assert_(issubclass(C.dtype.type, np.string_))
+        assert_array_equal(C, tgt)
+
+    def test_decode(self):
+        if sys.version_info[0] >= 3:
+            A = np.char.array([b'\\u03a3'])
+            assert_(A.decode('unicode-escape')[0] == '\u03a3')
+        else:
+            with suppress_warnings() as sup:
+                if sys.py3kwarning:
+                    sup.filter(DeprecationWarning, "'hex_codec'")
+                A = np.char.array(['736563726574206d657373616765'])
+                assert_(A.decode('hex_codec')[0] == 'secret message')
+
+    def test_encode(self):
+        B = self.B.encode('unicode_escape')
+        assert_(B[0][0] == str(' \\u03a3 ').encode('latin1'))
+
+    def test_expandtabs(self):
+        T = self.A.expandtabs()
+        assert_(T[2, 0] == b'123      345 \0')
+
+    def test_join(self):
+        if sys.version_info[0] >= 3:
+            # NOTE: list(b'123') == [49, 50, 51]
+            #       so that b','.join(b'123') results to an error on Py3
+            A0 = self.A.decode('ascii')
+        else:
+            A0 = self.A
+
+        A = np.char.join([',', '#'], A0)
+        if sys.version_info[0] >= 3:
+            assert_(issubclass(A.dtype.type, np.unicode_))
+        else:
+            assert_(issubclass(A.dtype.type, np.string_))
+        tgt = np.array([[' ,a,b,c, ', ''],
+                        ['1,2,3,4,5', 'M#i#x#e#d#C#a#s#e'],
+                        ['1,2,3, ,\t, ,3,4,5, ,\x00, ', 'U#P#P#E#R']])
+        assert_array_equal(np.char.join([',', '#'], A0), tgt)
+
+    def test_ljust(self):
+        assert_(issubclass(self.A.ljust(10).dtype.type, np.string_))
+
+        C = self.A.ljust([10, 20])
+        assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
+
+        C = self.A.ljust(20, b'#')
+        assert_array_equal(C.startswith(b'#'), [
+                [False, True], [False, False], [False, False]])
+        assert_(np.all(C.endswith(b'#')))
+
+        C = np.char.ljust(b'FOO', [[10, 20], [15, 8]])
+        tgt = [[b'FOO       ', b'FOO                 '],
+               [b'FOO            ', b'FOO     ']]
+        assert_(issubclass(C.dtype.type, np.string_))
+        assert_array_equal(C, tgt)
+
+    def test_lower(self):
+        tgt = [[b' abc ', b''],
+               [b'12345', b'mixedcase'],
+               [b'123 \t 345 \0 ', b'upper']]
+        assert_(issubclass(self.A.lower().dtype.type, np.string_))
+        assert_array_equal(self.A.lower(), tgt)
+
+        tgt = [[u' \u03c3 ', u''],
+               [u'12345', u'mixedcase'],
+               [u'123 \t 345 \0 ', u'upper']]
+        assert_(issubclass(self.B.lower().dtype.type, np.unicode_))
+        assert_array_equal(self.B.lower(), tgt)
+
+    def test_lstrip(self):
+        tgt = [[b'abc ', b''],
+               [b'12345', b'MixedCase'],
+               [b'123 \t 345 \0 ', b'UPPER']]
+        assert_(issubclass(self.A.lstrip().dtype.type, np.string_))
+        assert_array_equal(self.A.lstrip(), tgt)
+
+        tgt = [[b' abc', b''],
+               [b'2345', b'ixedCase'],
+               [b'23 \t 345 \x00', b'UPPER']]
+        assert_array_equal(self.A.lstrip([b'1', b'M']), tgt)
+
+        tgt = [[u'\u03a3 ', ''],
+               ['12345', 'MixedCase'],
+               ['123 \t 345 \0 ', 'UPPER']]
+        assert_(issubclass(self.B.lstrip().dtype.type, np.unicode_))
+        assert_array_equal(self.B.lstrip(), tgt)
+
+    def test_partition(self):
+        P = self.A.partition([b'3', b'M'])
+        tgt = [[(b' abc ', b'', b''), (b'', b'', b'')],
+               [(b'12', b'3', b'45'), (b'', b'M', b'ixedCase')],
+               [(b'12', b'3', b' \t 345 \0 '), (b'UPPER', b'', b'')]]
+        assert_(issubclass(P.dtype.type, np.string_))
+        assert_array_equal(P, tgt)
+
+    def test_replace(self):
+        R = self.A.replace([b'3', b'a'],
+                           [b'##########', b'@'])
+        tgt = [[b' abc ', b''],
+               [b'12##########45', b'MixedC@se'],
+               [b'12########## \t ##########45 \x00', b'UPPER']]
+        assert_(issubclass(R.dtype.type, np.string_))
+        assert_array_equal(R, tgt)
+
+        if sys.version_info[0] < 3:
+            # NOTE: b'abc'.replace(b'a', 'b') is not allowed on Py3
+            R = self.A.replace(b'a', u'\u03a3')
+            tgt = [[u' \u03a3bc ', ''],
+                   ['12345', u'MixedC\u03a3se'],
+                   ['123 \t 345 \x00', 'UPPER']]
+            assert_(issubclass(R.dtype.type, np.unicode_))
+            assert_array_equal(R, tgt)
+
+    def test_rjust(self):
+        assert_(issubclass(self.A.rjust(10).dtype.type, np.string_))
+
+        C = self.A.rjust([10, 20])
+        assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
+
+        C = self.A.rjust(20, b'#')
+        assert_(np.all(C.startswith(b'#')))
+        assert_array_equal(C.endswith(b'#'),
+                           [[False, True], [False, False], [False, False]])
+
+        C = np.char.rjust(b'FOO', [[10, 20], [15, 8]])
+        tgt = [[b'       FOO', b'                 FOO'],
+               [b'            FOO', b'     FOO']]
+        assert_(issubclass(C.dtype.type, np.string_))
+        assert_array_equal(C, tgt)
+
+    def test_rpartition(self):
+        P = self.A.rpartition([b'3', b'M'])
+        tgt = [[(b'', b'', b' abc '), (b'', b'', b'')],
+               [(b'12', b'3', b'45'), (b'', b'M', b'ixedCase')],
+               [(b'123 \t ', b'3', b'45 \0 '), (b'', b'', b'UPPER')]]
+        assert_(issubclass(P.dtype.type, np.string_))
+        assert_array_equal(P, tgt)
+
+    def test_rsplit(self):
+        A = self.A.rsplit(b'3')
+        tgt = [[[b' abc '], [b'']],
+               [[b'12', b'45'], [b'MixedCase']],
+               [[b'12', b' \t ', b'45 \x00 '], [b'UPPER']]]
+        assert_(issubclass(A.dtype.type, np.object_))
+        assert_equal(A.tolist(), tgt)
+
+    def test_rstrip(self):
+        assert_(issubclass(self.A.rstrip().dtype.type, np.string_))
+
+        tgt = [[b' abc', b''],
+               [b'12345', b'MixedCase'],
+               [b'123 \t 345', b'UPPER']]
+        assert_array_equal(self.A.rstrip(), tgt)
+
+        tgt = [[b' abc ', b''],
+               [b'1234', b'MixedCase'],
+               [b'123 \t 345 \x00', b'UPP']
+               ]
+        assert_array_equal(self.A.rstrip([b'5', b'ER']), tgt)
+
+        tgt = [[u' \u03a3', ''],
+               ['12345', 'MixedCase'],
+               ['123 \t 345', 'UPPER']]
+        assert_(issubclass(self.B.rstrip().dtype.type, np.unicode_))
+        assert_array_equal(self.B.rstrip(), tgt)
+
+    def test_strip(self):
+        tgt = [[b'abc', b''],
+               [b'12345', b'MixedCase'],
+               [b'123 \t 345', b'UPPER']]
+        assert_(issubclass(self.A.strip().dtype.type, np.string_))
+        assert_array_equal(self.A.strip(), tgt)
+
+        tgt = [[b' abc ', b''],
+               [b'234', b'ixedCas'],
+               [b'23 \t 345 \x00', b'UPP']]
+        assert_array_equal(self.A.strip([b'15', b'EReM']), tgt)
+
+        tgt = [[u'\u03a3', ''],
+               ['12345', 'MixedCase'],
+               ['123 \t 345', 'UPPER']]
+        assert_(issubclass(self.B.strip().dtype.type, np.unicode_))
+        assert_array_equal(self.B.strip(), tgt)
+
+    def test_split(self):
+        A = self.A.split(b'3')
+        tgt = [
+               [[b' abc '], [b'']],
+               [[b'12', b'45'], [b'MixedCase']],
+               [[b'12', b' \t ', b'45 \x00 '], [b'UPPER']]]
+        assert_(issubclass(A.dtype.type, np.object_))
+        assert_equal(A.tolist(), tgt)
+
+    def test_splitlines(self):
+        A = np.char.array(['abc\nfds\nwer']).splitlines()
+        assert_(issubclass(A.dtype.type, np.object_))
+        assert_(A.shape == (1,))
+        assert_(len(A[0]) == 3)
+
+    def test_swapcase(self):
+        tgt = [[b' ABC ', b''],
+               [b'12345', b'mIXEDcASE'],
+               [b'123 \t 345 \0 ', b'upper']]
+        assert_(issubclass(self.A.swapcase().dtype.type, np.string_))
+        assert_array_equal(self.A.swapcase(), tgt)
+
+        tgt = [[u' \u03c3 ', u''],
+               [u'12345', u'mIXEDcASE'],
+               [u'123 \t 345 \0 ', u'upper']]
+        assert_(issubclass(self.B.swapcase().dtype.type, np.unicode_))
+        assert_array_equal(self.B.swapcase(), tgt)
+
+    def test_title(self):
+        tgt = [[b' Abc ', b''],
+               [b'12345', b'Mixedcase'],
+               [b'123 \t 345 \0 ', b'Upper']]
+        assert_(issubclass(self.A.title().dtype.type, np.string_))
+        assert_array_equal(self.A.title(), tgt)
+
+        tgt = [[u' \u03a3 ', u''],
+               [u'12345', u'Mixedcase'],
+               [u'123 \t 345 \0 ', u'Upper']]
+        assert_(issubclass(self.B.title().dtype.type, np.unicode_))
+        assert_array_equal(self.B.title(), tgt)
+
+    def test_upper(self):
+        tgt = [[b' ABC ', b''],
+               [b'12345', b'MIXEDCASE'],
+               [b'123 \t 345 \0 ', b'UPPER']]
+        assert_(issubclass(self.A.upper().dtype.type, np.string_))
+        assert_array_equal(self.A.upper(), tgt)
+
+        tgt = [[u' \u03a3 ', u''],
+               [u'12345', u'MIXEDCASE'],
+               [u'123 \t 345 \0 ', u'UPPER']]
+        assert_(issubclass(self.B.upper().dtype.type, np.unicode_))
+        assert_array_equal(self.B.upper(), tgt)
+
+    def test_isnumeric(self):
+
+        def fail():
+            self.A.isnumeric()
+
+        assert_raises(TypeError, fail)
+        assert_(issubclass(self.B.isnumeric().dtype.type, np.bool_))
+        assert_array_equal(self.B.isnumeric(), [
+                [False, False], [True, False], [False, False]])
+
+    def test_isdecimal(self):
+
+        def fail():
+            self.A.isdecimal()
+
+        assert_raises(TypeError, fail)
+        assert_(issubclass(self.B.isdecimal().dtype.type, np.bool_))
+        assert_array_equal(self.B.isdecimal(), [
+                [False, False], [True, False], [False, False]])
+
+
+class TestOperations(object):
+    def setup(self):
+        self.A = np.array([['abc', '123'],
+                           ['789', 'xyz']]).view(np.chararray)
+        self.B = np.array([['efg', '456'],
+                           ['051', 'tuv']]).view(np.chararray)
+
+    def test_add(self):
+        AB = np.array([['abcefg', '123456'],
+                       ['789051', 'xyztuv']]).view(np.chararray)
+        assert_array_equal(AB, (self.A + self.B))
+        assert_(len((self.A + self.B)[0][0]) == 6)
+
+    def test_radd(self):
+        QA = np.array([['qabc', 'q123'],
+                       ['q789', 'qxyz']]).view(np.chararray)
+        assert_array_equal(QA, ('q' + self.A))
+
+    def test_mul(self):
+        A = self.A
+        for r in (2, 3, 5, 7, 197):
+            Ar = np.array([[A[0, 0]*r, A[0, 1]*r],
+                           [A[1, 0]*r, A[1, 1]*r]]).view(np.chararray)
+
+            assert_array_equal(Ar, (self.A * r))
+
+        for ob in [object(), 'qrs']:
+            with assert_raises_regex(ValueError,
+                                     'Can only multiply by integers'):
+                A*ob
+
+    def test_rmul(self):
+        A = self.A
+        for r in (2, 3, 5, 7, 197):
+            Ar = np.array([[A[0, 0]*r, A[0, 1]*r],
+                           [A[1, 0]*r, A[1, 1]*r]]).view(np.chararray)
+            assert_array_equal(Ar, (r * self.A))
+
+        for ob in [object(), 'qrs']:
+            with assert_raises_regex(ValueError,
+                                     'Can only multiply by integers'):
+                ob * A
+
+    def test_mod(self):
+        """Ticket #856"""
+        F = np.array([['%d', '%f'], ['%s', '%r']]).view(np.chararray)
+        C = np.array([[3, 7], [19, 1]])
+        FC = np.array([['3', '7.000000'],
+                       ['19', '1']]).view(np.chararray)
+        assert_array_equal(FC, F % C)
+
+        A = np.array([['%.3f', '%d'], ['%s', '%r']]).view(np.chararray)
+        A1 = np.array([['1.000', '1'], ['1', '1']]).view(np.chararray)
+        assert_array_equal(A1, (A % 1))
+
+        A2 = np.array([['1.000', '2'], ['3', '4']]).view(np.chararray)
+        assert_array_equal(A2, (A % [[1, 2], [3, 4]]))
+
+    def test_rmod(self):
+        assert_(("%s" % self.A) == str(self.A))
+        assert_(("%r" % self.A) == repr(self.A))
+
+        for ob in [42, object()]:
+            with assert_raises_regex(
+                    TypeError, "unsupported operand type.* and 'chararray'"):
+                ob % self.A
+
+    def test_slice(self):
+        """Regression test for https://github.com/numpy/numpy/issues/5982"""
+
+        arr = np.array([['abc ', 'def '], ['geh ', 'ijk ']],
+                       dtype='S4').view(np.chararray)
+        sl1 = arr[:]
+        assert_array_equal(sl1, arr)
+        assert_(sl1.base is arr)
+        assert_(sl1.base.base is arr.base)
+
+        sl2 = arr[:, :]
+        assert_array_equal(sl2, arr)
+        assert_(sl2.base is arr)
+        assert_(sl2.base.base is arr.base)
+
+        assert_(arr[0, 0] == b'abc')
+
+
+def test_empty_indexing():
+    """Regression test for ticket 1948."""
+    # Check that indexing a chararray with an empty list/array returns an
+    # empty chararray instead of a chararray with a single empty string in it.
+    s = np.chararray((4,))
+    assert_(s[[]].size == 0)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_deprecations.py b/contrib/python/numpy/py2/numpy/core/tests/test_deprecations.py
new file mode 100644
index 00000000000..edb5d5e4607
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_deprecations.py
@@ -0,0 +1,535 @@
+"""
+Tests related to deprecation warnings. Also a convenient place
+to document how deprecations should eventually be turned into errors.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import datetime
+import sys
+import operator
+import warnings
+import pytest
+
+import numpy as np
+from numpy.testing import (
+    assert_raises, assert_warns, assert_
+    )
+
+try:
+    import pytz
+    _has_pytz = True
+except ImportError:
+    _has_pytz = False
+
+
+class _DeprecationTestCase(object):
+    # Just as warning: warnings uses re.match, so the start of this message
+    # must match.
+    message = ''
+    warning_cls = DeprecationWarning
+
+    def setup(self):
+        self.warn_ctx = warnings.catch_warnings(record=True)
+        self.log = self.warn_ctx.__enter__()
+
+        # Do *not* ignore other DeprecationWarnings. Ignoring warnings
+        # can give very confusing results because of
+        # https://bugs.python.org/issue4180 and it is probably simplest to
+        # try to keep the tests cleanly giving only the right warning type.
+        # (While checking them set to "error" those are ignored anyway)
+        # We still have them show up, because otherwise they would be raised
+        warnings.filterwarnings("always", category=self.warning_cls)
+        warnings.filterwarnings("always", message=self.message,
+                                category=self.warning_cls)
+
+    def teardown(self):
+        self.warn_ctx.__exit__()
+
+    def assert_deprecated(self, function, num=1, ignore_others=False,
+                          function_fails=False,
+                          exceptions=np._NoValue,
+                          args=(), kwargs={}):
+        """Test if DeprecationWarnings are given and raised.
+
+        This first checks if the function when called gives `num`
+        DeprecationWarnings, after that it tries to raise these
+        DeprecationWarnings and compares them with `exceptions`.
+        The exceptions can be different for cases where this code path
+        is simply not anticipated and the exception is replaced.
+
+        Parameters
+        ----------
+        function : callable
+            The function to test
+        num : int
+            Number of DeprecationWarnings to expect. This should normally be 1.
+        ignore_others : bool
+            Whether warnings of the wrong type should be ignored (note that
+            the message is not checked)
+        function_fails : bool
+            If the function would normally fail, setting this will check for
+            warnings inside a try/except block.
+        exceptions : Exception or tuple of Exceptions
+            Exception to expect when turning the warnings into an error.
+            The default checks for DeprecationWarnings. If exceptions is
+            empty the function is expected to run successfully.
+        args : tuple
+            Arguments for `function`
+        kwargs : dict
+            Keyword arguments for `function`
+        """
+        # reset the log
+        self.log[:] = []
+
+        if exceptions is np._NoValue:
+            exceptions = (self.warning_cls,)
+
+        try:
+            function(*args, **kwargs)
+        except (Exception if function_fails else tuple()):
+            pass
+
+        # just in case, clear the registry
+        num_found = 0
+        for warning in self.log:
+            if warning.category is self.warning_cls:
+                num_found += 1
+            elif not ignore_others:
+                raise AssertionError(
+                        "expected %s but got: %s" %
+                        (self.warning_cls.__name__, warning.category))
+        if num is not None and num_found != num:
+            msg = "%i warnings found but %i expected." % (len(self.log), num)
+            lst = [str(w.category) for w in self.log]
+            raise AssertionError("\n".join([msg] + lst))
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("error", message=self.message,
+                                    category=self.warning_cls)
+            try:
+                function(*args, **kwargs)
+                if exceptions != tuple():
+                    raise AssertionError(
+                            "No error raised during function call")
+            except exceptions:
+                if exceptions == tuple():
+                    raise AssertionError(
+                            "Error raised during function call")
+
+    def assert_not_deprecated(self, function, args=(), kwargs={}):
+        """Test that warnings are not raised.
+
+        This is just a shorthand for:
+
+        self.assert_deprecated(function, num=0, ignore_others=True,
+                        exceptions=tuple(), args=args, kwargs=kwargs)
+        """
+        self.assert_deprecated(function, num=0, ignore_others=True,
+                        exceptions=tuple(), args=args, kwargs=kwargs)
+
+
+class _VisibleDeprecationTestCase(_DeprecationTestCase):
+    warning_cls = np.VisibleDeprecationWarning
+
+
+class TestNonTupleNDIndexDeprecation(object):
+    def test_basic(self):
+        a = np.zeros((5, 5))
+        with warnings.catch_warnings():
+            warnings.filterwarnings('always')
+            assert_warns(FutureWarning, a.__getitem__, [[0, 1], [0, 1]])
+            assert_warns(FutureWarning, a.__getitem__, [slice(None)])
+
+            warnings.filterwarnings('error')
+            assert_raises(FutureWarning, a.__getitem__, [[0, 1], [0, 1]])
+            assert_raises(FutureWarning, a.__getitem__, [slice(None)])
+
+            # a a[[0, 1]] always was advanced indexing, so no error/warning
+            a[[0, 1]]
+
+
+class TestRankDeprecation(_DeprecationTestCase):
+    """Test that np.rank is deprecated. The function should simply be
+    removed. The VisibleDeprecationWarning may become unnecessary.
+    """
+
+    def test(self):
+        a = np.arange(10)
+        assert_warns(np.VisibleDeprecationWarning, np.rank, a)
+
+
+class TestComparisonDeprecations(_DeprecationTestCase):
+    """This tests the deprecation, for non-element-wise comparison logic.
+    This used to mean that when an error occurred during element-wise comparison
+    (i.e. broadcasting) NotImplemented was returned, but also in the comparison
+    itself, False was given instead of the error.
+
+    Also test FutureWarning for the None comparison.
+    """
+
+    message = "elementwise.* comparison failed; .*"
+
+    def test_normal_types(self):
+        for op in (operator.eq, operator.ne):
+            # Broadcasting errors:
+            self.assert_deprecated(op, args=(np.zeros(3), []))
+            a = np.zeros(3, dtype='i,i')
+            # (warning is issued a couple of times here)
+            self.assert_deprecated(op, args=(a, a[:-1]), num=None)
+
+            # Element comparison error (numpy array can't be compared).
+            a = np.array([1, np.array([1,2,3])], dtype=object)
+            b = np.array([1, np.array([1,2,3])], dtype=object)
+            self.assert_deprecated(op, args=(a, b), num=None)
+
+    def test_string(self):
+        # For two string arrays, strings always raised the broadcasting error:
+        a = np.array(['a', 'b'])
+        b = np.array(['a', 'b', 'c'])
+        assert_raises(ValueError, lambda x, y: x == y, a, b)
+
+        # The empty list is not cast to string, and this used to pass due
+        # to dtype mismatch; now (2018-06-21) it correctly leads to a
+        # FutureWarning.
+        assert_warns(FutureWarning, lambda: a == [])
+
+    def test_void_dtype_equality_failures(self):
+        class NotArray(object):
+            def __array__(self):
+                raise TypeError
+
+            # Needed so Python 3 does not raise DeprecationWarning twice.
+            def __ne__(self, other):
+                return NotImplemented
+
+        self.assert_deprecated(lambda: np.arange(2) == NotArray())
+        self.assert_deprecated(lambda: np.arange(2) != NotArray())
+
+        struct1 = np.zeros(2, dtype="i4,i4")
+        struct2 = np.zeros(2, dtype="i4,i4,i4")
+
+        assert_warns(FutureWarning, lambda: struct1 == 1)
+        assert_warns(FutureWarning, lambda: struct1 == struct2)
+        assert_warns(FutureWarning, lambda: struct1 != 1)
+        assert_warns(FutureWarning, lambda: struct1 != struct2)
+
+    def test_array_richcompare_legacy_weirdness(self):
+        # It doesn't really work to use assert_deprecated here, b/c part of
+        # the point of assert_deprecated is to check that when warnings are
+        # set to "error" mode then the error is propagated -- which is good!
+        # But here we are testing a bunch of code that is deprecated *because*
+        # it has the habit of swallowing up errors and converting them into
+        # different warnings. So assert_warns will have to be sufficient.
+        assert_warns(FutureWarning, lambda: np.arange(2) == "a")
+        assert_warns(FutureWarning, lambda: np.arange(2) != "a")
+        # No warning for scalar comparisons
+        with warnings.catch_warnings():
+            warnings.filterwarnings("error")
+            assert_(not (np.array(0) == "a"))
+            assert_(np.array(0) != "a")
+            assert_(not (np.int16(0) == "a"))
+            assert_(np.int16(0) != "a")
+
+        for arg1 in [np.asarray(0), np.int16(0)]:
+            struct = np.zeros(2, dtype="i4,i4")
+            for arg2 in [struct, "a"]:
+                for f in [operator.lt, operator.le, operator.gt, operator.ge]:
+                    if sys.version_info[0] >= 3:
+                        # py3
+                        with warnings.catch_warnings() as l:
+                            warnings.filterwarnings("always")
+                            assert_raises(TypeError, f, arg1, arg2)
+                            assert_(not l)
+                    else:
+                        # py2
+                        assert_warns(DeprecationWarning, f, arg1, arg2)
+
+
+class TestDatetime64Timezone(_DeprecationTestCase):
+    """Parsing of datetime64 with timezones deprecated in 1.11.0, because
+    datetime64 is now timezone naive rather than UTC only.
+
+    It will be quite a while before we can remove this, because, at the very
+    least, a lot of existing code uses the 'Z' modifier to avoid conversion
+    from local time to UTC, even if otherwise it handles time in a timezone
+    naive fashion.
+    """
+    def test_string(self):
+        self.assert_deprecated(np.datetime64, args=('2000-01-01T00+01',))
+        self.assert_deprecated(np.datetime64, args=('2000-01-01T00Z',))
+
+    @pytest.mark.skipif(not _has_pytz,
+                        reason="The pytz module is not available.")
+    def test_datetime(self):
+        tz = pytz.timezone('US/Eastern')
+        dt = datetime.datetime(2000, 1, 1, 0, 0, tzinfo=tz)
+        self.assert_deprecated(np.datetime64, args=(dt,))
+
+
+class TestNonCContiguousViewDeprecation(_DeprecationTestCase):
+    """View of non-C-contiguous arrays deprecated in 1.11.0.
+
+    The deprecation will not be raised for arrays that are both C and F
+    contiguous, as C contiguous is dominant. There are more such arrays
+    with relaxed stride checking than without so the deprecation is not
+    as visible with relaxed stride checking in force.
+    """
+
+    def test_fortran_contiguous(self):
+        self.assert_deprecated(np.ones((2,2)).T.view, args=(complex,))
+        self.assert_deprecated(np.ones((2,2)).T.view, args=(np.int8,))
+
+
+class TestInvalidOrderParameterInputForFlattenArrayDeprecation(_DeprecationTestCase):
+    """Invalid arguments to the ORDER parameter in array.flatten() should not be
+    allowed and should raise an error.  However, in the interests of not breaking
+    code that may inadvertently pass invalid arguments to this parameter, a
+    DeprecationWarning will be issued instead for the time being to give developers
+    time to refactor relevant code.
+    """
+
+    def test_flatten_array_non_string_arg(self):
+        x = np.zeros((3, 5))
+        self.message = ("Non-string object detected for "
+                        "the array ordering. Please pass "
+                        "in 'C', 'F', 'A', or 'K' instead")
+        self.assert_deprecated(x.flatten, args=(np.pi,))
+
+    def test_flatten_array_invalid_string_arg(self):
+        # Tests that a DeprecationWarning is raised
+        # when a string of length greater than one
+        # starting with "C", "F", "A", or "K" (case-
+        # and unicode-insensitive) is passed in for
+        # the ORDER parameter. Otherwise, a TypeError
+        # will be raised!
+
+        x = np.zeros((3, 5))
+        self.message = ("Non length-one string passed "
+                        "in for the array ordering. Please "
+                        "pass in 'C', 'F', 'A', or 'K' instead")
+        self.assert_deprecated(x.flatten, args=("FACK",))
+
+
+class TestArrayDataAttributeAssignmentDeprecation(_DeprecationTestCase):
+    """Assigning the 'data' attribute of an ndarray is unsafe as pointed
+     out in gh-7093. Eventually, such assignment should NOT be allowed, but
+     in the interests of maintaining backwards compatibility, only a Deprecation-
+     Warning will be raised instead for the time being to give developers time to
+     refactor relevant code.
+    """
+
+    def test_data_attr_assignment(self):
+        a = np.arange(10)
+        b = np.linspace(0, 1, 10)
+
+        self.message = ("Assigning the 'data' attribute is an "
+                        "inherently unsafe operation and will "
+                        "be removed in the future.")
+        self.assert_deprecated(a.__setattr__, args=('data', b.data))
+
+
+class TestLinspaceInvalidNumParameter(_DeprecationTestCase):
+    """Argument to the num parameter in linspace that cannot be
+    safely interpreted as an integer is deprecated in 1.12.0.
+
+    Argument to the num parameter in linspace that cannot be
+    safely interpreted as an integer should not be allowed.
+    In the interest of not breaking code that passes
+    an argument that could still be interpreted as an integer, a
+    DeprecationWarning will be issued for the time being to give
+    developers time to refactor relevant code.
+    """
+    def test_float_arg(self):
+        # 2016-02-25, PR#7328
+        self.assert_deprecated(np.linspace, args=(0, 10, 2.5))
+
+
+class TestBinaryReprInsufficientWidthParameterForRepresentation(_DeprecationTestCase):
+    """
+    If a 'width' parameter is passed into ``binary_repr`` that is insufficient to
+    represent the number in base 2 (positive) or 2's complement (negative) form,
+    the function used to silently ignore the parameter and return a representation
+    using the minimal number of bits needed for the form in question. Such behavior
+    is now considered unsafe from a user perspective and will raise an error in the future.
+    """
+
+    def test_insufficient_width_positive(self):
+        args = (10,)
+        kwargs = {'width': 2}
+
+        self.message = ("Insufficient bit width provided. This behavior "
+                        "will raise an error in the future.")
+        self.assert_deprecated(np.binary_repr, args=args, kwargs=kwargs)
+
+    def test_insufficient_width_negative(self):
+        args = (-5,)
+        kwargs = {'width': 2}
+
+        self.message = ("Insufficient bit width provided. This behavior "
+                        "will raise an error in the future.")
+        self.assert_deprecated(np.binary_repr, args=args, kwargs=kwargs)
+
+
+class TestNumericStyleTypecodes(_DeprecationTestCase):
+    """
+    Deprecate the old numeric-style dtypes, which are especially
+    confusing for complex types, e.g. Complex32 -> complex64. When the
+    deprecation cycle is complete, the check for the strings should be
+    removed from PyArray_DescrConverter in descriptor.c, and the
+    deprecated keys should not be added as capitalized aliases in
+    _add_aliases in numerictypes.py.
+    """
+    def test_all_dtypes(self):
+        deprecated_types = [
+            'Bool', 'Complex32', 'Complex64', 'Float16', 'Float32', 'Float64',
+            'Int8', 'Int16', 'Int32', 'Int64', 'Object0', 'Timedelta64',
+            'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Void0'
+            ]
+        if sys.version_info[0] < 3:
+            deprecated_types.extend(['Unicode0', 'String0'])
+
+        for dt in deprecated_types:
+            self.assert_deprecated(np.dtype, exceptions=(TypeError,),
+                                   args=(dt,))
+
+
+class TestTestDeprecated(object):
+    def test_assert_deprecated(self):
+        test_case_instance = _DeprecationTestCase()
+        test_case_instance.setup()
+        assert_raises(AssertionError,
+                      test_case_instance.assert_deprecated,
+                      lambda: None)
+
+        def foo():
+            warnings.warn("foo", category=DeprecationWarning, stacklevel=2)
+
+        test_case_instance.assert_deprecated(foo)
+        test_case_instance.teardown()
+
+
+class TestClassicIntDivision(_DeprecationTestCase):
+    """
+    See #7949. Deprecate the numeric-style dtypes with -3 flag in python 2
+    if used for division
+    List of data types: https://docs.scipy.org/doc/numpy/user/basics.types.html
+    """
+    def test_int_dtypes(self):
+        #scramble types and do some mix and match testing
+        deprecated_types = [
+           'bool_', 'int_', 'intc', 'uint8', 'int8', 'uint64', 'int32', 'uint16',
+           'intp', 'int64', 'uint32', 'int16'
+            ]
+        if sys.version_info[0] < 3 and sys.py3kwarning:
+            import operator as op
+            dt2 = 'bool_'
+            for dt1 in deprecated_types:
+                a = np.array([1,2,3], dtype=dt1)
+                b = np.array([1,2,3], dtype=dt2)
+                self.assert_deprecated(op.div, args=(a,b))
+                dt2 = dt1
+
+
+class TestNonNumericConjugate(_DeprecationTestCase):
+    """
+    Deprecate no-op behavior of ndarray.conjugate on non-numeric dtypes,
+    which conflicts with the error behavior of np.conjugate.
+    """
+    def test_conjugate(self):
+        for a in np.array(5), np.array(5j):
+            self.assert_not_deprecated(a.conjugate)
+        for a in (np.array('s'), np.array('2016', 'M'),
+                np.array((1, 2), [('a', int), ('b', int)])):
+            self.assert_deprecated(a.conjugate)
+
+
+class TestNPY_CHAR(_DeprecationTestCase):
+    # 2017-05-03, 1.13.0
+    def test_npy_char_deprecation(self):
+        from numpy.core._multiarray_tests import npy_char_deprecation
+        self.assert_deprecated(npy_char_deprecation)
+        assert_(npy_char_deprecation() == 'S1')
+
+
+class Test_UPDATEIFCOPY(_DeprecationTestCase):
+    """
+    v1.14 deprecates creating an array with the UPDATEIFCOPY flag, use
+    WRITEBACKIFCOPY instead
+    """
+    def test_npy_updateifcopy_deprecation(self):
+        from numpy.core._multiarray_tests import npy_updateifcopy_deprecation
+        arr = np.arange(9).reshape(3, 3)
+        v = arr.T
+        self.assert_deprecated(npy_updateifcopy_deprecation, args=(v,))
+
+
+class TestDatetimeEvent(_DeprecationTestCase):
+    # 2017-08-11, 1.14.0
+    def test_3_tuple(self):
+        for cls in (np.datetime64, np.timedelta64):
+            # two valid uses - (unit, num) and (unit, num, den, None)
+            self.assert_not_deprecated(cls, args=(1, ('ms', 2)))
+            self.assert_not_deprecated(cls, args=(1, ('ms', 2, 1, None)))
+
+            # trying to use the event argument, removed in 1.7.0, is deprecated
+            # it used to be a uint8
+            self.assert_deprecated(cls, args=(1, ('ms', 2, 'event')))
+            self.assert_deprecated(cls, args=(1, ('ms', 2, 63)))
+            self.assert_deprecated(cls, args=(1, ('ms', 2, 1, 'event')))
+            self.assert_deprecated(cls, args=(1, ('ms', 2, 1, 63)))
+
+
+class TestTruthTestingEmptyArrays(_DeprecationTestCase):
+    # 2017-09-25, 1.14.0
+    message = '.*truth value of an empty array is ambiguous.*'
+
+    def test_1d(self):
+        self.assert_deprecated(bool, args=(np.array([]),))
+
+    def test_2d(self):
+        self.assert_deprecated(bool, args=(np.zeros((1, 0)),))
+        self.assert_deprecated(bool, args=(np.zeros((0, 1)),))
+        self.assert_deprecated(bool, args=(np.zeros((0, 0)),))
+
+
+class TestBincount(_DeprecationTestCase):
+    # 2017-06-01, 1.14.0
+    def test_bincount_minlength(self):
+        self.assert_deprecated(lambda: np.bincount([1, 2, 3], minlength=None))
+
+
+class TestGeneratorSum(_DeprecationTestCase):
+    # 2018-02-25, 1.15.0
+    def test_generator_sum(self):
+        self.assert_deprecated(np.sum, args=((i for i in range(5)),))
+
+
+class TestSctypeNA(_VisibleDeprecationTestCase):
+    # 2018-06-24, 1.16
+    def test_sctypeNA(self):
+        self.assert_deprecated(lambda: np.sctypeNA['?'])
+        self.assert_deprecated(lambda: np.typeNA['?'])
+        self.assert_deprecated(lambda: np.typeNA.get('?'))
+
+
+class TestPositiveOnNonNumerical(_DeprecationTestCase):
+    # 2018-06-28, 1.16.0
+    def test_positive_on_non_number(self):
+        self.assert_deprecated(operator.pos, args=(np.array('foo'),))
+
+class TestFromstring(_DeprecationTestCase):
+    # 2017-10-19, 1.14
+    def test_fromstring(self):
+        self.assert_deprecated(np.fromstring, args=('\x00'*80,))
+
+class Test_GetSet_NumericOps(_DeprecationTestCase):
+    # 2018-09-20, 1.16.0
+    def test_get_numeric_ops(self):
+        from numpy.core._multiarray_tests import getset_numericops
+        self.assert_deprecated(getset_numericops, num=2)
+        
+        # empty kwargs prevents any state actually changing which would break
+        # other tests.
+        self.assert_deprecated(np.set_numeric_ops, kwargs={})
+        assert_raises(ValueError, np.set_numeric_ops, add='abc')
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_dtype.py b/contrib/python/numpy/py2/numpy/core/tests/test_dtype.py
new file mode 100644
index 00000000000..ff0fb9eff5c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_dtype.py
@@ -0,0 +1,1138 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import operator
+import pytest
+import ctypes
+import gc
+
+import numpy as np
+from numpy.core._rational_tests import rational
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises, HAS_REFCOUNT)
+from numpy.core.numeric import pickle
+
+def assert_dtype_equal(a, b):
+    assert_equal(a, b)
+    assert_equal(hash(a), hash(b),
+                 "two equivalent types do not hash to the same value !")
+
+def assert_dtype_not_equal(a, b):
+    assert_(a != b)
+    assert_(hash(a) != hash(b),
+            "two different types hash to the same value !")
+
+class TestBuiltin(object):
+    @pytest.mark.parametrize('t', [int, float, complex, np.int32, str, object,
+                                   np.unicode])
+    def test_run(self, t):
+        """Only test hash runs at all."""
+        dt = np.dtype(t)
+        hash(dt)
+
+    @pytest.mark.parametrize('t', [int, float])
+    def test_dtype(self, t):
+        # Make sure equivalent byte order char hash the same (e.g. < and = on
+        # little endian)
+        dt = np.dtype(t)
+        dt2 = dt.newbyteorder("<")
+        dt3 = dt.newbyteorder(">")
+        if dt == dt2:
+            assert_(dt.byteorder != dt2.byteorder, "bogus test")
+            assert_dtype_equal(dt, dt2)
+        else:
+            assert_(dt.byteorder != dt3.byteorder, "bogus test")
+            assert_dtype_equal(dt, dt3)
+
+    def test_equivalent_dtype_hashing(self):
+        # Make sure equivalent dtypes with different type num hash equal
+        uintp = np.dtype(np.uintp)
+        if uintp.itemsize == 4:
+            left = uintp
+            right = np.dtype(np.uint32)
+        else:
+            left = uintp
+            right = np.dtype(np.ulonglong)
+        assert_(left == right)
+        assert_(hash(left) == hash(right))
+
+    def test_invalid_types(self):
+        # Make sure invalid type strings raise an error
+
+        assert_raises(TypeError, np.dtype, 'O3')
+        assert_raises(TypeError, np.dtype, 'O5')
+        assert_raises(TypeError, np.dtype, 'O7')
+        assert_raises(TypeError, np.dtype, 'b3')
+        assert_raises(TypeError, np.dtype, 'h4')
+        assert_raises(TypeError, np.dtype, 'I5')
+        assert_raises(TypeError, np.dtype, 'e3')
+        assert_raises(TypeError, np.dtype, 'f5')
+
+        if np.dtype('g').itemsize == 8 or np.dtype('g').itemsize == 16:
+            assert_raises(TypeError, np.dtype, 'g12')
+        elif np.dtype('g').itemsize == 12:
+            assert_raises(TypeError, np.dtype, 'g16')
+
+        if np.dtype('l').itemsize == 8:
+            assert_raises(TypeError, np.dtype, 'l4')
+            assert_raises(TypeError, np.dtype, 'L4')
+        else:
+            assert_raises(TypeError, np.dtype, 'l8')
+            assert_raises(TypeError, np.dtype, 'L8')
+
+        if np.dtype('q').itemsize == 8:
+            assert_raises(TypeError, np.dtype, 'q4')
+            assert_raises(TypeError, np.dtype, 'Q4')
+        else:
+            assert_raises(TypeError, np.dtype, 'q8')
+            assert_raises(TypeError, np.dtype, 'Q8')
+
+    def test_bad_param(self):
+        # Can't give a size that's too small
+        assert_raises(ValueError, np.dtype,
+                        {'names':['f0', 'f1'],
+                         'formats':['i4', 'i1'],
+                         'offsets':[0, 4],
+                         'itemsize':4})
+        # If alignment is enabled, the alignment (4) must divide the itemsize
+        assert_raises(ValueError, np.dtype,
+                        {'names':['f0', 'f1'],
+                         'formats':['i4', 'i1'],
+                         'offsets':[0, 4],
+                         'itemsize':9}, align=True)
+        # If alignment is enabled, the individual fields must be aligned
+        assert_raises(ValueError, np.dtype,
+                        {'names':['f0', 'f1'],
+                         'formats':['i1', 'f4'],
+                         'offsets':[0, 2]}, align=True)
+
+    def test_field_order_equality(self):
+        x = np.dtype({'names': ['A', 'B'], 
+                      'formats': ['i4', 'f4'], 
+                      'offsets': [0, 4]})
+        y = np.dtype({'names': ['B', 'A'], 
+                      'formats': ['f4', 'i4'], 
+                      'offsets': [4, 0]})
+        assert_equal(x == y, False)
+
+class TestRecord(object):
+    def test_equivalent_record(self):
+        """Test whether equivalent record dtypes hash the same."""
+        a = np.dtype([('yo', int)])
+        b = np.dtype([('yo', int)])
+        assert_dtype_equal(a, b)
+
+    def test_different_names(self):
+        # In theory, they may hash the same (collision) ?
+        a = np.dtype([('yo', int)])
+        b = np.dtype([('ye', int)])
+        assert_dtype_not_equal(a, b)
+
+    def test_different_titles(self):
+        # In theory, they may hash the same (collision) ?
+        a = np.dtype({'names': ['r', 'b'],
+                      'formats': ['u1', 'u1'],
+                      'titles': ['Red pixel', 'Blue pixel']})
+        b = np.dtype({'names': ['r', 'b'],
+                      'formats': ['u1', 'u1'],
+                      'titles': ['RRed pixel', 'Blue pixel']})
+        assert_dtype_not_equal(a, b)
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_refcount_dictionary_setting(self):
+        names = ["name1"]
+        formats = ["f8"]
+        titles = ["t1"]
+        offsets = [0]
+        d = dict(names=names, formats=formats, titles=titles, offsets=offsets)
+        refcounts = {k: sys.getrefcount(i) for k, i in d.items()}
+        np.dtype(d)
+        refcounts_new = {k: sys.getrefcount(i) for k, i in d.items()}
+        assert refcounts == refcounts_new
+
+    def test_mutate(self):
+        # Mutating a dtype should reset the cached hash value
+        a = np.dtype([('yo', int)])
+        b = np.dtype([('yo', int)])
+        c = np.dtype([('ye', int)])
+        assert_dtype_equal(a, b)
+        assert_dtype_not_equal(a, c)
+        a.names = ['ye']
+        assert_dtype_equal(a, c)
+        assert_dtype_not_equal(a, b)
+        state = b.__reduce__()[2]
+        a.__setstate__(state)
+        assert_dtype_equal(a, b)
+        assert_dtype_not_equal(a, c)
+
+    def test_not_lists(self):
+        """Test if an appropriate exception is raised when passing bad values to
+        the dtype constructor.
+        """
+        assert_raises(TypeError, np.dtype,
+                      dict(names={'A', 'B'}, formats=['f8', 'i4']))
+        assert_raises(TypeError, np.dtype,
+                      dict(names=['A', 'B'], formats={'f8', 'i4'}))
+
+    def test_aligned_size(self):
+        # Check that structured dtypes get padded to an aligned size
+        dt = np.dtype('i4, i1', align=True)
+        assert_equal(dt.itemsize, 8)
+        dt = np.dtype([('f0', 'i4'), ('f1', 'i1')], align=True)
+        assert_equal(dt.itemsize, 8)
+        dt = np.dtype({'names':['f0', 'f1'],
+                       'formats':['i4', 'u1'],
+                       'offsets':[0, 4]}, align=True)
+        assert_equal(dt.itemsize, 8)
+        dt = np.dtype({'f0': ('i4', 0), 'f1':('u1', 4)}, align=True)
+        assert_equal(dt.itemsize, 8)
+        # Nesting should preserve that alignment
+        dt1 = np.dtype([('f0', 'i4'),
+                       ('f1', [('f1', 'i1'), ('f2', 'i4'), ('f3', 'i1')]),
+                       ('f2', 'i1')], align=True)
+        assert_equal(dt1.itemsize, 20)
+        dt2 = np.dtype({'names':['f0', 'f1', 'f2'],
+                       'formats':['i4',
+                                  [('f1', 'i1'), ('f2', 'i4'), ('f3', 'i1')],
+                                  'i1'],
+                       'offsets':[0, 4, 16]}, align=True)
+        assert_equal(dt2.itemsize, 20)
+        dt3 = np.dtype({'f0': ('i4', 0),
+                       'f1': ([('f1', 'i1'), ('f2', 'i4'), ('f3', 'i1')], 4),
+                       'f2': ('i1', 16)}, align=True)
+        assert_equal(dt3.itemsize, 20)
+        assert_equal(dt1, dt2)
+        assert_equal(dt2, dt3)
+        # Nesting should preserve packing
+        dt1 = np.dtype([('f0', 'i4'),
+                       ('f1', [('f1', 'i1'), ('f2', 'i4'), ('f3', 'i1')]),
+                       ('f2', 'i1')], align=False)
+        assert_equal(dt1.itemsize, 11)
+        dt2 = np.dtype({'names':['f0', 'f1', 'f2'],
+                       'formats':['i4',
+                                  [('f1', 'i1'), ('f2', 'i4'), ('f3', 'i1')],
+                                  'i1'],
+                       'offsets':[0, 4, 10]}, align=False)
+        assert_equal(dt2.itemsize, 11)
+        dt3 = np.dtype({'f0': ('i4', 0),
+                       'f1': ([('f1', 'i1'), ('f2', 'i4'), ('f3', 'i1')], 4),
+                       'f2': ('i1', 10)}, align=False)
+        assert_equal(dt3.itemsize, 11)
+        assert_equal(dt1, dt2)
+        assert_equal(dt2, dt3)
+        # Array of subtype should preserve alignment
+        dt1 = np.dtype([('a', '|i1'),
+                        ('b', [('f0', '<i2'),
+                        ('f1', '<f4')], 2)], align=True)
+        assert_equal(dt1.descr, [('a', '|i1'), ('', '|V3'),
+                                 ('b', [('f0', '<i2'), ('', '|V2'),
+                                 ('f1', '<f4')], (2,))])
+
+    def test_union_struct(self):
+        # Should be able to create union dtypes
+        dt = np.dtype({'names':['f0', 'f1', 'f2'], 'formats':['<u4', '<u2', '<u2'],
+                        'offsets':[0, 0, 2]}, align=True)
+        assert_equal(dt.itemsize, 4)
+        a = np.array([3], dtype='<u4').view(dt)
+        a['f1'] = 10
+        a['f2'] = 36
+        assert_equal(a['f0'], 10 + 36*256*256)
+        # Should be able to specify fields out of order
+        dt = np.dtype({'names':['f0', 'f1', 'f2'], 'formats':['<u4', '<u2', '<u2'],
+                        'offsets':[4, 0, 2]}, align=True)
+        assert_equal(dt.itemsize, 8)
+        # field name should not matter: assignment is by position
+        dt2 = np.dtype({'names':['f2', 'f0', 'f1'],
+                        'formats':['<u4', '<u2', '<u2'],
+                        'offsets':[4, 0, 2]}, align=True)
+        vals = [(0, 1, 2), (3, -1, 4)]
+        vals2 = [(0, 1, 2), (3, -1, 4)]
+        a = np.array(vals, dt)
+        b = np.array(vals2, dt2)
+        assert_equal(a.astype(dt2), b)
+        assert_equal(b.astype(dt), a)
+        assert_equal(a.view(dt2), b)
+        assert_equal(b.view(dt), a)
+        # Should not be able to overlap objects with other types
+        assert_raises(TypeError, np.dtype,
+                {'names':['f0', 'f1'],
+                 'formats':['O', 'i1'],
+                 'offsets':[0, 2]})
+        assert_raises(TypeError, np.dtype,
+                {'names':['f0', 'f1'],
+                 'formats':['i4', 'O'],
+                 'offsets':[0, 3]})
+        assert_raises(TypeError, np.dtype,
+                {'names':['f0', 'f1'],
+                 'formats':[[('a', 'O')], 'i1'],
+                 'offsets':[0, 2]})
+        assert_raises(TypeError, np.dtype,
+                {'names':['f0', 'f1'],
+                 'formats':['i4', [('a', 'O')]],
+                 'offsets':[0, 3]})
+        # Out of order should still be ok, however
+        dt = np.dtype({'names':['f0', 'f1'],
+                       'formats':['i1', 'O'],
+                       'offsets':[np.dtype('intp').itemsize, 0]})
+
+    def test_comma_datetime(self):
+        dt = np.dtype('M8[D],datetime64[Y],i8')
+        assert_equal(dt, np.dtype([('f0', 'M8[D]'),
+                                   ('f1', 'datetime64[Y]'),
+                                   ('f2', 'i8')]))
+
+    def test_from_dictproxy(self):
+        # Tests for PR #5920
+        dt = np.dtype({'names': ['a', 'b'], 'formats': ['i4', 'f4']})
+        assert_dtype_equal(dt, np.dtype(dt.fields))
+        dt2 = np.dtype((np.void, dt.fields))
+        assert_equal(dt2.fields, dt.fields)
+
+    def test_from_dict_with_zero_width_field(self):
+        # Regression test for #6430 / #2196
+        dt = np.dtype([('val1', np.float32, (0,)), ('val2', int)])
+        dt2 = np.dtype({'names': ['val1', 'val2'],
+                        'formats': [(np.float32, (0,)), int]})
+
+        assert_dtype_equal(dt, dt2)
+        assert_equal(dt.fields['val1'][0].itemsize, 0)
+        assert_equal(dt.itemsize, dt.fields['val2'][0].itemsize)
+
+    def test_bool_commastring(self):
+        d = np.dtype('?,?,?')  # raises?
+        assert_equal(len(d.names), 3)
+        for n in d.names:
+            assert_equal(d.fields[n][0], np.dtype('?'))
+
+    def test_nonint_offsets(self):
+        # gh-8059
+        def make_dtype(off):
+            return np.dtype({'names': ['A'], 'formats': ['i4'],
+                             'offsets': [off]})
+
+        assert_raises(TypeError, make_dtype, 'ASD')
+        assert_raises(OverflowError, make_dtype, 2**70)
+        assert_raises(TypeError, make_dtype, 2.3)
+        assert_raises(ValueError, make_dtype, -10)
+
+        # no errors here:
+        dt = make_dtype(np.uint32(0))
+        np.zeros(1, dtype=dt)[0].item()
+
+    def test_fields_by_index(self):
+        dt = np.dtype([('a', np.int8), ('b', np.float32, 3)])
+        assert_dtype_equal(dt[0], np.dtype(np.int8))
+        assert_dtype_equal(dt[1], np.dtype((np.float32, 3)))
+        assert_dtype_equal(dt[-1], dt[1])
+        assert_dtype_equal(dt[-2], dt[0])
+        assert_raises(IndexError, lambda: dt[-3])
+
+        assert_raises(TypeError, operator.getitem, dt, 3.0)
+        assert_raises(TypeError, operator.getitem, dt, [])
+
+        assert_equal(dt[1], dt[np.int8(1)])
+
+    def test_partial_dict(self):
+        # 'names' is missing
+        assert_raises(ValueError, np.dtype,
+                {'formats': ['i4', 'i4'], 'f0': ('i4', 0), 'f1':('i4', 4)})
+        
+
+class TestSubarray(object):
+    def test_single_subarray(self):
+        a = np.dtype((int, (2)))
+        b = np.dtype((int, (2,)))
+        assert_dtype_equal(a, b)
+
+        assert_equal(type(a.subdtype[1]), tuple)
+        assert_equal(type(b.subdtype[1]), tuple)
+
+    def test_equivalent_record(self):
+        """Test whether equivalent subarray dtypes hash the same."""
+        a = np.dtype((int, (2, 3)))
+        b = np.dtype((int, (2, 3)))
+        assert_dtype_equal(a, b)
+
+    def test_nonequivalent_record(self):
+        """Test whether different subarray dtypes hash differently."""
+        a = np.dtype((int, (2, 3)))
+        b = np.dtype((int, (3, 2)))
+        assert_dtype_not_equal(a, b)
+
+        a = np.dtype((int, (2, 3)))
+        b = np.dtype((int, (2, 2)))
+        assert_dtype_not_equal(a, b)
+
+        a = np.dtype((int, (1, 2, 3)))
+        b = np.dtype((int, (1, 2)))
+        assert_dtype_not_equal(a, b)
+
+    def test_shape_equal(self):
+        """Test some data types that are equal"""
+        assert_dtype_equal(np.dtype('f8'), np.dtype(('f8', tuple())))
+        assert_dtype_equal(np.dtype('f8'), np.dtype(('f8', 1)))
+        assert_dtype_equal(np.dtype((int, 2)), np.dtype((int, (2,))))
+        assert_dtype_equal(np.dtype(('<f4', (3, 2))), np.dtype(('<f4', (3, 2))))
+        d = ([('a', 'f4', (1, 2)), ('b', 'f8', (3, 1))], (3, 2))
+        assert_dtype_equal(np.dtype(d), np.dtype(d))
+
+    def test_shape_simple(self):
+        """Test some simple cases that shouldn't be equal"""
+        assert_dtype_not_equal(np.dtype('f8'), np.dtype(('f8', (1,))))
+        assert_dtype_not_equal(np.dtype(('f8', (1,))), np.dtype(('f8', (1, 1))))
+        assert_dtype_not_equal(np.dtype(('f4', (3, 2))), np.dtype(('f4', (2, 3))))
+
+    def test_shape_monster(self):
+        """Test some more complicated cases that shouldn't be equal"""
+        assert_dtype_not_equal(
+            np.dtype(([('a', 'f4', (2, 1)), ('b', 'f8', (1, 3))], (2, 2))),
+            np.dtype(([('a', 'f4', (1, 2)), ('b', 'f8', (1, 3))], (2, 2))))
+        assert_dtype_not_equal(
+            np.dtype(([('a', 'f4', (2, 1)), ('b', 'f8', (1, 3))], (2, 2))),
+            np.dtype(([('a', 'f4', (2, 1)), ('b', 'i8', (1, 3))], (2, 2))))
+        assert_dtype_not_equal(
+            np.dtype(([('a', 'f4', (2, 1)), ('b', 'f8', (1, 3))], (2, 2))),
+            np.dtype(([('e', 'f8', (1, 3)), ('d', 'f4', (2, 1))], (2, 2))))
+        assert_dtype_not_equal(
+            np.dtype(([('a', [('a', 'i4', 6)], (2, 1)), ('b', 'f8', (1, 3))], (2, 2))),
+            np.dtype(([('a', [('a', 'u4', 6)], (2, 1)), ('b', 'f8', (1, 3))], (2, 2))))
+
+    def test_shape_sequence(self):
+        # Any sequence of integers should work as shape, but the result
+        # should be a tuple (immutable) of base type integers.
+        a = np.array([1, 2, 3], dtype=np.int16)
+        l = [1, 2, 3]
+        # Array gets converted
+        dt = np.dtype([('a', 'f4', a)])
+        assert_(isinstance(dt['a'].shape, tuple))
+        assert_(isinstance(dt['a'].shape[0], int))
+        # List gets converted
+        dt = np.dtype([('a', 'f4', l)])
+        assert_(isinstance(dt['a'].shape, tuple))
+        #
+
+        class IntLike(object):
+            def __index__(self):
+                return 3
+
+            def __int__(self):
+                # (a PyNumber_Check fails without __int__)
+                return 3
+
+        dt = np.dtype([('a', 'f4', IntLike())])
+        assert_(isinstance(dt['a'].shape, tuple))
+        assert_(isinstance(dt['a'].shape[0], int))
+        dt = np.dtype([('a', 'f4', (IntLike(),))])
+        assert_(isinstance(dt['a'].shape, tuple))
+        assert_(isinstance(dt['a'].shape[0], int))
+
+    def test_shape_matches_ndim(self):
+        dt = np.dtype([('a', 'f4', ())])
+        assert_equal(dt['a'].shape, ())
+        assert_equal(dt['a'].ndim, 0)
+
+        dt = np.dtype([('a', 'f4')])
+        assert_equal(dt['a'].shape, ())
+        assert_equal(dt['a'].ndim, 0)
+
+        dt = np.dtype([('a', 'f4', 4)])
+        assert_equal(dt['a'].shape, (4,))
+        assert_equal(dt['a'].ndim, 1)
+
+        dt = np.dtype([('a', 'f4', (1, 2, 3))])
+        assert_equal(dt['a'].shape, (1, 2, 3))
+        assert_equal(dt['a'].ndim, 3)
+
+    def test_shape_invalid(self):
+        # Check that the shape is valid.
+        max_int = np.iinfo(np.intc).max
+        max_intp = np.iinfo(np.intp).max
+        # Too large values (the datatype is part of this)
+        assert_raises(ValueError, np.dtype, [('a', 'f4', max_int // 4 + 1)])
+        assert_raises(ValueError, np.dtype, [('a', 'f4', max_int + 1)])
+        assert_raises(ValueError, np.dtype, [('a', 'f4', (max_int, 2))])
+        # Takes a different code path (fails earlier:
+        assert_raises(ValueError, np.dtype, [('a', 'f4', max_intp + 1)])
+        # Negative values
+        assert_raises(ValueError, np.dtype, [('a', 'f4', -1)])
+        assert_raises(ValueError, np.dtype, [('a', 'f4', (-1, -1))])
+
+    def test_alignment(self):
+        #Check that subarrays are aligned
+        t1 = np.dtype('1i4', align=True)
+        t2 = np.dtype('2i4', align=True)
+        assert_equal(t1.alignment, t2.alignment)
+
+
+def iter_struct_object_dtypes():
+    """
+    Iterates over a few complex dtypes and object pattern which
+    fill the array with a given object (defaults to a singleton).
+
+    Yields
+    ------
+    dtype : dtype
+    pattern : tuple
+        Structured tuple for use with `np.array`.
+    count : int
+        Number of objects stored in the dtype.
+    singleton : object
+        A singleton object. The returned pattern is constructed so that
+        all objects inside the datatype are set to the singleton.
+    """
+    obj = object()
+
+    dt = np.dtype([('b', 'O', (2, 3))])
+    p = ([[obj] * 3] * 2,)
+    yield pytest.param(dt, p, 6, obj, id="<subarray>")
+
+    dt = np.dtype([('a', 'i4'), ('b', 'O', (2, 3))])
+    p = (0, [[obj] * 3] * 2)
+    yield pytest.param(dt, p, 6, obj, id="<subarray in field>")
+
+    dt = np.dtype([('a', 'i4'),
+                   ('b', [('ba', 'O'), ('bb', 'i1')], (2, 3))])
+    p = (0, [[(obj, 0)] * 3] * 2)
+    yield pytest.param(dt, p, 6, obj, id="<structured subarray 1>")
+
+    dt = np.dtype([('a', 'i4'),
+                   ('b', [('ba', 'O'), ('bb', 'O')], (2, 3))])
+    p = (0, [[(obj, obj)] * 3] * 2)
+    yield pytest.param(dt, p, 12, obj, id="<structured subarray 2>")
+
+
+[email protected](not HAS_REFCOUNT, reason="Python lacks refcounts")
+class TestStructuredObjectRefcounting:
+    """These tests cover various uses of complicated structured types which
+    include objects and thus require reference counting.
+    """
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    @pytest.mark.parametrize(["creation_func", "creation_obj"], [
+        pytest.param(np.empty, None,
+             # None is probably used for too many things
+             marks=pytest.mark.skip("unreliable due to python's behaviour")),
+        (np.ones, 1),
+        (np.zeros, 0)])
+    def test_structured_object_create_delete(self, dt, pat, count, singleton,
+                                             creation_func, creation_obj):
+        """Structured object reference counting in creation and deletion"""
+        # The test assumes that 0, 1, and None are singletons.
+        gc.collect()
+        before = sys.getrefcount(creation_obj)
+        arr = creation_func(3, dt)
+
+        now = sys.getrefcount(creation_obj)
+        assert now - before == count * 3
+        del arr
+        now = sys.getrefcount(creation_obj)
+        assert now == before
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    def test_structured_object_item_setting(self, dt, pat, count, singleton):
+        """Structured object reference counting for simple item setting"""
+        one = 1
+
+        gc.collect()
+        before = sys.getrefcount(singleton)
+        arr = np.array([pat] * 3, dt)
+        assert sys.getrefcount(singleton) - before == count * 3
+        # Fill with `1` and check that it was replaced correctly:
+        before2 = sys.getrefcount(one)
+        arr[...] = one
+        after2 = sys.getrefcount(one)
+        assert after2 - before2 == count * 3
+        del arr
+        gc.collect()
+        assert sys.getrefcount(one) == before2
+        assert sys.getrefcount(singleton) == before
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    @pytest.mark.parametrize(
+        ['shape', 'index', 'items_changed'],
+        [((3,), ([0, 2],), 2),
+         ((3, 2), ([0, 2], slice(None)), 4),
+         ((3, 2), ([0, 2], [1]), 2),
+         ((3,), ([True, False, True]), 2)])
+    def test_structured_object_indexing(self, shape, index, items_changed,
+                                        dt, pat, count, singleton):
+        """Structured object reference counting for advanced indexing."""
+        zero = 0
+        one = 1
+
+        arr = np.zeros(shape, dt)
+
+        gc.collect()
+        before_zero = sys.getrefcount(zero)
+        before_one = sys.getrefcount(one)
+        # Test item getting:
+        part = arr[index]
+        after_zero = sys.getrefcount(zero)
+        assert after_zero - before_zero == count * items_changed
+        del part
+        # Test item setting:
+        arr[index] = one
+        gc.collect()
+        after_zero = sys.getrefcount(zero)
+        after_one = sys.getrefcount(one)
+        assert before_zero - after_zero == count * items_changed
+        assert after_one - before_one == count * items_changed
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    def test_structured_object_take_and_repeat(self, dt, pat, count, singleton):
+        """Structured object reference counting for specialized functions.
+        The older functions such as take and repeat use different code paths
+        then item setting (when writing this).
+        """
+        indices = [0, 1]
+
+        arr = np.array([pat] * 3, dt)
+        gc.collect()
+        before = sys.getrefcount(singleton)
+        res = arr.take(indices)
+        after = sys.getrefcount(singleton)
+        assert after - before == count * 2
+        new = res.repeat(10)
+        gc.collect()
+        after_repeat = sys.getrefcount(singleton)
+        assert after_repeat - after == count * 2 * 10
+
+
+class TestStructuredDtypeSparseFields(object):
+    """Tests subarray fields which contain sparse dtypes so that
+    not all memory is used by the dtype work. Such dtype's should
+    leave the underlying memory unchanged.
+    """
+    dtype = np.dtype([('a', {'names':['aa', 'ab'], 'formats':['f', 'f'],
+                             'offsets':[0, 4]}, (2, 3))])
+    sparse_dtype = np.dtype([('a', {'names':['ab'], 'formats':['f'],
+                                    'offsets':[4]}, (2, 3))])
+
+    @pytest.mark.xfail(reason="inaccessible data is changed see gh-12686.")
+    @pytest.mark.valgrind_error(reason="reads from unitialized buffers.")
+    def test_sparse_field_assignment(self):
+        arr = np.zeros(3, self.dtype)
+        sparse_arr = arr.view(self.sparse_dtype)
+
+        sparse_arr[...] = np.finfo(np.float32).max
+        # dtype is reduced when accessing the field, so shape is (3, 2, 3):
+        assert_array_equal(arr["a"]["aa"], np.zeros((3, 2, 3)))
+
+    def test_sparse_field_assignment_fancy(self):
+        # Fancy assignment goes to the copyswap function for comlex types:
+        arr = np.zeros(3, self.dtype)
+        sparse_arr = arr.view(self.sparse_dtype)
+
+        sparse_arr[[0, 1, 2]] = np.finfo(np.float32).max
+        # dtype is reduced when accessing the field, so shape is (3, 2, 3):
+        assert_array_equal(arr["a"]["aa"], np.zeros((3, 2, 3)))
+
+
+class TestMonsterType(object):
+    """Test deeply nested subtypes."""
+
+    def test1(self):
+        simple1 = np.dtype({'names': ['r', 'b'], 'formats': ['u1', 'u1'],
+            'titles': ['Red pixel', 'Blue pixel']})
+        a = np.dtype([('yo', int), ('ye', simple1),
+            ('yi', np.dtype((int, (3, 2))))])
+        b = np.dtype([('yo', int), ('ye', simple1),
+            ('yi', np.dtype((int, (3, 2))))])
+        assert_dtype_equal(a, b)
+
+        c = np.dtype([('yo', int), ('ye', simple1),
+            ('yi', np.dtype((a, (3, 2))))])
+        d = np.dtype([('yo', int), ('ye', simple1),
+            ('yi', np.dtype((a, (3, 2))))])
+        assert_dtype_equal(c, d)
+
+class TestMetadata(object):
+    def test_no_metadata(self):
+        d = np.dtype(int)
+        assert_(d.metadata is None)
+
+    def test_metadata_takes_dict(self):
+        d = np.dtype(int, metadata={'datum': 1})
+        assert_(d.metadata == {'datum': 1})
+
+    def test_metadata_rejects_nondict(self):
+        assert_raises(TypeError, np.dtype, int, metadata='datum')
+        assert_raises(TypeError, np.dtype, int, metadata=1)
+        assert_raises(TypeError, np.dtype, int, metadata=None)
+
+    def test_nested_metadata(self):
+        d = np.dtype([('a', np.dtype(int, metadata={'datum': 1}))])
+        assert_(d['a'].metadata == {'datum': 1})
+
+    def test_base_metadata_copied(self):
+        d = np.dtype((np.void, np.dtype('i4,i4', metadata={'datum': 1})))
+        assert_(d.metadata == {'datum': 1})
+
+class TestString(object):
+    def test_complex_dtype_str(self):
+        dt = np.dtype([('top', [('tiles', ('>f4', (64, 64)), (1,)),
+                                ('rtile', '>f4', (64, 36))], (3,)),
+                       ('bottom', [('bleft', ('>f4', (8, 64)), (1,)),
+                                   ('bright', '>f4', (8, 36))])])
+        assert_equal(str(dt),
+                     "[('top', [('tiles', ('>f4', (64, 64)), (1,)), "
+                     "('rtile', '>f4', (64, 36))], (3,)), "
+                     "('bottom', [('bleft', ('>f4', (8, 64)), (1,)), "
+                     "('bright', '>f4', (8, 36))])]")
+
+        # If the sticky aligned flag is set to True, it makes the
+        # str() function use a dict representation with an 'aligned' flag
+        dt = np.dtype([('top', [('tiles', ('>f4', (64, 64)), (1,)),
+                                ('rtile', '>f4', (64, 36))],
+                                (3,)),
+                       ('bottom', [('bleft', ('>f4', (8, 64)), (1,)),
+                                   ('bright', '>f4', (8, 36))])],
+                       align=True)
+        assert_equal(str(dt),
+                    "{'names':['top','bottom'], "
+                     "'formats':[([('tiles', ('>f4', (64, 64)), (1,)), "
+                                  "('rtile', '>f4', (64, 36))], (3,)),"
+                                 "[('bleft', ('>f4', (8, 64)), (1,)), "
+                                  "('bright', '>f4', (8, 36))]], "
+                     "'offsets':[0,76800], "
+                     "'itemsize':80000, "
+                     "'aligned':True}")
+        assert_equal(np.dtype(eval(str(dt))), dt)
+
+        dt = np.dtype({'names': ['r', 'g', 'b'], 'formats': ['u1', 'u1', 'u1'],
+                        'offsets': [0, 1, 2],
+                        'titles': ['Red pixel', 'Green pixel', 'Blue pixel']})
+        assert_equal(str(dt),
+                    "[(('Red pixel', 'r'), 'u1'), "
+                    "(('Green pixel', 'g'), 'u1'), "
+                    "(('Blue pixel', 'b'), 'u1')]")
+
+        dt = np.dtype({'names': ['rgba', 'r', 'g', 'b'],
+                       'formats': ['<u4', 'u1', 'u1', 'u1'],
+                       'offsets': [0, 0, 1, 2],
+                       'titles': ['Color', 'Red pixel',
+                                  'Green pixel', 'Blue pixel']})
+        assert_equal(str(dt),
+                    "{'names':['rgba','r','g','b'],"
+                    " 'formats':['<u4','u1','u1','u1'],"
+                    " 'offsets':[0,0,1,2],"
+                    " 'titles':['Color','Red pixel',"
+                              "'Green pixel','Blue pixel'],"
+                    " 'itemsize':4}")
+
+        dt = np.dtype({'names': ['r', 'b'], 'formats': ['u1', 'u1'],
+                        'offsets': [0, 2],
+                        'titles': ['Red pixel', 'Blue pixel']})
+        assert_equal(str(dt),
+                    "{'names':['r','b'],"
+                    " 'formats':['u1','u1'],"
+                    " 'offsets':[0,2],"
+                    " 'titles':['Red pixel','Blue pixel'],"
+                    " 'itemsize':3}")
+
+        dt = np.dtype([('a', '<m8[D]'), ('b', '<M8[us]')])
+        assert_equal(str(dt),
+                    "[('a', '<m8[D]'), ('b', '<M8[us]')]")
+
+    def test_repr_structured(self):
+        dt = np.dtype([('top', [('tiles', ('>f4', (64, 64)), (1,)),
+                                ('rtile', '>f4', (64, 36))], (3,)),
+                       ('bottom', [('bleft', ('>f4', (8, 64)), (1,)),
+                                   ('bright', '>f4', (8, 36))])])
+        assert_equal(repr(dt),
+                     "dtype([('top', [('tiles', ('>f4', (64, 64)), (1,)), "
+                     "('rtile', '>f4', (64, 36))], (3,)), "
+                     "('bottom', [('bleft', ('>f4', (8, 64)), (1,)), "
+                     "('bright', '>f4', (8, 36))])])")
+
+        dt = np.dtype({'names': ['r', 'g', 'b'], 'formats': ['u1', 'u1', 'u1'],
+                        'offsets': [0, 1, 2],
+                        'titles': ['Red pixel', 'Green pixel', 'Blue pixel']},
+                        align=True)
+        assert_equal(repr(dt),
+                    "dtype([(('Red pixel', 'r'), 'u1'), "
+                    "(('Green pixel', 'g'), 'u1'), "
+                    "(('Blue pixel', 'b'), 'u1')], align=True)")
+
+    def test_repr_structured_not_packed(self):
+        dt = np.dtype({'names': ['rgba', 'r', 'g', 'b'],
+                       'formats': ['<u4', 'u1', 'u1', 'u1'],
+                       'offsets': [0, 0, 1, 2],
+                       'titles': ['Color', 'Red pixel',
+                                  'Green pixel', 'Blue pixel']}, align=True)
+        assert_equal(repr(dt),
+                    "dtype({'names':['rgba','r','g','b'],"
+                    " 'formats':['<u4','u1','u1','u1'],"
+                    " 'offsets':[0,0,1,2],"
+                    " 'titles':['Color','Red pixel',"
+                              "'Green pixel','Blue pixel'],"
+                    " 'itemsize':4}, align=True)")
+
+        dt = np.dtype({'names': ['r', 'b'], 'formats': ['u1', 'u1'],
+                        'offsets': [0, 2],
+                        'titles': ['Red pixel', 'Blue pixel'],
+                        'itemsize': 4})
+        assert_equal(repr(dt),
+                    "dtype({'names':['r','b'], "
+                    "'formats':['u1','u1'], "
+                    "'offsets':[0,2], "
+                    "'titles':['Red pixel','Blue pixel'], "
+                    "'itemsize':4})")
+
+    def test_repr_structured_datetime(self):
+        dt = np.dtype([('a', '<M8[D]'), ('b', '<m8[us]')])
+        assert_equal(repr(dt),
+                    "dtype([('a', '<M8[D]'), ('b', '<m8[us]')])")
+
+    def test_repr_str_subarray(self):
+        dt = np.dtype(('<i2', (1,)))
+        assert_equal(repr(dt), "dtype(('<i2', (1,)))")
+        assert_equal(str(dt), "('<i2', (1,))")
+
+    @pytest.mark.skipif(sys.version_info[0] >= 3, reason="Python 2 only")
+    def test_dtype_str_with_long_in_shape(self):
+        # Pull request #376, should not error
+        np.dtype('(1L,)i4')
+
+    def test_base_dtype_with_object_type(self):
+        # Issue gh-2798, should not error.
+        np.array(['a'], dtype="O").astype(("O", [("name", "O")]))
+
+    def test_empty_string_to_object(self):
+        # Pull request #4722
+        np.array(["", ""]).astype(object)
+
+    def test_void_subclass_unsized(self):
+        dt = np.dtype(np.record)
+        assert_equal(repr(dt), "dtype('V')")
+        assert_equal(str(dt), '|V0')
+        assert_equal(dt.name, 'record')
+
+    def test_void_subclass_sized(self):
+        dt = np.dtype((np.record, 2))
+        assert_equal(repr(dt), "dtype('V2')")
+        assert_equal(str(dt), '|V2')
+        assert_equal(dt.name, 'record16')
+
+    def test_void_subclass_fields(self):
+        dt = np.dtype((np.record, [('a', '<u2')]))
+        assert_equal(repr(dt), "dtype((numpy.record, [('a', '<u2')]))")
+        assert_equal(str(dt), "(numpy.record, [('a', '<u2')])")
+        assert_equal(dt.name, 'record16')
+
+
+class TestDtypeAttributeDeletion(object):
+
+    def test_dtype_non_writable_attributes_deletion(self):
+        dt = np.dtype(np.double)
+        attr = ["subdtype", "descr", "str", "name", "base", "shape",
+                "isbuiltin", "isnative", "isalignedstruct", "fields",
+                "metadata", "hasobject"]
+
+        for s in attr:
+            assert_raises(AttributeError, delattr, dt, s)
+
+    def test_dtype_writable_attributes_deletion(self):
+        dt = np.dtype(np.double)
+        attr = ["names"]
+        for s in attr:
+            assert_raises(AttributeError, delattr, dt, s)
+
+
+class TestDtypeAttributes(object):
+    def test_descr_has_trailing_void(self):
+        # see gh-6359
+        dtype = np.dtype({
+            'names': ['A', 'B'],
+            'formats': ['f4', 'f4'],
+            'offsets': [0, 8],
+            'itemsize': 16})
+        new_dtype = np.dtype(dtype.descr)
+        assert_equal(new_dtype.itemsize, 16)
+
+    @pytest.mark.parametrize('t', np.typeDict.values())
+    def test_name_builtin(self, t):
+        name = t.__name__
+        if name.endswith('_'):
+            name = name[:-1]
+        assert_equal(np.dtype(t).name, name)
+
+    def test_name_dtype_subclass(self):
+        # Ticket #4357
+        class user_def_subcls(np.void):
+            pass
+        assert_equal(np.dtype(user_def_subcls).name, 'user_def_subcls')
+
+
+class TestPickling(object):
+
+    def check_pickling(self, dtype):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            pickled = pickle.loads(pickle.dumps(dtype, proto))
+            assert_equal(pickled, dtype)
+            assert_equal(pickled.descr, dtype.descr)
+            if dtype.metadata is not None:
+                assert_equal(pickled.metadata, dtype.metadata)
+            # Check the reconstructed dtype is functional
+            x = np.zeros(3, dtype=dtype)
+            y = np.zeros(3, dtype=pickled)
+            assert_equal(x, y)
+            assert_equal(x[0], y[0])
+
+    @pytest.mark.parametrize('t', [int, float, complex, np.int32, str, object,
+                                   np.unicode, bool])
+    def test_builtin(self, t):
+        self.check_pickling(np.dtype(t))
+
+    def test_structured(self):
+        dt = np.dtype(([('a', '>f4', (2, 1)), ('b', '<f8', (1, 3))], (2, 2)))
+        self.check_pickling(dt)
+
+    def test_structured_aligned(self):
+        dt = np.dtype('i4, i1', align=True)
+        self.check_pickling(dt)
+
+    def test_structured_unaligned(self):
+        dt = np.dtype('i4, i1', align=False)
+        self.check_pickling(dt)
+
+    def test_structured_padded(self):
+        dt = np.dtype({
+            'names': ['A', 'B'],
+            'formats': ['f4', 'f4'],
+            'offsets': [0, 8],
+            'itemsize': 16})
+        self.check_pickling(dt)
+
+    def test_structured_titles(self):
+        dt = np.dtype({'names': ['r', 'b'],
+                       'formats': ['u1', 'u1'],
+                       'titles': ['Red pixel', 'Blue pixel']})
+        self.check_pickling(dt)
+
+    @pytest.mark.parametrize('base', ['m8', 'M8'])
+    @pytest.mark.parametrize('unit', ['', 'Y', 'M', 'W', 'D', 'h', 'm', 's',
+                                      'ms', 'us', 'ns', 'ps', 'fs', 'as'])
+    def test_datetime(self, base, unit):
+        dt = np.dtype('%s[%s]' % (base, unit) if unit else base)
+        self.check_pickling(dt)
+        if unit:
+            dt = np.dtype('%s[7%s]' % (base, unit))
+            self.check_pickling(dt)
+
+    def test_metadata(self):
+        dt = np.dtype(int, metadata={'datum': 1})
+        self.check_pickling(dt)
+
+
+def test_rational_dtype():
+    # test for bug gh-5719
+    a = np.array([1111], dtype=rational).astype
+    assert_raises(OverflowError, a, 'int8')
+
+    # test that dtype detection finds user-defined types
+    x = rational(1)
+    assert_equal(np.array([x,x]).dtype, np.dtype(rational))
+
+
+def test_dtypes_are_true():
+    # test for gh-6294
+    assert bool(np.dtype('f8'))
+    assert bool(np.dtype('i8'))
+    assert bool(np.dtype([('a', 'i8'), ('b', 'f4')]))
+
+
+def test_invalid_dtype_string():
+    # test for gh-10440
+    assert_raises(TypeError, np.dtype, 'f8,i8,[f8,i8]')
+    assert_raises(TypeError, np.dtype, u'Fl\xfcgel')
+
+
+class TestFromCTypes(object):
+
+    @staticmethod
+    def check(ctype, dtype):
+        dtype = np.dtype(dtype)
+        assert_equal(np.dtype(ctype), dtype)
+        assert_equal(np.dtype(ctype()), dtype)
+
+    def test_array(self):
+        c8 = ctypes.c_uint8
+        self.check(     3 * c8,  (np.uint8, (3,)))
+        self.check(     1 * c8,  (np.uint8, (1,)))
+        self.check(     0 * c8,  (np.uint8, (0,)))
+        self.check(1 * (3 * c8), ((np.uint8, (3,)), (1,)))
+        self.check(3 * (1 * c8), ((np.uint8, (1,)), (3,)))
+
+    def test_padded_structure(self):
+        class PaddedStruct(ctypes.Structure):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', np.uint8),
+            ('b', np.uint16)
+        ], align=True)
+        self.check(PaddedStruct, expected)
+
+    def test_bit_fields(self):
+        class BitfieldStruct(ctypes.Structure):
+            _fields_ = [
+                ('a', ctypes.c_uint8, 7),
+                ('b', ctypes.c_uint8, 1)
+            ]
+        assert_raises(TypeError, np.dtype, BitfieldStruct)
+        assert_raises(TypeError, np.dtype, BitfieldStruct())
+
+    def test_pointer(self):
+        p_uint8 = ctypes.POINTER(ctypes.c_uint8)
+        assert_raises(TypeError, np.dtype, p_uint8)
+
+    def test_void_pointer(self):
+        self.check(ctypes.c_void_p, np.uintp)
+
+    def test_union(self):
+        class Union(ctypes.Union):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+            ]
+        expected = np.dtype(dict(
+            names=['a', 'b'],
+            formats=[np.uint8, np.uint16],
+            offsets=[0, 0],
+            itemsize=2
+        ))
+        self.check(Union, expected)
+
+    def test_union_with_struct_packed(self):
+        class Struct(ctypes.Structure):
+            _pack_ = 1
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+
+        class Union(ctypes.Union):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+                ('c', ctypes.c_uint32),
+                ('d', Struct),
+            ]
+        expected = np.dtype(dict(
+            names=['a', 'b', 'c', 'd'],
+            formats=['u1', np.uint16, np.uint32, [('one', 'u1'), ('two', np.uint32)]],
+            offsets=[0, 0, 0, 0],
+            itemsize=ctypes.sizeof(Union)
+        ))
+        self.check(Union, expected)
+
+    def test_union_packed(self):
+        class Struct(ctypes.Structure):
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+            _pack_ = 1
+        class Union(ctypes.Union):
+            _pack_ = 1
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+                ('c', ctypes.c_uint32),
+                ('d', Struct),
+            ]
+        expected = np.dtype(dict(
+            names=['a', 'b', 'c', 'd'],
+            formats=['u1', np.uint16, np.uint32, [('one', 'u1'), ('two', np.uint32)]],
+            offsets=[0, 0, 0, 0],
+            itemsize=ctypes.sizeof(Union)
+        ))
+        self.check(Union, expected)
+
+    def test_packed_structure(self):
+        class PackedStructure(ctypes.Structure):
+            _pack_ = 1
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', np.uint8),
+            ('b', np.uint16)
+        ])
+        self.check(PackedStructure, expected)
+
+    def test_large_packed_structure(self):
+        class PackedStructure(ctypes.Structure):
+            _pack_ = 2
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+                ('c', ctypes.c_uint8),
+                ('d', ctypes.c_uint16),
+                ('e', ctypes.c_uint32),
+                ('f', ctypes.c_uint32),
+                ('g', ctypes.c_uint8)
+                ]
+        expected = np.dtype(dict(
+            formats=[np.uint8, np.uint16, np.uint8, np.uint16, np.uint32, np.uint32, np.uint8 ],
+            offsets=[0, 2, 4, 6, 8, 12, 16],
+            names=['a', 'b', 'c', 'd', 'e', 'f', 'g'],
+            itemsize=18))
+        self.check(PackedStructure, expected)
+
+    def test_big_endian_structure_packed(self):
+        class BigEndStruct(ctypes.BigEndianStructure):
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+            _pack_ = 1
+        expected = np.dtype([('one', 'u1'), ('two', '>u4')])
+        self.check(BigEndStruct, expected)
+
+    def test_little_endian_structure_packed(self):
+        class LittleEndStruct(ctypes.LittleEndianStructure):
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+            _pack_ = 1
+        expected = np.dtype([('one', 'u1'), ('two', '<u4')])
+        self.check(LittleEndStruct, expected)
+
+    def test_little_endian_structure(self):
+        class PaddedStruct(ctypes.LittleEndianStructure):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', '<B'),
+            ('b', '<H')
+        ], align=True)
+        self.check(PaddedStruct, expected)
+
+    def test_big_endian_structure(self):
+        class PaddedStruct(ctypes.BigEndianStructure):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', '>B'),
+            ('b', '>H')
+        ], align=True)
+        self.check(PaddedStruct, expected)
+
+    def test_simple_endian_types(self):
+        self.check(ctypes.c_uint16.__ctype_le__, np.dtype('<u2'))
+        self.check(ctypes.c_uint16.__ctype_be__, np.dtype('>u2'))
+        self.check(ctypes.c_uint8.__ctype_le__, np.dtype('u1'))
+        self.check(ctypes.c_uint8.__ctype_be__, np.dtype('u1'))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_einsum.py b/contrib/python/numpy/py2/numpy/core/tests/test_einsum.py
new file mode 100644
index 00000000000..1b5b4cb2624
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_einsum.py
@@ -0,0 +1,1014 @@
+from __future__ import division, absolute_import, print_function
+
+import itertools
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_almost_equal,
+    assert_raises, suppress_warnings, assert_raises_regex, assert_allclose
+    )
+
+# Setup for optimize einsum
+chars = 'abcdefghij'
+sizes = np.array([2, 3, 4, 5, 4, 3, 2, 6, 5, 4, 3])
+global_size_dict = dict(zip(chars, sizes))
+
+
+class TestEinsum(object):
+    def test_einsum_errors(self):
+        for do_opt in [True, False]:
+            # Need enough arguments
+            assert_raises(ValueError, np.einsum, optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "", optimize=do_opt)
+
+            # subscripts must be a string
+            assert_raises(TypeError, np.einsum, 0, 0, optimize=do_opt)
+
+            # out parameter must be an array
+            assert_raises(TypeError, np.einsum, "", 0, out='test',
+                          optimize=do_opt)
+
+            # order parameter must be a valid order
+            assert_raises(TypeError, np.einsum, "", 0, order='W',
+                          optimize=do_opt)
+
+            # casting parameter must be a valid casting
+            assert_raises(ValueError, np.einsum, "", 0, casting='blah',
+                          optimize=do_opt)
+
+            # dtype parameter must be a valid dtype
+            assert_raises(TypeError, np.einsum, "", 0, dtype='bad_data_type',
+                          optimize=do_opt)
+
+            # other keyword arguments are rejected
+            assert_raises(TypeError, np.einsum, "", 0, bad_arg=0,
+                          optimize=do_opt)
+
+            # issue 4528 revealed a segfault with this call
+            assert_raises(TypeError, np.einsum, *(None,)*63, optimize=do_opt)
+
+            # number of operands must match count in subscripts string
+            assert_raises(ValueError, np.einsum, "", 0, 0, optimize=do_opt)
+            assert_raises(ValueError, np.einsum, ",", 0, [0], [0],
+                          optimize=do_opt)
+            assert_raises(ValueError, np.einsum, ",", [0], optimize=do_opt)
+
+            # can't have more subscripts than dimensions in the operand
+            assert_raises(ValueError, np.einsum, "i", 0, optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "ij", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "...i", 0, optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "i...j", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "i...", 0, optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "ij...", [0, 0], optimize=do_opt)
+
+            # invalid ellipsis
+            assert_raises(ValueError, np.einsum, "i..", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, ".i...", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "j->..j", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "j->.j...", [0, 0], optimize=do_opt)
+
+            # invalid subscript character
+            assert_raises(ValueError, np.einsum, "i%...", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "...j$", [0, 0], optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "i->&", [0, 0], optimize=do_opt)
+
+            # output subscripts must appear in input
+            assert_raises(ValueError, np.einsum, "i->ij", [0, 0], optimize=do_opt)
+
+            # output subscripts may only be specified once
+            assert_raises(ValueError, np.einsum, "ij->jij", [[0, 0], [0, 0]],
+                          optimize=do_opt)
+
+            # dimensions much match when being collapsed
+            assert_raises(ValueError, np.einsum, "ii",
+                          np.arange(6).reshape(2, 3), optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "ii->i",
+                          np.arange(6).reshape(2, 3), optimize=do_opt)
+
+            # broadcasting to new dimensions must be enabled explicitly
+            assert_raises(ValueError, np.einsum, "i", np.arange(6).reshape(2, 3),
+                          optimize=do_opt)
+            assert_raises(ValueError, np.einsum, "i->i", [[0, 1], [0, 1]],
+                          out=np.arange(4).reshape(2, 2), optimize=do_opt)
+            with assert_raises_regex(ValueError, "'b'"):
+                # gh-11221 - 'c' erroneously appeared in the error message
+                a = np.ones((3, 3, 4, 5, 6))
+                b = np.ones((3, 4, 5))
+                np.einsum('aabcb,abc', a, b)
+
+    def test_einsum_views(self):
+        # pass-through
+        for do_opt in [True, False]:
+            a = np.arange(6)
+            a.shape = (2, 3)
+
+            b = np.einsum("...", a, optimize=do_opt)
+            assert_(b.base is a)
+
+            b = np.einsum(a, [Ellipsis], optimize=do_opt)
+            assert_(b.base is a)
+
+            b = np.einsum("ij", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, a)
+
+            b = np.einsum(a, [0, 1], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, a)
+
+            # output is writeable whenever input is writeable
+            b = np.einsum("...", a, optimize=do_opt)
+            assert_(b.flags['WRITEABLE'])
+            a.flags['WRITEABLE'] = False
+            b = np.einsum("...", a, optimize=do_opt)
+            assert_(not b.flags['WRITEABLE'])
+
+            # transpose
+            a = np.arange(6)
+            a.shape = (2, 3)
+
+            b = np.einsum("ji", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, a.T)
+
+            b = np.einsum(a, [1, 0], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, a.T)
+
+            # diagonal
+            a = np.arange(9)
+            a.shape = (3, 3)
+
+            b = np.einsum("ii->i", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[i, i] for i in range(3)])
+
+            b = np.einsum(a, [0, 0], [0], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[i, i] for i in range(3)])
+
+            # diagonal with various ways of broadcasting an additional dimension
+            a = np.arange(27)
+            a.shape = (3, 3, 3)
+
+            b = np.einsum("...ii->...i", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [[x[i, i] for i in range(3)] for x in a])
+
+            b = np.einsum(a, [Ellipsis, 0, 0], [Ellipsis, 0], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [[x[i, i] for i in range(3)] for x in a])
+
+            b = np.einsum("ii...->...i", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [[x[i, i] for i in range(3)]
+                             for x in a.transpose(2, 0, 1)])
+
+            b = np.einsum(a, [0, 0, Ellipsis], [Ellipsis, 0], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [[x[i, i] for i in range(3)]
+                             for x in a.transpose(2, 0, 1)])
+
+            b = np.einsum("...ii->i...", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[:, i, i] for i in range(3)])
+
+            b = np.einsum(a, [Ellipsis, 0, 0], [0, Ellipsis], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[:, i, i] for i in range(3)])
+
+            b = np.einsum("jii->ij", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[:, i, i] for i in range(3)])
+
+            b = np.einsum(a, [1, 0, 0], [0, 1], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[:, i, i] for i in range(3)])
+
+            b = np.einsum("ii...->i...", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a.transpose(2, 0, 1)[:, i, i] for i in range(3)])
+
+            b = np.einsum(a, [0, 0, Ellipsis], [0, Ellipsis], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a.transpose(2, 0, 1)[:, i, i] for i in range(3)])
+
+            b = np.einsum("i...i->i...", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a.transpose(1, 0, 2)[:, i, i] for i in range(3)])
+
+            b = np.einsum(a, [0, Ellipsis, 0], [0, Ellipsis], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a.transpose(1, 0, 2)[:, i, i] for i in range(3)])
+
+            b = np.einsum("i...i->...i", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [[x[i, i] for i in range(3)]
+                             for x in a.transpose(1, 0, 2)])
+
+            b = np.einsum(a, [0, Ellipsis, 0], [Ellipsis, 0], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [[x[i, i] for i in range(3)]
+                             for x in a.transpose(1, 0, 2)])
+
+            # triple diagonal
+            a = np.arange(27)
+            a.shape = (3, 3, 3)
+
+            b = np.einsum("iii->i", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[i, i, i] for i in range(3)])
+
+            b = np.einsum(a, [0, 0, 0], [0], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, [a[i, i, i] for i in range(3)])
+
+            # swap axes
+            a = np.arange(24)
+            a.shape = (2, 3, 4)
+
+            b = np.einsum("ijk->jik", a, optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, a.swapaxes(0, 1))
+
+            b = np.einsum(a, [0, 1, 2], [1, 0, 2], optimize=do_opt)
+            assert_(b.base is a)
+            assert_equal(b, a.swapaxes(0, 1))
+
+    def check_einsum_sums(self, dtype, do_opt=False):
+        # Check various sums.  Does many sizes to exercise unrolled loops.
+
+        # sum(a, axis=-1)
+        for n in range(1, 17):
+            a = np.arange(n, dtype=dtype)
+            assert_equal(np.einsum("i->", a, optimize=do_opt),
+                         np.sum(a, axis=-1).astype(dtype))
+            assert_equal(np.einsum(a, [0], [], optimize=do_opt),
+                         np.sum(a, axis=-1).astype(dtype))
+
+        for n in range(1, 17):
+            a = np.arange(2*3*n, dtype=dtype).reshape(2, 3, n)
+            assert_equal(np.einsum("...i->...", a, optimize=do_opt),
+                         np.sum(a, axis=-1).astype(dtype))
+            assert_equal(np.einsum(a, [Ellipsis, 0], [Ellipsis], optimize=do_opt),
+                         np.sum(a, axis=-1).astype(dtype))
+
+        # sum(a, axis=0)
+        for n in range(1, 17):
+            a = np.arange(2*n, dtype=dtype).reshape(2, n)
+            assert_equal(np.einsum("i...->...", a, optimize=do_opt),
+                         np.sum(a, axis=0).astype(dtype))
+            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt),
+                         np.sum(a, axis=0).astype(dtype))
+
+        for n in range(1, 17):
+            a = np.arange(2*3*n, dtype=dtype).reshape(2, 3, n)
+            assert_equal(np.einsum("i...->...", a, optimize=do_opt),
+                         np.sum(a, axis=0).astype(dtype))
+            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt),
+                         np.sum(a, axis=0).astype(dtype))
+
+        # trace(a)
+        for n in range(1, 17):
+            a = np.arange(n*n, dtype=dtype).reshape(n, n)
+            assert_equal(np.einsum("ii", a, optimize=do_opt),
+                         np.trace(a).astype(dtype))
+            assert_equal(np.einsum(a, [0, 0], optimize=do_opt),
+                         np.trace(a).astype(dtype))
+
+        # multiply(a, b)
+        assert_equal(np.einsum("..., ...", 3, 4), 12)  # scalar case
+        for n in range(1, 17):
+            a = np.arange(3 * n, dtype=dtype).reshape(3, n)
+            b = np.arange(2 * 3 * n, dtype=dtype).reshape(2, 3, n)
+            assert_equal(np.einsum("..., ...", a, b, optimize=do_opt),
+                         np.multiply(a, b))
+            assert_equal(np.einsum(a, [Ellipsis], b, [Ellipsis], optimize=do_opt),
+                         np.multiply(a, b))
+
+        # inner(a,b)
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n, dtype=dtype).reshape(2, 3, n)
+            b = np.arange(n, dtype=dtype)
+            assert_equal(np.einsum("...i, ...i", a, b, optimize=do_opt), np.inner(a, b))
+            assert_equal(np.einsum(a, [Ellipsis, 0], b, [Ellipsis, 0], optimize=do_opt),
+                         np.inner(a, b))
+
+        for n in range(1, 11):
+            a = np.arange(n * 3 * 2, dtype=dtype).reshape(n, 3, 2)
+            b = np.arange(n, dtype=dtype)
+            assert_equal(np.einsum("i..., i...", a, b, optimize=do_opt),
+                         np.inner(a.T, b.T).T)
+            assert_equal(np.einsum(a, [0, Ellipsis], b, [0, Ellipsis], optimize=do_opt),
+                         np.inner(a.T, b.T).T)
+
+        # outer(a,b)
+        for n in range(1, 17):
+            a = np.arange(3, dtype=dtype)+1
+            b = np.arange(n, dtype=dtype)+1
+            assert_equal(np.einsum("i,j", a, b, optimize=do_opt),
+                         np.outer(a, b))
+            assert_equal(np.einsum(a, [0], b, [1], optimize=do_opt),
+                         np.outer(a, b))
+
+        # Suppress the complex warnings for the 'as f8' tests
+        with suppress_warnings() as sup:
+            sup.filter(np.ComplexWarning)
+
+            # matvec(a,b) / a.dot(b) where a is matrix, b is vector
+            for n in range(1, 17):
+                a = np.arange(4*n, dtype=dtype).reshape(4, n)
+                b = np.arange(n, dtype=dtype)
+                assert_equal(np.einsum("ij, j", a, b, optimize=do_opt),
+                             np.dot(a, b))
+                assert_equal(np.einsum(a, [0, 1], b, [1], optimize=do_opt),
+                             np.dot(a, b))
+
+                c = np.arange(4, dtype=dtype)
+                np.einsum("ij,j", a, b, out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c,
+                             np.dot(a.astype('f8'),
+                                    b.astype('f8')).astype(dtype))
+                c[...] = 0
+                np.einsum(a, [0, 1], b, [1], out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c,
+                             np.dot(a.astype('f8'),
+                                    b.astype('f8')).astype(dtype))
+
+            for n in range(1, 17):
+                a = np.arange(4*n, dtype=dtype).reshape(4, n)
+                b = np.arange(n, dtype=dtype)
+                assert_equal(np.einsum("ji,j", a.T, b.T, optimize=do_opt),
+                             np.dot(b.T, a.T))
+                assert_equal(np.einsum(a.T, [1, 0], b.T, [1], optimize=do_opt),
+                             np.dot(b.T, a.T))
+
+                c = np.arange(4, dtype=dtype)
+                np.einsum("ji,j", a.T, b.T, out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c,
+                             np.dot(b.T.astype('f8'),
+                                    a.T.astype('f8')).astype(dtype))
+                c[...] = 0
+                np.einsum(a.T, [1, 0], b.T, [1], out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c,
+                             np.dot(b.T.astype('f8'),
+                                    a.T.astype('f8')).astype(dtype))
+
+            # matmat(a,b) / a.dot(b) where a is matrix, b is matrix
+            for n in range(1, 17):
+                if n < 8 or dtype != 'f2':
+                    a = np.arange(4*n, dtype=dtype).reshape(4, n)
+                    b = np.arange(n*6, dtype=dtype).reshape(n, 6)
+                    assert_equal(np.einsum("ij,jk", a, b, optimize=do_opt),
+                                 np.dot(a, b))
+                    assert_equal(np.einsum(a, [0, 1], b, [1, 2], optimize=do_opt),
+                                 np.dot(a, b))
+
+            for n in range(1, 17):
+                a = np.arange(4*n, dtype=dtype).reshape(4, n)
+                b = np.arange(n*6, dtype=dtype).reshape(n, 6)
+                c = np.arange(24, dtype=dtype).reshape(4, 6)
+                np.einsum("ij,jk", a, b, out=c, dtype='f8', casting='unsafe',
+                          optimize=do_opt)
+                assert_equal(c,
+                             np.dot(a.astype('f8'),
+                                    b.astype('f8')).astype(dtype))
+                c[...] = 0
+                np.einsum(a, [0, 1], b, [1, 2], out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c,
+                             np.dot(a.astype('f8'),
+                                    b.astype('f8')).astype(dtype))
+
+            # matrix triple product (note this is not currently an efficient
+            # way to multiply 3 matrices)
+            a = np.arange(12, dtype=dtype).reshape(3, 4)
+            b = np.arange(20, dtype=dtype).reshape(4, 5)
+            c = np.arange(30, dtype=dtype).reshape(5, 6)
+            if dtype != 'f2':
+                assert_equal(np.einsum("ij,jk,kl", a, b, c, optimize=do_opt),
+                             a.dot(b).dot(c))
+                assert_equal(np.einsum(a, [0, 1], b, [1, 2], c, [2, 3],
+                                       optimize=do_opt), a.dot(b).dot(c))
+
+            d = np.arange(18, dtype=dtype).reshape(3, 6)
+            np.einsum("ij,jk,kl", a, b, c, out=d,
+                      dtype='f8', casting='unsafe', optimize=do_opt)
+            tgt = a.astype('f8').dot(b.astype('f8'))
+            tgt = tgt.dot(c.astype('f8')).astype(dtype)
+            assert_equal(d, tgt)
+
+            d[...] = 0
+            np.einsum(a, [0, 1], b, [1, 2], c, [2, 3], out=d,
+                      dtype='f8', casting='unsafe', optimize=do_opt)
+            tgt = a.astype('f8').dot(b.astype('f8'))
+            tgt = tgt.dot(c.astype('f8')).astype(dtype)
+            assert_equal(d, tgt)
+
+            # tensordot(a, b)
+            if np.dtype(dtype) != np.dtype('f2'):
+                a = np.arange(60, dtype=dtype).reshape(3, 4, 5)
+                b = np.arange(24, dtype=dtype).reshape(4, 3, 2)
+                assert_equal(np.einsum("ijk, jil -> kl", a, b),
+                             np.tensordot(a, b, axes=([1, 0], [0, 1])))
+                assert_equal(np.einsum(a, [0, 1, 2], b, [1, 0, 3], [2, 3]),
+                             np.tensordot(a, b, axes=([1, 0], [0, 1])))
+
+                c = np.arange(10, dtype=dtype).reshape(5, 2)
+                np.einsum("ijk,jil->kl", a, b, out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c, np.tensordot(a.astype('f8'), b.astype('f8'),
+                             axes=([1, 0], [0, 1])).astype(dtype))
+                c[...] = 0
+                np.einsum(a, [0, 1, 2], b, [1, 0, 3], [2, 3], out=c,
+                          dtype='f8', casting='unsafe', optimize=do_opt)
+                assert_equal(c, np.tensordot(a.astype('f8'), b.astype('f8'),
+                             axes=([1, 0], [0, 1])).astype(dtype))
+
+        # logical_and(logical_and(a!=0, b!=0), c!=0)
+        a = np.array([1,   3,   -2,   0,   12,  13,   0,   1], dtype=dtype)
+        b = np.array([0,   3.5, 0.,   -2,  0,   1,    3,   12], dtype=dtype)
+        c = np.array([True, True, False, True, True, False, True, True])
+        assert_equal(np.einsum("i,i,i->i", a, b, c,
+                     dtype='?', casting='unsafe', optimize=do_opt),
+                     np.logical_and(np.logical_and(a != 0, b != 0), c != 0))
+        assert_equal(np.einsum(a, [0], b, [0], c, [0], [0],
+                     dtype='?', casting='unsafe'),
+                     np.logical_and(np.logical_and(a != 0, b != 0), c != 0))
+
+        a = np.arange(9, dtype=dtype)
+        assert_equal(np.einsum(",i->", 3, a), 3*np.sum(a))
+        assert_equal(np.einsum(3, [], a, [0], []), 3*np.sum(a))
+        assert_equal(np.einsum("i,->", a, 3), 3*np.sum(a))
+        assert_equal(np.einsum(a, [0], 3, [], []), 3*np.sum(a))
+
+        # Various stride0, contiguous, and SSE aligned variants
+        for n in range(1, 25):
+            a = np.arange(n, dtype=dtype)
+            if np.dtype(dtype).itemsize > 1:
+                assert_equal(np.einsum("...,...", a, a, optimize=do_opt),
+                             np.multiply(a, a))
+                assert_equal(np.einsum("i,i", a, a, optimize=do_opt), np.dot(a, a))
+                assert_equal(np.einsum("i,->i", a, 2, optimize=do_opt), 2*a)
+                assert_equal(np.einsum(",i->i", 2, a, optimize=do_opt), 2*a)
+                assert_equal(np.einsum("i,->", a, 2, optimize=do_opt), 2*np.sum(a))
+                assert_equal(np.einsum(",i->", 2, a, optimize=do_opt), 2*np.sum(a))
+
+                assert_equal(np.einsum("...,...", a[1:], a[:-1], optimize=do_opt),
+                             np.multiply(a[1:], a[:-1]))
+                assert_equal(np.einsum("i,i", a[1:], a[:-1], optimize=do_opt),
+                             np.dot(a[1:], a[:-1]))
+                assert_equal(np.einsum("i,->i", a[1:], 2, optimize=do_opt), 2*a[1:])
+                assert_equal(np.einsum(",i->i", 2, a[1:], optimize=do_opt), 2*a[1:])
+                assert_equal(np.einsum("i,->", a[1:], 2, optimize=do_opt),
+                             2*np.sum(a[1:]))
+                assert_equal(np.einsum(",i->", 2, a[1:], optimize=do_opt),
+                             2*np.sum(a[1:]))
+
+        # An object array, summed as the data type
+        a = np.arange(9, dtype=object)
+
+        b = np.einsum("i->", a, dtype=dtype, casting='unsafe')
+        assert_equal(b, np.sum(a))
+        assert_equal(b.dtype, np.dtype(dtype))
+
+        b = np.einsum(a, [0], [], dtype=dtype, casting='unsafe')
+        assert_equal(b, np.sum(a))
+        assert_equal(b.dtype, np.dtype(dtype))
+
+        # A case which was failing (ticket #1885)
+        p = np.arange(2) + 1
+        q = np.arange(4).reshape(2, 2) + 3
+        r = np.arange(4).reshape(2, 2) + 7
+        assert_equal(np.einsum('z,mz,zm->', p, q, r), 253)
+
+        # singleton dimensions broadcast (gh-10343)
+        p = np.ones((10,2))
+        q = np.ones((1,2))
+        assert_array_equal(np.einsum('ij,ij->j', p, q, optimize=True),
+                           np.einsum('ij,ij->j', p, q, optimize=False))
+        assert_array_equal(np.einsum('ij,ij->j', p, q, optimize=True),
+                           [10.] * 2)
+
+        # a blas-compatible contraction broadcasting case which was failing
+        # for optimize=True (ticket #10930)
+        x = np.array([2., 3.])
+        y = np.array([4.])
+        assert_array_equal(np.einsum("i, i", x, y, optimize=False), 20.)
+        assert_array_equal(np.einsum("i, i", x, y, optimize=True), 20.)
+
+        # all-ones array was bypassing bug (ticket #10930)
+        p = np.ones((1, 5)) / 2
+        q = np.ones((5, 5)) / 2
+        for optimize in (True, False):
+            assert_array_equal(np.einsum("...ij,...jk->...ik", p, p,
+                                         optimize=optimize),
+                               np.einsum("...ij,...jk->...ik", p, q,
+                                         optimize=optimize))
+            assert_array_equal(np.einsum("...ij,...jk->...ik", p, q,
+                                         optimize=optimize),
+                               np.full((1, 5), 1.25))
+
+        # Cases which were failing (gh-10899)
+        x = np.eye(2, dtype=dtype)
+        y = np.ones(2, dtype=dtype)
+        assert_array_equal(np.einsum("ji,i->", x, y, optimize=optimize),
+                           [2.])  # contig_contig_outstride0_two
+        assert_array_equal(np.einsum("i,ij->", y, x, optimize=optimize),
+                           [2.])  # stride0_contig_outstride0_two
+        assert_array_equal(np.einsum("ij,i->", x, y, optimize=optimize),
+                           [2.])  # contig_stride0_outstride0_two
+
+    def test_einsum_sums_int8(self):
+        self.check_einsum_sums('i1')
+
+    def test_einsum_sums_uint8(self):
+        self.check_einsum_sums('u1')
+
+    def test_einsum_sums_int16(self):
+        self.check_einsum_sums('i2')
+
+    def test_einsum_sums_uint16(self):
+        self.check_einsum_sums('u2')
+
+    def test_einsum_sums_int32(self):
+        self.check_einsum_sums('i4')
+        self.check_einsum_sums('i4', True)
+
+    def test_einsum_sums_uint32(self):
+        self.check_einsum_sums('u4')
+        self.check_einsum_sums('u4', True)
+
+    def test_einsum_sums_int64(self):
+        self.check_einsum_sums('i8')
+
+    def test_einsum_sums_uint64(self):
+        self.check_einsum_sums('u8')
+
+    def test_einsum_sums_float16(self):
+        self.check_einsum_sums('f2')
+
+    def test_einsum_sums_float32(self):
+        self.check_einsum_sums('f4')
+
+    def test_einsum_sums_float64(self):
+        self.check_einsum_sums('f8')
+        self.check_einsum_sums('f8', True)
+
+    def test_einsum_sums_longdouble(self):
+        self.check_einsum_sums(np.longdouble)
+
+    def test_einsum_sums_cfloat64(self):
+        self.check_einsum_sums('c8')
+        self.check_einsum_sums('c8', True)
+
+    def test_einsum_sums_cfloat128(self):
+        self.check_einsum_sums('c16')
+
+    def test_einsum_sums_clongdouble(self):
+        self.check_einsum_sums(np.clongdouble)
+
+    def test_einsum_misc(self):
+        # This call used to crash because of a bug in
+        # PyArray_AssignZero
+        a = np.ones((1, 2))
+        b = np.ones((2, 2, 1))
+        assert_equal(np.einsum('ij...,j...->i...', a, b), [[[2], [2]]])
+        assert_equal(np.einsum('ij...,j...->i...', a, b, optimize=True), [[[2], [2]]])
+
+        # Regression test for issue #10369 (test unicode inputs with Python 2)
+        assert_equal(np.einsum(u'ij...,j...->i...', a, b), [[[2], [2]]])
+        assert_equal(np.einsum('...i,...i', [1, 2, 3], [2, 3, 4]), 20)
+        assert_equal(np.einsum(u'...i,...i', [1, 2, 3], [2, 3, 4]), 20)
+        assert_equal(np.einsum('...i,...i', [1, 2, 3], [2, 3, 4],
+                               optimize=u'greedy'), 20)
+
+        # The iterator had an issue with buffering this reduction
+        a = np.ones((5, 12, 4, 2, 3), np.int64)
+        b = np.ones((5, 12, 11), np.int64)
+        assert_equal(np.einsum('ijklm,ijn,ijn->', a, b, b),
+                     np.einsum('ijklm,ijn->', a, b))
+        assert_equal(np.einsum('ijklm,ijn,ijn->', a, b, b, optimize=True),
+                     np.einsum('ijklm,ijn->', a, b, optimize=True))
+
+        # Issue #2027, was a problem in the contiguous 3-argument
+        # inner loop implementation
+        a = np.arange(1, 3)
+        b = np.arange(1, 5).reshape(2, 2)
+        c = np.arange(1, 9).reshape(4, 2)
+        assert_equal(np.einsum('x,yx,zx->xzy', a, b, c),
+                     [[[1,  3], [3,  9], [5, 15], [7, 21]],
+                     [[8, 16], [16, 32], [24, 48], [32, 64]]])
+        assert_equal(np.einsum('x,yx,zx->xzy', a, b, c, optimize=True),
+                     [[[1,  3], [3,  9], [5, 15], [7, 21]],
+                     [[8, 16], [16, 32], [24, 48], [32, 64]]])
+
+    def test_subscript_range(self):
+        # Issue #7741, make sure that all letters of Latin alphabet (both uppercase & lowercase) can be used
+        # when creating a subscript from arrays
+        a = np.ones((2, 3))
+        b = np.ones((3, 4))
+        np.einsum(a, [0, 20], b, [20, 2], [0, 2], optimize=False)
+        np.einsum(a, [0, 27], b, [27, 2], [0, 2], optimize=False)
+        np.einsum(a, [0, 51], b, [51, 2], [0, 2], optimize=False)
+        assert_raises(ValueError, lambda: np.einsum(a, [0, 52], b, [52, 2], [0, 2], optimize=False))
+        assert_raises(ValueError, lambda: np.einsum(a, [-1, 5], b, [5, 2], [-1, 2], optimize=False))
+
+    def test_einsum_broadcast(self):
+        # Issue #2455 change in handling ellipsis
+        # remove the 'middle broadcast' error
+        # only use the 'RIGHT' iteration in prepare_op_axes
+        # adds auto broadcast on left where it belongs
+        # broadcast on right has to be explicit
+        # We need to test the optimized parsing as well
+
+        A = np.arange(2 * 3 * 4).reshape(2, 3, 4)
+        B = np.arange(3)
+        ref = np.einsum('ijk,j->ijk', A, B, optimize=False)
+        for opt in [True, False]:
+            assert_equal(np.einsum('ij...,j...->ij...', A, B, optimize=opt), ref)
+            assert_equal(np.einsum('ij...,...j->ij...', A, B, optimize=opt), ref)
+            assert_equal(np.einsum('ij...,j->ij...', A, B, optimize=opt), ref)  # used to raise error
+
+        A = np.arange(12).reshape((4, 3))
+        B = np.arange(6).reshape((3, 2))
+        ref = np.einsum('ik,kj->ij', A, B, optimize=False)
+        for opt in [True, False]:
+            assert_equal(np.einsum('ik...,k...->i...', A, B, optimize=opt), ref)
+            assert_equal(np.einsum('ik...,...kj->i...j', A, B, optimize=opt), ref)
+            assert_equal(np.einsum('...k,kj', A, B, optimize=opt), ref)  # used to raise error
+            assert_equal(np.einsum('ik,k...->i...', A, B, optimize=opt), ref)  # used to raise error
+
+        dims = [2, 3, 4, 5]
+        a = np.arange(np.prod(dims)).reshape(dims)
+        v = np.arange(dims[2])
+        ref = np.einsum('ijkl,k->ijl', a, v, optimize=False)
+        for opt in [True, False]:
+            assert_equal(np.einsum('ijkl,k', a, v, optimize=opt), ref)
+            assert_equal(np.einsum('...kl,k', a, v, optimize=opt), ref)  # used to raise error
+            assert_equal(np.einsum('...kl,k...', a, v, optimize=opt), ref)
+
+        J, K, M = 160, 160, 120
+        A = np.arange(J * K * M).reshape(1, 1, 1, J, K, M)
+        B = np.arange(J * K * M * 3).reshape(J, K, M, 3)
+        ref = np.einsum('...lmn,...lmno->...o', A, B, optimize=False)
+        for opt in [True, False]:
+            assert_equal(np.einsum('...lmn,lmno->...o', A, B,
+                                   optimize=opt), ref)  # used to raise error
+
+    def test_einsum_fixedstridebug(self):
+        # Issue #4485 obscure einsum bug
+        # This case revealed a bug in nditer where it reported a stride
+        # as 'fixed' (0) when it was in fact not fixed during processing
+        # (0 or 4). The reason for the bug was that the check for a fixed
+        # stride was using the information from the 2D inner loop reuse
+        # to restrict the iteration dimensions it had to validate to be
+        # the same, but that 2D inner loop reuse logic is only triggered
+        # during the buffer copying step, and hence it was invalid to
+        # rely on those values. The fix is to check all the dimensions
+        # of the stride in question, which in the test case reveals that
+        # the stride is not fixed.
+        #
+        # NOTE: This test is triggered by the fact that the default buffersize,
+        #       used by einsum, is 8192, and 3*2731 = 8193, is larger than that
+        #       and results in a mismatch between the buffering and the
+        #       striding for operand A.
+        A = np.arange(2 * 3).reshape(2, 3).astype(np.float32)
+        B = np.arange(2 * 3 * 2731).reshape(2, 3, 2731).astype(np.int16)
+        es = np.einsum('cl, cpx->lpx',  A,  B)
+        tp = np.tensordot(A,  B,  axes=(0,  0))
+        assert_equal(es,  tp)
+        # The following is the original test case from the bug report,
+        # made repeatable by changing random arrays to aranges.
+        A = np.arange(3 * 3).reshape(3, 3).astype(np.float64)
+        B = np.arange(3 * 3 * 64 * 64).reshape(3, 3, 64, 64).astype(np.float32)
+        es = np.einsum('cl, cpxy->lpxy',  A, B)
+        tp = np.tensordot(A, B,  axes=(0, 0))
+        assert_equal(es, tp)
+
+    def test_einsum_fixed_collapsingbug(self):
+        # Issue #5147.
+        # The bug only occurred when output argument of einssum was used.
+        x = np.random.normal(0, 1, (5, 5, 5, 5))
+        y1 = np.zeros((5, 5))
+        np.einsum('aabb->ab', x, out=y1)
+        idx = np.arange(5)
+        y2 = x[idx[:, None], idx[:, None], idx, idx]
+        assert_equal(y1, y2)
+
+    def test_einsum_failed_on_p9_and_s390x(self):
+        # Issues gh-14692 and gh-12689
+        # Bug with signed vs unsigned char errored on power9 and s390x Linux
+        tensor = np.random.random_sample((10, 10, 10, 10))
+        x = np.einsum('ijij->', tensor)
+        y = tensor.trace(axis1=0, axis2=2).trace()
+        assert_allclose(x, y)
+
+    def test_einsum_all_contig_non_contig_output(self):
+        # Issue gh-5907, tests that the all contiguous special case
+        # actually checks the contiguity of the output
+        x = np.ones((5, 5))
+        out = np.ones(10)[::2]
+        correct_base = np.ones(10)
+        correct_base[::2] = 5
+        # Always worked (inner iteration is done with 0-stride):
+        np.einsum('mi,mi,mi->m', x, x, x, out=out)
+        assert_array_equal(out.base, correct_base)
+        # Example 1:
+        out = np.ones(10)[::2]
+        np.einsum('im,im,im->m', x, x, x, out=out)
+        assert_array_equal(out.base, correct_base)
+        # Example 2, buffering causes x to be contiguous but
+        # special cases do not catch the operation before:
+        out = np.ones((2, 2, 2))[..., 0]
+        correct_base = np.ones((2, 2, 2))
+        correct_base[..., 0] = 2
+        x = np.ones((2, 2), np.float32)
+        np.einsum('ij,jk->ik', x, x, out=out)
+        assert_array_equal(out.base, correct_base)
+
+    def test_small_boolean_arrays(self):
+        # See gh-5946.
+        # Use array of True embedded in False.
+        a = np.zeros((16, 1, 1), dtype=np.bool_)[:2]
+        a[...] = True
+        out = np.zeros((16, 1, 1), dtype=np.bool_)[:2]
+        tgt = np.ones((2, 1, 1), dtype=np.bool_)
+        res = np.einsum('...ij,...jk->...ik', a, a, out=out)
+        assert_equal(res, tgt)
+
+    def test_out_is_res(self):
+        a = np.arange(9).reshape(3, 3)
+        res = np.einsum('...ij,...jk->...ik', a, a, out=a)
+        assert res is a
+
+    def optimize_compare(self, subscripts, operands=None):
+        # Tests all paths of the optimization function against
+        # conventional einsum
+        if operands is None:
+            args = [subscripts]
+            terms = subscripts.split('->')[0].split(',')
+            for term in terms:
+                dims = [global_size_dict[x] for x in term]
+                args.append(np.random.rand(*dims))
+        else:
+            args = [subscripts] + operands
+
+        noopt = np.einsum(*args, optimize=False)
+        opt = np.einsum(*args, optimize='greedy')
+        assert_almost_equal(opt, noopt)
+        opt = np.einsum(*args, optimize='optimal')
+        assert_almost_equal(opt, noopt)
+
+    def test_hadamard_like_products(self):
+        # Hadamard outer products
+        self.optimize_compare('a,ab,abc->abc')
+        self.optimize_compare('a,b,ab->ab')
+
+    def test_index_transformations(self):
+        # Simple index transformation cases
+        self.optimize_compare('ea,fb,gc,hd,abcd->efgh')
+        self.optimize_compare('ea,fb,abcd,gc,hd->efgh')
+        self.optimize_compare('abcd,ea,fb,gc,hd->efgh')
+
+    def test_complex(self):
+        # Long test cases
+        self.optimize_compare('acdf,jbje,gihb,hfac,gfac,gifabc,hfac')
+        self.optimize_compare('acdf,jbje,gihb,hfac,gfac,gifabc,hfac')
+        self.optimize_compare('cd,bdhe,aidb,hgca,gc,hgibcd,hgac')
+        self.optimize_compare('abhe,hidj,jgba,hiab,gab')
+        self.optimize_compare('bde,cdh,agdb,hica,ibd,hgicd,hiac')
+        self.optimize_compare('chd,bde,agbc,hiad,hgc,hgi,hiad')
+        self.optimize_compare('chd,bde,agbc,hiad,bdi,cgh,agdb')
+        self.optimize_compare('bdhe,acad,hiab,agac,hibd')
+
+    def test_collapse(self):
+        # Inner products
+        self.optimize_compare('ab,ab,c->')
+        self.optimize_compare('ab,ab,c->c')
+        self.optimize_compare('ab,ab,cd,cd->')
+        self.optimize_compare('ab,ab,cd,cd->ac')
+        self.optimize_compare('ab,ab,cd,cd->cd')
+        self.optimize_compare('ab,ab,cd,cd,ef,ef->')
+
+    def test_expand(self):
+        # Outer products
+        self.optimize_compare('ab,cd,ef->abcdef')
+        self.optimize_compare('ab,cd,ef->acdf')
+        self.optimize_compare('ab,cd,de->abcde')
+        self.optimize_compare('ab,cd,de->be')
+        self.optimize_compare('ab,bcd,cd->abcd')
+        self.optimize_compare('ab,bcd,cd->abd')
+
+    def test_edge_cases(self):
+        # Difficult edge cases for optimization
+        self.optimize_compare('eb,cb,fb->cef')
+        self.optimize_compare('dd,fb,be,cdb->cef')
+        self.optimize_compare('bca,cdb,dbf,afc->')
+        self.optimize_compare('dcc,fce,ea,dbf->ab')
+        self.optimize_compare('fdf,cdd,ccd,afe->ae')
+        self.optimize_compare('abcd,ad')
+        self.optimize_compare('ed,fcd,ff,bcf->be')
+        self.optimize_compare('baa,dcf,af,cde->be')
+        self.optimize_compare('bd,db,eac->ace')
+        self.optimize_compare('fff,fae,bef,def->abd')
+        self.optimize_compare('efc,dbc,acf,fd->abe')
+        self.optimize_compare('ba,ac,da->bcd')
+
+    def test_inner_product(self):
+        # Inner products
+        self.optimize_compare('ab,ab')
+        self.optimize_compare('ab,ba')
+        self.optimize_compare('abc,abc')
+        self.optimize_compare('abc,bac')
+        self.optimize_compare('abc,cba')
+
+    def test_random_cases(self):
+        # Randomly built test cases
+        self.optimize_compare('aab,fa,df,ecc->bde')
+        self.optimize_compare('ecb,fef,bad,ed->ac')
+        self.optimize_compare('bcf,bbb,fbf,fc->')
+        self.optimize_compare('bb,ff,be->e')
+        self.optimize_compare('bcb,bb,fc,fff->')
+        self.optimize_compare('fbb,dfd,fc,fc->')
+        self.optimize_compare('afd,ba,cc,dc->bf')
+        self.optimize_compare('adb,bc,fa,cfc->d')
+        self.optimize_compare('bbd,bda,fc,db->acf')
+        self.optimize_compare('dba,ead,cad->bce')
+        self.optimize_compare('aef,fbc,dca->bde')
+
+    def test_combined_views_mapping(self):
+        # gh-10792
+        a = np.arange(9).reshape(1, 1, 3, 1, 3)
+        b = np.einsum('bbcdc->d', a)
+        assert_equal(b, [12])
+
+    def test_broadcasting_dot_cases(self):
+        # Ensures broadcasting cases are not mistaken for GEMM
+
+        a = np.random.rand(1, 5, 4)
+        b = np.random.rand(4, 6)
+        c = np.random.rand(5, 6)
+        d = np.random.rand(10)
+
+        self.optimize_compare('ijk,kl,jl', operands=[a, b, c])
+        self.optimize_compare('ijk,kl,jl,i->i', operands=[a, b, c, d])
+
+        e = np.random.rand(1, 1, 5, 4)
+        f = np.random.rand(7, 7)
+        self.optimize_compare('abjk,kl,jl', operands=[e, b, c])
+        self.optimize_compare('abjk,kl,jl,ab->ab', operands=[e, b, c, f])
+
+        # Edge case found in gh-11308
+        g = np.arange(64).reshape(2, 4, 8)
+        self.optimize_compare('obk,ijk->ioj', operands=[g, g])
+
+
+class TestEinsumPath(object):
+    def build_operands(self, string, size_dict=global_size_dict):
+
+        # Builds views based off initial operands
+        operands = [string]
+        terms = string.split('->')[0].split(',')
+        for term in terms:
+            dims = [size_dict[x] for x in term]
+            operands.append(np.random.rand(*dims))
+
+        return operands
+
+    def assert_path_equal(self, comp, benchmark):
+        # Checks if list of tuples are equivalent
+        ret = (len(comp) == len(benchmark))
+        assert_(ret)
+        for pos in range(len(comp) - 1):
+            ret &= isinstance(comp[pos + 1], tuple)
+            ret &= (comp[pos + 1] == benchmark[pos + 1])
+        assert_(ret)
+
+    def test_memory_contraints(self):
+        # Ensure memory constraints are satisfied
+
+        outer_test = self.build_operands('a,b,c->abc')
+
+        path, path_str = np.einsum_path(*outer_test, optimize=('greedy', 0))
+        self.assert_path_equal(path, ['einsum_path', (0, 1, 2)])
+
+        path, path_str = np.einsum_path(*outer_test, optimize=('optimal', 0))
+        self.assert_path_equal(path, ['einsum_path', (0, 1, 2)])
+
+        long_test = self.build_operands('acdf,jbje,gihb,hfac')
+        path, path_str = np.einsum_path(*long_test, optimize=('greedy', 0))
+        self.assert_path_equal(path, ['einsum_path', (0, 1, 2, 3)])
+
+        path, path_str = np.einsum_path(*long_test, optimize=('optimal', 0))
+        self.assert_path_equal(path, ['einsum_path', (0, 1, 2, 3)])
+
+    def test_long_paths(self):
+        # Long complex cases
+
+        # Long test 1
+        long_test1 = self.build_operands('acdf,jbje,gihb,hfac,gfac,gifabc,hfac')
+        path, path_str = np.einsum_path(*long_test1, optimize='greedy')
+        self.assert_path_equal(path, ['einsum_path',
+                                      (3, 6), (3, 4), (2, 4), (2, 3), (0, 2), (0, 1)])
+
+        path, path_str = np.einsum_path(*long_test1, optimize='optimal')
+        self.assert_path_equal(path, ['einsum_path',
+                                      (3, 6), (3, 4), (2, 4), (2, 3), (0, 2), (0, 1)])
+
+        # Long test 2
+        long_test2 = self.build_operands('chd,bde,agbc,hiad,bdi,cgh,agdb')
+        path, path_str = np.einsum_path(*long_test2, optimize='greedy')
+        print(path)
+        self.assert_path_equal(path, ['einsum_path',
+                                      (3, 4), (0, 3), (3, 4), (1, 3), (1, 2), (0, 1)])
+
+        path, path_str = np.einsum_path(*long_test2, optimize='optimal')
+        print(path)
+        self.assert_path_equal(path, ['einsum_path',
+                                      (0, 5), (1, 4), (3, 4), (1, 3), (1, 2), (0, 1)])
+
+    def test_edge_paths(self):
+        # Difficult edge cases
+
+        # Edge test1
+        edge_test1 = self.build_operands('eb,cb,fb->cef')
+        path, path_str = np.einsum_path(*edge_test1, optimize='greedy')
+        self.assert_path_equal(path, ['einsum_path', (0, 2), (0, 1)])
+
+        path, path_str = np.einsum_path(*edge_test1, optimize='optimal')
+        self.assert_path_equal(path, ['einsum_path', (0, 2), (0, 1)])
+
+        # Edge test2
+        edge_test2 = self.build_operands('dd,fb,be,cdb->cef')
+        path, path_str = np.einsum_path(*edge_test2, optimize='greedy')
+        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 1), (0, 1)])
+
+        path, path_str = np.einsum_path(*edge_test2, optimize='optimal')
+        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 1), (0, 1)])
+
+        # Edge test3
+        edge_test3 = self.build_operands('bca,cdb,dbf,afc->')
+        path, path_str = np.einsum_path(*edge_test3, optimize='greedy')
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 2), (0, 1)])
+
+        path, path_str = np.einsum_path(*edge_test3, optimize='optimal')
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 2), (0, 1)])
+
+        # Edge test4
+        edge_test4 = self.build_operands('dcc,fce,ea,dbf->ab')
+        path, path_str = np.einsum_path(*edge_test4, optimize='greedy')
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
+
+        path, path_str = np.einsum_path(*edge_test4, optimize='optimal')
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 2), (0, 1)])
+
+        # Edge test5
+        edge_test4 = self.build_operands('a,ac,ab,ad,cd,bd,bc->',
+                                         size_dict={"a": 20, "b": 20, "c": 20, "d": 20})
+        path, path_str = np.einsum_path(*edge_test4, optimize='greedy')
+        self.assert_path_equal(path, ['einsum_path', (0, 1), (0, 1, 2, 3, 4, 5)])
+
+        path, path_str = np.einsum_path(*edge_test4, optimize='optimal')
+        self.assert_path_equal(path, ['einsum_path', (0, 1), (0, 1, 2, 3, 4, 5)])
+
+    def test_path_type_input(self):
+        # Test explicit path handeling
+        path_test = self.build_operands('dcc,fce,ea,dbf->ab')
+
+        path, path_str = np.einsum_path(*path_test, optimize=False)
+        self.assert_path_equal(path, ['einsum_path', (0, 1, 2, 3)])
+
+        path, path_str = np.einsum_path(*path_test, optimize=True)
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
+
+        exp_path = ['einsum_path', (0, 2), (0, 2), (0, 1)]
+        path, path_str = np.einsum_path(*path_test, optimize=exp_path)
+        self.assert_path_equal(path, exp_path)
+
+        # Double check einsum works on the input path
+        noopt = np.einsum(*path_test, optimize=False)
+        opt = np.einsum(*path_test, optimize=exp_path)
+        assert_almost_equal(noopt, opt)
+
+    def test_spaces(self):
+        #gh-10794
+        arr = np.array([[1]])
+        for sp in itertools.product(['', ' '], repeat=4):
+            # no error for any spacing
+            np.einsum('{}...a{}->{}...a{}'.format(*sp), arr)
+
+def test_overlap():
+    a = np.arange(9, dtype=int).reshape(3, 3)
+    b = np.arange(9, dtype=int).reshape(3, 3)
+    d = np.dot(a, b)
+    # sanity check
+    c = np.einsum('ij,jk->ik', a, b)
+    assert_equal(c, d)
+    #gh-10080, out overlaps one of the operands
+    c = np.einsum('ij,jk->ik', a, b, out=b)
+    assert_equal(c, d)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_errstate.py b/contrib/python/numpy/py2/numpy/core/tests/test_errstate.py
new file mode 100644
index 00000000000..670d485c1d7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_errstate.py
@@ -0,0 +1,41 @@
+from __future__ import division, absolute_import, print_function
+
+import platform
+import pytest
+
+import numpy as np
+from numpy.testing import assert_, assert_raises
+
+
+class TestErrstate(object):
+    @pytest.mark.skipif(platform.machine() == "armv5tel", reason="See gh-413.")
+    def test_invalid(self):
+        with np.errstate(all='raise', under='ignore'):
+            a = -np.arange(3)
+            # This should work
+            with np.errstate(invalid='ignore'):
+                np.sqrt(a)
+            # While this should fail!
+            with assert_raises(FloatingPointError):
+                np.sqrt(a)
+
+    def test_divide(self):
+        with np.errstate(all='raise', under='ignore'):
+            a = -np.arange(3)
+            # This should work
+            with np.errstate(divide='ignore'):
+                a // 0
+            # While this should fail!
+            with assert_raises(FloatingPointError):
+                a // 0
+
+    def test_errcall(self):
+        def foo(*args):
+            print(args)
+
+        olderrcall = np.geterrcall()
+        with np.errstate(call=foo):
+            assert_(np.geterrcall() is foo, 'call is not foo')
+            with np.errstate(call=None):
+                assert_(np.geterrcall() is None, 'call is not None')
+        assert_(np.geterrcall() is olderrcall, 'call is not olderrcall')
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_extint128.py b/contrib/python/numpy/py2/numpy/core/tests/test_extint128.py
new file mode 100644
index 00000000000..7c454a603b6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_extint128.py
@@ -0,0 +1,221 @@
+from __future__ import division, absolute_import, print_function
+
+import itertools
+import contextlib
+import operator
+import pytest
+
+import numpy as np
+import numpy.core._multiarray_tests as mt
+
+from numpy.testing import assert_raises, assert_equal
+
+
+INT64_MAX = np.iinfo(np.int64).max
+INT64_MIN = np.iinfo(np.int64).min
+INT64_MID = 2**32
+
+# int128 is not two's complement, the sign bit is separate
+INT128_MAX = 2**128 - 1
+INT128_MIN = -INT128_MAX
+INT128_MID = 2**64
+
+INT64_VALUES = (
+    [INT64_MIN + j for j in range(20)] +
+    [INT64_MAX - j for j in range(20)] +
+    [INT64_MID + j for j in range(-20, 20)] +
+    [2*INT64_MID + j for j in range(-20, 20)] +
+    [INT64_MID//2 + j for j in range(-20, 20)] +
+    list(range(-70, 70))
+)
+
+INT128_VALUES = (
+    [INT128_MIN + j for j in range(20)] +
+    [INT128_MAX - j for j in range(20)] +
+    [INT128_MID + j for j in range(-20, 20)] +
+    [2*INT128_MID + j for j in range(-20, 20)] +
+    [INT128_MID//2 + j for j in range(-20, 20)] +
+    list(range(-70, 70)) +
+    [False]  # negative zero
+)
+
+INT64_POS_VALUES = [x for x in INT64_VALUES if x > 0]
+
+
+[email protected]
+def exc_iter(*args):
+    """
+    Iterate over Cartesian product of *args, and if an exception is raised,
+    add information of the current iterate.
+    """
+
+    value = [None]
+
+    def iterate():
+        for v in itertools.product(*args):
+            value[0] = v
+            yield v
+
+    try:
+        yield iterate()
+    except Exception:
+        import traceback
+        msg = "At: %r\n%s" % (repr(value[0]),
+                              traceback.format_exc())
+        raise AssertionError(msg)
+
+
+def test_safe_binop():
+    # Test checked arithmetic routines
+
+    ops = [
+        (operator.add, 1),
+        (operator.sub, 2),
+        (operator.mul, 3)
+    ]
+
+    with exc_iter(ops, INT64_VALUES, INT64_VALUES) as it:
+        for xop, a, b in it:
+            pyop, op = xop
+            c = pyop(a, b)
+
+            if not (INT64_MIN <= c <= INT64_MAX):
+                assert_raises(OverflowError, mt.extint_safe_binop, a, b, op)
+            else:
+                d = mt.extint_safe_binop(a, b, op)
+                if c != d:
+                    # assert_equal is slow
+                    assert_equal(d, c)
+
+
+def test_to_128():
+    with exc_iter(INT64_VALUES) as it:
+        for a, in it:
+            b = mt.extint_to_128(a)
+            if a != b:
+                assert_equal(b, a)
+
+
+def test_to_64():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            if not (INT64_MIN <= a <= INT64_MAX):
+                assert_raises(OverflowError, mt.extint_to_64, a)
+            else:
+                b = mt.extint_to_64(a)
+                if a != b:
+                    assert_equal(b, a)
+
+
+def test_mul_64_64():
+    with exc_iter(INT64_VALUES, INT64_VALUES) as it:
+        for a, b in it:
+            c = a * b
+            d = mt.extint_mul_64_64(a, b)
+            if c != d:
+                assert_equal(d, c)
+
+
+def test_add_128():
+    with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+        for a, b in it:
+            c = a + b
+            if not (INT128_MIN <= c <= INT128_MAX):
+                assert_raises(OverflowError, mt.extint_add_128, a, b)
+            else:
+                d = mt.extint_add_128(a, b)
+                if c != d:
+                    assert_equal(d, c)
+
+
+def test_sub_128():
+    with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+        for a, b in it:
+            c = a - b
+            if not (INT128_MIN <= c <= INT128_MAX):
+                assert_raises(OverflowError, mt.extint_sub_128, a, b)
+            else:
+                d = mt.extint_sub_128(a, b)
+                if c != d:
+                    assert_equal(d, c)
+
+
+def test_neg_128():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            b = -a
+            c = mt.extint_neg_128(a)
+            if b != c:
+                assert_equal(c, b)
+
+
+def test_shl_128():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            if a < 0:
+                b = -(((-a) << 1) & (2**128-1))
+            else:
+                b = (a << 1) & (2**128-1)
+            c = mt.extint_shl_128(a)
+            if b != c:
+                assert_equal(c, b)
+
+
+def test_shr_128():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            if a < 0:
+                b = -((-a) >> 1)
+            else:
+                b = a >> 1
+            c = mt.extint_shr_128(a)
+            if b != c:
+                assert_equal(c, b)
+
+
+def test_gt_128():
+    with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+        for a, b in it:
+            c = a > b
+            d = mt.extint_gt_128(a, b)
+            if c != d:
+                assert_equal(d, c)
+
+
+[email protected]
+def test_divmod_128_64():
+    with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+        for a, b in it:
+            if a >= 0:
+                c, cr = divmod(a, b)
+            else:
+                c, cr = divmod(-a, b)
+                c = -c
+                cr = -cr
+
+            d, dr = mt.extint_divmod_128_64(a, b)
+
+            if c != d or d != dr or b*d + dr != a:
+                assert_equal(d, c)
+                assert_equal(dr, cr)
+                assert_equal(b*d + dr, a)
+
+
+def test_floordiv_128_64():
+    with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+        for a, b in it:
+            c = a // b
+            d = mt.extint_floordiv_128_64(a, b)
+
+            if c != d:
+                assert_equal(d, c)
+
+
+def test_ceildiv_128_64():
+    with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+        for a, b in it:
+            c = (a + b - 1) // b
+            d = mt.extint_ceildiv_128_64(a, b)
+
+            if c != d:
+                assert_equal(d, c)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_function_base.py b/contrib/python/numpy/py2/numpy/core/tests/test_function_base.py
new file mode 100644
index 00000000000..8b820bd75c3
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_function_base.py
@@ -0,0 +1,370 @@
+from __future__ import division, absolute_import, print_function
+
+from numpy import (
+    logspace, linspace, geomspace, dtype, array, sctypes, arange, isnan,
+    ndarray, sqrt, nextafter, stack
+    )
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_array_equal, assert_allclose,
+    suppress_warnings
+    )
+
+
+class PhysicalQuantity(float):
+    def __new__(cls, value):
+        return float.__new__(cls, value)
+
+    def __add__(self, x):
+        assert_(isinstance(x, PhysicalQuantity))
+        return PhysicalQuantity(float(x) + float(self))
+    __radd__ = __add__
+
+    def __sub__(self, x):
+        assert_(isinstance(x, PhysicalQuantity))
+        return PhysicalQuantity(float(self) - float(x))
+
+    def __rsub__(self, x):
+        assert_(isinstance(x, PhysicalQuantity))
+        return PhysicalQuantity(float(x) - float(self))
+
+    def __mul__(self, x):
+        return PhysicalQuantity(float(x) * float(self))
+    __rmul__ = __mul__
+
+    def __div__(self, x):
+        return PhysicalQuantity(float(self) / float(x))
+
+    def __rdiv__(self, x):
+        return PhysicalQuantity(float(x) / float(self))
+
+
+class PhysicalQuantity2(ndarray):
+    __array_priority__ = 10
+
+
+class TestLogspace(object):
+
+    def test_basic(self):
+        y = logspace(0, 6)
+        assert_(len(y) == 50)
+        y = logspace(0, 6, num=100)
+        assert_(y[-1] == 10 ** 6)
+        y = logspace(0, 6, endpoint=0)
+        assert_(y[-1] < 10 ** 6)
+        y = logspace(0, 6, num=7)
+        assert_array_equal(y, [1, 10, 100, 1e3, 1e4, 1e5, 1e6])
+
+    def test_start_stop_array(self):
+        start = array([0., 1.])
+        stop = array([6., 7.])
+        t1 = logspace(start, stop, 6)
+        t2 = stack([logspace(_start, _stop, 6)
+                    for _start, _stop in zip(start, stop)], axis=1)
+        assert_equal(t1, t2)
+        t3 = logspace(start, stop[0], 6)
+        t4 = stack([logspace(_start, stop[0], 6)
+                    for _start in start], axis=1)
+        assert_equal(t3, t4)
+        t5 = logspace(start, stop, 6, axis=-1)
+        assert_equal(t5, t2.T)
+
+    def test_dtype(self):
+        y = logspace(0, 6, dtype='float32')
+        assert_equal(y.dtype, dtype('float32'))
+        y = logspace(0, 6, dtype='float64')
+        assert_equal(y.dtype, dtype('float64'))
+        y = logspace(0, 6, dtype='int32')
+        assert_equal(y.dtype, dtype('int32'))
+
+    def test_physical_quantities(self):
+        a = PhysicalQuantity(1.0)
+        b = PhysicalQuantity(5.0)
+        assert_equal(logspace(a, b), logspace(1.0, 5.0))
+
+    def test_subclass(self):
+        a = array(1).view(PhysicalQuantity2)
+        b = array(7).view(PhysicalQuantity2)
+        ls = logspace(a, b)
+        assert type(ls) is PhysicalQuantity2
+        assert_equal(ls, logspace(1.0, 7.0))
+        ls = logspace(a, b, 1)
+        assert type(ls) is PhysicalQuantity2
+        assert_equal(ls, logspace(1.0, 7.0, 1))
+
+
+class TestGeomspace(object):
+
+    def test_basic(self):
+        y = geomspace(1, 1e6)
+        assert_(len(y) == 50)
+        y = geomspace(1, 1e6, num=100)
+        assert_(y[-1] == 10 ** 6)
+        y = geomspace(1, 1e6, endpoint=False)
+        assert_(y[-1] < 10 ** 6)
+        y = geomspace(1, 1e6, num=7)
+        assert_array_equal(y, [1, 10, 100, 1e3, 1e4, 1e5, 1e6])
+
+        y = geomspace(8, 2, num=3)
+        assert_allclose(y, [8, 4, 2])
+        assert_array_equal(y.imag, 0)
+
+        y = geomspace(-1, -100, num=3)
+        assert_array_equal(y, [-1, -10, -100])
+        assert_array_equal(y.imag, 0)
+
+        y = geomspace(-100, -1, num=3)
+        assert_array_equal(y, [-100, -10, -1])
+        assert_array_equal(y.imag, 0)
+
+    def test_complex(self):
+        # Purely imaginary
+        y = geomspace(1j, 16j, num=5)
+        assert_allclose(y, [1j, 2j, 4j, 8j, 16j])
+        assert_array_equal(y.real, 0)
+
+        y = geomspace(-4j, -324j, num=5)
+        assert_allclose(y, [-4j, -12j, -36j, -108j, -324j])
+        assert_array_equal(y.real, 0)
+
+        y = geomspace(1+1j, 1000+1000j, num=4)
+        assert_allclose(y, [1+1j, 10+10j, 100+100j, 1000+1000j])
+
+        y = geomspace(-1+1j, -1000+1000j, num=4)
+        assert_allclose(y, [-1+1j, -10+10j, -100+100j, -1000+1000j])
+
+        # Logarithmic spirals
+        y = geomspace(-1, 1, num=3, dtype=complex)
+        assert_allclose(y, [-1, 1j, +1])
+
+        y = geomspace(0+3j, -3+0j, 3)
+        assert_allclose(y, [0+3j, -3/sqrt(2)+3j/sqrt(2), -3+0j])
+        y = geomspace(0+3j, 3+0j, 3)
+        assert_allclose(y, [0+3j, 3/sqrt(2)+3j/sqrt(2), 3+0j])
+        y = geomspace(-3+0j, 0-3j, 3)
+        assert_allclose(y, [-3+0j, -3/sqrt(2)-3j/sqrt(2), 0-3j])
+        y = geomspace(0+3j, -3+0j, 3)
+        assert_allclose(y, [0+3j, -3/sqrt(2)+3j/sqrt(2), -3+0j])
+        y = geomspace(-2-3j, 5+7j, 7)
+        assert_allclose(y, [-2-3j, -0.29058977-4.15771027j,
+                            2.08885354-4.34146838j, 4.58345529-3.16355218j,
+                            6.41401745-0.55233457j, 6.75707386+3.11795092j,
+                            5+7j])
+
+        # Type promotion should prevent the -5 from becoming a NaN
+        y = geomspace(3j, -5, 2)
+        assert_allclose(y, [3j, -5])
+        y = geomspace(-5, 3j, 2)
+        assert_allclose(y, [-5, 3j])
+
+    def test_dtype(self):
+        y = geomspace(1, 1e6, dtype='float32')
+        assert_equal(y.dtype, dtype('float32'))
+        y = geomspace(1, 1e6, dtype='float64')
+        assert_equal(y.dtype, dtype('float64'))
+        y = geomspace(1, 1e6, dtype='int32')
+        assert_equal(y.dtype, dtype('int32'))
+
+        # Native types
+        y = geomspace(1, 1e6, dtype=float)
+        assert_equal(y.dtype, dtype('float_'))
+        y = geomspace(1, 1e6, dtype=complex)
+        assert_equal(y.dtype, dtype('complex'))
+
+    def test_start_stop_array_scalar(self):
+        lim1 = array([120, 100], dtype="int8")
+        lim2 = array([-120, -100], dtype="int8")
+        lim3 = array([1200, 1000], dtype="uint16")
+        t1 = geomspace(lim1[0], lim1[1], 5)
+        t2 = geomspace(lim2[0], lim2[1], 5)
+        t3 = geomspace(lim3[0], lim3[1], 5)
+        t4 = geomspace(120.0, 100.0, 5)
+        t5 = geomspace(-120.0, -100.0, 5)
+        t6 = geomspace(1200.0, 1000.0, 5)
+
+        # t3 uses float32, t6 uses float64
+        assert_allclose(t1, t4, rtol=1e-2)
+        assert_allclose(t2, t5, rtol=1e-2)
+        assert_allclose(t3, t6, rtol=1e-5)
+
+    def test_start_stop_array(self):
+        # Try to use all special cases.
+        start = array([1.e0, 32., 1j, -4j, 1+1j, -1])
+        stop = array([1.e4, 2., 16j, -324j, 10000+10000j, 1])
+        t1 = geomspace(start, stop, 5)
+        t2 = stack([geomspace(_start, _stop, 5)
+                    for _start, _stop in zip(start, stop)], axis=1)
+        assert_equal(t1, t2)
+        t3 = geomspace(start, stop[0], 5)
+        t4 = stack([geomspace(_start, stop[0], 5)
+                    for _start in start], axis=1)
+        assert_equal(t3, t4)
+        t5 = geomspace(start, stop, 5, axis=-1)
+        assert_equal(t5, t2.T)
+
+    def test_physical_quantities(self):
+        a = PhysicalQuantity(1.0)
+        b = PhysicalQuantity(5.0)
+        assert_equal(geomspace(a, b), geomspace(1.0, 5.0))
+
+    def test_subclass(self):
+        a = array(1).view(PhysicalQuantity2)
+        b = array(7).view(PhysicalQuantity2)
+        gs = geomspace(a, b)
+        assert type(gs) is PhysicalQuantity2
+        assert_equal(gs, geomspace(1.0, 7.0))
+        gs = geomspace(a, b, 1)
+        assert type(gs) is PhysicalQuantity2
+        assert_equal(gs, geomspace(1.0, 7.0, 1))
+
+    def test_bounds(self):
+        assert_raises(ValueError, geomspace, 0, 10)
+        assert_raises(ValueError, geomspace, 10, 0)
+        assert_raises(ValueError, geomspace, 0, 0)
+
+
+class TestLinspace(object):
+
+    def test_basic(self):
+        y = linspace(0, 10)
+        assert_(len(y) == 50)
+        y = linspace(2, 10, num=100)
+        assert_(y[-1] == 10)
+        y = linspace(2, 10, endpoint=0)
+        assert_(y[-1] < 10)
+        assert_raises(ValueError, linspace, 0, 10, num=-1)
+
+    def test_corner(self):
+        y = list(linspace(0, 1, 1))
+        assert_(y == [0.0], y)
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, ".*safely interpreted as an integer")
+            y = list(linspace(0, 1, 2.5))
+            assert_(y == [0.0, 1.0])
+
+    def test_type(self):
+        t1 = linspace(0, 1, 0).dtype
+        t2 = linspace(0, 1, 1).dtype
+        t3 = linspace(0, 1, 2).dtype
+        assert_equal(t1, t2)
+        assert_equal(t2, t3)
+
+    def test_dtype(self):
+        y = linspace(0, 6, dtype='float32')
+        assert_equal(y.dtype, dtype('float32'))
+        y = linspace(0, 6, dtype='float64')
+        assert_equal(y.dtype, dtype('float64'))
+        y = linspace(0, 6, dtype='int32')
+        assert_equal(y.dtype, dtype('int32'))
+
+    def test_start_stop_array_scalar(self):
+        lim1 = array([-120, 100], dtype="int8")
+        lim2 = array([120, -100], dtype="int8")
+        lim3 = array([1200, 1000], dtype="uint16")
+        t1 = linspace(lim1[0], lim1[1], 5)
+        t2 = linspace(lim2[0], lim2[1], 5)
+        t3 = linspace(lim3[0], lim3[1], 5)
+        t4 = linspace(-120.0, 100.0, 5)
+        t5 = linspace(120.0, -100.0, 5)
+        t6 = linspace(1200.0, 1000.0, 5)
+        assert_equal(t1, t4)
+        assert_equal(t2, t5)
+        assert_equal(t3, t6)
+
+    def test_start_stop_array(self):
+        start = array([-120, 120], dtype="int8")
+        stop = array([100, -100], dtype="int8")
+        t1 = linspace(start, stop, 5)
+        t2 = stack([linspace(_start, _stop, 5)
+                    for _start, _stop in zip(start, stop)], axis=1)
+        assert_equal(t1, t2)
+        t3 = linspace(start, stop[0], 5)
+        t4 = stack([linspace(_start, stop[0], 5)
+                    for _start in start], axis=1)
+        assert_equal(t3, t4)
+        t5 = linspace(start, stop, 5, axis=-1)
+        assert_equal(t5, t2.T)
+
+    def test_complex(self):
+        lim1 = linspace(1 + 2j, 3 + 4j, 5)
+        t1 = array([1.0+2.j, 1.5+2.5j,  2.0+3j, 2.5+3.5j, 3.0+4j])
+        lim2 = linspace(1j, 10, 5)
+        t2 = array([0.0+1.j, 2.5+0.75j, 5.0+0.5j, 7.5+0.25j, 10.0+0j])
+        assert_equal(lim1, t1)
+        assert_equal(lim2, t2)
+
+    def test_physical_quantities(self):
+        a = PhysicalQuantity(0.0)
+        b = PhysicalQuantity(1.0)
+        assert_equal(linspace(a, b), linspace(0.0, 1.0))
+
+    def test_subclass(self):
+        a = array(0).view(PhysicalQuantity2)
+        b = array(1).view(PhysicalQuantity2)
+        ls = linspace(a, b)
+        assert type(ls) is PhysicalQuantity2
+        assert_equal(ls, linspace(0.0, 1.0))
+        ls = linspace(a, b, 1)
+        assert type(ls) is PhysicalQuantity2
+        assert_equal(ls, linspace(0.0, 1.0, 1))
+
+    def test_array_interface(self):
+        # Regression test for https://github.com/numpy/numpy/pull/6659
+        # Ensure that start/stop can be objects that implement
+        # __array_interface__ and are convertible to numeric scalars
+
+        class Arrayish(object):
+            """
+            A generic object that supports the __array_interface__ and hence
+            can in principle be converted to a numeric scalar, but is not
+            otherwise recognized as numeric, but also happens to support
+            multiplication by floats.
+
+            Data should be an object that implements the buffer interface,
+            and contains at least 4 bytes.
+            """
+
+            def __init__(self, data):
+                self._data = data
+
+            @property
+            def __array_interface__(self):
+                return {'shape': (), 'typestr': '<i4', 'data': self._data,
+                        'version': 3}
+
+            def __mul__(self, other):
+                # For the purposes of this test any multiplication is an
+                # identity operation :)
+                return self
+
+        one = Arrayish(array(1, dtype='<i4'))
+        five = Arrayish(array(5, dtype='<i4'))
+
+        assert_equal(linspace(one, five), linspace(1, 5))
+
+    def test_denormal_numbers(self):
+        # Regression test for gh-5437. Will probably fail when compiled
+        # with ICC, which flushes denormals to zero
+        for ftype in sctypes['float']:
+            stop = nextafter(ftype(0), ftype(1)) * 5  # A denormal number
+            assert_(any(linspace(0, stop, 10, endpoint=False, dtype=ftype)))
+
+    def test_equivalent_to_arange(self):
+        for j in range(1000):
+            assert_equal(linspace(0, j, j+1, dtype=int),
+                         arange(j+1, dtype=int))
+
+    def test_retstep(self):
+        y = linspace(0, 1, 2, retstep=True)
+        assert_(isinstance(y, tuple) and len(y) == 2)
+        for num in (0, 1):
+            for ept in (False, True):
+                y = linspace(0, 1, num, endpoint=ept, retstep=True)
+                assert_(isinstance(y, tuple) and len(y) == 2 and
+                        len(y[0]) == num and isnan(y[1]),
+                        'num={0}, endpoint={1}'.format(num, ept))
+
+    def test_object(self):
+        start = array(1, dtype='O')
+        stop = array(2, dtype='O')
+        y = linspace(start, stop, 3)
+        assert_array_equal(y, array([1., 1.5, 2.]))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_getlimits.py b/contrib/python/numpy/py2/numpy/core/tests/test_getlimits.py
new file mode 100644
index 00000000000..2f664818365
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_getlimits.py
@@ -0,0 +1,123 @@
+""" Test functions for limits module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.core import finfo, iinfo
+from numpy import half, single, double, longdouble
+from numpy.testing import assert_equal, assert_, assert_raises
+from numpy.core.getlimits import _discovered_machar, _float_ma
+
+##################################################
+
+class TestPythonFloat(object):
+    def test_singleton(self):
+        ftype = finfo(float)
+        ftype2 = finfo(float)
+        assert_equal(id(ftype), id(ftype2))
+
+class TestHalf(object):
+    def test_singleton(self):
+        ftype = finfo(half)
+        ftype2 = finfo(half)
+        assert_equal(id(ftype), id(ftype2))
+
+class TestSingle(object):
+    def test_singleton(self):
+        ftype = finfo(single)
+        ftype2 = finfo(single)
+        assert_equal(id(ftype), id(ftype2))
+
+class TestDouble(object):
+    def test_singleton(self):
+        ftype = finfo(double)
+        ftype2 = finfo(double)
+        assert_equal(id(ftype), id(ftype2))
+
+class TestLongdouble(object):
+    def test_singleton(self):
+        ftype = finfo(longdouble)
+        ftype2 = finfo(longdouble)
+        assert_equal(id(ftype), id(ftype2))
+
+class TestFinfo(object):
+    def test_basic(self):
+        dts = list(zip(['f2', 'f4', 'f8', 'c8', 'c16'],
+                       [np.float16, np.float32, np.float64, np.complex64,
+                        np.complex128]))
+        for dt1, dt2 in dts:
+            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machar', 'machep',
+                         'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
+                         'nmant', 'precision', 'resolution', 'tiny'):
+                assert_equal(getattr(finfo(dt1), attr),
+                             getattr(finfo(dt2), attr), attr)
+        assert_raises(ValueError, finfo, 'i4')
+
+class TestIinfo(object):
+    def test_basic(self):
+        dts = list(zip(['i1', 'i2', 'i4', 'i8',
+                   'u1', 'u2', 'u4', 'u8'],
+                  [np.int8, np.int16, np.int32, np.int64,
+                   np.uint8, np.uint16, np.uint32, np.uint64]))
+        for dt1, dt2 in dts:
+            for attr in ('bits', 'min', 'max'):
+                assert_equal(getattr(iinfo(dt1), attr),
+                             getattr(iinfo(dt2), attr), attr)
+        assert_raises(ValueError, iinfo, 'f4')
+
+    def test_unsigned_max(self):
+        types = np.sctypes['uint']
+        for T in types:
+            assert_equal(iinfo(T).max, T(-1))
+
+class TestRepr(object):
+    def test_iinfo_repr(self):
+        expected = "iinfo(min=-32768, max=32767, dtype=int16)"
+        assert_equal(repr(np.iinfo(np.int16)), expected)
+
+    def test_finfo_repr(self):
+        expected = "finfo(resolution=1e-06, min=-3.4028235e+38," + \
+                   " max=3.4028235e+38, dtype=float32)"
+        assert_equal(repr(np.finfo(np.float32)), expected)
+
+
+def test_instances():
+    iinfo(10)
+    finfo(3.0)
+
+
+def assert_ma_equal(discovered, ma_like):
+    # Check MachAr-like objects same as calculated MachAr instances
+    for key, value in discovered.__dict__.items():
+        assert_equal(value, getattr(ma_like, key))
+        if hasattr(value, 'shape'):
+            assert_equal(value.shape, getattr(ma_like, key).shape)
+            assert_equal(value.dtype, getattr(ma_like, key).dtype)
+
+
+def test_known_types():
+    # Test we are correctly compiling parameters for known types
+    for ftype, ma_like in ((np.float16, _float_ma[16]),
+                           (np.float32, _float_ma[32]),
+                           (np.float64, _float_ma[64])):
+        assert_ma_equal(_discovered_machar(ftype), ma_like)
+    # Suppress warning for broken discovery of double double on PPC
+    with np.errstate(all='ignore'):
+        ld_ma = _discovered_machar(np.longdouble)
+    bytes = np.dtype(np.longdouble).itemsize
+    if (ld_ma.it, ld_ma.maxexp) == (63, 16384) and bytes in (12, 16):
+        # 80-bit extended precision
+        assert_ma_equal(ld_ma, _float_ma[80])
+    elif (ld_ma.it, ld_ma.maxexp) == (112, 16384) and bytes == 16:
+        # IEE 754 128-bit
+        assert_ma_equal(ld_ma, _float_ma[128])
+
+
+def test_plausible_finfo():
+    # Assert that finfo returns reasonable results for all types
+    for ftype in np.sctypes['float'] + np.sctypes['complex']:
+        info = np.finfo(ftype)
+        assert_(info.nmant > 1)
+        assert_(info.minexp < -1)
+        assert_(info.maxexp > 1)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_half.py b/contrib/python/numpy/py2/numpy/core/tests/test_half.py
new file mode 100644
index 00000000000..7707125014c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_half.py
@@ -0,0 +1,518 @@
+from __future__ import division, absolute_import, print_function
+
+import platform
+import pytest
+
+import numpy as np
+from numpy import uint16, float16, float32, float64
+from numpy.testing import assert_, assert_equal
+
+
+def assert_raises_fpe(strmatch, callable, *args, **kwargs):
+    try:
+        callable(*args, **kwargs)
+    except FloatingPointError as exc:
+        assert_(str(exc).find(strmatch) >= 0,
+                "Did not raise floating point %s error" % strmatch)
+    else:
+        assert_(False,
+                "Did not raise floating point %s error" % strmatch)
+
+class TestHalf(object):
+    def setup(self):
+        # An array of all possible float16 values
+        self.all_f16 = np.arange(0x10000, dtype=uint16)
+        self.all_f16.dtype = float16
+        self.all_f32 = np.array(self.all_f16, dtype=float32)
+        self.all_f64 = np.array(self.all_f16, dtype=float64)
+
+        # An array of all non-NaN float16 values, in sorted order
+        self.nonan_f16 = np.concatenate(
+                                (np.arange(0xfc00, 0x7fff, -1, dtype=uint16),
+                                 np.arange(0x0000, 0x7c01, 1, dtype=uint16)))
+        self.nonan_f16.dtype = float16
+        self.nonan_f32 = np.array(self.nonan_f16, dtype=float32)
+        self.nonan_f64 = np.array(self.nonan_f16, dtype=float64)
+
+        # An array of all finite float16 values, in sorted order
+        self.finite_f16 = self.nonan_f16[1:-1]
+        self.finite_f32 = self.nonan_f32[1:-1]
+        self.finite_f64 = self.nonan_f64[1:-1]
+
+    def test_half_conversions(self):
+        """Checks that all 16-bit values survive conversion
+           to/from 32-bit and 64-bit float"""
+        # Because the underlying routines preserve the NaN bits, every
+        # value is preserved when converting to/from other floats.
+
+        # Convert from float32 back to float16
+        b = np.array(self.all_f32, dtype=float16)
+        assert_equal(self.all_f16.view(dtype=uint16),
+                     b.view(dtype=uint16))
+
+        # Convert from float64 back to float16
+        b = np.array(self.all_f64, dtype=float16)
+        assert_equal(self.all_f16.view(dtype=uint16),
+                     b.view(dtype=uint16))
+
+        # Convert float16 to longdouble and back
+        # This doesn't necessarily preserve the extra NaN bits,
+        # so exclude NaNs.
+        a_ld = np.array(self.nonan_f16, dtype=np.longdouble)
+        b = np.array(a_ld, dtype=float16)
+        assert_equal(self.nonan_f16.view(dtype=uint16),
+                     b.view(dtype=uint16))
+
+        # Check the range for which all integers can be represented
+        i_int = np.arange(-2048, 2049)
+        i_f16 = np.array(i_int, dtype=float16)
+        j = np.array(i_f16, dtype=int)
+        assert_equal(i_int, j)
+
+    @pytest.mark.parametrize("offset", [None, "up", "down"])
+    @pytest.mark.parametrize("shift", [None, "up", "down"])
+    @pytest.mark.parametrize("float_t", [np.float32, np.float64])
+    def test_half_conversion_rounding(self, float_t, shift, offset):
+        # Assumes that round to even is used during casting.
+        max_pattern = np.float16(np.finfo(np.float16).max).view(np.uint16)
+
+        # Test all (positive) finite numbers, denormals are most interesting
+        # however:
+        f16s_patterns = np.arange(0, max_pattern+1, dtype=np.uint16)
+        f16s_float = f16s_patterns.view(np.float16).astype(float_t)
+
+        # Shift the values by half a bit up or a down (or do not shift),
+        if shift == "up":
+            f16s_float = 0.5 * (f16s_float[:-1] + f16s_float[1:])[1:]
+        elif shift == "down":
+            f16s_float = 0.5 * (f16s_float[:-1] + f16s_float[1:])[:-1]
+        else:
+            f16s_float = f16s_float[1:-1]
+
+        # Increase the float by a minimal value:
+        if offset == "up":
+            f16s_float = np.nextafter(f16s_float, float_t(1e50))
+        elif offset == "down":
+            f16s_float = np.nextafter(f16s_float, float_t(-1e50))
+
+        # Convert back to float16 and its bit pattern:
+        res_patterns = f16s_float.astype(np.float16).view(np.uint16)
+
+        # The above calculations tries the original values, or the exact
+        # mid points between the float16 values. It then further offsets them
+        # by as little as possible. If no offset occurs, "round to even"
+        # logic will be necessary, an arbitrarily small offset should cause
+        # normal up/down rounding always.
+
+        # Calculate the expecte pattern:
+        cmp_patterns = f16s_patterns[1:-1].copy()
+
+        if shift == "down" and offset != "up":
+            shift_pattern = -1
+        elif shift == "up" and offset != "down":
+            shift_pattern = 1
+        else:
+            # There cannot be a shift, either shift is None, so all rounding
+            # will go back to original, or shift is reduced by offset too much.
+            shift_pattern = 0
+
+        # If rounding occurs, is it normal rounding or round to even?
+        if offset is None:
+            # Round to even occurs, modify only non-even, cast to allow + (-1)
+            cmp_patterns[0::2].view(np.int16)[...] += shift_pattern
+        else:
+            cmp_patterns.view(np.int16)[...] += shift_pattern
+
+        assert_equal(res_patterns, cmp_patterns)
+
+    @pytest.mark.parametrize(["float_t", "uint_t", "bits"],
+                             [(np.float32, np.uint32, 23),
+                              (np.float64, np.uint64, 52)])
+    def test_half_conversion_denormal_round_even(self, float_t, uint_t, bits):
+        # Test specifically that all bits are considered when deciding
+        # whether round to even should occur (i.e. no bits are lost at the
+        # end. Compare also gh-12721. The most bits can get lost for the
+        # smallest denormal:
+        smallest_value = np.uint16(1).view(np.float16).astype(float_t)
+        assert smallest_value == 2**-24
+
+        # Will be rounded to zero based on round to even rule:
+        rounded_to_zero = smallest_value / float_t(2)
+        assert rounded_to_zero.astype(np.float16) == 0
+
+        # The significand will be all 0 for the float_t, test that we do not
+        # lose the lower ones of these:
+        for i in range(bits):
+            # slightly increasing the value should make it round up:
+            larger_pattern = rounded_to_zero.view(uint_t) | uint_t(1 << i)
+            larger_value = larger_pattern.view(float_t)
+            assert larger_value.astype(np.float16) == smallest_value
+
+    def test_nans_infs(self):
+        with np.errstate(all='ignore'):
+            # Check some of the ufuncs
+            assert_equal(np.isnan(self.all_f16), np.isnan(self.all_f32))
+            assert_equal(np.isinf(self.all_f16), np.isinf(self.all_f32))
+            assert_equal(np.isfinite(self.all_f16), np.isfinite(self.all_f32))
+            assert_equal(np.signbit(self.all_f16), np.signbit(self.all_f32))
+            assert_equal(np.spacing(float16(65504)), np.inf)
+
+            # Check comparisons of all values with NaN
+            nan = float16(np.nan)
+
+            assert_(not (self.all_f16 == nan).any())
+            assert_(not (nan == self.all_f16).any())
+
+            assert_((self.all_f16 != nan).all())
+            assert_((nan != self.all_f16).all())
+
+            assert_(not (self.all_f16 < nan).any())
+            assert_(not (nan < self.all_f16).any())
+
+            assert_(not (self.all_f16 <= nan).any())
+            assert_(not (nan <= self.all_f16).any())
+
+            assert_(not (self.all_f16 > nan).any())
+            assert_(not (nan > self.all_f16).any())
+
+            assert_(not (self.all_f16 >= nan).any())
+            assert_(not (nan >= self.all_f16).any())
+
+    def test_half_values(self):
+        """Confirms a small number of known half values"""
+        a = np.array([1.0, -1.0,
+                      2.0, -2.0,
+                      0.0999755859375, 0.333251953125,  # 1/10, 1/3
+                      65504, -65504,           # Maximum magnitude
+                      2.0**(-14), -2.0**(-14),  # Minimum normal
+                      2.0**(-24), -2.0**(-24),  # Minimum subnormal
+                      0, -1/1e1000,            # Signed zeros
+                      np.inf, -np.inf])
+        b = np.array([0x3c00, 0xbc00,
+                      0x4000, 0xc000,
+                      0x2e66, 0x3555,
+                      0x7bff, 0xfbff,
+                      0x0400, 0x8400,
+                      0x0001, 0x8001,
+                      0x0000, 0x8000,
+                      0x7c00, 0xfc00], dtype=uint16)
+        b.dtype = float16
+        assert_equal(a, b)
+
+    def test_half_rounding(self):
+        """Checks that rounding when converting to half is correct"""
+        a = np.array([2.0**-25 + 2.0**-35,  # Rounds to minimum subnormal
+                      2.0**-25,       # Underflows to zero (nearest even mode)
+                      2.0**-26,       # Underflows to zero
+                      1.0+2.0**-11 + 2.0**-16,  # rounds to 1.0+2**(-10)
+                      1.0+2.0**-11,   # rounds to 1.0 (nearest even mode)
+                      1.0+2.0**-12,   # rounds to 1.0
+                      65519,          # rounds to 65504
+                      65520],         # rounds to inf
+                      dtype=float64)
+        rounded = [2.0**-24,
+                   0.0,
+                   0.0,
+                   1.0+2.0**(-10),
+                   1.0,
+                   1.0,
+                   65504,
+                   np.inf]
+
+        # Check float64->float16 rounding
+        b = np.array(a, dtype=float16)
+        assert_equal(b, rounded)
+
+        # Check float32->float16 rounding
+        a = np.array(a, dtype=float32)
+        b = np.array(a, dtype=float16)
+        assert_equal(b, rounded)
+
+    def test_half_correctness(self):
+        """Take every finite float16, and check the casting functions with
+           a manual conversion."""
+
+        # Create an array of all finite float16s
+        a_bits = self.finite_f16.view(dtype=uint16)
+
+        # Convert to 64-bit float manually
+        a_sgn = (-1.0)**((a_bits & 0x8000) >> 15)
+        a_exp = np.array((a_bits & 0x7c00) >> 10, dtype=np.int32) - 15
+        a_man = (a_bits & 0x03ff) * 2.0**(-10)
+        # Implicit bit of normalized floats
+        a_man[a_exp != -15] += 1
+        # Denormalized exponent is -14
+        a_exp[a_exp == -15] = -14
+
+        a_manual = a_sgn * a_man * 2.0**a_exp
+
+        a32_fail = np.nonzero(self.finite_f32 != a_manual)[0]
+        if len(a32_fail) != 0:
+            bad_index = a32_fail[0]
+            assert_equal(self.finite_f32, a_manual,
+                 "First non-equal is half value %x -> %g != %g" %
+                            (self.finite_f16[bad_index],
+                             self.finite_f32[bad_index],
+                             a_manual[bad_index]))
+
+        a64_fail = np.nonzero(self.finite_f64 != a_manual)[0]
+        if len(a64_fail) != 0:
+            bad_index = a64_fail[0]
+            assert_equal(self.finite_f64, a_manual,
+                 "First non-equal is half value %x -> %g != %g" %
+                            (self.finite_f16[bad_index],
+                             self.finite_f64[bad_index],
+                             a_manual[bad_index]))
+
+    def test_half_ordering(self):
+        """Make sure comparisons are working right"""
+
+        # All non-NaN float16 values in reverse order
+        a = self.nonan_f16[::-1].copy()
+
+        # 32-bit float copy
+        b = np.array(a, dtype=float32)
+
+        # Should sort the same
+        a.sort()
+        b.sort()
+        assert_equal(a, b)
+
+        # Comparisons should work
+        assert_((a[:-1] <= a[1:]).all())
+        assert_(not (a[:-1] > a[1:]).any())
+        assert_((a[1:] >= a[:-1]).all())
+        assert_(not (a[1:] < a[:-1]).any())
+        # All != except for +/-0
+        assert_equal(np.nonzero(a[:-1] < a[1:])[0].size, a.size-2)
+        assert_equal(np.nonzero(a[1:] > a[:-1])[0].size, a.size-2)
+
+    def test_half_funcs(self):
+        """Test the various ArrFuncs"""
+
+        # fill
+        assert_equal(np.arange(10, dtype=float16),
+                     np.arange(10, dtype=float32))
+
+        # fillwithscalar
+        a = np.zeros((5,), dtype=float16)
+        a.fill(1)
+        assert_equal(a, np.ones((5,), dtype=float16))
+
+        # nonzero and copyswap
+        a = np.array([0, 0, -1, -1/1e20, 0, 2.0**-24, 7.629e-6], dtype=float16)
+        assert_equal(a.nonzero()[0],
+                     [2, 5, 6])
+        a = a.byteswap().newbyteorder()
+        assert_equal(a.nonzero()[0],
+                     [2, 5, 6])
+
+        # dot
+        a = np.arange(0, 10, 0.5, dtype=float16)
+        b = np.ones((20,), dtype=float16)
+        assert_equal(np.dot(a, b),
+                     95)
+
+        # argmax
+        a = np.array([0, -np.inf, -2, 0.5, 12.55, 7.3, 2.1, 12.4], dtype=float16)
+        assert_equal(a.argmax(),
+                     4)
+        a = np.array([0, -np.inf, -2, np.inf, 12.55, np.nan, 2.1, 12.4], dtype=float16)
+        assert_equal(a.argmax(),
+                     5)
+
+        # getitem
+        a = np.arange(10, dtype=float16)
+        for i in range(10):
+            assert_equal(a.item(i), i)
+
+    def test_spacing_nextafter(self):
+        """Test np.spacing and np.nextafter"""
+        # All non-negative finite #'s
+        a = np.arange(0x7c00, dtype=uint16)
+        hinf = np.array((np.inf,), dtype=float16)
+        a_f16 = a.view(dtype=float16)
+
+        assert_equal(np.spacing(a_f16[:-1]), a_f16[1:]-a_f16[:-1])
+
+        assert_equal(np.nextafter(a_f16[:-1], hinf), a_f16[1:])
+        assert_equal(np.nextafter(a_f16[0], -hinf), -a_f16[1])
+        assert_equal(np.nextafter(a_f16[1:], -hinf), a_f16[:-1])
+
+        # switch to negatives
+        a |= 0x8000
+
+        assert_equal(np.spacing(a_f16[0]), np.spacing(a_f16[1]))
+        assert_equal(np.spacing(a_f16[1:]), a_f16[:-1]-a_f16[1:])
+
+        assert_equal(np.nextafter(a_f16[0], hinf), -a_f16[1])
+        assert_equal(np.nextafter(a_f16[1:], hinf), a_f16[:-1])
+        assert_equal(np.nextafter(a_f16[:-1], -hinf), a_f16[1:])
+
+    def test_half_ufuncs(self):
+        """Test the various ufuncs"""
+
+        a = np.array([0, 1, 2, 4, 2], dtype=float16)
+        b = np.array([-2, 5, 1, 4, 3], dtype=float16)
+        c = np.array([0, -1, -np.inf, np.nan, 6], dtype=float16)
+
+        assert_equal(np.add(a, b), [-2, 6, 3, 8, 5])
+        assert_equal(np.subtract(a, b), [2, -4, 1, 0, -1])
+        assert_equal(np.multiply(a, b), [0, 5, 2, 16, 6])
+        assert_equal(np.divide(a, b), [0, 0.199951171875, 2, 1, 0.66650390625])
+
+        assert_equal(np.equal(a, b), [False, False, False, True, False])
+        assert_equal(np.not_equal(a, b), [True, True, True, False, True])
+        assert_equal(np.less(a, b), [False, True, False, False, True])
+        assert_equal(np.less_equal(a, b), [False, True, False, True, True])
+        assert_equal(np.greater(a, b), [True, False, True, False, False])
+        assert_equal(np.greater_equal(a, b), [True, False, True, True, False])
+        assert_equal(np.logical_and(a, b), [False, True, True, True, True])
+        assert_equal(np.logical_or(a, b), [True, True, True, True, True])
+        assert_equal(np.logical_xor(a, b), [True, False, False, False, False])
+        assert_equal(np.logical_not(a), [True, False, False, False, False])
+
+        assert_equal(np.isnan(c), [False, False, False, True, False])
+        assert_equal(np.isinf(c), [False, False, True, False, False])
+        assert_equal(np.isfinite(c), [True, True, False, False, True])
+        assert_equal(np.signbit(b), [True, False, False, False, False])
+
+        assert_equal(np.copysign(b, a), [2, 5, 1, 4, 3])
+
+        assert_equal(np.maximum(a, b), [0, 5, 2, 4, 3])
+
+        x = np.maximum(b, c)
+        assert_(np.isnan(x[3]))
+        x[3] = 0
+        assert_equal(x, [0, 5, 1, 0, 6])
+
+        assert_equal(np.minimum(a, b), [-2, 1, 1, 4, 2])
+
+        x = np.minimum(b, c)
+        assert_(np.isnan(x[3]))
+        x[3] = 0
+        assert_equal(x, [-2, -1, -np.inf, 0, 3])
+
+        assert_equal(np.fmax(a, b), [0, 5, 2, 4, 3])
+        assert_equal(np.fmax(b, c), [0, 5, 1, 4, 6])
+        assert_equal(np.fmin(a, b), [-2, 1, 1, 4, 2])
+        assert_equal(np.fmin(b, c), [-2, -1, -np.inf, 4, 3])
+
+        assert_equal(np.floor_divide(a, b), [0, 0, 2, 1, 0])
+        assert_equal(np.remainder(a, b), [0, 1, 0, 0, 2])
+        assert_equal(np.divmod(a, b), ([0, 0, 2, 1, 0], [0, 1, 0, 0, 2]))
+        assert_equal(np.square(b), [4, 25, 1, 16, 9])
+        assert_equal(np.reciprocal(b), [-0.5, 0.199951171875, 1, 0.25, 0.333251953125])
+        assert_equal(np.ones_like(b), [1, 1, 1, 1, 1])
+        assert_equal(np.conjugate(b), b)
+        assert_equal(np.absolute(b), [2, 5, 1, 4, 3])
+        assert_equal(np.negative(b), [2, -5, -1, -4, -3])
+        assert_equal(np.positive(b), b)
+        assert_equal(np.sign(b), [-1, 1, 1, 1, 1])
+        assert_equal(np.modf(b), ([0, 0, 0, 0, 0], b))
+        assert_equal(np.frexp(b), ([-0.5, 0.625, 0.5, 0.5, 0.75], [2, 3, 1, 3, 2]))
+        assert_equal(np.ldexp(b, [0, 1, 2, 4, 2]), [-2, 10, 4, 64, 12])
+
+    def test_half_coercion(self):
+        """Test that half gets coerced properly with the other types"""
+        a16 = np.array((1,), dtype=float16)
+        a32 = np.array((1,), dtype=float32)
+        b16 = float16(1)
+        b32 = float32(1)
+
+        assert_equal(np.power(a16, 2).dtype, float16)
+        assert_equal(np.power(a16, 2.0).dtype, float16)
+        assert_equal(np.power(a16, b16).dtype, float16)
+        assert_equal(np.power(a16, b32).dtype, float16)
+        assert_equal(np.power(a16, a16).dtype, float16)
+        assert_equal(np.power(a16, a32).dtype, float32)
+
+        assert_equal(np.power(b16, 2).dtype, float64)
+        assert_equal(np.power(b16, 2.0).dtype, float64)
+        assert_equal(np.power(b16, b16).dtype, float16)
+        assert_equal(np.power(b16, b32).dtype, float32)
+        assert_equal(np.power(b16, a16).dtype, float16)
+        assert_equal(np.power(b16, a32).dtype, float32)
+
+        assert_equal(np.power(a32, a16).dtype, float32)
+        assert_equal(np.power(a32, b16).dtype, float32)
+        assert_equal(np.power(b32, a16).dtype, float16)
+        assert_equal(np.power(b32, b16).dtype, float32)
+
+    @pytest.mark.skipif(platform.machine() == "armv5tel",
+                        reason="See gh-413.")
+    def test_half_fpe(self):
+        with np.errstate(all='raise'):
+            sx16 = np.array((1e-4,), dtype=float16)
+            bx16 = np.array((1e4,), dtype=float16)
+            sy16 = float16(1e-4)
+            by16 = float16(1e4)
+
+            # Underflow errors
+            assert_raises_fpe('underflow', lambda a, b:a*b, sx16, sx16)
+            assert_raises_fpe('underflow', lambda a, b:a*b, sx16, sy16)
+            assert_raises_fpe('underflow', lambda a, b:a*b, sy16, sx16)
+            assert_raises_fpe('underflow', lambda a, b:a*b, sy16, sy16)
+            assert_raises_fpe('underflow', lambda a, b:a/b, sx16, bx16)
+            assert_raises_fpe('underflow', lambda a, b:a/b, sx16, by16)
+            assert_raises_fpe('underflow', lambda a, b:a/b, sy16, bx16)
+            assert_raises_fpe('underflow', lambda a, b:a/b, sy16, by16)
+            assert_raises_fpe('underflow', lambda a, b:a/b,
+                                             float16(2.**-14), float16(2**11))
+            assert_raises_fpe('underflow', lambda a, b:a/b,
+                                             float16(-2.**-14), float16(2**11))
+            assert_raises_fpe('underflow', lambda a, b:a/b,
+                                             float16(2.**-14+2**-24), float16(2))
+            assert_raises_fpe('underflow', lambda a, b:a/b,
+                                             float16(-2.**-14-2**-24), float16(2))
+            assert_raises_fpe('underflow', lambda a, b:a/b,
+                                             float16(2.**-14+2**-23), float16(4))
+
+            # Overflow errors
+            assert_raises_fpe('overflow', lambda a, b:a*b, bx16, bx16)
+            assert_raises_fpe('overflow', lambda a, b:a*b, bx16, by16)
+            assert_raises_fpe('overflow', lambda a, b:a*b, by16, bx16)
+            assert_raises_fpe('overflow', lambda a, b:a*b, by16, by16)
+            assert_raises_fpe('overflow', lambda a, b:a/b, bx16, sx16)
+            assert_raises_fpe('overflow', lambda a, b:a/b, bx16, sy16)
+            assert_raises_fpe('overflow', lambda a, b:a/b, by16, sx16)
+            assert_raises_fpe('overflow', lambda a, b:a/b, by16, sy16)
+            assert_raises_fpe('overflow', lambda a, b:a+b,
+                                             float16(65504), float16(17))
+            assert_raises_fpe('overflow', lambda a, b:a-b,
+                                             float16(-65504), float16(17))
+            assert_raises_fpe('overflow', np.nextafter, float16(65504), float16(np.inf))
+            assert_raises_fpe('overflow', np.nextafter, float16(-65504), float16(-np.inf))
+            assert_raises_fpe('overflow', np.spacing, float16(65504))
+
+            # Invalid value errors
+            assert_raises_fpe('invalid', np.divide, float16(np.inf), float16(np.inf))
+            assert_raises_fpe('invalid', np.spacing, float16(np.inf))
+            assert_raises_fpe('invalid', np.spacing, float16(np.nan))
+            assert_raises_fpe('invalid', np.nextafter, float16(np.inf), float16(0))
+            assert_raises_fpe('invalid', np.nextafter, float16(-np.inf), float16(0))
+            assert_raises_fpe('invalid', np.nextafter, float16(0), float16(np.nan))
+
+            # These should not raise
+            float16(65472)+float16(32)
+            float16(2**-13)/float16(2)
+            float16(2**-14)/float16(2**10)
+            np.spacing(float16(-65504))
+            np.nextafter(float16(65504), float16(-np.inf))
+            np.nextafter(float16(-65504), float16(np.inf))
+            float16(2**-14)/float16(2**10)
+            float16(-2**-14)/float16(2**10)
+            float16(2**-14+2**-23)/float16(2)
+            float16(-2**-14-2**-23)/float16(2)
+
+    def test_half_array_interface(self):
+        """Test that half is compatible with __array_interface__"""
+        class Dummy:
+            pass
+
+        a = np.ones((1,), dtype=float16)
+        b = Dummy()
+        b.__array_interface__ = a.__array_interface__
+        c = np.array(b)
+        assert_(c.dtype == float16)
+        assert_equal(a, c)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_indexerrors.py b/contrib/python/numpy/py2/numpy/core/tests/test_indexerrors.py
new file mode 100644
index 00000000000..63b43c473c8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_indexerrors.py
@@ -0,0 +1,123 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_raises
+
+class TestIndexErrors(object):
+    '''Tests to exercise indexerrors not covered by other tests.'''
+
+    def test_arraytypes_fasttake(self):
+        'take from a 0-length dimension'
+        x = np.empty((2, 3, 0, 4))
+        assert_raises(IndexError, x.take, [0], axis=2)
+        assert_raises(IndexError, x.take, [1], axis=2)
+        assert_raises(IndexError, x.take, [0], axis=2, mode='wrap')
+        assert_raises(IndexError, x.take, [0], axis=2, mode='clip')
+
+    def test_take_from_object(self):
+        # Check exception taking from object array
+        d = np.zeros(5, dtype=object)
+        assert_raises(IndexError, d.take, [6])
+
+        # Check exception taking from 0-d array
+        d = np.zeros((5, 0), dtype=object)
+        assert_raises(IndexError, d.take, [1], axis=1)
+        assert_raises(IndexError, d.take, [0], axis=1)
+        assert_raises(IndexError, d.take, [0])
+        assert_raises(IndexError, d.take, [0], mode='wrap')
+        assert_raises(IndexError, d.take, [0], mode='clip')
+
+    def test_multiindex_exceptions(self):
+        a = np.empty(5, dtype=object)
+        assert_raises(IndexError, a.item, 20)
+        a = np.empty((5, 0), dtype=object)
+        assert_raises(IndexError, a.item, (0, 0))
+
+        a = np.empty(5, dtype=object)
+        assert_raises(IndexError, a.itemset, 20, 0)
+        a = np.empty((5, 0), dtype=object)
+        assert_raises(IndexError, a.itemset, (0, 0), 0)
+
+    def test_put_exceptions(self):
+        a = np.zeros((5, 5))
+        assert_raises(IndexError, a.put, 100, 0)
+        a = np.zeros((5, 5), dtype=object)
+        assert_raises(IndexError, a.put, 100, 0)
+        a = np.zeros((5, 5, 0))
+        assert_raises(IndexError, a.put, 100, 0)
+        a = np.zeros((5, 5, 0), dtype=object)
+        assert_raises(IndexError, a.put, 100, 0)
+
+    def test_iterators_exceptions(self):
+        "cases in iterators.c"
+        def assign(obj, ind, val):
+            obj[ind] = val
+
+        a = np.zeros([1, 2, 3])
+        assert_raises(IndexError, lambda: a[0, 5, None, 2])
+        assert_raises(IndexError, lambda: a[0, 5, 0, 2])
+        assert_raises(IndexError, lambda: assign(a, (0, 5, None, 2), 1))
+        assert_raises(IndexError, lambda: assign(a, (0, 5, 0, 2),  1))
+
+        a = np.zeros([1, 0, 3])
+        assert_raises(IndexError, lambda: a[0, 0, None, 2])
+        assert_raises(IndexError, lambda: assign(a, (0, 0, None, 2), 1))
+
+        a = np.zeros([1, 2, 3])
+        assert_raises(IndexError, lambda: a.flat[10])
+        assert_raises(IndexError, lambda: assign(a.flat, 10, 5))
+        a = np.zeros([1, 0, 3])
+        assert_raises(IndexError, lambda: a.flat[10])
+        assert_raises(IndexError, lambda: assign(a.flat, 10, 5))
+
+        a = np.zeros([1, 2, 3])
+        assert_raises(IndexError, lambda: a.flat[np.array(10)])
+        assert_raises(IndexError, lambda: assign(a.flat, np.array(10), 5))
+        a = np.zeros([1, 0, 3])
+        assert_raises(IndexError, lambda: a.flat[np.array(10)])
+        assert_raises(IndexError, lambda: assign(a.flat, np.array(10), 5))
+
+        a = np.zeros([1, 2, 3])
+        assert_raises(IndexError, lambda: a.flat[np.array([10])])
+        assert_raises(IndexError, lambda: assign(a.flat, np.array([10]), 5))
+        a = np.zeros([1, 0, 3])
+        assert_raises(IndexError, lambda: a.flat[np.array([10])])
+        assert_raises(IndexError, lambda: assign(a.flat, np.array([10]), 5))
+
+    def test_mapping(self):
+        "cases from mapping.c"
+
+        def assign(obj, ind, val):
+            obj[ind] = val
+
+        a = np.zeros((0, 10))
+        assert_raises(IndexError, lambda: a[12])
+
+        a = np.zeros((3, 5))
+        assert_raises(IndexError, lambda: a[(10, 20)])
+        assert_raises(IndexError, lambda: assign(a, (10, 20), 1))
+        a = np.zeros((3, 0))
+        assert_raises(IndexError, lambda: a[(1, 0)])
+        assert_raises(IndexError, lambda: assign(a, (1, 0), 1))
+
+        a = np.zeros((10,))
+        assert_raises(IndexError, lambda: assign(a, 10, 1))
+        a = np.zeros((0,))
+        assert_raises(IndexError, lambda: assign(a, 10, 1))
+
+        a = np.zeros((3, 5))
+        assert_raises(IndexError, lambda: a[(1, [1, 20])])
+        assert_raises(IndexError, lambda: assign(a, (1, [1, 20]), 1))
+        a = np.zeros((3, 0))
+        assert_raises(IndexError, lambda: a[(1, [0, 1])])
+        assert_raises(IndexError, lambda: assign(a, (1, [0, 1]), 1))
+
+    def test_methods(self):
+        "cases from methods.c"
+
+        a = np.zeros((3, 3))
+        assert_raises(IndexError, lambda: a.item(100))
+        assert_raises(IndexError, lambda: a.itemset(100, 1))
+        a = np.zeros((0, 3))
+        assert_raises(IndexError, lambda: a.item(100))
+        assert_raises(IndexError, lambda: a.itemset(100, 1))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_indexing.py b/contrib/python/numpy/py2/numpy/core/tests/test_indexing.py
new file mode 100644
index 00000000000..f7485c3f7c9
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_indexing.py
@@ -0,0 +1,1334 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import warnings
+import functools
+import operator
+import pytest
+
+import numpy as np
+from numpy.core._multiarray_tests import array_indexing
+from itertools import product
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_array_equal, assert_warns,
+    HAS_REFCOUNT, suppress_warnings,
+    )
+
+
+class TestIndexing(object):
+    def test_index_no_floats(self):
+        a = np.array([[[5]]])
+
+        assert_raises(IndexError, lambda: a[0.0])
+        assert_raises(IndexError, lambda: a[0, 0.0])
+        assert_raises(IndexError, lambda: a[0.0, 0])
+        assert_raises(IndexError, lambda: a[0.0,:])
+        assert_raises(IndexError, lambda: a[:, 0.0])
+        assert_raises(IndexError, lambda: a[:, 0.0,:])
+        assert_raises(IndexError, lambda: a[0.0,:,:])
+        assert_raises(IndexError, lambda: a[0, 0, 0.0])
+        assert_raises(IndexError, lambda: a[0.0, 0, 0])
+        assert_raises(IndexError, lambda: a[0, 0.0, 0])
+        assert_raises(IndexError, lambda: a[-1.4])
+        assert_raises(IndexError, lambda: a[0, -1.4])
+        assert_raises(IndexError, lambda: a[-1.4, 0])
+        assert_raises(IndexError, lambda: a[-1.4,:])
+        assert_raises(IndexError, lambda: a[:, -1.4])
+        assert_raises(IndexError, lambda: a[:, -1.4,:])
+        assert_raises(IndexError, lambda: a[-1.4,:,:])
+        assert_raises(IndexError, lambda: a[0, 0, -1.4])
+        assert_raises(IndexError, lambda: a[-1.4, 0, 0])
+        assert_raises(IndexError, lambda: a[0, -1.4, 0])
+        assert_raises(IndexError, lambda: a[0.0:, 0.0])
+        assert_raises(IndexError, lambda: a[0.0:, 0.0,:])
+
+    def test_slicing_no_floats(self):
+        a = np.array([[5]])
+
+        # start as float.
+        assert_raises(TypeError, lambda: a[0.0:])
+        assert_raises(TypeError, lambda: a[0:, 0.0:2])
+        assert_raises(TypeError, lambda: a[0.0::2, :0])
+        assert_raises(TypeError, lambda: a[0.0:1:2,:])
+        assert_raises(TypeError, lambda: a[:, 0.0:])
+        # stop as float.
+        assert_raises(TypeError, lambda: a[:0.0])
+        assert_raises(TypeError, lambda: a[:0, 1:2.0])
+        assert_raises(TypeError, lambda: a[:0.0:2, :0])
+        assert_raises(TypeError, lambda: a[:0.0,:])
+        assert_raises(TypeError, lambda: a[:, 0:4.0:2])
+        # step as float.
+        assert_raises(TypeError, lambda: a[::1.0])
+        assert_raises(TypeError, lambda: a[0:, :2:2.0])
+        assert_raises(TypeError, lambda: a[1::4.0, :0])
+        assert_raises(TypeError, lambda: a[::5.0,:])
+        assert_raises(TypeError, lambda: a[:, 0:4:2.0])
+        # mixed.
+        assert_raises(TypeError, lambda: a[1.0:2:2.0])
+        assert_raises(TypeError, lambda: a[1.0::2.0])
+        assert_raises(TypeError, lambda: a[0:, :2.0:2.0])
+        assert_raises(TypeError, lambda: a[1.0:1:4.0, :0])
+        assert_raises(TypeError, lambda: a[1.0:5.0:5.0,:])
+        assert_raises(TypeError, lambda: a[:, 0.4:4.0:2.0])
+        # should still get the DeprecationWarning if step = 0.
+        assert_raises(TypeError, lambda: a[::0.0])
+
+    def test_index_no_array_to_index(self):
+        # No non-scalar arrays.
+        a = np.array([[[1]]])
+
+        assert_raises(TypeError, lambda: a[a:a:a])
+
+    def test_none_index(self):
+        # `None` index adds newaxis
+        a = np.array([1, 2, 3])
+        assert_equal(a[None], a[np.newaxis])
+        assert_equal(a[None].ndim, a.ndim + 1)
+
+    def test_empty_tuple_index(self):
+        # Empty tuple index creates a view
+        a = np.array([1, 2, 3])
+        assert_equal(a[()], a)
+        assert_(a[()].base is a)
+        a = np.array(0)
+        assert_(isinstance(a[()], np.int_))
+
+    def test_void_scalar_empty_tuple(self):
+        s = np.zeros((), dtype='V4')
+        assert_equal(s[()].dtype, s.dtype)
+        assert_equal(s[()], s)
+        assert_equal(type(s[...]), np.ndarray)
+
+    def test_same_kind_index_casting(self):
+        # Indexes should be cast with same-kind and not safe, even if that
+        # is somewhat unsafe. So test various different code paths.
+        index = np.arange(5)
+        u_index = index.astype(np.uintp)
+        arr = np.arange(10)
+
+        assert_array_equal(arr[index], arr[u_index])
+        arr[u_index] = np.arange(5)
+        assert_array_equal(arr, np.arange(10))
+
+        arr = np.arange(10).reshape(5, 2)
+        assert_array_equal(arr[index], arr[u_index])
+
+        arr[u_index] = np.arange(5)[:,None]
+        assert_array_equal(arr, np.arange(5)[:,None].repeat(2, axis=1))
+
+        arr = np.arange(25).reshape(5, 5)
+        assert_array_equal(arr[u_index, u_index], arr[index, index])
+
+    def test_empty_fancy_index(self):
+        # Empty list index creates an empty array
+        # with the same dtype (but with weird shape)
+        a = np.array([1, 2, 3])
+        assert_equal(a[[]], [])
+        assert_equal(a[[]].dtype, a.dtype)
+
+        b = np.array([], dtype=np.intp)
+        assert_equal(a[[]], [])
+        assert_equal(a[[]].dtype, a.dtype)
+
+        b = np.array([])
+        assert_raises(IndexError, a.__getitem__, b)
+
+    def test_ellipsis_index(self):
+        a = np.array([[1, 2, 3],
+                      [4, 5, 6],
+                      [7, 8, 9]])
+        assert_(a[...] is not a)
+        assert_equal(a[...], a)
+        # `a[...]` was `a` in numpy <1.9.
+        assert_(a[...].base is a)
+
+        # Slicing with ellipsis can skip an
+        # arbitrary number of dimensions
+        assert_equal(a[0, ...], a[0])
+        assert_equal(a[0, ...], a[0,:])
+        assert_equal(a[..., 0], a[:, 0])
+
+        # Slicing with ellipsis always results
+        # in an array, not a scalar
+        assert_equal(a[0, ..., 1], np.array(2))
+
+        # Assignment with `(Ellipsis,)` on 0-d arrays
+        b = np.array(1)
+        b[(Ellipsis,)] = 2
+        assert_equal(b, 2)
+
+    def test_single_int_index(self):
+        # Single integer index selects one row
+        a = np.array([[1, 2, 3],
+                      [4, 5, 6],
+                      [7, 8, 9]])
+
+        assert_equal(a[0], [1, 2, 3])
+        assert_equal(a[-1], [7, 8, 9])
+
+        # Index out of bounds produces IndexError
+        assert_raises(IndexError, a.__getitem__, 1 << 30)
+        # Index overflow produces IndexError
+        assert_raises(IndexError, a.__getitem__, 1 << 64)
+
+    def test_single_bool_index(self):
+        # Single boolean index
+        a = np.array([[1, 2, 3],
+                      [4, 5, 6],
+                      [7, 8, 9]])
+
+        assert_equal(a[np.array(True)], a[None])
+        assert_equal(a[np.array(False)], a[None][0:0])
+
+    def test_boolean_shape_mismatch(self):
+        arr = np.ones((5, 4, 3))
+
+        index = np.array([True])
+        assert_raises(IndexError, arr.__getitem__, index)
+
+        index = np.array([False] * 6)
+        assert_raises(IndexError, arr.__getitem__, index)
+
+        index = np.zeros((4, 4), dtype=bool)
+        assert_raises(IndexError, arr.__getitem__, index)
+
+        assert_raises(IndexError, arr.__getitem__, (slice(None), index))
+
+    def test_boolean_indexing_onedim(self):
+        # Indexing a 2-dimensional array with
+        # boolean array of length one
+        a = np.array([[ 0.,  0.,  0.]])
+        b = np.array([ True], dtype=bool)
+        assert_equal(a[b], a)
+        # boolean assignment
+        a[b] = 1.
+        assert_equal(a, [[1., 1., 1.]])
+
+    def test_boolean_assignment_value_mismatch(self):
+        # A boolean assignment should fail when the shape of the values
+        # cannot be broadcast to the subscription. (see also gh-3458)
+        a = np.arange(4)
+
+        def f(a, v):
+            a[a > -1] = v
+
+        assert_raises(ValueError, f, a, [])
+        assert_raises(ValueError, f, a, [1, 2, 3])
+        assert_raises(ValueError, f, a[:1], [1, 2, 3])
+
+    def test_boolean_assignment_needs_api(self):
+        # See also gh-7666
+        # This caused a segfault on Python 2 due to the GIL not being
+        # held when the iterator does not need it, but the transfer function
+        # does
+        arr = np.zeros(1000)
+        indx = np.zeros(1000, dtype=bool)
+        indx[:100] = True
+        arr[indx] = np.ones(100, dtype=object)
+
+        expected = np.zeros(1000)
+        expected[:100] = 1
+        assert_array_equal(arr, expected)
+
+    def test_boolean_indexing_twodim(self):
+        # Indexing a 2-dimensional array with
+        # 2-dimensional boolean array
+        a = np.array([[1, 2, 3],
+                      [4, 5, 6],
+                      [7, 8, 9]])
+        b = np.array([[ True, False,  True],
+                      [False,  True, False],
+                      [ True, False,  True]])
+        assert_equal(a[b], [1, 3, 5, 7, 9])
+        assert_equal(a[b[1]], [[4, 5, 6]])
+        assert_equal(a[b[0]], a[b[2]])
+
+        # boolean assignment
+        a[b] = 0
+        assert_equal(a, [[0, 2, 0],
+                         [4, 0, 6],
+                         [0, 8, 0]])
+
+    def test_boolean_indexing_list(self):
+        # Regression test for #13715. It's a use-after-free bug which the
+        # test won't directly catch, but it will show up in valgrind.
+        a = np.array([1, 2, 3])
+        b = [True, False, True]
+        # Two variants of the test because the first takes a fast path
+        assert_equal(a[b], [1, 3])
+        assert_equal(a[None, b], [[1, 3]])
+
+    def test_reverse_strides_and_subspace_bufferinit(self):
+        # This tests that the strides are not reversed for simple and
+        # subspace fancy indexing.
+        a = np.ones(5)
+        b = np.zeros(5, dtype=np.intp)[::-1]
+        c = np.arange(5)[::-1]
+
+        a[b] = c
+        # If the strides are not reversed, the 0 in the arange comes last.
+        assert_equal(a[0], 0)
+
+        # This also tests that the subspace buffer is initialized:
+        a = np.ones((5, 2))
+        c = np.arange(10).reshape(5, 2)[::-1]
+        a[b, :] = c
+        assert_equal(a[0], [0, 1])
+
+    def test_reversed_strides_result_allocation(self):
+        # Test a bug when calculating the output strides for a result array
+        # when the subspace size was 1 (and test other cases as well)
+        a = np.arange(10)[:, None]
+        i = np.arange(10)[::-1]
+        assert_array_equal(a[i], a[i.copy('C')])
+
+        a = np.arange(20).reshape(-1, 2)
+
+    def test_uncontiguous_subspace_assignment(self):
+        # During development there was a bug activating a skip logic
+        # based on ndim instead of size.
+        a = np.full((3, 4, 2), -1)
+        b = np.full((3, 4, 2), -1)
+
+        a[[0, 1]] = np.arange(2 * 4 * 2).reshape(2, 4, 2).T
+        b[[0, 1]] = np.arange(2 * 4 * 2).reshape(2, 4, 2).T.copy()
+
+        assert_equal(a, b)
+
+    def test_too_many_fancy_indices_special_case(self):
+        # Just documents behaviour, this is a small limitation.
+        a = np.ones((1,) * 32)  # 32 is NPY_MAXDIMS
+        assert_raises(IndexError, a.__getitem__, (np.array([0]),) * 32)
+
+    def test_scalar_array_bool(self):
+        # NumPy bools can be used as boolean index (python ones as of yet not)
+        a = np.array(1)
+        assert_equal(a[np.bool_(True)], a[np.array(True)])
+        assert_equal(a[np.bool_(False)], a[np.array(False)])
+
+        # After deprecating bools as integers:
+        #a = np.array([0,1,2])
+        #assert_equal(a[True, :], a[None, :])
+        #assert_equal(a[:, True], a[:, None])
+        #
+        #assert_(not np.may_share_memory(a, a[True, :]))
+
+    def test_everything_returns_views(self):
+        # Before `...` would return a itself.
+        a = np.arange(5)
+
+        assert_(a is not a[()])
+        assert_(a is not a[...])
+        assert_(a is not a[:])
+
+    def test_broaderrors_indexing(self):
+        a = np.zeros((5, 5))
+        assert_raises(IndexError, a.__getitem__, ([0, 1], [0, 1, 2]))
+        assert_raises(IndexError, a.__setitem__, ([0, 1], [0, 1, 2]), 0)
+
+    def test_trivial_fancy_out_of_bounds(self):
+        a = np.zeros(5)
+        ind = np.ones(20, dtype=np.intp)
+        ind[-1] = 10
+        assert_raises(IndexError, a.__getitem__, ind)
+        assert_raises(IndexError, a.__setitem__, ind, 0)
+        ind = np.ones(20, dtype=np.intp)
+        ind[0] = 11
+        assert_raises(IndexError, a.__getitem__, ind)
+        assert_raises(IndexError, a.__setitem__, ind, 0)
+
+    def test_trivial_fancy_not_possible(self):
+        # Test that the fast path for trivial assignment is not incorrectly
+        # used when the index is not contiguous or 1D, see also gh-11467.
+        a = np.arange(6)
+        idx = np.arange(6, dtype=np.intp).reshape(2, 1, 3)[:, :, 0]
+        assert_array_equal(a[idx], idx)
+
+        # this case must not go into the fast path, note that idx is
+        # a non-contiuguous none 1D array here.
+        a[idx] = -1
+        res = np.arange(6)
+        res[0] = -1
+        res[3] = -1
+        assert_array_equal(a, res)
+
+    def test_nonbaseclass_values(self):
+        class SubClass(np.ndarray):
+            def __array_finalize__(self, old):
+                # Have array finalize do funny things
+                self.fill(99)
+
+        a = np.zeros((5, 5))
+        s = a.copy().view(type=SubClass)
+        s.fill(1)
+
+        a[[0, 1, 2, 3, 4], :] = s
+        assert_((a == 1).all())
+
+        # Subspace is last, so transposing might want to finalize
+        a[:, [0, 1, 2, 3, 4]] = s
+        assert_((a == 1).all())
+
+        a.fill(0)
+        a[...] = s
+        assert_((a == 1).all())
+
+    def test_subclass_writeable(self):
+        d = np.rec.array([('NGC1001', 11), ('NGC1002', 1.), ('NGC1003', 1.)],
+                         dtype=[('target', 'S20'), ('V_mag', '>f4')])
+        ind = np.array([False,  True,  True], dtype=bool)
+        assert_(d[ind].flags.writeable)
+        ind = np.array([0, 1])
+        assert_(d[ind].flags.writeable)
+        assert_(d[...].flags.writeable)
+        assert_(d[0].flags.writeable)
+
+    def test_memory_order(self):
+        # This is not necessary to preserve. Memory layouts for
+        # more complex indices are not as simple.
+        a = np.arange(10)
+        b = np.arange(10).reshape(5,2).T
+        assert_(a[b].flags.f_contiguous)
+
+        # Takes a different implementation branch:
+        a = a.reshape(-1, 1)
+        assert_(a[b, 0].flags.f_contiguous)
+
+    def test_scalar_return_type(self):
+        # Full scalar indices should return scalars and object
+        # arrays should not call PyArray_Return on their items
+        class Zero(object):
+            # The most basic valid indexing
+            def __index__(self):
+                return 0
+
+        z = Zero()
+
+        class ArrayLike(object):
+            # Simple array, should behave like the array
+            def __array__(self):
+                return np.array(0)
+
+        a = np.zeros(())
+        assert_(isinstance(a[()], np.float_))
+        a = np.zeros(1)
+        assert_(isinstance(a[z], np.float_))
+        a = np.zeros((1, 1))
+        assert_(isinstance(a[z, np.array(0)], np.float_))
+        assert_(isinstance(a[z, ArrayLike()], np.float_))
+
+        # And object arrays do not call it too often:
+        b = np.array(0)
+        a = np.array(0, dtype=object)
+        a[()] = b
+        assert_(isinstance(a[()], np.ndarray))
+        a = np.array([b, None])
+        assert_(isinstance(a[z], np.ndarray))
+        a = np.array([[b, None]])
+        assert_(isinstance(a[z, np.array(0)], np.ndarray))
+        assert_(isinstance(a[z, ArrayLike()], np.ndarray))
+
+    def test_small_regressions(self):
+        # Reference count of intp for index checks
+        a = np.array([0])
+        if HAS_REFCOUNT:
+            refcount = sys.getrefcount(np.dtype(np.intp))
+        # item setting always checks indices in separate function:
+        a[np.array([0], dtype=np.intp)] = 1
+        a[np.array([0], dtype=np.uint8)] = 1
+        assert_raises(IndexError, a.__setitem__,
+                      np.array([1], dtype=np.intp), 1)
+        assert_raises(IndexError, a.__setitem__,
+                      np.array([1], dtype=np.uint8), 1)
+
+        if HAS_REFCOUNT:
+            assert_equal(sys.getrefcount(np.dtype(np.intp)), refcount)
+
+    def test_unaligned(self):
+        v = (np.zeros(64, dtype=np.int8) + ord('a'))[1:-7]
+        d = v.view(np.dtype("S8"))
+        # unaligned source
+        x = (np.zeros(16, dtype=np.int8) + ord('a'))[1:-7]
+        x = x.view(np.dtype("S8"))
+        x[...] = np.array("b" * 8, dtype="S")
+        b = np.arange(d.size)
+        #trivial
+        assert_equal(d[b], d)
+        d[b] = x
+        # nontrivial
+        # unaligned index array
+        b = np.zeros(d.size + 1).view(np.int8)[1:-(np.intp(0).itemsize - 1)]
+        b = b.view(np.intp)[:d.size]
+        b[...] = np.arange(d.size)
+        assert_equal(d[b.astype(np.int16)], d)
+        d[b.astype(np.int16)] = x
+        # boolean
+        d[b % 2 == 0]
+        d[b % 2 == 0] = x[::2]
+
+    def test_tuple_subclass(self):
+        arr = np.ones((5, 5))
+
+        # A tuple subclass should also be an nd-index
+        class TupleSubclass(tuple):
+            pass
+        index = ([1], [1])
+        index = TupleSubclass(index)
+        assert_(arr[index].shape == (1,))
+        # Unlike the non nd-index:
+        assert_(arr[index,].shape != (1,))
+
+    def test_broken_sequence_not_nd_index(self):
+        # See gh-5063:
+        # If we have an object which claims to be a sequence, but fails
+        # on item getting, this should not be converted to an nd-index (tuple)
+        # If this object happens to be a valid index otherwise, it should work
+        # This object here is very dubious and probably bad though:
+        class SequenceLike(object):
+            def __index__(self):
+                return 0
+
+            def __len__(self):
+                return 1
+
+            def __getitem__(self, item):
+                raise IndexError('Not possible')
+
+        arr = np.arange(10)
+        assert_array_equal(arr[SequenceLike()], arr[SequenceLike(),])
+
+        # also test that field indexing does not segfault
+        # for a similar reason, by indexing a structured array
+        arr = np.zeros((1,), dtype=[('f1', 'i8'), ('f2', 'i8')])
+        assert_array_equal(arr[SequenceLike()], arr[SequenceLike(),])
+
+    def test_indexing_array_weird_strides(self):
+        # See also gh-6221
+        # the shapes used here come from the issue and create the correct
+        # size for the iterator buffering size.
+        x = np.ones(10)
+        x2 = np.ones((10, 2))
+        ind = np.arange(10)[:, None, None, None]
+        ind = np.broadcast_to(ind, (10, 55, 4, 4))
+
+        # single advanced index case
+        assert_array_equal(x[ind], x[ind.copy()])
+        # higher dimensional advanced index
+        zind = np.zeros(4, dtype=np.intp)
+        assert_array_equal(x2[ind, zind], x2[ind.copy(), zind])
+
+    def test_indexing_array_negative_strides(self):
+        # From gh-8264,
+        # core dumps if negative strides are used in iteration
+        arro = np.zeros((4, 4))
+        arr = arro[::-1, ::-1]
+
+        slices = (slice(None), [0, 1, 2, 3])
+        arr[slices] = 10
+        assert_array_equal(arr, 10.)
+
+class TestFieldIndexing(object):
+    def test_scalar_return_type(self):
+        # Field access on an array should return an array, even if it
+        # is 0-d.
+        a = np.zeros((), [('a','f8')])
+        assert_(isinstance(a['a'], np.ndarray))
+        assert_(isinstance(a[['a']], np.ndarray))
+
+
+class TestBroadcastedAssignments(object):
+    def assign(self, a, ind, val):
+        a[ind] = val
+        return a
+
+    def test_prepending_ones(self):
+        a = np.zeros((3, 2))
+
+        a[...] = np.ones((1, 3, 2))
+        # Fancy with subspace with and without transpose
+        a[[0, 1, 2], :] = np.ones((1, 3, 2))
+        a[:, [0, 1]] = np.ones((1, 3, 2))
+        # Fancy without subspace (with broadcasting)
+        a[[[0], [1], [2]], [0, 1]] = np.ones((1, 3, 2))
+
+    def test_prepend_not_one(self):
+        assign = self.assign
+        s_ = np.s_
+        a = np.zeros(5)
+
+        # Too large and not only ones.
+        assert_raises(ValueError, assign, a, s_[...],  np.ones((2, 1)))
+        assert_raises(ValueError, assign, a, s_[[1, 2, 3],], np.ones((2, 1)))
+        assert_raises(ValueError, assign, a, s_[[[1], [2]],], np.ones((2,2,1)))
+
+    def test_simple_broadcasting_errors(self):
+        assign = self.assign
+        s_ = np.s_
+        a = np.zeros((5, 1))
+
+        assert_raises(ValueError, assign, a, s_[...], np.zeros((5, 2)))
+        assert_raises(ValueError, assign, a, s_[...], np.zeros((5, 0)))
+        assert_raises(ValueError, assign, a, s_[:, [0]], np.zeros((5, 2)))
+        assert_raises(ValueError, assign, a, s_[:, [0]], np.zeros((5, 0)))
+        assert_raises(ValueError, assign, a, s_[[0], :], np.zeros((2, 1)))
+
+    def test_index_is_larger(self):
+        # Simple case of fancy index broadcasting of the index.
+        a = np.zeros((5, 5))
+        a[[[0], [1], [2]], [0, 1, 2]] = [2, 3, 4]
+
+        assert_((a[:3, :3] == [2, 3, 4]).all())
+
+    def test_broadcast_subspace(self):
+        a = np.zeros((100, 100))
+        v = np.arange(100)[:,None]
+        b = np.arange(100)[::-1]
+        a[b] = v
+        assert_((a[::-1] == v).all())
+
+
+class TestSubclasses(object):
+    def test_basic(self):
+        # Test that indexing in various ways produces SubClass instances,
+        # and that the base is set up correctly: the original subclass
+        # instance for views, and a new ndarray for advanced/boolean indexing
+        # where a copy was made (latter a regression test for gh-11983).
+        class SubClass(np.ndarray):
+            pass
+
+        a = np.arange(5)
+        s = a.view(SubClass)
+        s_slice = s[:3]
+        assert_(type(s_slice) is SubClass)
+        assert_(s_slice.base is s)
+        assert_array_equal(s_slice, a[:3])
+
+        s_fancy = s[[0, 1, 2]]
+        assert_(type(s_fancy) is SubClass)
+        assert_(s_fancy.base is not s)
+        assert_(type(s_fancy.base) is np.ndarray)
+        assert_array_equal(s_fancy, a[[0, 1, 2]])
+        assert_array_equal(s_fancy.base, a[[0, 1, 2]])
+
+        s_bool = s[s > 0]
+        assert_(type(s_bool) is SubClass)
+        assert_(s_bool.base is not s)
+        assert_(type(s_bool.base) is np.ndarray)
+        assert_array_equal(s_bool, a[a > 0])
+        assert_array_equal(s_bool.base, a[a > 0])
+
+    def test_finalize_gets_full_info(self):
+        # Array finalize should be called on the filled array.
+        class SubClass(np.ndarray):
+            def __array_finalize__(self, old):
+                self.finalize_status = np.array(self)
+                self.old = old
+
+        s = np.arange(10).view(SubClass)
+        new_s = s[:3]
+        assert_array_equal(new_s.finalize_status, new_s)
+        assert_array_equal(new_s.old, s)
+
+        new_s = s[[0,1,2,3]]
+        assert_array_equal(new_s.finalize_status, new_s)
+        assert_array_equal(new_s.old, s)
+
+        new_s = s[s > 0]
+        assert_array_equal(new_s.finalize_status, new_s)
+        assert_array_equal(new_s.old, s)
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_slice_decref_getsetslice(self):
+        # See gh-10066, a temporary slice object should be discarted.
+        # This test is only really interesting on Python 2 since
+        # it goes through `__set/getslice__` here and can probably be
+        # removed. Use 0:7 to make sure it is never None:7.
+        class KeepIndexObject(np.ndarray):
+            def __getitem__(self, indx):
+                self.indx = indx
+                if indx == slice(0, 7):
+                    raise ValueError
+
+            def __setitem__(self, indx, val):
+                self.indx = indx
+                if indx == slice(0, 4):
+                    raise ValueError
+
+        k = np.array([1]).view(KeepIndexObject)
+        k[0:5]
+        assert_equal(k.indx, slice(0, 5))
+        assert_equal(sys.getrefcount(k.indx), 2)
+        try:
+            k[0:7]
+            raise AssertionError
+        except ValueError:
+            # The exception holds a reference to the slice so clear on Py2
+            if hasattr(sys, 'exc_clear'):
+                with suppress_warnings() as sup:
+                    sup.filter(DeprecationWarning)
+                    sys.exc_clear()
+        assert_equal(k.indx, slice(0, 7))
+        assert_equal(sys.getrefcount(k.indx), 2)
+
+        k[0:3] = 6
+        assert_equal(k.indx, slice(0, 3))
+        assert_equal(sys.getrefcount(k.indx), 2)
+        try:
+            k[0:4] = 2
+            raise AssertionError
+        except ValueError:
+            # The exception holds a reference to the slice so clear on Py2
+            if hasattr(sys, 'exc_clear'):
+                with suppress_warnings() as sup:
+                    sup.filter(DeprecationWarning)
+                    sys.exc_clear()
+        assert_equal(k.indx, slice(0, 4))
+        assert_equal(sys.getrefcount(k.indx), 2)
+
+
+class TestFancyIndexingCast(object):
+    def test_boolean_index_cast_assign(self):
+        # Setup the boolean index and float arrays.
+        shape = (8, 63)
+        bool_index = np.zeros(shape).astype(bool)
+        bool_index[0, 1] = True
+        zero_array = np.zeros(shape)
+
+        # Assigning float is fine.
+        zero_array[bool_index] = np.array([1])
+        assert_equal(zero_array[0, 1], 1)
+
+        # Fancy indexing works, although we get a cast warning.
+        assert_warns(np.ComplexWarning,
+                     zero_array.__setitem__, ([0], [1]), np.array([2 + 1j]))
+        assert_equal(zero_array[0, 1], 2)  # No complex part
+
+        # Cast complex to float, throwing away the imaginary portion.
+        assert_warns(np.ComplexWarning,
+                     zero_array.__setitem__, bool_index, np.array([1j]))
+        assert_equal(zero_array[0, 1], 0)
+
+class TestFancyIndexingEquivalence(object):
+    def test_object_assign(self):
+        # Check that the field and object special case using copyto is active.
+        # The right hand side cannot be converted to an array here.
+        a = np.arange(5, dtype=object)
+        b = a.copy()
+        a[:3] = [1, (1,2), 3]
+        b[[0, 1, 2]] = [1, (1,2), 3]
+        assert_array_equal(a, b)
+
+        # test same for subspace fancy indexing
+        b = np.arange(5, dtype=object)[None, :]
+        b[[0], :3] = [[1, (1,2), 3]]
+        assert_array_equal(a, b[0])
+
+        # Check that swapping of axes works.
+        # There was a bug that made the later assignment throw a ValueError
+        # do to an incorrectly transposed temporary right hand side (gh-5714)
+        b = b.T
+        b[:3, [0]] = [[1], [(1,2)], [3]]
+        assert_array_equal(a, b[:, 0])
+
+        # Another test for the memory order of the subspace
+        arr = np.ones((3, 4, 5), dtype=object)
+        # Equivalent slicing assignment for comparison
+        cmp_arr = arr.copy()
+        cmp_arr[:1, ...] = [[[1], [2], [3], [4]]]
+        arr[[0], ...] = [[[1], [2], [3], [4]]]
+        assert_array_equal(arr, cmp_arr)
+        arr = arr.copy('F')
+        arr[[0], ...] = [[[1], [2], [3], [4]]]
+        assert_array_equal(arr, cmp_arr)
+
+    def test_cast_equivalence(self):
+        # Yes, normal slicing uses unsafe casting.
+        a = np.arange(5)
+        b = a.copy()
+
+        a[:3] = np.array(['2', '-3', '-1'])
+        b[[0, 2, 1]] = np.array(['2', '-1', '-3'])
+        assert_array_equal(a, b)
+
+        # test the same for subspace fancy indexing
+        b = np.arange(5)[None, :]
+        b[[0], :3] = np.array([['2', '-3', '-1']])
+        assert_array_equal(a, b[0])
+
+
+class TestMultiIndexingAutomated(object):
+    """
+    These tests use code to mimic the C-Code indexing for selection.
+
+    NOTE:
+
+        * This still lacks tests for complex item setting.
+        * If you change behavior of indexing, you might want to modify
+          these tests to try more combinations.
+        * Behavior was written to match numpy version 1.8. (though a
+          first version matched 1.7.)
+        * Only tuple indices are supported by the mimicking code.
+          (and tested as of writing this)
+        * Error types should match most of the time as long as there
+          is only one error. For multiple errors, what gets raised
+          will usually not be the same one. They are *not* tested.
+
+    Update 2016-11-30: It is probably not worth maintaining this test
+    indefinitely and it can be dropped if maintenance becomes a burden.
+
+    """
+
+    def setup(self):
+        self.a = np.arange(np.prod([3, 1, 5, 6])).reshape(3, 1, 5, 6)
+        self.b = np.empty((3, 0, 5, 6))
+        self.complex_indices = ['skip', Ellipsis,
+            0,
+            # Boolean indices, up to 3-d for some special cases of eating up
+            # dimensions, also need to test all False
+            np.array([True, False, False]),
+            np.array([[True, False], [False, True]]),
+            np.array([[[False, False], [False, False]]]),
+            # Some slices:
+            slice(-5, 5, 2),
+            slice(1, 1, 100),
+            slice(4, -1, -2),
+            slice(None, None, -3),
+            # Some Fancy indexes:
+            np.empty((0, 1, 1), dtype=np.intp),  # empty and can be broadcast
+            np.array([0, 1, -2]),
+            np.array([[2], [0], [1]]),
+            np.array([[0, -1], [0, 1]], dtype=np.dtype('intp').newbyteorder()),
+            np.array([2, -1], dtype=np.int8),
+            np.zeros([1]*31, dtype=int),  # trigger too large array.
+            np.array([0., 1.])]  # invalid datatype
+        # Some simpler indices that still cover a bit more
+        self.simple_indices = [Ellipsis, None, -1, [1], np.array([True]),
+                               'skip']
+        # Very simple ones to fill the rest:
+        self.fill_indices = [slice(None, None), 0]
+
+    def _get_multi_index(self, arr, indices):
+        """Mimic multi dimensional indexing.
+
+        Parameters
+        ----------
+        arr : ndarray
+            Array to be indexed.
+        indices : tuple of index objects
+
+        Returns
+        -------
+        out : ndarray
+            An array equivalent to the indexing operation (but always a copy).
+            `arr[indices]` should be identical.
+        no_copy : bool
+            Whether the indexing operation requires a copy. If this is `True`,
+            `np.may_share_memory(arr, arr[indices])` should be `True` (with
+            some exceptions for scalars and possibly 0-d arrays).
+
+        Notes
+        -----
+        While the function may mostly match the errors of normal indexing this
+        is generally not the case.
+        """
+        in_indices = list(indices)
+        indices = []
+        # if False, this is a fancy or boolean index
+        no_copy = True
+        # number of fancy/scalar indexes that are not consecutive
+        num_fancy = 0
+        # number of dimensions indexed by a "fancy" index
+        fancy_dim = 0
+        # NOTE: This is a funny twist (and probably OK to change).
+        # The boolean array has illegal indexes, but this is
+        # allowed if the broadcast fancy-indices are 0-sized.
+        # This variable is to catch that case.
+        error_unless_broadcast_to_empty = False
+
+        # We need to handle Ellipsis and make arrays from indices, also
+        # check if this is fancy indexing (set no_copy).
+        ndim = 0
+        ellipsis_pos = None  # define here mostly to replace all but first.
+        for i, indx in enumerate(in_indices):
+            if indx is None:
+                continue
+            if isinstance(indx, np.ndarray) and indx.dtype == bool:
+                no_copy = False
+                if indx.ndim == 0:
+                    raise IndexError
+                # boolean indices can have higher dimensions
+                ndim += indx.ndim
+                fancy_dim += indx.ndim
+                continue
+            if indx is Ellipsis:
+                if ellipsis_pos is None:
+                    ellipsis_pos = i
+                    continue  # do not increment ndim counter
+                raise IndexError
+            if isinstance(indx, slice):
+                ndim += 1
+                continue
+            if not isinstance(indx, np.ndarray):
+                # This could be open for changes in numpy.
+                # numpy should maybe raise an error if casting to intp
+                # is not safe. It rejects np.array([1., 2.]) but not
+                # [1., 2.] as index (same for ie. np.take).
+                # (Note the importance of empty lists if changing this here)
+                try:
+                    indx = np.array(indx, dtype=np.intp)
+                except ValueError:
+                    raise IndexError
+                in_indices[i] = indx
+            elif indx.dtype.kind != 'b' and indx.dtype.kind != 'i':
+                raise IndexError('arrays used as indices must be of '
+                                 'integer (or boolean) type')
+            if indx.ndim != 0:
+                no_copy = False
+            ndim += 1
+            fancy_dim += 1
+
+        if arr.ndim - ndim < 0:
+            # we can't take more dimensions then we have, not even for 0-d
+            # arrays.  since a[()] makes sense, but not a[(),]. We will
+            # raise an error later on, unless a broadcasting error occurs
+            # first.
+            raise IndexError
+
+        if ndim == 0 and None not in in_indices:
+            # Well we have no indexes or one Ellipsis. This is legal.
+            return arr.copy(), no_copy
+
+        if ellipsis_pos is not None:
+            in_indices[ellipsis_pos:ellipsis_pos+1] = ([slice(None, None)] *
+                                                       (arr.ndim - ndim))
+
+        for ax, indx in enumerate(in_indices):
+            if isinstance(indx, slice):
+                # convert to an index array
+                indx = np.arange(*indx.indices(arr.shape[ax]))
+                indices.append(['s', indx])
+                continue
+            elif indx is None:
+                # this is like taking a slice with one element from a new axis:
+                indices.append(['n', np.array([0], dtype=np.intp)])
+                arr = arr.reshape((arr.shape[:ax] + (1,) + arr.shape[ax:]))
+                continue
+            if isinstance(indx, np.ndarray) and indx.dtype == bool:
+                if indx.shape != arr.shape[ax:ax+indx.ndim]:
+                    raise IndexError
+
+                try:
+                    flat_indx = np.ravel_multi_index(np.nonzero(indx),
+                                    arr.shape[ax:ax+indx.ndim], mode='raise')
+                except Exception:
+                    error_unless_broadcast_to_empty = True
+                    # fill with 0s instead, and raise error later
+                    flat_indx = np.array([0]*indx.sum(), dtype=np.intp)
+                # concatenate axis into a single one:
+                if indx.ndim != 0:
+                    arr = arr.reshape((arr.shape[:ax]
+                                  + (np.prod(arr.shape[ax:ax+indx.ndim]),)
+                                  + arr.shape[ax+indx.ndim:]))
+                    indx = flat_indx
+                else:
+                    # This could be changed, a 0-d boolean index can
+                    # make sense (even outside the 0-d indexed array case)
+                    # Note that originally this is could be interpreted as
+                    # integer in the full integer special case.
+                    raise IndexError
+            else:
+                # If the index is a singleton, the bounds check is done
+                # before the broadcasting. This used to be different in <1.9
+                if indx.ndim == 0:
+                    if indx >= arr.shape[ax] or indx < -arr.shape[ax]:
+                        raise IndexError
+            if indx.ndim == 0:
+                # The index is a scalar. This used to be two fold, but if
+                # fancy indexing was active, the check was done later,
+                # possibly after broadcasting it away (1.7. or earlier).
+                # Now it is always done.
+                if indx >= arr.shape[ax] or indx < - arr.shape[ax]:
+                    raise IndexError
+            if (len(indices) > 0 and
+                    indices[-1][0] == 'f' and
+                    ax != ellipsis_pos):
+                # NOTE: There could still have been a 0-sized Ellipsis
+                # between them. Checked that with ellipsis_pos.
+                indices[-1].append(indx)
+            else:
+                # We have a fancy index that is not after an existing one.
+                # NOTE: A 0-d array triggers this as well, while one may
+                # expect it to not trigger it, since a scalar would not be
+                # considered fancy indexing.
+                num_fancy += 1
+                indices.append(['f', indx])
+
+        if num_fancy > 1 and not no_copy:
+            # We have to flush the fancy indexes left
+            new_indices = indices[:]
+            axes = list(range(arr.ndim))
+            fancy_axes = []
+            new_indices.insert(0, ['f'])
+            ni = 0
+            ai = 0
+            for indx in indices:
+                ni += 1
+                if indx[0] == 'f':
+                    new_indices[0].extend(indx[1:])
+                    del new_indices[ni]
+                    ni -= 1
+                    for ax in range(ai, ai + len(indx[1:])):
+                        fancy_axes.append(ax)
+                        axes.remove(ax)
+                ai += len(indx) - 1  # axis we are at
+            indices = new_indices
+            # and now we need to transpose arr:
+            arr = arr.transpose(*(fancy_axes + axes))
+
+        # We only have one 'f' index now and arr is transposed accordingly.
+        # Now handle newaxis by reshaping...
+        ax = 0
+        for indx in indices:
+            if indx[0] == 'f':
+                if len(indx) == 1:
+                    continue
+                # First of all, reshape arr to combine fancy axes into one:
+                orig_shape = arr.shape
+                orig_slice = orig_shape[ax:ax + len(indx[1:])]
+                arr = arr.reshape((arr.shape[:ax]
+                                    + (np.prod(orig_slice).astype(int),)
+                                    + arr.shape[ax + len(indx[1:]):]))
+
+                # Check if broadcasting works
+                res = np.broadcast(*indx[1:])
+                # unfortunately the indices might be out of bounds. So check
+                # that first, and use mode='wrap' then. However only if
+                # there are any indices...
+                if res.size != 0:
+                    if error_unless_broadcast_to_empty:
+                        raise IndexError
+                    for _indx, _size in zip(indx[1:], orig_slice):
+                        if _indx.size == 0:
+                            continue
+                        if np.any(_indx >= _size) or np.any(_indx < -_size):
+                                raise IndexError
+                if len(indx[1:]) == len(orig_slice):
+                    if np.product(orig_slice) == 0:
+                        # Work around for a crash or IndexError with 'wrap'
+                        # in some 0-sized cases.
+                        try:
+                            mi = np.ravel_multi_index(indx[1:], orig_slice,
+                                                      mode='raise')
+                        except Exception:
+                            # This happens with 0-sized orig_slice (sometimes?)
+                            # here it is a ValueError, but indexing gives a:
+                            raise IndexError('invalid index into 0-sized')
+                    else:
+                        mi = np.ravel_multi_index(indx[1:], orig_slice,
+                                                  mode='wrap')
+                else:
+                    # Maybe never happens...
+                    raise ValueError
+                arr = arr.take(mi.ravel(), axis=ax)
+                try:
+                    arr = arr.reshape((arr.shape[:ax]
+                                        + mi.shape
+                                        + arr.shape[ax+1:]))
+                except ValueError:
+                    # too many dimensions, probably
+                    raise IndexError
+                ax += mi.ndim
+                continue
+
+            # If we are here, we have a 1D array for take:
+            arr = arr.take(indx[1], axis=ax)
+            ax += 1
+
+        return arr, no_copy
+
+    def _check_multi_index(self, arr, index):
+        """Check a multi index item getting and simple setting.
+
+        Parameters
+        ----------
+        arr : ndarray
+            Array to be indexed, must be a reshaped arange.
+        index : tuple of indexing objects
+            Index being tested.
+        """
+        # Test item getting
+        try:
+            mimic_get, no_copy = self._get_multi_index(arr, index)
+        except Exception as e:
+            if HAS_REFCOUNT:
+                prev_refcount = sys.getrefcount(arr)
+            assert_raises(type(e), arr.__getitem__, index)
+            assert_raises(type(e), arr.__setitem__, index, 0)
+            if HAS_REFCOUNT:
+                assert_equal(prev_refcount, sys.getrefcount(arr))
+            return
+
+        self._compare_index_result(arr, index, mimic_get, no_copy)
+
+    def _check_single_index(self, arr, index):
+        """Check a single index item getting and simple setting.
+
+        Parameters
+        ----------
+        arr : ndarray
+            Array to be indexed, must be an arange.
+        index : indexing object
+            Index being tested. Must be a single index and not a tuple
+            of indexing objects (see also `_check_multi_index`).
+        """
+        try:
+            mimic_get, no_copy = self._get_multi_index(arr, (index,))
+        except Exception as e:
+            if HAS_REFCOUNT:
+                prev_refcount = sys.getrefcount(arr)
+            assert_raises(type(e), arr.__getitem__, index)
+            assert_raises(type(e), arr.__setitem__, index, 0)
+            if HAS_REFCOUNT:
+                assert_equal(prev_refcount, sys.getrefcount(arr))
+            return
+
+        self._compare_index_result(arr, index, mimic_get, no_copy)
+
+    def _compare_index_result(self, arr, index, mimic_get, no_copy):
+        """Compare mimicked result to indexing result.
+        """
+        arr = arr.copy()
+        indexed_arr = arr[index]
+        assert_array_equal(indexed_arr, mimic_get)
+        # Check if we got a view, unless its a 0-sized or 0-d array.
+        # (then its not a view, and that does not matter)
+        if indexed_arr.size != 0 and indexed_arr.ndim != 0:
+            assert_(np.may_share_memory(indexed_arr, arr) == no_copy)
+            # Check reference count of the original array
+            if HAS_REFCOUNT:
+                if no_copy:
+                    # refcount increases by one:
+                    assert_equal(sys.getrefcount(arr), 3)
+                else:
+                    assert_equal(sys.getrefcount(arr), 2)
+
+        # Test non-broadcast setitem:
+        b = arr.copy()
+        b[index] = mimic_get + 1000
+        if b.size == 0:
+            return  # nothing to compare here...
+        if no_copy and indexed_arr.ndim != 0:
+            # change indexed_arr in-place to manipulate original:
+            indexed_arr += 1000
+            assert_array_equal(arr, b)
+            return
+        # Use the fact that the array is originally an arange:
+        arr.flat[indexed_arr.ravel()] += 1000
+        assert_array_equal(arr, b)
+
+    def test_boolean(self):
+        a = np.array(5)
+        assert_equal(a[np.array(True)], 5)
+        a[np.array(True)] = 1
+        assert_equal(a, 1)
+        # NOTE: This is different from normal broadcasting, as
+        # arr[boolean_array] works like in a multi index. Which means
+        # it is aligned to the left. This is probably correct for
+        # consistency with arr[boolean_array,] also no broadcasting
+        # is done at all
+        self._check_multi_index(
+            self.a, (np.zeros_like(self.a, dtype=bool),))
+        self._check_multi_index(
+            self.a, (np.zeros_like(self.a, dtype=bool)[..., 0],))
+        self._check_multi_index(
+            self.a, (np.zeros_like(self.a, dtype=bool)[None, ...],))
+
+    def test_multidim(self):
+        # Automatically test combinations with complex indexes on 2nd (or 1st)
+        # spot and the simple ones in one other spot.
+        with warnings.catch_warnings():
+            # This is so that np.array(True) is not accepted in a full integer
+            # index, when running the file separately.
+            warnings.filterwarnings('error', '', DeprecationWarning)
+            warnings.filterwarnings('error', '', np.VisibleDeprecationWarning)
+
+            def isskip(idx):
+                return isinstance(idx, str) and idx == "skip"
+
+            for simple_pos in [0, 2, 3]:
+                tocheck = [self.fill_indices, self.complex_indices,
+                           self.fill_indices, self.fill_indices]
+                tocheck[simple_pos] = self.simple_indices
+                for index in product(*tocheck):
+                    index = tuple(i for i in index if not isskip(i))
+                    self._check_multi_index(self.a, index)
+                    self._check_multi_index(self.b, index)
+
+        # Check very simple item getting:
+        self._check_multi_index(self.a, (0, 0, 0, 0))
+        self._check_multi_index(self.b, (0, 0, 0, 0))
+        # Also check (simple cases of) too many indices:
+        assert_raises(IndexError, self.a.__getitem__, (0, 0, 0, 0, 0))
+        assert_raises(IndexError, self.a.__setitem__, (0, 0, 0, 0, 0), 0)
+        assert_raises(IndexError, self.a.__getitem__, (0, 0, [1], 0, 0))
+        assert_raises(IndexError, self.a.__setitem__, (0, 0, [1], 0, 0), 0)
+
+    def test_1d(self):
+        a = np.arange(10)
+        for index in self.complex_indices:
+            self._check_single_index(a, index)
+
+class TestFloatNonIntegerArgument(object):
+    """
+    These test that ``TypeError`` is raised when you try to use
+    non-integers as arguments to for indexing and slicing e.g. ``a[0.0:5]``
+    and ``a[0.5]``, or other functions like ``array.reshape(1., -1)``.
+
+    """
+    def test_valid_indexing(self):
+        # These should raise no errors.
+        a = np.array([[[5]]])
+
+        a[np.array([0])]
+        a[[0, 0]]
+        a[:, [0, 0]]
+        a[:, 0,:]
+        a[:,:,:]
+
+    def test_valid_slicing(self):
+        # These should raise no errors.
+        a = np.array([[[5]]])
+
+        a[::]
+        a[0:]
+        a[:2]
+        a[0:2]
+        a[::2]
+        a[1::2]
+        a[:2:2]
+        a[1:2:2]
+
+    def test_non_integer_argument_errors(self):
+        a = np.array([[5]])
+
+        assert_raises(TypeError, np.reshape, a, (1., 1., -1))
+        assert_raises(TypeError, np.reshape, a, (np.array(1.), -1))
+        assert_raises(TypeError, np.take, a, [0], 1.)
+        assert_raises(TypeError, np.take, a, [0], np.float64(1.))
+
+    def test_non_integer_sequence_multiplication(self):
+        # NumPy scalar sequence multiply should not work with non-integers
+        def mult(a, b):
+            return a * b
+
+        assert_raises(TypeError, mult, [1], np.float_(3))
+        # following should be OK
+        mult([1], np.int_(3))
+
+    def test_reduce_axis_float_index(self):
+        d = np.zeros((3,3,3))
+        assert_raises(TypeError, np.min, d, 0.5)
+        assert_raises(TypeError, np.min, d, (0.5, 1))
+        assert_raises(TypeError, np.min, d, (1, 2.2))
+        assert_raises(TypeError, np.min, d, (.2, 1.2))
+
+
+class TestBooleanIndexing(object):
+    # Using a boolean as integer argument/indexing is an error.
+    def test_bool_as_int_argument_errors(self):
+        a = np.array([[[1]]])
+
+        assert_raises(TypeError, np.reshape, a, (True, -1))
+        assert_raises(TypeError, np.reshape, a, (np.bool_(True), -1))
+        # Note that operator.index(np.array(True)) does not work, a boolean
+        # array is thus also deprecated, but not with the same message:
+        assert_raises(TypeError, operator.index, np.array(True))
+        assert_warns(DeprecationWarning, operator.index, np.True_)
+        assert_raises(TypeError, np.take, args=(a, [0], False))
+
+    def test_boolean_indexing_weirdness(self):
+        # Weird boolean indexing things
+        a = np.ones((2, 3, 4))
+        a[False, True, ...].shape == (0, 2, 3, 4)
+        a[True, [0, 1], True, True, [1], [[2]]] == (1, 2)
+        assert_raises(IndexError, lambda: a[False, [0, 1], ...])
+
+
+class TestArrayToIndexDeprecation(object):
+    """Creating an an index from array not 0-D is an error.
+
+    """
+    def test_array_to_index_error(self):
+        # so no exception is expected. The raising is effectively tested above.
+        a = np.array([[[1]]])
+
+        assert_raises(TypeError, operator.index, np.array([1]))
+        assert_raises(TypeError, np.reshape, a, (a, -1))
+        assert_raises(TypeError, np.take, a, [0], a)
+
+
+class TestNonIntegerArrayLike(object):
+    """Tests that array_likes only valid if can safely cast to integer.
+
+    For instance, lists give IndexError when they cannot be safely cast to
+    an integer.
+
+    """
+    def test_basic(self):
+        a = np.arange(10)
+
+        assert_raises(IndexError, a.__getitem__, [0.5, 1.5])
+        assert_raises(IndexError, a.__getitem__, (['1', '2'],))
+
+        # The following is valid
+        a.__getitem__([])
+
+
+class TestMultipleEllipsisError(object):
+    """An index can only have a single ellipsis.
+
+    """
+    def test_basic(self):
+        a = np.arange(10)
+        assert_raises(IndexError, lambda: a[..., ...])
+        assert_raises(IndexError, a.__getitem__, ((Ellipsis,) * 2,))
+        assert_raises(IndexError, a.__getitem__, ((Ellipsis,) * 3,))
+
+
+class TestCApiAccess(object):
+    def test_getitem(self):
+        subscript = functools.partial(array_indexing, 0)
+
+        # 0-d arrays don't work:
+        assert_raises(IndexError, subscript, np.ones(()), 0)
+        # Out of bound values:
+        assert_raises(IndexError, subscript, np.ones(10), 11)
+        assert_raises(IndexError, subscript, np.ones(10), -11)
+        assert_raises(IndexError, subscript, np.ones((10, 10)), 11)
+        assert_raises(IndexError, subscript, np.ones((10, 10)), -11)
+
+        a = np.arange(10)
+        assert_array_equal(a[4], subscript(a, 4))
+        a = a.reshape(5, 2)
+        assert_array_equal(a[-4], subscript(a, -4))
+
+    def test_setitem(self):
+        assign = functools.partial(array_indexing, 1)
+
+        # Deletion is impossible:
+        assert_raises(ValueError, assign, np.ones(10), 0)
+        # 0-d arrays don't work:
+        assert_raises(IndexError, assign, np.ones(()), 0, 0)
+        # Out of bound values:
+        assert_raises(IndexError, assign, np.ones(10), 11, 0)
+        assert_raises(IndexError, assign, np.ones(10), -11, 0)
+        assert_raises(IndexError, assign, np.ones((10, 10)), 11, 0)
+        assert_raises(IndexError, assign, np.ones((10, 10)), -11, 0)
+
+        a = np.arange(10)
+        assign(a, 4, 10)
+        assert_(a[4] == 10)
+
+        a = a.reshape(5, 2)
+        assign(a, 4, 10)
+        assert_array_equal(a[-1], [10, 10])
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_item_selection.py b/contrib/python/numpy/py2/numpy/core/tests/test_item_selection.py
new file mode 100644
index 00000000000..3bc24fc9561
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_item_selection.py
@@ -0,0 +1,87 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_raises, assert_array_equal, HAS_REFCOUNT
+    )
+
+
+class TestTake(object):
+    def test_simple(self):
+        a = [[1, 2], [3, 4]]
+        a_str = [[b'1', b'2'], [b'3', b'4']]
+        modes = ['raise', 'wrap', 'clip']
+        indices = [-1, 4]
+        index_arrays = [np.empty(0, dtype=np.intp),
+                        np.empty(tuple(), dtype=np.intp),
+                        np.empty((1, 1), dtype=np.intp)]
+        real_indices = {'raise': {-1: 1, 4: IndexError},
+                        'wrap': {-1: 1, 4: 0},
+                        'clip': {-1: 0, 4: 1}}
+        # Currently all types but object, use the same function generation.
+        # So it should not be necessary to test all. However test also a non
+        # refcounted struct on top of object.
+        types = int, object, np.dtype([('', 'i', 2)])
+        for t in types:
+            # ta works, even if the array may be odd if buffer interface is used
+            ta = np.array(a if np.issubdtype(t, np.number) else a_str, dtype=t)
+            tresult = list(ta.T.copy())
+            for index_array in index_arrays:
+                if index_array.size != 0:
+                    tresult[0].shape = (2,) + index_array.shape
+                    tresult[1].shape = (2,) + index_array.shape
+                for mode in modes:
+                    for index in indices:
+                        real_index = real_indices[mode][index]
+                        if real_index is IndexError and index_array.size != 0:
+                            index_array.put(0, index)
+                            assert_raises(IndexError, ta.take, index_array,
+                                          mode=mode, axis=1)
+                        elif index_array.size != 0:
+                            index_array.put(0, index)
+                            res = ta.take(index_array, mode=mode, axis=1)
+                            assert_array_equal(res, tresult[real_index])
+                        else:
+                            res = ta.take(index_array, mode=mode, axis=1)
+                            assert_(res.shape == (2,) + index_array.shape)
+
+    def test_refcounting(self):
+        objects = [object() for i in range(10)]
+        for mode in ('raise', 'clip', 'wrap'):
+            a = np.array(objects)
+            b = np.array([2, 2, 4, 5, 3, 5])
+            a.take(b, out=a[:6], mode=mode)
+            del a
+            if HAS_REFCOUNT:
+                assert_(all(sys.getrefcount(o) == 3 for o in objects))
+            # not contiguous, example:
+            a = np.array(objects * 2)[::2]
+            a.take(b, out=a[:6], mode=mode)
+            del a
+            if HAS_REFCOUNT:
+                assert_(all(sys.getrefcount(o) == 3 for o in objects))
+
+    def test_unicode_mode(self):
+        d = np.arange(10)
+        k = b'\xc3\xa4'.decode("UTF8")
+        assert_raises(ValueError, d.take, 5, mode=k)
+
+    def test_empty_partition(self):
+        # In reference to github issue #6530
+        a_original = np.array([0, 2, 4, 6, 8, 10])
+        a = a_original.copy()
+
+        # An empty partition should be a successful no-op
+        a.partition(np.array([], dtype=np.int16))
+
+        assert_array_equal(a, a_original)
+
+    def test_empty_argpartition(self):
+            # In reference to github issue #6530
+            a = np.array([0, 2, 4, 6, 8, 10])
+            a = a.argpartition(np.array([], dtype=np.int16))
+
+            b = np.array([0, 1, 2, 3, 4, 5])
+            assert_array_equal(a, b)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_longdouble.py b/contrib/python/numpy/py2/numpy/core/tests/test_longdouble.py
new file mode 100644
index 00000000000..ee4197f8f7a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_longdouble.py
@@ -0,0 +1,233 @@
+from __future__ import division, absolute_import, print_function
+
+import warnings
+import pytest
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_array_equal, temppath,
+    )
+from numpy.core.tests._locales import CommaDecimalPointLocale
+
+LD_INFO = np.finfo(np.longdouble)
+longdouble_longer_than_double = (LD_INFO.eps < np.finfo(np.double).eps)
+
+
+_o = 1 + LD_INFO.eps
+string_to_longdouble_inaccurate = (_o != np.longdouble(repr(_o)))
+del _o
+
+
+def test_scalar_extraction():
+    """Confirm that extracting a value doesn't convert to python float"""
+    o = 1 + LD_INFO.eps
+    a = np.array([o, o, o])
+    assert_equal(a[1], o)
+
+
+# Conversions string -> long double
+
+# 0.1 not exactly representable in base 2 floating point.
+repr_precision = len(repr(np.longdouble(0.1)))
+# +2 from macro block starting around line 842 in scalartypes.c.src.
+[email protected](LD_INFO.precision + 2 >= repr_precision,
+                    reason="repr precision not enough to show eps")
+def test_repr_roundtrip():
+    # We will only see eps in repr if within printing precision.
+    o = 1 + LD_INFO.eps
+    assert_equal(np.longdouble(repr(o)), o, "repr was %s" % repr(o))
+
+
+def test_unicode():
+    np.longdouble(u"1.2")
+
+
+def test_string():
+    np.longdouble("1.2")
+
+
+def test_bytes():
+    np.longdouble(b"1.2")
+
+
+[email protected](string_to_longdouble_inaccurate, reason="Need strtold_l")
+def test_repr_roundtrip_bytes():
+    o = 1 + LD_INFO.eps
+    assert_equal(np.longdouble(repr(o).encode("ascii")), o)
+
+
+def test_bogus_string():
+    assert_raises(ValueError, np.longdouble, "spam")
+    assert_raises(ValueError, np.longdouble, "1.0 flub")
+
+
+[email protected](string_to_longdouble_inaccurate, reason="Need strtold_l")
+def test_fromstring():
+    o = 1 + LD_INFO.eps
+    s = (" " + repr(o))*5
+    a = np.array([o]*5)
+    assert_equal(np.fromstring(s, sep=" ", dtype=np.longdouble), a,
+                 err_msg="reading '%s'" % s)
+
+
+def test_fromstring_bogus():
+    assert_equal(np.fromstring("1. 2. 3. flop 4.", dtype=float, sep=" "),
+                 np.array([1., 2., 3.]))
+
+
+def test_fromstring_empty():
+    assert_equal(np.fromstring("xxxxx", sep="x"),
+                 np.array([]))
+
+
+def test_fromstring_missing():
+    assert_equal(np.fromstring("1xx3x4x5x6", sep="x"),
+                 np.array([1]))
+
+
+class TestFileBased(object):
+
+    ldbl = 1 + LD_INFO.eps
+    tgt = np.array([ldbl]*5)
+    out = ''.join([repr(t) + '\n' for t in tgt])
+
+    def test_fromfile_bogus(self):
+        with temppath() as path:
+            with open(path, 'wt') as f:
+                f.write("1. 2. 3. flop 4.\n")
+            res = np.fromfile(path, dtype=float, sep=" ")
+        assert_equal(res, np.array([1., 2., 3.]))
+
+    @pytest.mark.skipif(string_to_longdouble_inaccurate,
+                        reason="Need strtold_l")
+    def test_fromfile(self):
+        with temppath() as path:
+            with open(path, 'wt') as f:
+                f.write(self.out)
+            res = np.fromfile(path, dtype=np.longdouble, sep="\n")
+        assert_equal(res, self.tgt)
+
+    @pytest.mark.skipif(string_to_longdouble_inaccurate,
+                        reason="Need strtold_l")
+    def test_genfromtxt(self):
+        with temppath() as path:
+            with open(path, 'wt') as f:
+                f.write(self.out)
+            res = np.genfromtxt(path, dtype=np.longdouble)
+        assert_equal(res, self.tgt)
+
+    @pytest.mark.skipif(string_to_longdouble_inaccurate,
+                        reason="Need strtold_l")
+    def test_loadtxt(self):
+        with temppath() as path:
+            with open(path, 'wt') as f:
+                f.write(self.out)
+            res = np.loadtxt(path, dtype=np.longdouble)
+        assert_equal(res, self.tgt)
+
+    @pytest.mark.skipif(string_to_longdouble_inaccurate,
+                        reason="Need strtold_l")
+    def test_tofile_roundtrip(self):
+        with temppath() as path:
+            self.tgt.tofile(path, sep=" ")
+            res = np.fromfile(path, dtype=np.longdouble, sep=" ")
+        assert_equal(res, self.tgt)
+
+
+# Conversions long double -> string
+
+
+def test_repr_exact():
+    o = 1 + LD_INFO.eps
+    assert_(repr(o) != '1')
+
+
+[email protected](longdouble_longer_than_double, reason="BUG #2376")
+[email protected](string_to_longdouble_inaccurate,
+                    reason="Need strtold_l")
+def test_format():
+    o = 1 + LD_INFO.eps
+    assert_("{0:.40g}".format(o) != '1')
+
+
+[email protected](longdouble_longer_than_double, reason="BUG #2376")
+[email protected](string_to_longdouble_inaccurate,
+                    reason="Need strtold_l")
+def test_percent():
+    o = 1 + LD_INFO.eps
+    assert_("%.40g" % o != '1')
+
+
+[email protected](longdouble_longer_than_double,
+                    reason="array repr problem")
+[email protected](string_to_longdouble_inaccurate,
+                    reason="Need strtold_l")
+def test_array_repr():
+    o = 1 + LD_INFO.eps
+    a = np.array([o])
+    b = np.array([1], dtype=np.longdouble)
+    if not np.all(a != b):
+        raise ValueError("precision loss creating arrays")
+    assert_(repr(a) != repr(b))
+
+#
+# Locale tests: scalar types formatting should be independent of the locale
+#
+
+class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
+
+    def test_repr_roundtrip_foreign(self):
+        o = 1.5
+        assert_equal(o, np.longdouble(repr(o)))
+
+    def test_fromstring_foreign_repr(self):
+        f = 1.234
+        a = np.fromstring(repr(f), dtype=float, sep=" ")
+        assert_equal(a[0], f)
+
+    def test_fromstring_best_effort_float(self):
+        assert_equal(np.fromstring("1,234", dtype=float, sep=" "),
+                     np.array([1.]))
+
+    def test_fromstring_best_effort(self):
+        assert_equal(np.fromstring("1,234", dtype=np.longdouble, sep=" "),
+                     np.array([1.]))
+
+    def test_fromstring_foreign(self):
+        s = "1.234"
+        a = np.fromstring(s, dtype=np.longdouble, sep=" ")
+        assert_equal(a[0], np.longdouble(s))
+
+    def test_fromstring_foreign_sep(self):
+        a = np.array([1, 2, 3, 4])
+        b = np.fromstring("1,2,3,4,", dtype=np.longdouble, sep=",")
+        assert_array_equal(a, b)
+
+    def test_fromstring_foreign_value(self):
+        b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
+        assert_array_equal(b[0], 1)
+
+[email protected]("int_val", [
+    # cases discussed in gh-10723
+    # and gh-9968
+    2 ** 1024, 0])
+def test_longdouble_from_int(int_val):
+    # for issue gh-9968
+    str_val = str(int_val)
+    # we'll expect a RuntimeWarning on platforms
+    # with np.longdouble equivalent to np.double
+    # for large integer input
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', RuntimeWarning)
+        # can be inf==inf on some platforms
+        assert np.longdouble(int_val) == np.longdouble(str_val)
+        # we can't directly compare the int and
+        # max longdouble value on all platforms
+        if np.allclose(np.finfo(np.longdouble).max,
+                       np.finfo(np.double).max) and w:
+            assert w[0].category is RuntimeWarning
+
+[email protected]("bool_val", [
+    True, False])
+def test_longdouble_from_bool(bool_val):
+    assert np.longdouble(bool_val) == np.longdouble(int(bool_val))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_machar.py b/contrib/python/numpy/py2/numpy/core/tests/test_machar.py
new file mode 100644
index 00000000000..ab8800c09d3
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_machar.py
@@ -0,0 +1,32 @@
+"""
+Test machar. Given recent changes to hardcode type data, we might want to get
+rid of both MachAr and this test at some point.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from numpy.core.machar import MachAr
+import numpy.core.numerictypes as ntypes
+from numpy import errstate, array
+
+
+class TestMachAr(object):
+    def _run_machar_highprec(self):
+        # Instantiate MachAr instance with high enough precision to cause
+        # underflow
+        try:
+            hiprec = ntypes.float96
+            MachAr(lambda v:array([v], hiprec))
+        except AttributeError:
+            # Fixme, this needs to raise a 'skip' exception.
+            "Skipping test: no ntypes.float96 available on this platform."
+
+    def test_underlow(self):
+        # Regression test for #759:
+        # instantiating MachAr for dtype = np.float96 raises spurious warning.
+        with errstate(all='raise'):
+            try:
+                self._run_machar_highprec()
+            except FloatingPointError as e:
+                msg = "Caught %s exception, should not have been raised." % e
+                raise AssertionError(msg)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_mem_overlap.py b/contrib/python/numpy/py2/numpy/core/tests/test_mem_overlap.py
new file mode 100644
index 00000000000..3c8e0e7220d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_mem_overlap.py
@@ -0,0 +1,950 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import itertools
+import pytest
+
+import numpy as np
+from numpy.core._multiarray_tests import solve_diophantine, internal_overlap
+from numpy.core import _umath_tests
+from numpy.lib.stride_tricks import as_strided
+from numpy.compat import long
+from numpy.testing import (
+    assert_, assert_raises, assert_equal, assert_array_equal
+    )
+
+if sys.version_info[0] >= 3:
+    xrange = range
+
+
+ndims = 2
+size = 10
+shape = tuple([size] * ndims)
+
+MAY_SHARE_BOUNDS = 0
+MAY_SHARE_EXACT = -1
+
+
+def _indices_for_nelems(nelems):
+    """Returns slices of length nelems, from start onwards, in direction sign."""
+
+    if nelems == 0:
+        return [size // 2]  # int index
+
+    res = []
+    for step in (1, 2):
+        for sign in (-1, 1):
+            start = size // 2 - nelems * step * sign // 2
+            stop = start + nelems * step * sign
+            res.append(slice(start, stop, step * sign))
+
+    return res
+
+
+def _indices_for_axis():
+    """Returns (src, dst) pairs of indices."""
+
+    res = []
+    for nelems in (0, 2, 3):
+        ind = _indices_for_nelems(nelems)
+
+        # no itertools.product available in Py2.4
+        res.extend([(a, b) for a in ind for b in ind])  # all assignments of size "nelems"
+
+    return res
+
+
+def _indices(ndims):
+    """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
+
+    ind = _indices_for_axis()
+
+    # no itertools.product available in Py2.4
+
+    res = [[]]
+    for i in range(ndims):
+        newres = []
+        for elem in ind:
+            for others in res:
+                newres.append([elem] + others)
+        res = newres
+
+    return res
+
+
+def _check_assignment(srcidx, dstidx):
+    """Check assignment arr[dstidx] = arr[srcidx] works."""
+
+    arr = np.arange(np.product(shape)).reshape(shape)
+
+    cpy = arr.copy()
+
+    cpy[dstidx] = arr[srcidx]
+    arr[dstidx] = arr[srcidx]
+
+    assert_(np.all(arr == cpy),
+            'assigning arr[%s] = arr[%s]' % (dstidx, srcidx))
+
+
+def test_overlapping_assignments():
+    # Test automatically generated assignments which overlap in memory.
+
+    inds = _indices(ndims)
+
+    for ind in inds:
+        srcidx = tuple([a[0] for a in ind])
+        dstidx = tuple([a[1] for a in ind])
+
+        _check_assignment(srcidx, dstidx)
+
+
+[email protected]
+def test_diophantine_fuzz():
+    # Fuzz test the diophantine solver
+    rng = np.random.RandomState(1234)
+
+    max_int = np.iinfo(np.intp).max
+
+    for ndim in range(10):
+        feasible_count = 0
+        infeasible_count = 0
+
+        min_count = 500//(ndim + 1)
+
+        while min(feasible_count, infeasible_count) < min_count:
+            # Ensure big and small integer problems
+            A_max = 1 + rng.randint(0, 11, dtype=np.intp)**6
+            U_max = rng.randint(0, 11, dtype=np.intp)**6
+
+            A_max = min(max_int, A_max)
+            U_max = min(max_int-1, U_max)
+
+            A = tuple(int(rng.randint(1, A_max+1, dtype=np.intp))
+                      for j in range(ndim))
+            U = tuple(int(rng.randint(0, U_max+2, dtype=np.intp))
+                      for j in range(ndim))
+
+            b_ub = min(max_int-2, sum(a*ub for a, ub in zip(A, U)))
+            b = rng.randint(-1, b_ub+2, dtype=np.intp)
+
+            if ndim == 0 and feasible_count < min_count:
+                b = 0
+
+            X = solve_diophantine(A, U, b)
+
+            if X is None:
+                # Check the simplified decision problem agrees
+                X_simplified = solve_diophantine(A, U, b, simplify=1)
+                assert_(X_simplified is None, (A, U, b, X_simplified))
+
+                # Check no solution exists (provided the problem is
+                # small enough so that brute force checking doesn't
+                # take too long)
+                try:
+                    ranges = tuple(xrange(0, a*ub+1, a) for a, ub in zip(A, U))
+                except OverflowError:
+                    # xrange on 32-bit Python 2 may overflow
+                    continue
+
+                size = 1
+                for r in ranges:
+                    size *= len(r)
+                if size < 100000:
+                    assert_(not any(sum(w) == b for w in itertools.product(*ranges)))
+                    infeasible_count += 1
+            else:
+                # Check the simplified decision problem agrees
+                X_simplified = solve_diophantine(A, U, b, simplify=1)
+                assert_(X_simplified is not None, (A, U, b, X_simplified))
+
+                # Check validity
+                assert_(sum(a*x for a, x in zip(A, X)) == b)
+                assert_(all(0 <= x <= ub for x, ub in zip(X, U)))
+                feasible_count += 1
+
+
+def test_diophantine_overflow():
+    # Smoke test integer overflow detection
+    max_intp = np.iinfo(np.intp).max
+    max_int64 = np.iinfo(np.int64).max
+
+    if max_int64 <= max_intp:
+        # Check that the algorithm works internally in 128-bit;
+        # solving this problem requires large intermediate numbers
+        A = (max_int64//2, max_int64//2 - 10)
+        U = (max_int64//2, max_int64//2 - 10)
+        b = 2*(max_int64//2) - 10
+
+        assert_equal(solve_diophantine(A, U, b), (1, 1))
+
+
+def check_may_share_memory_exact(a, b):
+    got = np.may_share_memory(a, b, max_work=MAY_SHARE_EXACT)
+
+    assert_equal(np.may_share_memory(a, b),
+                 np.may_share_memory(a, b, max_work=MAY_SHARE_BOUNDS))
+
+    a.fill(0)
+    b.fill(0)
+    a.fill(1)
+    exact = b.any()
+
+    err_msg = ""
+    if got != exact:
+        err_msg = "    " + "\n    ".join([
+            "base_a - base_b = %r" % (a.__array_interface__['data'][0] - b.__array_interface__['data'][0],),
+            "shape_a = %r" % (a.shape,),
+            "shape_b = %r" % (b.shape,),
+            "strides_a = %r" % (a.strides,),
+            "strides_b = %r" % (b.strides,),
+            "size_a = %r" % (a.size,),
+            "size_b = %r" % (b.size,)
+        ])
+
+    assert_equal(got, exact, err_msg=err_msg)
+
+
+def test_may_share_memory_manual():
+    # Manual test cases for may_share_memory
+
+    # Base arrays
+    xs0 = [
+        np.zeros([13, 21, 23, 22], dtype=np.int8),
+        np.zeros([13, 21, 23*2, 22], dtype=np.int8)[:,:,::2,:]
+    ]
+
+    # Generate all negative stride combinations
+    xs = []
+    for x in xs0:
+        for ss in itertools.product(*(([slice(None), slice(None, None, -1)],)*4)):
+            xp = x[ss]
+            xs.append(xp)
+
+    for x in xs:
+        # The default is a simple extent check
+        assert_(np.may_share_memory(x[:,0,:], x[:,1,:]))
+        assert_(np.may_share_memory(x[:,0,:], x[:,1,:], max_work=None))
+
+        # Exact checks
+        check_may_share_memory_exact(x[:,0,:], x[:,1,:])
+        check_may_share_memory_exact(x[:,::7], x[:,3::3])
+
+        try:
+            xp = x.ravel()
+            if xp.flags.owndata:
+                continue
+            xp = xp.view(np.int16)
+        except ValueError:
+            continue
+
+        # 0-size arrays cannot overlap
+        check_may_share_memory_exact(x.ravel()[6:6],
+                                     xp.reshape(13, 21, 23, 11)[:,::7])
+
+        # Test itemsize is dealt with
+        check_may_share_memory_exact(x[:,::7],
+                                     xp.reshape(13, 21, 23, 11))
+        check_may_share_memory_exact(x[:,::7],
+                                     xp.reshape(13, 21, 23, 11)[:,3::3])
+        check_may_share_memory_exact(x.ravel()[6:7],
+                                     xp.reshape(13, 21, 23, 11)[:,::7])
+
+    # Check unit size
+    x = np.zeros([1], dtype=np.int8)
+    check_may_share_memory_exact(x, x)
+    check_may_share_memory_exact(x, x.copy())
+
+
+def iter_random_view_pairs(x, same_steps=True, equal_size=False):
+    rng = np.random.RandomState(1234)
+
+    if equal_size and same_steps:
+        raise ValueError()
+
+    def random_slice(n, step):
+        start = rng.randint(0, n+1, dtype=np.intp)
+        stop = rng.randint(start, n+1, dtype=np.intp)
+        if rng.randint(0, 2, dtype=np.intp) == 0:
+            stop, start = start, stop
+            step *= -1
+        return slice(start, stop, step)
+
+    def random_slice_fixed_size(n, step, size):
+        start = rng.randint(0, n+1 - size*step)
+        stop = start + (size-1)*step + 1
+        if rng.randint(0, 2) == 0:
+            stop, start = start-1, stop-1
+            if stop < 0:
+                stop = None
+            step *= -1
+        return slice(start, stop, step)
+
+    # First a few regular views
+    yield x, x
+    for j in range(1, 7, 3):
+        yield x[j:], x[:-j]
+        yield x[...,j:], x[...,:-j]
+
+    # An array with zero stride internal overlap
+    strides = list(x.strides)
+    strides[0] = 0
+    xp = as_strided(x, shape=x.shape, strides=strides)
+    yield x, xp
+    yield xp, xp
+
+    # An array with non-zero stride internal overlap
+    strides = list(x.strides)
+    if strides[0] > 1:
+        strides[0] = 1
+    xp = as_strided(x, shape=x.shape, strides=strides)
+    yield x, xp
+    yield xp, xp
+
+    # Then discontiguous views
+    while True:
+        steps = tuple(rng.randint(1, 11, dtype=np.intp)
+                      if rng.randint(0, 5, dtype=np.intp) == 0 else 1
+                      for j in range(x.ndim))
+        s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+
+        t1 = np.arange(x.ndim)
+        rng.shuffle(t1)
+
+        if equal_size:
+            t2 = t1
+        else:
+            t2 = np.arange(x.ndim)
+            rng.shuffle(t2)
+
+        a = x[s1]
+
+        if equal_size:
+            if a.size == 0:
+                continue
+
+            steps2 = tuple(rng.randint(1, max(2, p//(1+pa)))
+                           if rng.randint(0, 5) == 0 else 1
+                           for p, s, pa in zip(x.shape, s1, a.shape))
+            s2 = tuple(random_slice_fixed_size(p, s, pa)
+                       for p, s, pa in zip(x.shape, steps2, a.shape))
+        elif same_steps:
+            steps2 = steps
+        else:
+            steps2 = tuple(rng.randint(1, 11, dtype=np.intp)
+                           if rng.randint(0, 5, dtype=np.intp) == 0 else 1
+                           for j in range(x.ndim))
+
+        if not equal_size:
+            s2 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps2))
+
+        a = a.transpose(t1)
+        b = x[s2].transpose(t2)
+
+        yield a, b
+
+
+def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
+    # Check that overlap problems with common strides are solved with
+    # little work.
+    x = np.zeros([17,34,71,97], dtype=np.int16)
+
+    feasible = 0
+    infeasible = 0
+
+    pair_iter = iter_random_view_pairs(x, same_steps)
+
+    while min(feasible, infeasible) < min_count:
+        a, b = next(pair_iter)
+
+        bounds_overlap = np.may_share_memory(a, b)
+        may_share_answer = np.may_share_memory(a, b)
+        easy_answer = np.may_share_memory(a, b, max_work=get_max_work(a, b))
+        exact_answer = np.may_share_memory(a, b, max_work=MAY_SHARE_EXACT)
+
+        if easy_answer != exact_answer:
+            # assert_equal is slow...
+            assert_equal(easy_answer, exact_answer)
+
+        if may_share_answer != bounds_overlap:
+            assert_equal(may_share_answer, bounds_overlap)
+
+        if bounds_overlap:
+            if exact_answer:
+                feasible += 1
+            else:
+                infeasible += 1
+
+
+[email protected]
+def test_may_share_memory_easy_fuzz():
+    # Check that overlap problems with common strides are always
+    # solved with little work.
+
+    check_may_share_memory_easy_fuzz(get_max_work=lambda a, b: 1,
+                                     same_steps=True,
+                                     min_count=2000)
+
+
+[email protected]
+def test_may_share_memory_harder_fuzz():
+    # Overlap problems with not necessarily common strides take more
+    # work.
+    #
+    # The work bound below can't be reduced much. Harder problems can
+    # also exist but not be detected here, as the set of problems
+    # comes from RNG.
+
+    check_may_share_memory_easy_fuzz(get_max_work=lambda a, b: max(a.size, b.size)//2,
+                                     same_steps=False,
+                                     min_count=2000)
+
+
+def test_shares_memory_api():
+    x = np.zeros([4, 5, 6], dtype=np.int8)
+
+    assert_equal(np.shares_memory(x, x), True)
+    assert_equal(np.shares_memory(x, x.copy()), False)
+
+    a = x[:,::2,::3]
+    b = x[:,::3,::2]
+    assert_equal(np.shares_memory(a, b), True)
+    assert_equal(np.shares_memory(a, b, max_work=None), True)
+    assert_raises(np.TooHardError, np.shares_memory, a, b, max_work=1)
+    assert_raises(np.TooHardError, np.shares_memory, a, b, max_work=long(1))
+
+
+def test_may_share_memory_bad_max_work():
+    x = np.zeros([1])
+    assert_raises(OverflowError, np.may_share_memory, x, x, max_work=10**100)
+    assert_raises(OverflowError, np.shares_memory, x, x, max_work=10**100)
+
+
+def test_internal_overlap_diophantine():
+    def check(A, U, exists=None):
+        X = solve_diophantine(A, U, 0, require_ub_nontrivial=1)
+
+        if exists is None:
+            exists = (X is not None)
+
+        if X is not None:
+            assert_(sum(a*x for a, x in zip(A, X)) == sum(a*u//2 for a, u in zip(A, U)))
+            assert_(all(0 <= x <= u for x, u in zip(X, U)))
+            assert_(any(x != u//2 for x, u in zip(X, U)))
+
+        if exists:
+            assert_(X is not None, repr(X))
+        else:
+            assert_(X is None, repr(X))
+
+    # Smoke tests
+    check((3, 2), (2*2, 3*2), exists=True)
+    check((3*2, 2), (15*2, (3-1)*2), exists=False)
+
+
+def test_internal_overlap_slices():
+    # Slicing an array never generates internal overlap
+
+    x = np.zeros([17,34,71,97], dtype=np.int16)
+
+    rng = np.random.RandomState(1234)
+
+    def random_slice(n, step):
+        start = rng.randint(0, n+1, dtype=np.intp)
+        stop = rng.randint(start, n+1, dtype=np.intp)
+        if rng.randint(0, 2, dtype=np.intp) == 0:
+            stop, start = start, stop
+            step *= -1
+        return slice(start, stop, step)
+
+    cases = 0
+    min_count = 5000
+
+    while cases < min_count:
+        steps = tuple(rng.randint(1, 11, dtype=np.intp)
+                      if rng.randint(0, 5, dtype=np.intp) == 0 else 1
+                      for j in range(x.ndim))
+        t1 = np.arange(x.ndim)
+        rng.shuffle(t1)
+        s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+        a = x[s1].transpose(t1)
+
+        assert_(not internal_overlap(a))
+        cases += 1
+
+
+def check_internal_overlap(a, manual_expected=None):
+    got = internal_overlap(a)
+
+    # Brute-force check
+    m = set()
+    ranges = tuple(xrange(n) for n in a.shape)
+    for v in itertools.product(*ranges):
+        offset = sum(s*w for s, w in zip(a.strides, v))
+        if offset in m:
+            expected = True
+            break
+        else:
+            m.add(offset)
+    else:
+        expected = False
+
+    # Compare
+    if got != expected:
+        assert_equal(got, expected, err_msg=repr((a.strides, a.shape)))
+    if manual_expected is not None and expected != manual_expected:
+        assert_equal(expected, manual_expected)
+    return got
+
+
+def test_internal_overlap_manual():
+    # Stride tricks can construct arrays with internal overlap
+
+    # We don't care about memory bounds, the array is not
+    # read/write accessed
+    x = np.arange(1).astype(np.int8)
+
+    # Check low-dimensional special cases
+
+    check_internal_overlap(x, False) # 1-dim
+    check_internal_overlap(x.reshape([]), False) # 0-dim
+
+    a = as_strided(x, strides=(3, 4), shape=(4, 4))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(3, 4), shape=(5, 4))
+    check_internal_overlap(a, True)
+
+    a = as_strided(x, strides=(0,), shape=(0,))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(0,), shape=(1,))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(0,), shape=(2,))
+    check_internal_overlap(a, True)
+
+    a = as_strided(x, strides=(0, -9993), shape=(87, 22))
+    check_internal_overlap(a, True)
+
+    a = as_strided(x, strides=(0, -9993), shape=(1, 22))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(0, -9993), shape=(0, 22))
+    check_internal_overlap(a, False)
+
+
+def test_internal_overlap_fuzz():
+    # Fuzz check; the brute-force check is fairly slow
+
+    x = np.arange(1).astype(np.int8)
+
+    overlap = 0
+    no_overlap = 0
+    min_count = 100
+
+    rng = np.random.RandomState(1234)
+
+    while min(overlap, no_overlap) < min_count:
+        ndim = rng.randint(1, 4, dtype=np.intp)
+
+        strides = tuple(rng.randint(-1000, 1000, dtype=np.intp)
+                        for j in range(ndim))
+        shape = tuple(rng.randint(1, 30, dtype=np.intp)
+                      for j in range(ndim))
+
+        a = as_strided(x, strides=strides, shape=shape)
+        result = check_internal_overlap(a)
+
+        if result:
+            overlap += 1
+        else:
+            no_overlap += 1
+
+
+def test_non_ndarray_inputs():
+    # Regression check for gh-5604
+
+    class MyArray(object):
+        def __init__(self, data):
+            self.data = data
+
+        @property
+        def __array_interface__(self):
+            return self.data.__array_interface__
+
+    class MyArray2(object):
+        def __init__(self, data):
+            self.data = data
+
+        def __array__(self):
+            return self.data
+
+    for cls in [MyArray, MyArray2]:
+        x = np.arange(5)
+
+        assert_(np.may_share_memory(cls(x[::2]), x[1::2]))
+        assert_(not np.shares_memory(cls(x[::2]), x[1::2]))
+
+        assert_(np.shares_memory(cls(x[1::3]), x[::2]))
+        assert_(np.may_share_memory(cls(x[1::3]), x[::2]))
+
+
+def view_element_first_byte(x):
+    """Construct an array viewing the first byte of each element of `x`"""
+    from numpy.lib.stride_tricks import DummyArray
+    interface = dict(x.__array_interface__)
+    interface['typestr'] = '|b1'
+    interface['descr'] = [('', '|b1')]
+    return np.asarray(DummyArray(interface, x))
+
+
+def assert_copy_equivalent(operation, args, out, **kwargs):
+    """
+    Check that operation(*args, out=out) produces results
+    equivalent to out[...] = operation(*args, out=out.copy())
+    """
+
+    kwargs['out'] = out
+    kwargs2 = dict(kwargs)
+    kwargs2['out'] = out.copy()
+
+    out_orig = out.copy()
+    out[...] = operation(*args, **kwargs2)
+    expected = out.copy()
+    out[...] = out_orig
+
+    got = operation(*args, **kwargs).copy()
+
+    if (got != expected).any():
+        assert_equal(got, expected)
+
+
+class TestUFunc(object):
+    """
+    Test ufunc call memory overlap handling
+    """
+
+    def check_unary_fuzz(self, operation, get_out_axis_size, dtype=np.int16,
+                             count=5000):
+        shapes = [7, 13, 8, 21, 29, 32]
+
+        rng = np.random.RandomState(1234)
+
+        for ndim in range(1, 6):
+            x = rng.randint(0, 2**16, size=shapes[:ndim]).astype(dtype)
+
+            it = iter_random_view_pairs(x, same_steps=False, equal_size=True)
+
+            min_count = count // (ndim + 1)**2
+
+            overlapping = 0
+            while overlapping < min_count:
+                a, b = next(it)
+
+                a_orig = a.copy()
+                b_orig = b.copy()
+
+                if get_out_axis_size is None:
+                    assert_copy_equivalent(operation, [a], out=b)
+
+                    if np.shares_memory(a, b):
+                        overlapping += 1
+                else:
+                    for axis in itertools.chain(range(ndim), [None]):
+                        a[...] = a_orig
+                        b[...] = b_orig
+
+                        # Determine size for reduction axis (None if scalar)
+                        outsize, scalarize = get_out_axis_size(a, b, axis)
+                        if outsize == 'skip':
+                            continue
+
+                        # Slice b to get an output array of the correct size
+                        sl = [slice(None)] * ndim
+                        if axis is None:
+                            if outsize is None:
+                                sl = [slice(0, 1)] + [0]*(ndim - 1)
+                            else:
+                                sl = [slice(0, outsize)] + [0]*(ndim - 1)
+                        else:
+                            if outsize is None:
+                                k = b.shape[axis]//2
+                                if ndim == 1:
+                                    sl[axis] = slice(k, k + 1)
+                                else:
+                                    sl[axis] = k
+                            else:
+                                assert b.shape[axis] >= outsize
+                                sl[axis] = slice(0, outsize)
+                        b_out = b[tuple(sl)]
+
+                        if scalarize:
+                            b_out = b_out.reshape([])
+
+                        if np.shares_memory(a, b_out):
+                            overlapping += 1
+
+                        # Check result
+                        assert_copy_equivalent(operation, [a], out=b_out, axis=axis)
+
+    @pytest.mark.slow
+    def test_unary_ufunc_call_fuzz(self):
+        self.check_unary_fuzz(np.invert, None, np.int16)
+
+    def test_binary_ufunc_accumulate_fuzz(self):
+        def get_out_axis_size(a, b, axis):
+            if axis is None:
+                if a.ndim == 1:
+                    return a.size, False
+                else:
+                    return 'skip', False  # accumulate doesn't support this
+            else:
+                return a.shape[axis], False
+
+        self.check_unary_fuzz(np.add.accumulate, get_out_axis_size,
+                              dtype=np.int16, count=500)
+
+    def test_binary_ufunc_reduce_fuzz(self):
+        def get_out_axis_size(a, b, axis):
+            return None, (axis is None or a.ndim == 1)
+
+        self.check_unary_fuzz(np.add.reduce, get_out_axis_size,
+                              dtype=np.int16, count=500)
+
+    def test_binary_ufunc_reduceat_fuzz(self):
+        def get_out_axis_size(a, b, axis):
+            if axis is None:
+                if a.ndim == 1:
+                    return a.size, False
+                else:
+                    return 'skip', False  # reduceat doesn't support this
+            else:
+                return a.shape[axis], False
+
+        def do_reduceat(a, out, axis):
+            if axis is None:
+                size = len(a)
+                step = size//len(out)
+            else:
+                size = a.shape[axis]
+                step = a.shape[axis] // out.shape[axis]
+            idx = np.arange(0, size, step)
+            return np.add.reduceat(a, idx, out=out, axis=axis)
+
+        self.check_unary_fuzz(do_reduceat, get_out_axis_size,
+                              dtype=np.int16, count=500)
+
+    def test_binary_ufunc_reduceat_manual(self):
+        def check(ufunc, a, ind, out):
+            c1 = ufunc.reduceat(a.copy(), ind.copy(), out=out.copy())
+            c2 = ufunc.reduceat(a, ind, out=out)
+            assert_array_equal(c1, c2)
+
+        # Exactly same input/output arrays
+        a = np.arange(10000, dtype=np.int16)
+        check(np.add, a, a[::-1].copy(), a)
+
+        # Overlap with index
+        a = np.arange(10000, dtype=np.int16)
+        check(np.add, a, a[::-1], a)
+
+    def test_unary_gufunc_fuzz(self):
+        shapes = [7, 13, 8, 21, 29, 32]
+        gufunc = _umath_tests.euclidean_pdist
+
+        rng = np.random.RandomState(1234)
+
+        for ndim in range(2, 6):
+            x = rng.rand(*shapes[:ndim])
+
+            it = iter_random_view_pairs(x, same_steps=False, equal_size=True)
+
+            min_count = 500 // (ndim + 1)**2
+
+            overlapping = 0
+            while overlapping < min_count:
+                a, b = next(it)
+
+                if min(a.shape[-2:]) < 2 or min(b.shape[-2:]) < 2 or a.shape[-1] < 2:
+                    continue
+
+                # Ensure the shapes are so that euclidean_pdist is happy
+                if b.shape[-1] > b.shape[-2]:
+                    b = b[...,0,:]
+                else:
+                    b = b[...,:,0]
+
+                n = a.shape[-2]
+                p = n * (n - 1) // 2
+                if p <= b.shape[-1] and p > 0:
+                    b = b[...,:p]
+                else:
+                    n = max(2, int(np.sqrt(b.shape[-1]))//2)
+                    p = n * (n - 1) // 2
+                    a = a[...,:n,:]
+                    b = b[...,:p]
+
+                # Call
+                if np.shares_memory(a, b):
+                    overlapping += 1
+
+                with np.errstate(over='ignore', invalid='ignore'):
+                    assert_copy_equivalent(gufunc, [a], out=b)
+
+    def test_ufunc_at_manual(self):
+        def check(ufunc, a, ind, b=None):
+            a0 = a.copy()
+            if b is None:
+                ufunc.at(a0, ind.copy())
+                c1 = a0.copy()
+                ufunc.at(a, ind)
+                c2 = a.copy()
+            else:
+                ufunc.at(a0, ind.copy(), b.copy())
+                c1 = a0.copy()
+                ufunc.at(a, ind, b)
+                c2 = a.copy()
+            assert_array_equal(c1, c2)
+
+        # Overlap with index
+        a = np.arange(10000, dtype=np.int16)
+        check(np.invert, a[::-1], a)
+
+        # Overlap with second data array
+        a = np.arange(100, dtype=np.int16)
+        ind = np.arange(0, 100, 2, dtype=np.int16)
+        check(np.add, a, ind, a[25:75])
+
+    def test_unary_ufunc_1d_manual(self):
+        # Exercise branches in PyArray_EQUIVALENTLY_ITERABLE
+
+        def check(a, b):
+            a_orig = a.copy()
+            b_orig = b.copy()
+
+            b0 = b.copy()
+            c1 = ufunc(a, out=b0)
+            c2 = ufunc(a, out=b)
+            assert_array_equal(c1, c2)
+
+            # Trigger "fancy ufunc loop" code path
+            mask = view_element_first_byte(b).view(np.bool_)
+
+            a[...] = a_orig
+            b[...] = b_orig
+            c1 = ufunc(a, out=b.copy(), where=mask.copy()).copy()
+
+            a[...] = a_orig
+            b[...] = b_orig
+            c2 = ufunc(a, out=b, where=mask.copy()).copy()
+
+            # Also, mask overlapping with output
+            a[...] = a_orig
+            b[...] = b_orig
+            c3 = ufunc(a, out=b, where=mask).copy()
+
+            assert_array_equal(c1, c2)
+            assert_array_equal(c1, c3)
+
+        dtypes = [np.int8, np.int16, np.int32, np.int64, np.float32,
+                  np.float64, np.complex64, np.complex128]
+        dtypes = [np.dtype(x) for x in dtypes]
+
+        for dtype in dtypes:
+            if np.issubdtype(dtype, np.integer):
+                ufunc = np.invert
+            else:
+                ufunc = np.reciprocal
+
+            n = 1000
+            k = 10
+            indices = [
+                np.index_exp[:n],
+                np.index_exp[k:k+n],
+                np.index_exp[n-1::-1],
+                np.index_exp[k+n-1:k-1:-1],
+                np.index_exp[:2*n:2],
+                np.index_exp[k:k+2*n:2],
+                np.index_exp[2*n-1::-2],
+                np.index_exp[k+2*n-1:k-1:-2],
+            ]
+
+            for xi, yi in itertools.product(indices, indices):
+                v = np.arange(1, 1 + n*2 + k, dtype=dtype)
+                x = v[xi]
+                y = v[yi]
+
+                with np.errstate(all='ignore'):
+                    check(x, y)
+
+                    # Scalar cases
+                    check(x[:1], y)
+                    check(x[-1:], y)
+                    check(x[:1].reshape([]), y)
+                    check(x[-1:].reshape([]), y)
+
+    def test_unary_ufunc_where_same(self):
+        # Check behavior at wheremask overlap
+        ufunc = np.invert
+
+        def check(a, out, mask):
+            c1 = ufunc(a, out=out.copy(), where=mask.copy())
+            c2 = ufunc(a, out=out, where=mask)
+            assert_array_equal(c1, c2)
+
+        # Check behavior with same input and output arrays
+        x = np.arange(100).astype(np.bool_)
+        check(x, x, x)
+        check(x, x.copy(), x)
+        check(x, x, x.copy())
+
+    @pytest.mark.slow
+    def test_binary_ufunc_1d_manual(self):
+        ufunc = np.add
+
+        def check(a, b, c):
+            c0 = c.copy()
+            c1 = ufunc(a, b, out=c0)
+            c2 = ufunc(a, b, out=c)
+            assert_array_equal(c1, c2)
+
+        for dtype in [np.int8, np.int16, np.int32, np.int64,
+                      np.float32, np.float64, np.complex64, np.complex128]:
+            # Check different data dependency orders
+
+            n = 1000
+            k = 10
+
+            indices = []
+            for p in [1, 2]:
+                indices.extend([
+                    np.index_exp[:p*n:p],
+                    np.index_exp[k:k+p*n:p],
+                    np.index_exp[p*n-1::-p],
+                    np.index_exp[k+p*n-1:k-1:-p],
+                ])
+
+            for x, y, z in itertools.product(indices, indices, indices):
+                v = np.arange(6*n).astype(dtype)
+                x = v[x]
+                y = v[y]
+                z = v[z]
+
+                check(x, y, z)
+
+                # Scalar cases
+                check(x[:1], y, z)
+                check(x[-1:], y, z)
+                check(x[:1].reshape([]), y, z)
+                check(x[-1:].reshape([]), y, z)
+                check(x, y[:1], z)
+                check(x, y[-1:], z)
+                check(x, y[:1].reshape([]), z)
+                check(x, y[-1:].reshape([]), z)
+
+    def test_inplace_op_simple_manual(self):
+        rng = np.random.RandomState(1234)
+        x = rng.rand(200, 200)  # bigger than bufsize
+
+        x += x.T
+        assert_array_equal(x - x.T, 0)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_memmap.py b/contrib/python/numpy/py2/numpy/core/tests/test_memmap.py
new file mode 100644
index 00000000000..990d0ae265f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_memmap.py
@@ -0,0 +1,206 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import os
+import shutil
+import mmap
+import pytest
+from tempfile import NamedTemporaryFile, TemporaryFile, mktemp, mkdtemp
+
+from numpy import (
+    memmap, sum, average, product, ndarray, isscalar, add, subtract, multiply)
+from numpy.compat import Path
+
+from numpy import arange, allclose, asarray
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, suppress_warnings
+    )
+
+class TestMemmap(object):
+    def setup(self):
+        self.tmpfp = NamedTemporaryFile(prefix='mmap')
+        self.tempdir = mkdtemp()
+        self.shape = (3, 4)
+        self.dtype = 'float32'
+        self.data = arange(12, dtype=self.dtype)
+        self.data.resize(self.shape)
+
+    def teardown(self):
+        self.tmpfp.close()
+        shutil.rmtree(self.tempdir)
+
+    def test_roundtrip(self):
+        # Write data to file
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode='w+',
+                    shape=self.shape)
+        fp[:] = self.data[:]
+        del fp  # Test __del__ machinery, which handles cleanup
+
+        # Read data back from file
+        newfp = memmap(self.tmpfp, dtype=self.dtype, mode='r',
+                       shape=self.shape)
+        assert_(allclose(self.data, newfp))
+        assert_array_equal(self.data, newfp)
+        assert_equal(newfp.flags.writeable, False)
+
+    def test_open_with_filename(self):
+        tmpname = mktemp('', 'mmap', dir=self.tempdir)
+        fp = memmap(tmpname, dtype=self.dtype, mode='w+',
+                       shape=self.shape)
+        fp[:] = self.data[:]
+        del fp
+
+    def test_unnamed_file(self):
+        with TemporaryFile() as f:
+            fp = memmap(f, dtype=self.dtype, shape=self.shape)
+            del fp
+
+    def test_attributes(self):
+        offset = 1
+        mode = "w+"
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode=mode,
+                    shape=self.shape, offset=offset)
+        assert_equal(offset, fp.offset)
+        assert_equal(mode, fp.mode)
+        del fp
+
+    def test_filename(self):
+        tmpname = mktemp('', 'mmap', dir=self.tempdir)
+        fp = memmap(tmpname, dtype=self.dtype, mode='w+',
+                       shape=self.shape)
+        abspath = os.path.abspath(tmpname)
+        fp[:] = self.data[:]
+        assert_equal(abspath, fp.filename)
+        b = fp[:1]
+        assert_equal(abspath, b.filename)
+        del b
+        del fp
+
+    @pytest.mark.skipif(Path is None, reason="No pathlib.Path")
+    def test_path(self):
+        tmpname = mktemp('', 'mmap', dir=self.tempdir)
+        fp = memmap(Path(tmpname), dtype=self.dtype, mode='w+',
+                       shape=self.shape)
+        # os.path.realpath does not resolve symlinks on Windows
+        # see: https://bugs.python.org/issue9949
+        # use Path.resolve, just as memmap class does internally
+        abspath = str(Path(tmpname).resolve())
+        fp[:] = self.data[:]
+        assert_equal(abspath, str(fp.filename.resolve()))
+        b = fp[:1]
+        assert_equal(abspath, str(b.filename.resolve()))
+        del b
+        del fp
+
+    def test_filename_fileobj(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode="w+",
+                    shape=self.shape)
+        assert_equal(fp.filename, self.tmpfp.name)
+
+    @pytest.mark.skipif(sys.platform == 'gnu0',
+                        reason="Known to fail on hurd")
+    def test_flush(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode='w+',
+                    shape=self.shape)
+        fp[:] = self.data[:]
+        assert_equal(fp[0], self.data[0])
+        fp.flush()
+
+    def test_del(self):
+        # Make sure a view does not delete the underlying mmap
+        fp_base = memmap(self.tmpfp, dtype=self.dtype, mode='w+',
+                    shape=self.shape)
+        fp_base[0] = 5
+        fp_view = fp_base[0:1]
+        assert_equal(fp_view[0], 5)
+        del fp_view
+        # Should still be able to access and assign values after
+        # deleting the view
+        assert_equal(fp_base[0], 5)
+        fp_base[0] = 6
+        assert_equal(fp_base[0], 6)
+
+    def test_arithmetic_drops_references(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode='w+',
+                    shape=self.shape)
+        tmp = (fp + 10)
+        if isinstance(tmp, memmap):
+            assert_(tmp._mmap is not fp._mmap)
+
+    def test_indexing_drops_references(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode='w+',
+                    shape=self.shape)
+        tmp = fp[(1, 2), (2, 3)]
+        if isinstance(tmp, memmap):
+            assert_(tmp._mmap is not fp._mmap)
+
+    def test_slicing_keeps_references(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, mode='w+',
+                    shape=self.shape)
+        assert_(fp[:2, :2]._mmap is fp._mmap)
+
+    def test_view(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, shape=self.shape)
+        new1 = fp.view()
+        new2 = new1.view()
+        assert_(new1.base is fp)
+        assert_(new2.base is fp)
+        new_array = asarray(fp)
+        assert_(new_array.base is fp)
+
+    def test_ufunc_return_ndarray(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, shape=self.shape)
+        fp[:] = self.data
+
+        with suppress_warnings() as sup:
+            sup.filter(FutureWarning, "np.average currently does not preserve")
+            for unary_op in [sum, average, product]:
+                result = unary_op(fp)
+                assert_(isscalar(result))
+                assert_(result.__class__ is self.data[0, 0].__class__)
+
+                assert_(unary_op(fp, axis=0).__class__ is ndarray)
+                assert_(unary_op(fp, axis=1).__class__ is ndarray)
+
+        for binary_op in [add, subtract, multiply]:
+            assert_(binary_op(fp, self.data).__class__ is ndarray)
+            assert_(binary_op(self.data, fp).__class__ is ndarray)
+            assert_(binary_op(fp, fp).__class__ is ndarray)
+
+        fp += 1
+        assert(fp.__class__ is memmap)
+        add(fp, 1, out=fp)
+        assert(fp.__class__ is memmap)
+
+    def test_getitem(self):
+        fp = memmap(self.tmpfp, dtype=self.dtype, shape=self.shape)
+        fp[:] = self.data
+
+        assert_(fp[1:, :-1].__class__ is memmap)
+        # Fancy indexing returns a copy that is not memmapped
+        assert_(fp[[0, 1]].__class__ is ndarray)
+
+    def test_memmap_subclass(self):
+        class MemmapSubClass(memmap):
+            pass
+
+        fp = MemmapSubClass(self.tmpfp, dtype=self.dtype, shape=self.shape)
+        fp[:] = self.data
+
+        # We keep previous behavior for subclasses of memmap, i.e. the
+        # ufunc and __getitem__ output is never turned into a ndarray
+        assert_(sum(fp, axis=0).__class__ is MemmapSubClass)
+        assert_(sum(fp).__class__ is MemmapSubClass)
+        assert_(fp[1:, :-1].__class__ is MemmapSubClass)
+        assert(fp[[0, 1]].__class__ is MemmapSubClass)
+
+    def test_mmap_offset_greater_than_allocation_granularity(self):
+        size = 5 * mmap.ALLOCATIONGRANULARITY
+        offset = mmap.ALLOCATIONGRANULARITY + 1
+        fp = memmap(self.tmpfp, shape=size, mode='w+', offset=offset)
+        assert_(fp.offset == offset)
+
+    def test_no_shape(self):
+        self.tmpfp.write(b'a'*16)
+        mm = memmap(self.tmpfp, dtype='float64')
+        assert_equal(mm.shape, (2,))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_multiarray.py b/contrib/python/numpy/py2/numpy/core/tests/test_multiarray.py
new file mode 100644
index 00000000000..c55556535c9
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_multiarray.py
@@ -0,0 +1,8325 @@
+from __future__ import division, absolute_import, print_function
+
+try:
+    # Accessing collections abstract classes from collections
+    # has been deprecated since Python 3.3
+    import collections.abc as collections_abc
+except ImportError:
+    import collections as collections_abc
+import tempfile
+import sys
+import shutil
+import warnings
+import operator
+import io
+import itertools
+import functools
+import ctypes
+import os
+import gc
+import weakref
+import pytest
+from contextlib import contextmanager
+
+from numpy.core.numeric import pickle
+
+if sys.version_info[0] >= 3:
+    import builtins
+else:
+    import __builtin__ as builtins
+from decimal import Decimal
+
+import numpy as np
+from numpy.compat import strchar, unicode
+import numpy.core._multiarray_tests as _multiarray_tests
+from numpy.testing import (
+    assert_, assert_raises, assert_warns, assert_equal, assert_almost_equal,
+    assert_array_equal, assert_raises_regex, assert_array_almost_equal,
+    assert_allclose, IS_PYPY, HAS_REFCOUNT, assert_array_less, runstring,
+    temppath, suppress_warnings, break_cycles,
+    )
+from numpy.core.tests._locales import CommaDecimalPointLocale
+
+# Need to test an object that does not fully implement math interface
+from datetime import timedelta, datetime
+
+
+if sys.version_info[:2] > (3, 2):
+    # In Python 3.3 the representation of empty shape, strides and sub-offsets
+    # is an empty tuple instead of None.
+    # https://docs.python.org/dev/whatsnew/3.3.html#api-changes
+    EMPTY = ()
+else:
+    EMPTY = None
+
+
+def _aligned_zeros(shape, dtype=float, order="C", align=None):
+    """
+    Allocate a new ndarray with aligned memory.
+
+    The ndarray is guaranteed *not* aligned to twice the requested alignment.
+    Eg, if align=4, guarantees it is not aligned to 8. If align=None uses
+    dtype.alignment."""
+    dtype = np.dtype(dtype)
+    if dtype == np.dtype(object):
+        # Can't do this, fall back to standard allocation (which
+        # should always be sufficiently aligned)
+        if align is not None:
+            raise ValueError("object array alignment not supported")
+        return np.zeros(shape, dtype=dtype, order=order)
+    if align is None:
+        align = dtype.alignment
+    if not hasattr(shape, '__len__'):
+        shape = (shape,)
+    size = functools.reduce(operator.mul, shape) * dtype.itemsize
+    buf = np.empty(size + 2*align + 1, np.uint8)
+
+    ptr = buf.__array_interface__['data'][0]
+    offset = ptr % align
+    if offset != 0:
+        offset = align - offset
+    if (ptr % (2*align)) == 0:
+        offset += align
+
+    # Note: slices producing 0-size arrays do not necessarily change
+    # data pointer --- so we use and allocate size+1
+    buf = buf[offset:offset+size+1][:-1]
+    data = np.ndarray(shape, dtype, buf, order=order)
+    data.fill(0)
+    return data
+
+
+class TestFlags(object):
+    def setup(self):
+        self.a = np.arange(10)
+
+    def test_writeable(self):
+        mydict = locals()
+        self.a.flags.writeable = False
+        assert_raises(ValueError, runstring, 'self.a[0] = 3', mydict)
+        assert_raises(ValueError, runstring, 'self.a[0:1].itemset(3)', mydict)
+        self.a.flags.writeable = True
+        self.a[0] = 5
+        self.a[0] = 0
+
+    def test_writeable_from_readonly(self):
+        # gh-9440 - make sure fromstring, from buffer on readonly buffers
+        # set writeable False
+        data = b'\x00' * 100
+        vals = np.frombuffer(data, 'B')
+        assert_raises(ValueError, vals.setflags, write=True)
+        types = np.dtype( [('vals', 'u1'), ('res3', 'S4')] )
+        values = np.core.records.fromstring(data, types)
+        vals = values['vals']
+        assert_raises(ValueError, vals.setflags, write=True)
+
+    def test_writeable_from_buffer(self):
+        data = bytearray(b'\x00' * 100)
+        vals = np.frombuffer(data, 'B')
+        assert_(vals.flags.writeable)
+        vals.setflags(write=False)
+        assert_(vals.flags.writeable is False)
+        vals.setflags(write=True)
+        assert_(vals.flags.writeable)
+        types = np.dtype( [('vals', 'u1'), ('res3', 'S4')] )
+        values = np.core.records.fromstring(data, types)
+        vals = values['vals']
+        assert_(vals.flags.writeable)
+        vals.setflags(write=False)
+        assert_(vals.flags.writeable is False)
+        vals.setflags(write=True)
+        assert_(vals.flags.writeable)
+
+    @pytest.mark.skipif(sys.version_info[0] < 3, reason="Python 2 always copies")
+    @pytest.mark.skipif(IS_PYPY, reason="PyPy always copies")
+    def test_writeable_pickle(self):
+        import pickle
+        # Small arrays will be copied without setting base.
+        # See condition for using PyArray_SetBaseObject in
+        # array_setstate.
+        a = np.arange(1000)
+        for v in range(pickle.HIGHEST_PROTOCOL):
+            vals = pickle.loads(pickle.dumps(a, v))
+            assert_(vals.flags.writeable)
+            assert_(isinstance(vals.base, bytes))
+
+    def test_writeable_from_c_data(self):
+        # Test that the writeable flag can be changed for an array wrapping
+        # low level C-data, but not owning its data.
+        # Also see that this is deprecated to change from python.
+        from numpy.core._multiarray_tests import get_c_wrapping_array
+
+        arr_writeable = get_c_wrapping_array(True)
+        assert not arr_writeable.flags.owndata
+        assert arr_writeable.flags.writeable
+        view = arr_writeable[...]
+
+        # Toggling the writeable flag works on the view:
+        view.flags.writeable = False
+        assert not view.flags.writeable
+        view.flags.writeable = True
+        assert view.flags.writeable
+        # Flag can be unset on the arr_writeable:
+        arr_writeable.flags.writeable = False
+
+        arr_readonly = get_c_wrapping_array(False)
+        assert not arr_readonly.flags.owndata
+        assert not arr_readonly.flags.writeable
+
+        for arr in [arr_writeable, arr_readonly]:
+            view = arr[...]
+            view.flags.writeable = False  # make sure it is readonly
+            arr.flags.writeable = False
+            assert not arr.flags.writeable
+
+            with assert_raises(ValueError):
+                view.flags.writeable = True
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", DeprecationWarning)
+                with assert_raises(DeprecationWarning):
+                    arr.flags.writeable = True
+
+            with assert_warns(DeprecationWarning):
+                arr.flags.writeable = True
+
+
+    def test_otherflags(self):
+        assert_equal(self.a.flags.carray, True)
+        assert_equal(self.a.flags['C'], True)
+        assert_equal(self.a.flags.farray, False)
+        assert_equal(self.a.flags.behaved, True)
+        assert_equal(self.a.flags.fnc, False)
+        assert_equal(self.a.flags.forc, True)
+        assert_equal(self.a.flags.owndata, True)
+        assert_equal(self.a.flags.writeable, True)
+        assert_equal(self.a.flags.aligned, True)
+        with assert_warns(DeprecationWarning):
+            assert_equal(self.a.flags.updateifcopy, False)
+        with assert_warns(DeprecationWarning):
+            assert_equal(self.a.flags['U'], False)
+            assert_equal(self.a.flags['UPDATEIFCOPY'], False)
+        assert_equal(self.a.flags.writebackifcopy, False)
+        assert_equal(self.a.flags['X'], False)
+        assert_equal(self.a.flags['WRITEBACKIFCOPY'], False)
+
+    def test_string_align(self):
+        a = np.zeros(4, dtype=np.dtype('|S4'))
+        assert_(a.flags.aligned)
+        # not power of two are accessed byte-wise and thus considered aligned
+        a = np.zeros(5, dtype=np.dtype('|S4'))
+        assert_(a.flags.aligned)
+
+    def test_void_align(self):
+        a = np.zeros(4, dtype=np.dtype([("a", "i4"), ("b", "i4")]))
+        assert_(a.flags.aligned)
+
+
+class TestHash(object):
+    # see #3793
+    def test_int(self):
+        for st, ut, s in [(np.int8, np.uint8, 8),
+                          (np.int16, np.uint16, 16),
+                          (np.int32, np.uint32, 32),
+                          (np.int64, np.uint64, 64)]:
+            for i in range(1, s):
+                assert_equal(hash(st(-2**i)), hash(-2**i),
+                             err_msg="%r: -2**%d" % (st, i))
+                assert_equal(hash(st(2**(i - 1))), hash(2**(i - 1)),
+                             err_msg="%r: 2**%d" % (st, i - 1))
+                assert_equal(hash(st(2**i - 1)), hash(2**i - 1),
+                             err_msg="%r: 2**%d - 1" % (st, i))
+
+                i = max(i - 1, 1)
+                assert_equal(hash(ut(2**(i - 1))), hash(2**(i - 1)),
+                             err_msg="%r: 2**%d" % (ut, i - 1))
+                assert_equal(hash(ut(2**i - 1)), hash(2**i - 1),
+                             err_msg="%r: 2**%d - 1" % (ut, i))
+
+
+class TestAttributes(object):
+    def setup(self):
+        self.one = np.arange(10)
+        self.two = np.arange(20).reshape(4, 5)
+        self.three = np.arange(60, dtype=np.float64).reshape(2, 5, 6)
+
+    def test_attributes(self):
+        assert_equal(self.one.shape, (10,))
+        assert_equal(self.two.shape, (4, 5))
+        assert_equal(self.three.shape, (2, 5, 6))
+        self.three.shape = (10, 3, 2)
+        assert_equal(self.three.shape, (10, 3, 2))
+        self.three.shape = (2, 5, 6)
+        assert_equal(self.one.strides, (self.one.itemsize,))
+        num = self.two.itemsize
+        assert_equal(self.two.strides, (5*num, num))
+        num = self.three.itemsize
+        assert_equal(self.three.strides, (30*num, 6*num, num))
+        assert_equal(self.one.ndim, 1)
+        assert_equal(self.two.ndim, 2)
+        assert_equal(self.three.ndim, 3)
+        num = self.two.itemsize
+        assert_equal(self.two.size, 20)
+        assert_equal(self.two.nbytes, 20*num)
+        assert_equal(self.two.itemsize, self.two.dtype.itemsize)
+        assert_equal(self.two.base, np.arange(20))
+
+    def test_dtypeattr(self):
+        assert_equal(self.one.dtype, np.dtype(np.int_))
+        assert_equal(self.three.dtype, np.dtype(np.float_))
+        assert_equal(self.one.dtype.char, 'l')
+        assert_equal(self.three.dtype.char, 'd')
+        assert_(self.three.dtype.str[0] in '<>')
+        assert_equal(self.one.dtype.str[1], 'i')
+        assert_equal(self.three.dtype.str[1], 'f')
+
+    def test_int_subclassing(self):
+        # Regression test for https://github.com/numpy/numpy/pull/3526
+
+        numpy_int = np.int_(0)
+
+        if sys.version_info[0] >= 3:
+            # On Py3k int_ should not inherit from int, because it's not
+            # fixed-width anymore
+            assert_equal(isinstance(numpy_int, int), False)
+        else:
+            # Otherwise, it should inherit from int...
+            assert_equal(isinstance(numpy_int, int), True)
+
+            # ... and fast-path checks on C-API level should also work
+            from numpy.core._multiarray_tests import test_int_subclass
+            assert_equal(test_int_subclass(numpy_int), True)
+
+    def test_stridesattr(self):
+        x = self.one
+
+        def make_array(size, offset, strides):
+            return np.ndarray(size, buffer=x, dtype=int,
+                              offset=offset*x.itemsize,
+                              strides=strides*x.itemsize)
+
+        assert_equal(make_array(4, 4, -1), np.array([4, 3, 2, 1]))
+        assert_raises(ValueError, make_array, 4, 4, -2)
+        assert_raises(ValueError, make_array, 4, 2, -1)
+        assert_raises(ValueError, make_array, 8, 3, 1)
+        assert_equal(make_array(8, 3, 0), np.array([3]*8))
+        # Check behavior reported in gh-2503:
+        assert_raises(ValueError, make_array, (2, 3), 5, np.array([-2, -3]))
+        make_array(0, 0, 10)
+
+    def test_set_stridesattr(self):
+        x = self.one
+
+        def make_array(size, offset, strides):
+            try:
+                r = np.ndarray([size], dtype=int, buffer=x,
+                               offset=offset*x.itemsize)
+            except Exception as e:
+                raise RuntimeError(e)
+            r.strides = strides = strides*x.itemsize
+            return r
+
+        assert_equal(make_array(4, 4, -1), np.array([4, 3, 2, 1]))
+        assert_equal(make_array(7, 3, 1), np.array([3, 4, 5, 6, 7, 8, 9]))
+        assert_raises(ValueError, make_array, 4, 4, -2)
+        assert_raises(ValueError, make_array, 4, 2, -1)
+        assert_raises(RuntimeError, make_array, 8, 3, 1)
+        # Check that the true extent of the array is used.
+        # Test relies on as_strided base not exposing a buffer.
+        x = np.lib.stride_tricks.as_strided(np.arange(1), (10, 10), (0, 0))
+
+        def set_strides(arr, strides):
+            arr.strides = strides
+
+        assert_raises(ValueError, set_strides, x, (10*x.itemsize, x.itemsize))
+
+        # Test for offset calculations:
+        x = np.lib.stride_tricks.as_strided(np.arange(10, dtype=np.int8)[-1],
+                                                    shape=(10,), strides=(-1,))
+        assert_raises(ValueError, set_strides, x[::-1], -1)
+        a = x[::-1]
+        a.strides = 1
+        a[::2].strides = 2
+
+    def test_fill(self):
+        for t in "?bhilqpBHILQPfdgFDGO":
+            x = np.empty((3, 2, 1), t)
+            y = np.empty((3, 2, 1), t)
+            x.fill(1)
+            y[...] = 1
+            assert_equal(x, y)
+
+    def test_fill_max_uint64(self):
+        x = np.empty((3, 2, 1), dtype=np.uint64)
+        y = np.empty((3, 2, 1), dtype=np.uint64)
+        value = 2**64 - 1
+        y[...] = value
+        x.fill(value)
+        assert_array_equal(x, y)
+
+    def test_fill_struct_array(self):
+        # Filling from a scalar
+        x = np.array([(0, 0.0), (1, 1.0)], dtype='i4,f8')
+        x.fill(x[0])
+        assert_equal(x['f1'][1], x['f1'][0])
+        # Filling from a tuple that can be converted
+        # to a scalar
+        x = np.zeros(2, dtype=[('a', 'f8'), ('b', 'i4')])
+        x.fill((3.5, -2))
+        assert_array_equal(x['a'], [3.5, 3.5])
+        assert_array_equal(x['b'], [-2, -2])
+
+
+class TestArrayConstruction(object):
+    def test_array(self):
+        d = np.ones(6)
+        r = np.array([d, d])
+        assert_equal(r, np.ones((2, 6)))
+
+        d = np.ones(6)
+        tgt = np.ones((2, 6))
+        r = np.array([d, d])
+        assert_equal(r, tgt)
+        tgt[1] = 2
+        r = np.array([d, d + 1])
+        assert_equal(r, tgt)
+
+        d = np.ones(6)
+        r = np.array([[d, d]])
+        assert_equal(r, np.ones((1, 2, 6)))
+
+        d = np.ones(6)
+        r = np.array([[d, d], [d, d]])
+        assert_equal(r, np.ones((2, 2, 6)))
+
+        d = np.ones((6, 6))
+        r = np.array([d, d])
+        assert_equal(r, np.ones((2, 6, 6)))
+
+        d = np.ones((6, ))
+        r = np.array([[d, d + 1], d + 2])
+        assert_equal(len(r), 2)
+        assert_equal(r[0], [d, d + 1])
+        assert_equal(r[1], d + 2)
+
+        tgt = np.ones((2, 3), dtype=bool)
+        tgt[0, 2] = False
+        tgt[1, 0:2] = False
+        r = np.array([[True, True, False], [False, False, True]])
+        assert_equal(r, tgt)
+        r = np.array([[True, False], [True, False], [False, True]])
+        assert_equal(r, tgt.T)
+
+    def test_array_empty(self):
+        assert_raises(TypeError, np.array)
+
+    def test_array_copy_false(self):
+        d = np.array([1, 2, 3])
+        e = np.array(d, copy=False)
+        d[1] = 3
+        assert_array_equal(e, [1, 3, 3])
+        e = np.array(d, copy=False, order='F')
+        d[1] = 4
+        assert_array_equal(e, [1, 4, 3])
+        e[2] = 7
+        assert_array_equal(d, [1, 4, 7])
+
+    def test_array_copy_true(self):
+        d = np.array([[1,2,3], [1, 2, 3]])
+        e = np.array(d, copy=True)
+        d[0, 1] = 3
+        e[0, 2] = -7
+        assert_array_equal(e, [[1, 2, -7], [1, 2, 3]])
+        assert_array_equal(d, [[1, 3, 3], [1, 2, 3]])
+        e = np.array(d, copy=True, order='F')
+        d[0, 1] = 5
+        e[0, 2] = 7
+        assert_array_equal(e, [[1, 3, 7], [1, 2, 3]])
+        assert_array_equal(d, [[1, 5, 3], [1,2,3]])
+
+    def test_array_cont(self):
+        d = np.ones(10)[::2]
+        assert_(np.ascontiguousarray(d).flags.c_contiguous)
+        assert_(np.ascontiguousarray(d).flags.f_contiguous)
+        assert_(np.asfortranarray(d).flags.c_contiguous)
+        assert_(np.asfortranarray(d).flags.f_contiguous)
+        d = np.ones((10, 10))[::2,::2]
+        assert_(np.ascontiguousarray(d).flags.c_contiguous)
+        assert_(np.asfortranarray(d).flags.f_contiguous)
+
+
+class TestAssignment(object):
+    def test_assignment_broadcasting(self):
+        a = np.arange(6).reshape(2, 3)
+
+        # Broadcasting the input to the output
+        a[...] = np.arange(3)
+        assert_equal(a, [[0, 1, 2], [0, 1, 2]])
+        a[...] = np.arange(2).reshape(2, 1)
+        assert_equal(a, [[0, 0, 0], [1, 1, 1]])
+
+        # For compatibility with <= 1.5, a limited version of broadcasting
+        # the output to the input.
+        #
+        # This behavior is inconsistent with NumPy broadcasting
+        # in general, because it only uses one of the two broadcasting
+        # rules (adding a new "1" dimension to the left of the shape),
+        # applied to the output instead of an input. In NumPy 2.0, this kind
+        # of broadcasting assignment will likely be disallowed.
+        a[...] = np.arange(6)[::-1].reshape(1, 2, 3)
+        assert_equal(a, [[5, 4, 3], [2, 1, 0]])
+        # The other type of broadcasting would require a reduction operation.
+
+        def assign(a, b):
+            a[...] = b
+
+        assert_raises(ValueError, assign, a, np.arange(12).reshape(2, 2, 3))
+
+    def test_assignment_errors(self):
+        # Address issue #2276
+        class C:
+            pass
+        a = np.zeros(1)
+
+        def assign(v):
+            a[0] = v
+
+        assert_raises((AttributeError, TypeError), assign, C())
+        assert_raises(ValueError, assign, [1])
+
+    def test_unicode_assignment(self):
+        # gh-5049
+        from numpy.core.numeric import set_string_function
+
+        @contextmanager
+        def inject_str(s):
+            """ replace ndarray.__str__ temporarily """
+            set_string_function(lambda x: s, repr=False)
+            try:
+                yield
+            finally:
+                set_string_function(None, repr=False)
+
+        a1d = np.array([u'test'])
+        a0d = np.array(u'done')
+        with inject_str(u'bad'):
+            a1d[0] = a0d  # previously this would invoke __str__
+        assert_equal(a1d[0], u'done')
+
+        # this would crash for the same reason
+        np.array([np.array(u'\xe5\xe4\xf6')])
+
+    def test_stringlike_empty_list(self):
+        # gh-8902
+        u = np.array([u'done'])
+        b = np.array([b'done'])
+
+        class bad_sequence(object):
+            def __getitem__(self): pass
+            def __len__(self): raise RuntimeError
+
+        assert_raises(ValueError, operator.setitem, u, 0, [])
+        assert_raises(ValueError, operator.setitem, b, 0, [])
+
+        assert_raises(ValueError, operator.setitem, u, 0, bad_sequence())
+        assert_raises(ValueError, operator.setitem, b, 0, bad_sequence())
+
+    def test_longdouble_assignment(self):
+        # only relevant if longdouble is larger than float
+        # we're looking for loss of precision
+
+        for dtype in (np.longdouble, np.longcomplex):
+            # gh-8902
+            tinyb = np.nextafter(np.longdouble(0), 1).astype(dtype)
+            tinya = np.nextafter(np.longdouble(0), -1).astype(dtype)
+
+            # construction
+            tiny1d = np.array([tinya])
+            assert_equal(tiny1d[0], tinya)
+
+            # scalar = scalar
+            tiny1d[0] = tinyb
+            assert_equal(tiny1d[0], tinyb)
+
+            # 0d = scalar
+            tiny1d[0, ...] = tinya
+            assert_equal(tiny1d[0], tinya)
+
+            # 0d = 0d
+            tiny1d[0, ...] = tinyb[...]
+            assert_equal(tiny1d[0], tinyb)
+
+            # scalar = 0d
+            tiny1d[0] = tinyb[...]
+            assert_equal(tiny1d[0], tinyb)
+
+            arr = np.array([np.array(tinya)])
+            assert_equal(arr[0], tinya)
+
+    def test_cast_to_string(self):
+        # cast to str should do "str(scalar)", not "str(scalar.item())"
+        # Example: In python2, str(float) is truncated, so we want to avoid
+        # str(np.float64(...).item()) as this would incorrectly truncate.
+        a = np.zeros(1, dtype='S20')
+        a[:] = np.array(['1.12345678901234567890'], dtype='f8')
+        assert_equal(a[0], b"1.1234567890123457")
+
+
+class TestDtypedescr(object):
+    def test_construction(self):
+        d1 = np.dtype('i4')
+        assert_equal(d1, np.dtype(np.int32))
+        d2 = np.dtype('f8')
+        assert_equal(d2, np.dtype(np.float64))
+
+    def test_byteorders(self):
+        assert_(np.dtype('<i4') != np.dtype('>i4'))
+        assert_(np.dtype([('a', '<i4')]) != np.dtype([('a', '>i4')]))
+
+    def test_structured_non_void(self):
+        fields = [('a', '<i2'), ('b', '<i2')]
+        dt_int = np.dtype(('i4', fields))
+        assert_equal(str(dt_int), "(numpy.int32, [('a', '<i2'), ('b', '<i2')])")
+
+        # gh-9821
+        arr_int = np.zeros(4, dt_int)
+        assert_equal(repr(arr_int),
+            "array([0, 0, 0, 0], dtype=(numpy.int32, [('a', '<i2'), ('b', '<i2')]))")
+
+
+class TestZeroRank(object):
+    def setup(self):
+        self.d = np.array(0), np.array('x', object)
+
+    def test_ellipsis_subscript(self):
+        a, b = self.d
+        assert_equal(a[...], 0)
+        assert_equal(b[...], 'x')
+        assert_(a[...].base is a)  # `a[...] is a` in numpy <1.9.
+        assert_(b[...].base is b)  # `b[...] is b` in numpy <1.9.
+
+    def test_empty_subscript(self):
+        a, b = self.d
+        assert_equal(a[()], 0)
+        assert_equal(b[()], 'x')
+        assert_(type(a[()]) is a.dtype.type)
+        assert_(type(b[()]) is str)
+
+    def test_invalid_subscript(self):
+        a, b = self.d
+        assert_raises(IndexError, lambda x: x[0], a)
+        assert_raises(IndexError, lambda x: x[0], b)
+        assert_raises(IndexError, lambda x: x[np.array([], int)], a)
+        assert_raises(IndexError, lambda x: x[np.array([], int)], b)
+
+    def test_ellipsis_subscript_assignment(self):
+        a, b = self.d
+        a[...] = 42
+        assert_equal(a, 42)
+        b[...] = ''
+        assert_equal(b.item(), '')
+
+    def test_empty_subscript_assignment(self):
+        a, b = self.d
+        a[()] = 42
+        assert_equal(a, 42)
+        b[()] = ''
+        assert_equal(b.item(), '')
+
+    def test_invalid_subscript_assignment(self):
+        a, b = self.d
+
+        def assign(x, i, v):
+            x[i] = v
+
+        assert_raises(IndexError, assign, a, 0, 42)
+        assert_raises(IndexError, assign, b, 0, '')
+        assert_raises(ValueError, assign, a, (), '')
+
+    def test_newaxis(self):
+        a, b = self.d
+        assert_equal(a[np.newaxis].shape, (1,))
+        assert_equal(a[..., np.newaxis].shape, (1,))
+        assert_equal(a[np.newaxis, ...].shape, (1,))
+        assert_equal(a[..., np.newaxis].shape, (1,))
+        assert_equal(a[np.newaxis, ..., np.newaxis].shape, (1, 1))
+        assert_equal(a[..., np.newaxis, np.newaxis].shape, (1, 1))
+        assert_equal(a[np.newaxis, np.newaxis, ...].shape, (1, 1))
+        assert_equal(a[(np.newaxis,)*10].shape, (1,)*10)
+
+    def test_invalid_newaxis(self):
+        a, b = self.d
+
+        def subscript(x, i):
+            x[i]
+
+        assert_raises(IndexError, subscript, a, (np.newaxis, 0))
+        assert_raises(IndexError, subscript, a, (np.newaxis,)*50)
+
+    def test_constructor(self):
+        x = np.ndarray(())
+        x[()] = 5
+        assert_equal(x[()], 5)
+        y = np.ndarray((), buffer=x)
+        y[()] = 6
+        assert_equal(x[()], 6)
+
+    def test_output(self):
+        x = np.array(2)
+        assert_raises(ValueError, np.add, x, [1], x)
+
+    def test_real_imag(self):
+        # contiguity checks are for gh-11245
+        x = np.array(1j)
+        xr = x.real
+        xi = x.imag
+
+        assert_equal(xr, np.array(0))
+        assert_(type(xr) is np.ndarray)
+        assert_equal(xr.flags.contiguous, True)
+        assert_equal(xr.flags.f_contiguous, True)
+
+        assert_equal(xi, np.array(1))
+        assert_(type(xi) is np.ndarray)
+        assert_equal(xi.flags.contiguous, True)
+        assert_equal(xi.flags.f_contiguous, True)
+
+
+class TestScalarIndexing(object):
+    def setup(self):
+        self.d = np.array([0, 1])[0]
+
+    def test_ellipsis_subscript(self):
+        a = self.d
+        assert_equal(a[...], 0)
+        assert_equal(a[...].shape, ())
+
+    def test_empty_subscript(self):
+        a = self.d
+        assert_equal(a[()], 0)
+        assert_equal(a[()].shape, ())
+
+    def test_invalid_subscript(self):
+        a = self.d
+        assert_raises(IndexError, lambda x: x[0], a)
+        assert_raises(IndexError, lambda x: x[np.array([], int)], a)
+
+    def test_invalid_subscript_assignment(self):
+        a = self.d
+
+        def assign(x, i, v):
+            x[i] = v
+
+        assert_raises(TypeError, assign, a, 0, 42)
+
+    def test_newaxis(self):
+        a = self.d
+        assert_equal(a[np.newaxis].shape, (1,))
+        assert_equal(a[..., np.newaxis].shape, (1,))
+        assert_equal(a[np.newaxis, ...].shape, (1,))
+        assert_equal(a[..., np.newaxis].shape, (1,))
+        assert_equal(a[np.newaxis, ..., np.newaxis].shape, (1, 1))
+        assert_equal(a[..., np.newaxis, np.newaxis].shape, (1, 1))
+        assert_equal(a[np.newaxis, np.newaxis, ...].shape, (1, 1))
+        assert_equal(a[(np.newaxis,)*10].shape, (1,)*10)
+
+    def test_invalid_newaxis(self):
+        a = self.d
+
+        def subscript(x, i):
+            x[i]
+
+        assert_raises(IndexError, subscript, a, (np.newaxis, 0))
+        assert_raises(IndexError, subscript, a, (np.newaxis,)*50)
+
+    def test_overlapping_assignment(self):
+        # With positive strides
+        a = np.arange(4)
+        a[:-1] = a[1:]
+        assert_equal(a, [1, 2, 3, 3])
+
+        a = np.arange(4)
+        a[1:] = a[:-1]
+        assert_equal(a, [0, 0, 1, 2])
+
+        # With positive and negative strides
+        a = np.arange(4)
+        a[:] = a[::-1]
+        assert_equal(a, [3, 2, 1, 0])
+
+        a = np.arange(6).reshape(2, 3)
+        a[::-1,:] = a[:, ::-1]
+        assert_equal(a, [[5, 4, 3], [2, 1, 0]])
+
+        a = np.arange(6).reshape(2, 3)
+        a[::-1, ::-1] = a[:, ::-1]
+        assert_equal(a, [[3, 4, 5], [0, 1, 2]])
+
+        # With just one element overlapping
+        a = np.arange(5)
+        a[:3] = a[2:]
+        assert_equal(a, [2, 3, 4, 3, 4])
+
+        a = np.arange(5)
+        a[2:] = a[:3]
+        assert_equal(a, [0, 1, 0, 1, 2])
+
+        a = np.arange(5)
+        a[2::-1] = a[2:]
+        assert_equal(a, [4, 3, 2, 3, 4])
+
+        a = np.arange(5)
+        a[2:] = a[2::-1]
+        assert_equal(a, [0, 1, 2, 1, 0])
+
+        a = np.arange(5)
+        a[2::-1] = a[:1:-1]
+        assert_equal(a, [2, 3, 4, 3, 4])
+
+        a = np.arange(5)
+        a[:1:-1] = a[2::-1]
+        assert_equal(a, [0, 1, 0, 1, 2])
+
+
+class TestCreation(object):
+    """
+    Test the np.array constructor
+    """
+    def test_from_attribute(self):
+        class x(object):
+            def __array__(self, dtype=None):
+                pass
+
+        assert_raises(ValueError, np.array, x())
+
+    def test_from_string(self):
+        types = np.typecodes['AllInteger'] + np.typecodes['Float']
+        nstr = ['123', '123']
+        result = np.array([123, 123], dtype=int)
+        for type in types:
+            msg = 'String conversion for %s' % type
+            assert_equal(np.array(nstr, dtype=type), result, err_msg=msg)
+
+    def test_void(self):
+        arr = np.array([], dtype='V')
+        assert_equal(arr.dtype.kind, 'V')
+
+    def test_too_big_error(self):
+        # 45341 is the smallest integer greater than sqrt(2**31 - 1).
+        # 3037000500 is the smallest integer greater than sqrt(2**63 - 1).
+        # We want to make sure that the square byte array with those dimensions
+        # is too big on 32 or 64 bit systems respectively.
+        if np.iinfo('intp').max == 2**31 - 1:
+            shape = (46341, 46341)
+        elif np.iinfo('intp').max == 2**63 - 1:
+            shape = (3037000500, 3037000500)
+        else:
+            return
+        assert_raises(ValueError, np.empty, shape, dtype=np.int8)
+        assert_raises(ValueError, np.zeros, shape, dtype=np.int8)
+        assert_raises(ValueError, np.ones, shape, dtype=np.int8)
+
+    def test_zeros(self):
+        types = np.typecodes['AllInteger'] + np.typecodes['AllFloat']
+        for dt in types:
+            d = np.zeros((13,), dtype=dt)
+            assert_equal(np.count_nonzero(d), 0)
+            # true for ieee floats
+            assert_equal(d.sum(), 0)
+            assert_(not d.any())
+
+            d = np.zeros(2, dtype='(2,4)i4')
+            assert_equal(np.count_nonzero(d), 0)
+            assert_equal(d.sum(), 0)
+            assert_(not d.any())
+
+            d = np.zeros(2, dtype='4i4')
+            assert_equal(np.count_nonzero(d), 0)
+            assert_equal(d.sum(), 0)
+            assert_(not d.any())
+
+            d = np.zeros(2, dtype='(2,4)i4, (2,4)i4')
+            assert_equal(np.count_nonzero(d), 0)
+
+    @pytest.mark.slow
+    def test_zeros_big(self):
+        # test big array as they might be allocated different by the system
+        types = np.typecodes['AllInteger'] + np.typecodes['AllFloat']
+        for dt in types:
+            d = np.zeros((30 * 1024**2,), dtype=dt)
+            assert_(not d.any())
+            # This test can fail on 32-bit systems due to insufficient
+            # contiguous memory. Deallocating the previous array increases the
+            # chance of success.
+            del(d)
+
+    def test_zeros_obj(self):
+        # test initialization from PyLong(0)
+        d = np.zeros((13,), dtype=object)
+        assert_array_equal(d, [0] * 13)
+        assert_equal(np.count_nonzero(d), 0)
+
+    def test_zeros_obj_obj(self):
+        d = np.zeros(10, dtype=[('k', object, 2)])
+        assert_array_equal(d['k'], 0)
+
+    def test_zeros_like_like_zeros(self):
+        # test zeros_like returns the same as zeros
+        for c in np.typecodes['All']:
+            if c == 'V':
+                continue
+            d = np.zeros((3,3), dtype=c)
+            assert_array_equal(np.zeros_like(d), d)
+            assert_equal(np.zeros_like(d).dtype, d.dtype)
+        # explicitly check some special cases
+        d = np.zeros((3,3), dtype='S5')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+        d = np.zeros((3,3), dtype='U5')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+
+        d = np.zeros((3,3), dtype='<i4')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+        d = np.zeros((3,3), dtype='>i4')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+
+        d = np.zeros((3,3), dtype='<M8[s]')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+        d = np.zeros((3,3), dtype='>M8[s]')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+
+        d = np.zeros((3,3), dtype='f4,f4')
+        assert_array_equal(np.zeros_like(d), d)
+        assert_equal(np.zeros_like(d).dtype, d.dtype)
+
+    def test_empty_unicode(self):
+        # don't throw decode errors on garbage memory
+        for i in range(5, 100, 5):
+            d = np.empty(i, dtype='U')
+            str(d)
+
+    def test_sequence_non_homogenous(self):
+        assert_equal(np.array([4, 2**80]).dtype, object)
+        assert_equal(np.array([4, 2**80, 4]).dtype, object)
+        assert_equal(np.array([2**80, 4]).dtype, object)
+        assert_equal(np.array([2**80] * 3).dtype, object)
+        assert_equal(np.array([[1, 1],[1j, 1j]]).dtype, complex)
+        assert_equal(np.array([[1j, 1j],[1, 1]]).dtype, complex)
+        assert_equal(np.array([[1, 1, 1],[1, 1j, 1.], [1, 1, 1]]).dtype, complex)
+
+    @pytest.mark.skipif(sys.version_info[0] >= 3, reason="Not Python 2")
+    def test_sequence_long(self):
+        assert_equal(np.array([long(4), long(4)]).dtype, np.long)
+        assert_equal(np.array([long(4), 2**80]).dtype, object)
+        assert_equal(np.array([long(4), 2**80, long(4)]).dtype, object)
+        assert_equal(np.array([2**80, long(4)]).dtype, object)
+
+    def test_non_sequence_sequence(self):
+        """Should not segfault.
+
+        Class Fail breaks the sequence protocol for new style classes, i.e.,
+        those derived from object. Class Map is a mapping type indicated by
+        raising a ValueError. At some point we may raise a warning instead
+        of an error in the Fail case.
+
+        """
+        class Fail(object):
+            def __len__(self):
+                return 1
+
+            def __getitem__(self, index):
+                raise ValueError()
+
+        class Map(object):
+            def __len__(self):
+                return 1
+
+            def __getitem__(self, index):
+                raise KeyError()
+
+        a = np.array([Map()])
+        assert_(a.shape == (1,))
+        assert_(a.dtype == np.dtype(object))
+        assert_raises(ValueError, np.array, [Fail()])
+
+    def test_no_len_object_type(self):
+        # gh-5100, want object array from iterable object without len()
+        class Point2:
+            def __init__(self):
+                pass
+
+            def __getitem__(self, ind):
+                if ind in [0, 1]:
+                    return ind
+                else:
+                    raise IndexError()
+        d = np.array([Point2(), Point2(), Point2()])
+        assert_equal(d.dtype, np.dtype(object))
+
+    def test_false_len_sequence(self):
+        # gh-7264, segfault for this example
+        class C:
+            def __getitem__(self, i):
+                raise IndexError
+            def __len__(self):
+                return 42
+
+        assert_raises(ValueError, np.array, C()) # segfault?
+
+    def test_failed_len_sequence(self):
+        # gh-7393
+        class A(object):
+            def __init__(self, data):
+                self._data = data
+            def __getitem__(self, item):
+                return type(self)(self._data[item])
+            def __len__(self):
+                return len(self._data)
+
+        # len(d) should give 3, but len(d[0]) will fail
+        d = A([1,2,3])
+        assert_equal(len(np.array(d)), 3)
+
+    def test_array_too_big(self):
+        # Test that array creation succeeds for arrays addressable by intp
+        # on the byte level and fails for too large arrays.
+        buf = np.zeros(100)
+
+        max_bytes = np.iinfo(np.intp).max
+        for dtype in ["intp", "S20", "b"]:
+            dtype = np.dtype(dtype)
+            itemsize = dtype.itemsize
+
+            np.ndarray(buffer=buf, strides=(0,),
+                       shape=(max_bytes//itemsize,), dtype=dtype)
+            assert_raises(ValueError, np.ndarray, buffer=buf, strides=(0,),
+                          shape=(max_bytes//itemsize + 1,), dtype=dtype)
+
+    def test_jagged_ndim_object(self):
+        # Lists of mismatching depths are treated as object arrays
+        a = np.array([[1], 2, 3])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([1, [2], 3])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([1, 2, [3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+    def test_jagged_shape_object(self):
+        # The jagged dimension of a list is turned into an object array
+        a = np.array([[1, 1], [2], [3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([[1], [2, 2], [3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([[1], [2], [3, 3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+
+class TestStructured(object):
+    def test_subarray_field_access(self):
+        a = np.zeros((3, 5), dtype=[('a', ('i4', (2, 2)))])
+        a['a'] = np.arange(60).reshape(3, 5, 2, 2)
+
+        # Since the subarray is always in C-order, a transpose
+        # does not swap the subarray:
+        assert_array_equal(a.T['a'], a['a'].transpose(1, 0, 2, 3))
+
+        # In Fortran order, the subarray gets appended
+        # like in all other cases, not prepended as a special case
+        b = a.copy(order='F')
+        assert_equal(a['a'].shape, b['a'].shape)
+        assert_equal(a.T['a'].shape, a.T.copy()['a'].shape)
+
+    def test_subarray_comparison(self):
+        # Check that comparisons between record arrays with
+        # multi-dimensional field types work properly
+        a = np.rec.fromrecords(
+            [([1, 2, 3], 'a', [[1, 2], [3, 4]]), ([3, 3, 3], 'b', [[0, 0], [0, 0]])],
+            dtype=[('a', ('f4', 3)), ('b', object), ('c', ('i4', (2, 2)))])
+        b = a.copy()
+        assert_equal(a == b, [True, True])
+        assert_equal(a != b, [False, False])
+        b[1].b = 'c'
+        assert_equal(a == b, [True, False])
+        assert_equal(a != b, [False, True])
+        for i in range(3):
+            b[0].a = a[0].a
+            b[0].a[i] = 5
+            assert_equal(a == b, [False, False])
+            assert_equal(a != b, [True, True])
+        for i in range(2):
+            for j in range(2):
+                b = a.copy()
+                b[0].c[i, j] = 10
+                assert_equal(a == b, [False, True])
+                assert_equal(a != b, [True, False])
+
+        # Check that broadcasting with a subarray works
+        a = np.array([[(0,)], [(1,)]], dtype=[('a', 'f8')])
+        b = np.array([(0,), (0,), (1,)], dtype=[('a', 'f8')])
+        assert_equal(a == b, [[True, True, False], [False, False, True]])
+        assert_equal(b == a, [[True, True, False], [False, False, True]])
+        a = np.array([[(0,)], [(1,)]], dtype=[('a', 'f8', (1,))])
+        b = np.array([(0,), (0,), (1,)], dtype=[('a', 'f8', (1,))])
+        assert_equal(a == b, [[True, True, False], [False, False, True]])
+        assert_equal(b == a, [[True, True, False], [False, False, True]])
+        a = np.array([[([0, 0],)], [([1, 1],)]], dtype=[('a', 'f8', (2,))])
+        b = np.array([([0, 0],), ([0, 1],), ([1, 1],)], dtype=[('a', 'f8', (2,))])
+        assert_equal(a == b, [[True, False, False], [False, False, True]])
+        assert_equal(b == a, [[True, False, False], [False, False, True]])
+
+        # Check that broadcasting Fortran-style arrays with a subarray work
+        a = np.array([[([0, 0],)], [([1, 1],)]], dtype=[('a', 'f8', (2,))], order='F')
+        b = np.array([([0, 0],), ([0, 1],), ([1, 1],)], dtype=[('a', 'f8', (2,))])
+        assert_equal(a == b, [[True, False, False], [False, False, True]])
+        assert_equal(b == a, [[True, False, False], [False, False, True]])
+
+        # Check that incompatible sub-array shapes don't result to broadcasting
+        x = np.zeros((1,), dtype=[('a', ('f4', (1, 2))), ('b', 'i1')])
+        y = np.zeros((1,), dtype=[('a', ('f4', (2,))), ('b', 'i1')])
+        # This comparison invokes deprecated behaviour, and will probably
+        # start raising an error eventually. What we really care about in this
+        # test is just that it doesn't return True.
+        with suppress_warnings() as sup:
+            sup.filter(FutureWarning, "elementwise == comparison failed")
+            assert_equal(x == y, False)
+
+        x = np.zeros((1,), dtype=[('a', ('f4', (2, 1))), ('b', 'i1')])
+        y = np.zeros((1,), dtype=[('a', ('f4', (2,))), ('b', 'i1')])
+        # This comparison invokes deprecated behaviour, and will probably
+        # start raising an error eventually. What we really care about in this
+        # test is just that it doesn't return True.
+        with suppress_warnings() as sup:
+            sup.filter(FutureWarning, "elementwise == comparison failed")
+            assert_equal(x == y, False)
+
+        # Check that structured arrays that are different only in
+        # byte-order work
+        a = np.array([(5, 42), (10, 1)], dtype=[('a', '>i8'), ('b', '<f8')])
+        b = np.array([(5, 43), (10, 1)], dtype=[('a', '<i8'), ('b', '>f8')])
+        assert_equal(a == b, [False, True])
+
+    def test_casting(self):
+        # Check that casting a structured array to change its byte order
+        # works
+        a = np.array([(1,)], dtype=[('a', '<i4')])
+        assert_(np.can_cast(a.dtype, [('a', '>i4')], casting='unsafe'))
+        b = a.astype([('a', '>i4')])
+        assert_equal(b, a.byteswap().newbyteorder())
+        assert_equal(a['a'][0], b['a'][0])
+
+        # Check that equality comparison works on structured arrays if
+        # they are 'equiv'-castable
+        a = np.array([(5, 42), (10, 1)], dtype=[('a', '>i4'), ('b', '<f8')])
+        b = np.array([(5, 42), (10, 1)], dtype=[('a', '<i4'), ('b', '>f8')])
+        assert_(np.can_cast(a.dtype, b.dtype, casting='equiv'))
+        assert_equal(a == b, [True, True])
+
+        # Check that 'equiv' casting can change byte order
+        assert_(np.can_cast(a.dtype, b.dtype, casting='equiv'))
+        c = a.astype(b.dtype, casting='equiv')
+        assert_equal(a == c, [True, True])
+
+        # Check that 'safe' casting can change byte order and up-cast
+        # fields
+        t = [('a', '<i8'), ('b', '>f8')]
+        assert_(np.can_cast(a.dtype, t, casting='safe'))
+        c = a.astype(t, casting='safe')
+        assert_equal((c == np.array([(5, 42), (10, 1)], dtype=t)),
+                     [True, True])
+
+        # Check that 'same_kind' casting can change byte order and
+        # change field widths within a "kind"
+        t = [('a', '<i4'), ('b', '>f4')]
+        assert_(np.can_cast(a.dtype, t, casting='same_kind'))
+        c = a.astype(t, casting='same_kind')
+        assert_equal((c == np.array([(5, 42), (10, 1)], dtype=t)),
+                     [True, True])
+
+        # Check that casting fails if the casting rule should fail on
+        # any of the fields
+        t = [('a', '>i8'), ('b', '<f4')]
+        assert_(not np.can_cast(a.dtype, t, casting='safe'))
+        assert_raises(TypeError, a.astype, t, casting='safe')
+        t = [('a', '>i2'), ('b', '<f8')]
+        assert_(not np.can_cast(a.dtype, t, casting='equiv'))
+        assert_raises(TypeError, a.astype, t, casting='equiv')
+        t = [('a', '>i8'), ('b', '<i2')]
+        assert_(not np.can_cast(a.dtype, t, casting='same_kind'))
+        assert_raises(TypeError, a.astype, t, casting='same_kind')
+        assert_(not np.can_cast(a.dtype, b.dtype, casting='no'))
+        assert_raises(TypeError, a.astype, b.dtype, casting='no')
+
+        # Check that non-'unsafe' casting can't change the set of field names
+        for casting in ['no', 'safe', 'equiv', 'same_kind']:
+            t = [('a', '>i4')]
+            assert_(not np.can_cast(a.dtype, t, casting=casting))
+            t = [('a', '>i4'), ('b', '<f8'), ('c', 'i4')]
+            assert_(not np.can_cast(a.dtype, t, casting=casting))
+
+    def test_objview(self):
+        # https://github.com/numpy/numpy/issues/3286
+        a = np.array([], dtype=[('a', 'f'), ('b', 'f'), ('c', 'O')])
+        a[['a', 'b']]  # TypeError?
+
+        # https://github.com/numpy/numpy/issues/3253
+        dat2 = np.zeros(3, [('A', 'i'), ('B', '|O')])
+        dat2[['B', 'A']]  # TypeError?
+
+    def test_setfield(self):
+        # https://github.com/numpy/numpy/issues/3126
+        struct_dt = np.dtype([('elem', 'i4', 5),])
+        dt = np.dtype([('field', 'i4', 10),('struct', struct_dt)])
+        x = np.zeros(1, dt)
+        x[0]['field'] = np.ones(10, dtype='i4')
+        x[0]['struct'] = np.ones(1, dtype=struct_dt)
+        assert_equal(x[0]['field'], np.ones(10, dtype='i4'))
+
+    def test_setfield_object(self):
+        # make sure object field assignment with ndarray value
+        # on void scalar mimics setitem behavior
+        b = np.zeros(1, dtype=[('x', 'O')])
+        # next line should work identically to b['x'][0] = np.arange(3)
+        b[0]['x'] = np.arange(3)
+        assert_equal(b[0]['x'], np.arange(3))
+
+        # check that broadcasting check still works
+        c = np.zeros(1, dtype=[('x', 'O', 5)])
+
+        def testassign():
+            c[0]['x'] = np.arange(3)
+
+        assert_raises(ValueError, testassign)
+
+    def test_zero_width_string(self):
+        # Test for PR #6430 / issues #473, #4955, #2585
+
+        dt = np.dtype([('I', int), ('S', 'S0')])
+
+        x = np.zeros(4, dtype=dt)
+
+        assert_equal(x['S'], [b'', b'', b'', b''])
+        assert_equal(x['S'].itemsize, 0)
+
+        x['S'] = ['a', 'b', 'c', 'd']
+        assert_equal(x['S'], [b'', b'', b'', b''])
+        assert_equal(x['I'], [0, 0, 0, 0])
+
+        # Variation on test case from #4955
+        x['S'][x['I'] == 0] = 'hello'
+        assert_equal(x['S'], [b'', b'', b'', b''])
+        assert_equal(x['I'], [0, 0, 0, 0])
+
+        # Variation on test case from #2585
+        x['S'] = 'A'
+        assert_equal(x['S'], [b'', b'', b'', b''])
+        assert_equal(x['I'], [0, 0, 0, 0])
+
+        # Allow zero-width dtypes in ndarray constructor
+        y = np.ndarray(4, dtype=x['S'].dtype)
+        assert_equal(y.itemsize, 0)
+        assert_equal(x['S'], y)
+
+        # More tests for indexing an array with zero-width fields
+        assert_equal(np.zeros(4, dtype=[('a', 'S0,S0'),
+                                        ('b', 'u1')])['a'].itemsize, 0)
+        assert_equal(np.empty(3, dtype='S0,S0').itemsize, 0)
+        assert_equal(np.zeros(4, dtype='S0,u1')['f0'].itemsize, 0)
+
+        xx = x['S'].reshape((2, 2))
+        assert_equal(xx.itemsize, 0)
+        assert_equal(xx, [[b'', b''], [b'', b'']])
+        # check for no uninitialized memory due to viewing S0 array
+        assert_equal(xx[:].dtype, xx.dtype)
+        assert_array_equal(eval(repr(xx), dict(array=np.array)), xx)
+
+        b = io.BytesIO()
+        np.save(b, xx)
+
+        b.seek(0)
+        yy = np.load(b)
+        assert_equal(yy.itemsize, 0)
+        assert_equal(xx, yy)
+
+        with temppath(suffix='.npy') as tmp:
+            np.save(tmp, xx)
+            yy = np.load(tmp)
+            assert_equal(yy.itemsize, 0)
+            assert_equal(xx, yy)
+
+    def test_base_attr(self):
+        a = np.zeros(3, dtype='i4,f4')
+        b = a[0]
+        assert_(b.base is a)
+
+    def test_assignment(self):
+        def testassign(arr, v):
+            c = arr.copy()
+            c[0] = v  # assign using setitem
+            c[1:] = v # assign using "dtype_transfer" code paths
+            return c
+
+        dt = np.dtype([('foo', 'i8'), ('bar', 'i8')])
+        arr = np.ones(2, dt)
+        v1 = np.array([(2,3)], dtype=[('foo', 'i8'), ('bar', 'i8')])
+        v2 = np.array([(2,3)], dtype=[('bar', 'i8'), ('foo', 'i8')])
+        v3 = np.array([(2,3)], dtype=[('bar', 'i8'), ('baz', 'i8')])
+        v4 = np.array([(2,)],  dtype=[('bar', 'i8')])
+        v5 = np.array([(2,3)], dtype=[('foo', 'f8'), ('bar', 'f8')])
+        w = arr.view({'names': ['bar'], 'formats': ['i8'], 'offsets': [8]})
+
+        ans = np.array([(2,3),(2,3)], dtype=dt)
+        assert_equal(testassign(arr, v1), ans)
+        assert_equal(testassign(arr, v2), ans)
+        assert_equal(testassign(arr, v3), ans)
+        assert_raises(ValueError, lambda: testassign(arr, v4))
+        assert_equal(testassign(arr, v5), ans)
+        w[:] = 4
+        assert_equal(arr, np.array([(1,4),(1,4)], dtype=dt))
+
+        # test field-reordering, assignment by position, and self-assignment
+        a = np.array([(1,2,3)],
+                     dtype=[('foo', 'i8'), ('bar', 'i8'), ('baz', 'f4')])
+        a[['foo', 'bar']] = a[['bar', 'foo']]
+        assert_equal(a[0].item(), (2,1,3))
+
+        # test that this works even for 'simple_unaligned' structs
+        # (ie, that PyArray_EquivTypes cares about field order too)
+        a = np.array([(1,2)], dtype=[('a', 'i4'), ('b', 'i4')])
+        a[['a', 'b']] = a[['b', 'a']]
+        assert_equal(a[0].item(), (2,1))
+
+    def test_structuredscalar_indexing(self):
+        # test gh-7262
+        x = np.empty(shape=1, dtype="(2)3S,(2)3U")
+        assert_equal(x[["f0","f1"]][0], x[0][["f0","f1"]])
+        assert_equal(x[0], x[0][()])
+
+    def test_multiindex_titles(self):
+        a = np.zeros(4, dtype=[(('a', 'b'), 'i'), ('c', 'i'), ('d', 'i')])
+        assert_raises(KeyError, lambda : a[['a','c']])
+        assert_raises(KeyError, lambda : a[['a','a']])
+        assert_raises(ValueError, lambda : a[['b','b']])  # field exists, but repeated
+        a[['b','c']]  # no exception
+
+
+class TestBool(object):
+    def test_test_interning(self):
+        a0 = np.bool_(0)
+        b0 = np.bool_(False)
+        assert_(a0 is b0)
+        a1 = np.bool_(1)
+        b1 = np.bool_(True)
+        assert_(a1 is b1)
+        assert_(np.array([True])[0] is a1)
+        assert_(np.array(True)[()] is a1)
+
+    def test_sum(self):
+        d = np.ones(101, dtype=bool)
+        assert_equal(d.sum(), d.size)
+        assert_equal(d[::2].sum(), d[::2].size)
+        assert_equal(d[::-2].sum(), d[::-2].size)
+
+        d = np.frombuffer(b'\xff\xff' * 100, dtype=bool)
+        assert_equal(d.sum(), d.size)
+        assert_equal(d[::2].sum(), d[::2].size)
+        assert_equal(d[::-2].sum(), d[::-2].size)
+
+    def check_count_nonzero(self, power, length):
+        powers = [2 ** i for i in range(length)]
+        for i in range(2**power):
+            l = [(i & x) != 0 for x in powers]
+            a = np.array(l, dtype=bool)
+            c = builtins.sum(l)
+            assert_equal(np.count_nonzero(a), c)
+            av = a.view(np.uint8)
+            av *= 3
+            assert_equal(np.count_nonzero(a), c)
+            av *= 4
+            assert_equal(np.count_nonzero(a), c)
+            av[av != 0] = 0xFF
+            assert_equal(np.count_nonzero(a), c)
+
+    def test_count_nonzero(self):
+        # check all 12 bit combinations in a length 17 array
+        # covers most cases of the 16 byte unrolled code
+        self.check_count_nonzero(12, 17)
+
+    @pytest.mark.slow
+    def test_count_nonzero_all(self):
+        # check all combinations in a length 17 array
+        # covers all cases of the 16 byte unrolled code
+        self.check_count_nonzero(17, 17)
+
+    def test_count_nonzero_unaligned(self):
+        # prevent mistakes as e.g. gh-4060
+        for o in range(7):
+            a = np.zeros((18,), dtype=bool)[o+1:]
+            a[:o] = True
+            assert_equal(np.count_nonzero(a), builtins.sum(a.tolist()))
+            a = np.ones((18,), dtype=bool)[o+1:]
+            a[:o] = False
+            assert_equal(np.count_nonzero(a), builtins.sum(a.tolist()))
+
+    def _test_cast_from_flexible(self, dtype):
+        # empty string -> false
+        for n in range(3):
+            v = np.array(b'', (dtype, n))
+            assert_equal(bool(v), False)
+            assert_equal(bool(v[()]), False)
+            assert_equal(v.astype(bool), False)
+            assert_(isinstance(v.astype(bool), np.ndarray))
+            assert_(v[()].astype(bool) is np.False_)
+
+        # anything else -> true
+        for n in range(1, 4):
+            for val in [b'a', b'0', b' ']:
+                v = np.array(val, (dtype, n))
+                assert_equal(bool(v), True)
+                assert_equal(bool(v[()]), True)
+                assert_equal(v.astype(bool), True)
+                assert_(isinstance(v.astype(bool), np.ndarray))
+                assert_(v[()].astype(bool) is np.True_)
+
+    def test_cast_from_void(self):
+        self._test_cast_from_flexible(np.void)
+
+    @pytest.mark.xfail(reason="See gh-9847")
+    def test_cast_from_unicode(self):
+        self._test_cast_from_flexible(np.unicode_)
+
+    @pytest.mark.xfail(reason="See gh-9847")
+    def test_cast_from_bytes(self):
+        self._test_cast_from_flexible(np.bytes_)
+
+
+class TestZeroSizeFlexible(object):
+    @staticmethod
+    def _zeros(shape, dtype=str):
+        dtype = np.dtype(dtype)
+        if dtype == np.void:
+            return np.zeros(shape, dtype=(dtype, 0))
+
+        # not constructable directly
+        dtype = np.dtype([('x', dtype, 0)])
+        return np.zeros(shape, dtype=dtype)['x']
+
+    def test_create(self):
+        zs = self._zeros(10, bytes)
+        assert_equal(zs.itemsize, 0)
+        zs = self._zeros(10, np.void)
+        assert_equal(zs.itemsize, 0)
+        zs = self._zeros(10, unicode)
+        assert_equal(zs.itemsize, 0)
+
+    def _test_sort_partition(self, name, kinds, **kwargs):
+        # Previously, these would all hang
+        for dt in [bytes, np.void, unicode]:
+            zs = self._zeros(10, dt)
+            sort_method = getattr(zs, name)
+            sort_func = getattr(np, name)
+            for kind in kinds:
+                sort_method(kind=kind, **kwargs)
+                sort_func(zs, kind=kind, **kwargs)
+
+    def test_sort(self):
+        self._test_sort_partition('sort', kinds='qhm')
+
+    def test_argsort(self):
+        self._test_sort_partition('argsort', kinds='qhm')
+
+    def test_partition(self):
+        self._test_sort_partition('partition', kinds=['introselect'], kth=2)
+
+    def test_argpartition(self):
+        self._test_sort_partition('argpartition', kinds=['introselect'], kth=2)
+
+    def test_resize(self):
+        # previously an error
+        for dt in [bytes, np.void, unicode]:
+            zs = self._zeros(10, dt)
+            zs.resize(25)
+            zs.resize((10, 10))
+
+    def test_view(self):
+        for dt in [bytes, np.void, unicode]:
+            zs = self._zeros(10, dt)
+
+            # viewing as itself should be allowed
+            assert_equal(zs.view(dt).dtype, np.dtype(dt))
+
+            # viewing as any non-empty type gives an empty result
+            assert_equal(zs.view((dt, 1)).shape, (0,))
+
+    def test_pickle(self):
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            for dt in [bytes, np.void, unicode]:
+                zs = self._zeros(10, dt)
+                p = pickle.dumps(zs, protocol=proto)
+                zs2 = pickle.loads(p)
+
+                assert_equal(zs.dtype, zs2.dtype)
+
+    @pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5,
+                        reason="requires pickle protocol 5")
+    def test_pickle_with_buffercallback(self):
+        array = np.arange(10)
+        buffers = []
+        bytes_string = pickle.dumps(array, buffer_callback=buffers.append,
+                                    protocol=5)
+        array_from_buffer = pickle.loads(bytes_string, buffers=buffers)
+        # when using pickle protocol 5 with buffer callbacks,
+        # array_from_buffer is reconstructed from a buffer holding a view
+        # to the initial array's data, so modifying an element in array
+        # should modify it in array_from_buffer too.
+        array[0] = -1
+        assert array_from_buffer[0] == -1, array_from_buffer[0]
+
+
+class TestMethods(object):
+    def test_compress(self):
+        tgt = [[5, 6, 7, 8, 9]]
+        arr = np.arange(10).reshape(2, 5)
+        out = arr.compress([0, 1], axis=0)
+        assert_equal(out, tgt)
+
+        tgt = [[1, 3], [6, 8]]
+        out = arr.compress([0, 1, 0, 1, 0], axis=1)
+        assert_equal(out, tgt)
+
+        tgt = [[1], [6]]
+        arr = np.arange(10).reshape(2, 5)
+        out = arr.compress([0, 1], axis=1)
+        assert_equal(out, tgt)
+
+        arr = np.arange(10).reshape(2, 5)
+        out = arr.compress([0, 1])
+        assert_equal(out, 1)
+
+    def test_choose(self):
+        x = 2*np.ones((3,), dtype=int)
+        y = 3*np.ones((3,), dtype=int)
+        x2 = 2*np.ones((2, 3), dtype=int)
+        y2 = 3*np.ones((2, 3), dtype=int)
+        ind = np.array([0, 0, 1])
+
+        A = ind.choose((x, y))
+        assert_equal(A, [2, 2, 3])
+
+        A = ind.choose((x2, y2))
+        assert_equal(A, [[2, 2, 3], [2, 2, 3]])
+
+        A = ind.choose((x, y2))
+        assert_equal(A, [[2, 2, 3], [2, 2, 3]])
+
+        oned = np.ones(1)
+        # gh-12031, caused SEGFAULT
+        assert_raises(TypeError, oned.choose,np.void(0), [oned])
+
+    def test_prod(self):
+        ba = [1, 2, 10, 11, 6, 5, 4]
+        ba2 = [[1, 2, 3, 4], [5, 6, 7, 9], [10, 3, 4, 5]]
+
+        for ctype in [np.int16, np.uint16, np.int32, np.uint32,
+                      np.float32, np.float64, np.complex64, np.complex128]:
+            a = np.array(ba, ctype)
+            a2 = np.array(ba2, ctype)
+            if ctype in ['1', 'b']:
+                assert_raises(ArithmeticError, a.prod)
+                assert_raises(ArithmeticError, a2.prod, axis=1)
+            else:
+                assert_equal(a.prod(axis=0), 26400)
+                assert_array_equal(a2.prod(axis=0),
+                                   np.array([50, 36, 84, 180], ctype))
+                assert_array_equal(a2.prod(axis=-1),
+                                   np.array([24, 1890, 600], ctype))
+
+    def test_repeat(self):
+        m = np.array([1, 2, 3, 4, 5, 6])
+        m_rect = m.reshape((2, 3))
+
+        A = m.repeat([1, 3, 2, 1, 1, 2])
+        assert_equal(A, [1, 2, 2, 2, 3,
+                         3, 4, 5, 6, 6])
+
+        A = m.repeat(2)
+        assert_equal(A, [1, 1, 2, 2, 3, 3,
+                         4, 4, 5, 5, 6, 6])
+
+        A = m_rect.repeat([2, 1], axis=0)
+        assert_equal(A, [[1, 2, 3],
+                         [1, 2, 3],
+                         [4, 5, 6]])
+
+        A = m_rect.repeat([1, 3, 2], axis=1)
+        assert_equal(A, [[1, 2, 2, 2, 3, 3],
+                         [4, 5, 5, 5, 6, 6]])
+
+        A = m_rect.repeat(2, axis=0)
+        assert_equal(A, [[1, 2, 3],
+                         [1, 2, 3],
+                         [4, 5, 6],
+                         [4, 5, 6]])
+
+        A = m_rect.repeat(2, axis=1)
+        assert_equal(A, [[1, 1, 2, 2, 3, 3],
+                         [4, 4, 5, 5, 6, 6]])
+
+    def test_reshape(self):
+        arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+
+        tgt = [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]
+        assert_equal(arr.reshape(2, 6), tgt)
+
+        tgt = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]
+        assert_equal(arr.reshape(3, 4), tgt)
+
+        tgt = [[1, 10, 8, 6], [4, 2, 11, 9], [7, 5, 3, 12]]
+        assert_equal(arr.reshape((3, 4), order='F'), tgt)
+
+        tgt = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]]
+        assert_equal(arr.T.reshape((3, 4), order='C'), tgt)
+
+    def test_round(self):
+        def check_round(arr, expected, *round_args):
+            assert_equal(arr.round(*round_args), expected)
+            # With output array
+            out = np.zeros_like(arr)
+            res = arr.round(*round_args, out=out)
+            assert_equal(out, expected)
+            assert_equal(out, res)
+
+        check_round(np.array([1.2, 1.5]), [1, 2])
+        check_round(np.array(1.5), 2)
+        check_round(np.array([12.2, 15.5]), [10, 20], -1)
+        check_round(np.array([12.15, 15.51]), [12.2, 15.5], 1)
+        # Complex rounding
+        check_round(np.array([4.5 + 1.5j]), [4 + 2j])
+        check_round(np.array([12.5 + 15.5j]), [10 + 20j], -1)
+
+    def test_squeeze(self):
+        a = np.array([[[1], [2], [3]]])
+        assert_equal(a.squeeze(), [1, 2, 3])
+        assert_equal(a.squeeze(axis=(0,)), [[1], [2], [3]])
+        assert_raises(ValueError, a.squeeze, axis=(1,))
+        assert_equal(a.squeeze(axis=(2,)), [[1, 2, 3]])
+
+    def test_transpose(self):
+        a = np.array([[1, 2], [3, 4]])
+        assert_equal(a.transpose(), [[1, 3], [2, 4]])
+        assert_raises(ValueError, lambda: a.transpose(0))
+        assert_raises(ValueError, lambda: a.transpose(0, 0))
+        assert_raises(ValueError, lambda: a.transpose(0, 1, 2))
+
+    def test_sort(self):
+        # test ordering for floats and complex containing nans. It is only
+        # necessary to check the less-than comparison, so sorts that
+        # only follow the insertion sort path are sufficient. We only
+        # test doubles and complex doubles as the logic is the same.
+
+        # check doubles
+        msg = "Test real sort order with nans"
+        a = np.array([np.nan, 1, 0])
+        b = np.sort(a)
+        assert_equal(b, a[::-1], msg)
+        # check complex
+        msg = "Test complex sort order with nans"
+        a = np.zeros(9, dtype=np.complex128)
+        a.real += [np.nan, np.nan, np.nan, 1, 0, 1, 1, 0, 0]
+        a.imag += [np.nan, 1, 0, np.nan, np.nan, 1, 0, 1, 0]
+        b = np.sort(a)
+        assert_equal(b, a[::-1], msg)
+
+        # all c scalar sorts use the same code with different types
+        # so it suffices to run a quick check with one type. The number
+        # of sorted items must be greater than ~50 to check the actual
+        # algorithm because quick and merge sort fall over to insertion
+        # sort for small arrays.
+        a = np.arange(101)
+        b = a[::-1].copy()
+        for kind in ['q', 'm', 'h']:
+            msg = "scalar sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test complex sorts. These use the same code as the scalars
+        # but the compare function differs.
+        ai = a*1j + 1
+        bi = b*1j + 1
+        for kind in ['q', 'm', 'h']:
+            msg = "complex sort, real part == 1, kind=%s" % kind
+            c = ai.copy()
+            c.sort(kind=kind)
+            assert_equal(c, ai, msg)
+            c = bi.copy()
+            c.sort(kind=kind)
+            assert_equal(c, ai, msg)
+        ai = a + 1j
+        bi = b + 1j
+        for kind in ['q', 'm', 'h']:
+            msg = "complex sort, imag part == 1, kind=%s" % kind
+            c = ai.copy()
+            c.sort(kind=kind)
+            assert_equal(c, ai, msg)
+            c = bi.copy()
+            c.sort(kind=kind)
+            assert_equal(c, ai, msg)
+
+        # test sorting of complex arrays requiring byte-swapping, gh-5441
+        for endianness in '<>':
+            for dt in np.typecodes['Complex']:
+                arr = np.array([1+3.j, 2+2.j, 3+1.j], dtype=endianness + dt)
+                c = arr.copy()
+                c.sort()
+                msg = 'byte-swapped complex sort, dtype={0}'.format(dt)
+                assert_equal(c, arr, msg)
+
+        # test string sorts.
+        s = 'aaaaaaaa'
+        a = np.array([s + chr(i) for i in range(101)])
+        b = a[::-1].copy()
+        for kind in ['q', 'm', 'h']:
+            msg = "string sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test unicode sorts.
+        s = 'aaaaaaaa'
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode)
+        b = a[::-1].copy()
+        for kind in ['q', 'm', 'h']:
+            msg = "unicode sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test object array sorts.
+        a = np.empty((101,), dtype=object)
+        a[:] = list(range(101))
+        b = a[::-1]
+        for kind in ['q', 'h', 'm']:
+            msg = "object sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test record array sorts.
+        dt = np.dtype([('f', float), ('i', int)])
+        a = np.array([(i, i) for i in range(101)], dtype=dt)
+        b = a[::-1]
+        for kind in ['q', 'h', 'm']:
+            msg = "object sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test datetime64 sorts.
+        a = np.arange(0, 101, dtype='datetime64[D]')
+        b = a[::-1]
+        for kind in ['q', 'h', 'm']:
+            msg = "datetime64 sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # test timedelta64 sorts.
+        a = np.arange(0, 101, dtype='timedelta64[D]')
+        b = a[::-1]
+        for kind in ['q', 'h', 'm']:
+            msg = "timedelta64 sort, kind=%s" % kind
+            c = a.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+            c = b.copy()
+            c.sort(kind=kind)
+            assert_equal(c, a, msg)
+
+        # check axis handling. This should be the same for all type
+        # specific sorts, so we only check it for one type and one kind
+        a = np.array([[3, 2], [1, 0]])
+        b = np.array([[1, 0], [3, 2]])
+        c = np.array([[2, 3], [0, 1]])
+        d = a.copy()
+        d.sort(axis=0)
+        assert_equal(d, b, "test sort with axis=0")
+        d = a.copy()
+        d.sort(axis=1)
+        assert_equal(d, c, "test sort with axis=1")
+        d = a.copy()
+        d.sort()
+        assert_equal(d, c, "test sort with default axis")
+
+        # check axis handling for multidimensional empty arrays
+        a = np.array([])
+        a.shape = (3, 2, 1, 0)
+        for axis in range(-a.ndim, a.ndim):
+            msg = 'test empty array sort with axis={0}'.format(axis)
+            assert_equal(np.sort(a, axis=axis), a, msg)
+        msg = 'test empty array sort with axis=None'
+        assert_equal(np.sort(a, axis=None), a.ravel(), msg)
+
+        # test generic class with bogus ordering,
+        # should not segfault.
+        class Boom(object):
+            def __lt__(self, other):
+                return True
+
+        a = np.array([Boom()]*100, dtype=object)
+        for kind in ['q', 'm', 'h']:
+            msg = "bogus comparison object sort, kind=%s" % kind
+            c.sort(kind=kind)
+
+    def test_void_sort(self):
+        # gh-8210 - previously segfaulted
+        for i in range(4):
+            rand = np.random.randint(256, size=4000, dtype=np.uint8)
+            arr = rand.view('V4')
+            arr[::-1].sort()
+
+        dt = np.dtype([('val', 'i4', (1,))])
+        for i in range(4):
+            rand = np.random.randint(256, size=4000, dtype=np.uint8)
+            arr = rand.view(dt)
+            arr[::-1].sort()
+
+    def test_sort_raises(self):
+        #gh-9404
+        arr = np.array([0, datetime.now(), 1], dtype=object)
+        for kind in ['q', 'm', 'h']:
+            assert_raises(TypeError, arr.sort, kind=kind)
+        #gh-3879
+        class Raiser(object):
+            def raises_anything(*args, **kwargs):
+                raise TypeError("SOMETHING ERRORED")
+            __eq__ = __ne__ = __lt__ = __gt__ = __ge__ = __le__ = raises_anything
+        arr = np.array([[Raiser(), n] for n in range(10)]).reshape(-1)
+        np.random.shuffle(arr)
+        for kind in ['q', 'm', 'h']:
+            assert_raises(TypeError, arr.sort, kind=kind)
+
+    def test_sort_degraded(self):
+        # test degraded dataset would take minutes to run with normal qsort
+        d = np.arange(1000000)
+        do = d.copy()
+        x = d
+        # create a median of 3 killer where each median is the sorted second
+        # last element of the quicksort partition
+        while x.size > 3:
+            mid = x.size // 2
+            x[mid], x[-2] = x[-2], x[mid]
+            x = x[:-2]
+
+        assert_equal(np.sort(d), do)
+        assert_equal(d[np.argsort(d)], do)
+
+    def test_copy(self):
+        def assert_fortran(arr):
+            assert_(arr.flags.fortran)
+            assert_(arr.flags.f_contiguous)
+            assert_(not arr.flags.c_contiguous)
+
+        def assert_c(arr):
+            assert_(not arr.flags.fortran)
+            assert_(not arr.flags.f_contiguous)
+            assert_(arr.flags.c_contiguous)
+
+        a = np.empty((2, 2), order='F')
+        # Test copying a Fortran array
+        assert_c(a.copy())
+        assert_c(a.copy('C'))
+        assert_fortran(a.copy('F'))
+        assert_fortran(a.copy('A'))
+
+        # Now test starting with a C array.
+        a = np.empty((2, 2), order='C')
+        assert_c(a.copy())
+        assert_c(a.copy('C'))
+        assert_fortran(a.copy('F'))
+        assert_c(a.copy('A'))
+
+    def test_sort_order(self):
+        # Test sorting an array with fields
+        x1 = np.array([21, 32, 14])
+        x2 = np.array(['my', 'first', 'name'])
+        x3 = np.array([3.1, 4.5, 6.2])
+        r = np.rec.fromarrays([x1, x2, x3], names='id,word,number')
+
+        r.sort(order=['id'])
+        assert_equal(r.id, np.array([14, 21, 32]))
+        assert_equal(r.word, np.array(['name', 'my', 'first']))
+        assert_equal(r.number, np.array([6.2, 3.1, 4.5]))
+
+        r.sort(order=['word'])
+        assert_equal(r.id, np.array([32, 21, 14]))
+        assert_equal(r.word, np.array(['first', 'my', 'name']))
+        assert_equal(r.number, np.array([4.5, 3.1, 6.2]))
+
+        r.sort(order=['number'])
+        assert_equal(r.id, np.array([21, 32, 14]))
+        assert_equal(r.word, np.array(['my', 'first', 'name']))
+        assert_equal(r.number, np.array([3.1, 4.5, 6.2]))
+
+        assert_raises_regex(ValueError, 'duplicate',
+            lambda: r.sort(order=['id', 'id']))
+
+        if sys.byteorder == 'little':
+            strtype = '>i2'
+        else:
+            strtype = '<i2'
+        mydtype = [('name', strchar + '5'), ('col2', strtype)]
+        r = np.array([('a', 1), ('b', 255), ('c', 3), ('d', 258)],
+                     dtype=mydtype)
+        r.sort(order='col2')
+        assert_equal(r['col2'], [1, 3, 255, 258])
+        assert_equal(r, np.array([('a', 1), ('c', 3), ('b', 255), ('d', 258)],
+                                 dtype=mydtype))
+
+    def test_argsort(self):
+        # all c scalar argsorts use the same code with different types
+        # so it suffices to run a quick check with one type. The number
+        # of sorted items must be greater than ~50 to check the actual
+        # algorithm because quick and merge sort fall over to insertion
+        # sort for small arrays.
+        a = np.arange(101)
+        b = a[::-1].copy()
+        for kind in ['q', 'm', 'h']:
+            msg = "scalar argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), a, msg)
+            assert_equal(b.copy().argsort(kind=kind), b, msg)
+
+        # test complex argsorts. These use the same code as the scalars
+        # but the compare function differs.
+        ai = a*1j + 1
+        bi = b*1j + 1
+        for kind in ['q', 'm', 'h']:
+            msg = "complex argsort, kind=%s" % kind
+            assert_equal(ai.copy().argsort(kind=kind), a, msg)
+            assert_equal(bi.copy().argsort(kind=kind), b, msg)
+        ai = a + 1j
+        bi = b + 1j
+        for kind in ['q', 'm', 'h']:
+            msg = "complex argsort, kind=%s" % kind
+            assert_equal(ai.copy().argsort(kind=kind), a, msg)
+            assert_equal(bi.copy().argsort(kind=kind), b, msg)
+
+        # test argsort of complex arrays requiring byte-swapping, gh-5441
+        for endianness in '<>':
+            for dt in np.typecodes['Complex']:
+                arr = np.array([1+3.j, 2+2.j, 3+1.j], dtype=endianness + dt)
+                msg = 'byte-swapped complex argsort, dtype={0}'.format(dt)
+                assert_equal(arr.argsort(),
+                             np.arange(len(arr), dtype=np.intp), msg)
+
+        # test string argsorts.
+        s = 'aaaaaaaa'
+        a = np.array([s + chr(i) for i in range(101)])
+        b = a[::-1].copy()
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h']:
+            msg = "string argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test unicode argsorts.
+        s = 'aaaaaaaa'
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h']:
+            msg = "unicode argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test object array argsorts.
+        a = np.empty((101,), dtype=object)
+        a[:] = list(range(101))
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h']:
+            msg = "object argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test structured array argsorts.
+        dt = np.dtype([('f', float), ('i', int)])
+        a = np.array([(i, i) for i in range(101)], dtype=dt)
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'm', 'h']:
+            msg = "structured array argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test datetime64 argsorts.
+        a = np.arange(0, 101, dtype='datetime64[D]')
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'h', 'm']:
+            msg = "datetime64 argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # test timedelta64 argsorts.
+        a = np.arange(0, 101, dtype='timedelta64[D]')
+        b = a[::-1]
+        r = np.arange(101)
+        rr = r[::-1]
+        for kind in ['q', 'h', 'm']:
+            msg = "timedelta64 argsort, kind=%s" % kind
+            assert_equal(a.copy().argsort(kind=kind), r, msg)
+            assert_equal(b.copy().argsort(kind=kind), rr, msg)
+
+        # check axis handling. This should be the same for all type
+        # specific argsorts, so we only check it for one type and one kind
+        a = np.array([[3, 2], [1, 0]])
+        b = np.array([[1, 1], [0, 0]])
+        c = np.array([[1, 0], [1, 0]])
+        assert_equal(a.copy().argsort(axis=0), b)
+        assert_equal(a.copy().argsort(axis=1), c)
+        assert_equal(a.copy().argsort(), c)
+
+        # check axis handling for multidimensional empty arrays
+        a = np.array([])
+        a.shape = (3, 2, 1, 0)
+        for axis in range(-a.ndim, a.ndim):
+            msg = 'test empty array argsort with axis={0}'.format(axis)
+            assert_equal(np.argsort(a, axis=axis),
+                         np.zeros_like(a, dtype=np.intp), msg)
+        msg = 'test empty array argsort with axis=None'
+        assert_equal(np.argsort(a, axis=None),
+                     np.zeros_like(a.ravel(), dtype=np.intp), msg)
+
+        # check that stable argsorts are stable
+        r = np.arange(100)
+        # scalars
+        a = np.zeros(100)
+        assert_equal(a.argsort(kind='m'), r)
+        # complex
+        a = np.zeros(100, dtype=complex)
+        assert_equal(a.argsort(kind='m'), r)
+        # string
+        a = np.array(['aaaaaaaaa' for i in range(100)])
+        assert_equal(a.argsort(kind='m'), r)
+        # unicode
+        a = np.array(['aaaaaaaaa' for i in range(100)], dtype=np.unicode)
+        assert_equal(a.argsort(kind='m'), r)
+
+    def test_sort_unicode_kind(self):
+        d = np.arange(10)
+        k = b'\xc3\xa4'.decode("UTF8")
+        assert_raises(ValueError, d.sort, kind=k)
+        assert_raises(ValueError, d.argsort, kind=k)
+
+    def test_searchsorted(self):
+        # test for floats and complex containing nans. The logic is the
+        # same for all float types so only test double types for now.
+        # The search sorted routines use the compare functions for the
+        # array type, so this checks if that is consistent with the sort
+        # order.
+
+        # check double
+        a = np.array([0, 1, np.nan])
+        msg = "Test real searchsorted with nans, side='l'"
+        b = a.searchsorted(a, side='l')
+        assert_equal(b, np.arange(3), msg)
+        msg = "Test real searchsorted with nans, side='r'"
+        b = a.searchsorted(a, side='r')
+        assert_equal(b, np.arange(1, 4), msg)
+        # check double complex
+        a = np.zeros(9, dtype=np.complex128)
+        a.real += [0, 0, 1, 1, 0, 1, np.nan, np.nan, np.nan]
+        a.imag += [0, 1, 0, 1, np.nan, np.nan, 0, 1, np.nan]
+        msg = "Test complex searchsorted with nans, side='l'"
+        b = a.searchsorted(a, side='l')
+        assert_equal(b, np.arange(9), msg)
+        msg = "Test complex searchsorted with nans, side='r'"
+        b = a.searchsorted(a, side='r')
+        assert_equal(b, np.arange(1, 10), msg)
+        msg = "Test searchsorted with little endian, side='l'"
+        a = np.array([0, 128], dtype='<i4')
+        b = a.searchsorted(np.array(128, dtype='<i4'))
+        assert_equal(b, 1, msg)
+        msg = "Test searchsorted with big endian, side='l'"
+        a = np.array([0, 128], dtype='>i4')
+        b = a.searchsorted(np.array(128, dtype='>i4'))
+        assert_equal(b, 1, msg)
+
+        # Check 0 elements
+        a = np.ones(0)
+        b = a.searchsorted([0, 1, 2], 'l')
+        assert_equal(b, [0, 0, 0])
+        b = a.searchsorted([0, 1, 2], 'r')
+        assert_equal(b, [0, 0, 0])
+        a = np.ones(1)
+        # Check 1 element
+        b = a.searchsorted([0, 1, 2], 'l')
+        assert_equal(b, [0, 0, 1])
+        b = a.searchsorted([0, 1, 2], 'r')
+        assert_equal(b, [0, 1, 1])
+        # Check all elements equal
+        a = np.ones(2)
+        b = a.searchsorted([0, 1, 2], 'l')
+        assert_equal(b, [0, 0, 2])
+        b = a.searchsorted([0, 1, 2], 'r')
+        assert_equal(b, [0, 2, 2])
+
+        # Test searching unaligned array
+        a = np.arange(10)
+        aligned = np.empty(a.itemsize * a.size + 1, 'uint8')
+        unaligned = aligned[1:].view(a.dtype)
+        unaligned[:] = a
+        # Test searching unaligned array
+        b = unaligned.searchsorted(a, 'l')
+        assert_equal(b, a)
+        b = unaligned.searchsorted(a, 'r')
+        assert_equal(b, a + 1)
+        # Test searching for unaligned keys
+        b = a.searchsorted(unaligned, 'l')
+        assert_equal(b, a)
+        b = a.searchsorted(unaligned, 'r')
+        assert_equal(b, a + 1)
+
+        # Test smart resetting of binsearch indices
+        a = np.arange(5)
+        b = a.searchsorted([6, 5, 4], 'l')
+        assert_equal(b, [5, 5, 4])
+        b = a.searchsorted([6, 5, 4], 'r')
+        assert_equal(b, [5, 5, 5])
+
+        # Test all type specific binary search functions
+        types = ''.join((np.typecodes['AllInteger'], np.typecodes['AllFloat'],
+                         np.typecodes['Datetime'], '?O'))
+        for dt in types:
+            if dt == 'M':
+                dt = 'M8[D]'
+            if dt == '?':
+                a = np.arange(2, dtype=dt)
+                out = np.arange(2)
+            else:
+                a = np.arange(0, 5, dtype=dt)
+                out = np.arange(5)
+            b = a.searchsorted(a, 'l')
+            assert_equal(b, out)
+            b = a.searchsorted(a, 'r')
+            assert_equal(b, out + 1)
+            # Test empty array, use a fresh array to get warnings in
+            # valgrind if access happens.
+            e = np.ndarray(shape=0, buffer=b'', dtype=dt)
+            b = e.searchsorted(a, 'l')
+            assert_array_equal(b, np.zeros(len(a), dtype=np.intp))
+            b = a.searchsorted(e, 'l')
+            assert_array_equal(b, np.zeros(0, dtype=np.intp))
+
+    def test_searchsorted_unicode(self):
+        # Test searchsorted on unicode strings.
+
+        # 1.6.1 contained a string length miscalculation in
+        # arraytypes.c.src:UNICODE_compare() which manifested as
+        # incorrect/inconsistent results from searchsorted.
+        a = np.array(['P:\\20x_dapi_cy3\\20x_dapi_cy3_20100185_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100186_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100187_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100189_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100190_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100191_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100192_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100193_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100194_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100195_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100196_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100197_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100198_1',
+                      'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100199_1'],
+                     dtype=np.unicode)
+        ind = np.arange(len(a))
+        assert_equal([a.searchsorted(v, 'left') for v in a], ind)
+        assert_equal([a.searchsorted(v, 'right') for v in a], ind + 1)
+        assert_equal([a.searchsorted(a[i], 'left') for i in ind], ind)
+        assert_equal([a.searchsorted(a[i], 'right') for i in ind], ind + 1)
+
+    def test_searchsorted_with_sorter(self):
+        a = np.array([5, 2, 1, 3, 4])
+        s = np.argsort(a)
+        assert_raises(TypeError, np.searchsorted, a, 0, sorter=(1, (2, 3)))
+        assert_raises(TypeError, np.searchsorted, a, 0, sorter=[1.1])
+        assert_raises(ValueError, np.searchsorted, a, 0, sorter=[1, 2, 3, 4])
+        assert_raises(ValueError, np.searchsorted, a, 0, sorter=[1, 2, 3, 4, 5, 6])
+
+        # bounds check
+        assert_raises(ValueError, np.searchsorted, a, 4, sorter=[0, 1, 2, 3, 5])
+        assert_raises(ValueError, np.searchsorted, a, 0, sorter=[-1, 0, 1, 2, 3])
+        assert_raises(ValueError, np.searchsorted, a, 0, sorter=[4, 0, -1, 2, 3])
+
+        a = np.random.rand(300)
+        s = a.argsort()
+        b = np.sort(a)
+        k = np.linspace(0, 1, 20)
+        assert_equal(b.searchsorted(k), a.searchsorted(k, sorter=s))
+
+        a = np.array([0, 1, 2, 3, 5]*20)
+        s = a.argsort()
+        k = [0, 1, 2, 3, 5]
+        expected = [0, 20, 40, 60, 80]
+        assert_equal(a.searchsorted(k, side='l', sorter=s), expected)
+        expected = [20, 40, 60, 80, 100]
+        assert_equal(a.searchsorted(k, side='r', sorter=s), expected)
+
+        # Test searching unaligned array
+        keys = np.arange(10)
+        a = keys.copy()
+        np.random.shuffle(s)
+        s = a.argsort()
+        aligned = np.empty(a.itemsize * a.size + 1, 'uint8')
+        unaligned = aligned[1:].view(a.dtype)
+        # Test searching unaligned array
+        unaligned[:] = a
+        b = unaligned.searchsorted(keys, 'l', s)
+        assert_equal(b, keys)
+        b = unaligned.searchsorted(keys, 'r', s)
+        assert_equal(b, keys + 1)
+        # Test searching for unaligned keys
+        unaligned[:] = keys
+        b = a.searchsorted(unaligned, 'l', s)
+        assert_equal(b, keys)
+        b = a.searchsorted(unaligned, 'r', s)
+        assert_equal(b, keys + 1)
+
+        # Test all type specific indirect binary search functions
+        types = ''.join((np.typecodes['AllInteger'], np.typecodes['AllFloat'],
+                         np.typecodes['Datetime'], '?O'))
+        for dt in types:
+            if dt == 'M':
+                dt = 'M8[D]'
+            if dt == '?':
+                a = np.array([1, 0], dtype=dt)
+                # We want the sorter array to be of a type that is different
+                # from np.intp in all platforms, to check for #4698
+                s = np.array([1, 0], dtype=np.int16)
+                out = np.array([1, 0])
+            else:
+                a = np.array([3, 4, 1, 2, 0], dtype=dt)
+                # We want the sorter array to be of a type that is different
+                # from np.intp in all platforms, to check for #4698
+                s = np.array([4, 2, 3, 0, 1], dtype=np.int16)
+                out = np.array([3, 4, 1, 2, 0], dtype=np.intp)
+            b = a.searchsorted(a, 'l', s)
+            assert_equal(b, out)
+            b = a.searchsorted(a, 'r', s)
+            assert_equal(b, out + 1)
+            # Test empty array, use a fresh array to get warnings in
+            # valgrind if access happens.
+            e = np.ndarray(shape=0, buffer=b'', dtype=dt)
+            b = e.searchsorted(a, 'l', s[:0])
+            assert_array_equal(b, np.zeros(len(a), dtype=np.intp))
+            b = a.searchsorted(e, 'l', s)
+            assert_array_equal(b, np.zeros(0, dtype=np.intp))
+
+        # Test non-contiguous sorter array
+        a = np.array([3, 4, 1, 2, 0])
+        srt = np.empty((10,), dtype=np.intp)
+        srt[1::2] = -1
+        srt[::2] = [4, 2, 3, 0, 1]
+        s = srt[::2]
+        out = np.array([3, 4, 1, 2, 0], dtype=np.intp)
+        b = a.searchsorted(a, 'l', s)
+        assert_equal(b, out)
+        b = a.searchsorted(a, 'r', s)
+        assert_equal(b, out + 1)
+
+    def test_searchsorted_return_type(self):
+        # Functions returning indices should always return base ndarrays
+        class A(np.ndarray):
+            pass
+        a = np.arange(5).view(A)
+        b = np.arange(1, 3).view(A)
+        s = np.arange(5).view(A)
+        assert_(not isinstance(a.searchsorted(b, 'l'), A))
+        assert_(not isinstance(a.searchsorted(b, 'r'), A))
+        assert_(not isinstance(a.searchsorted(b, 'l', s), A))
+        assert_(not isinstance(a.searchsorted(b, 'r', s), A))
+
+    def test_argpartition_out_of_range(self):
+        # Test out of range values in kth raise an error, gh-5469
+        d = np.arange(10)
+        assert_raises(ValueError, d.argpartition, 10)
+        assert_raises(ValueError, d.argpartition, -11)
+        # Test also for generic type argpartition, which uses sorting
+        # and used to not bound check kth
+        d_obj = np.arange(10, dtype=object)
+        assert_raises(ValueError, d_obj.argpartition, 10)
+        assert_raises(ValueError, d_obj.argpartition, -11)
+
+    def test_partition_out_of_range(self):
+        # Test out of range values in kth raise an error, gh-5469
+        d = np.arange(10)
+        assert_raises(ValueError, d.partition, 10)
+        assert_raises(ValueError, d.partition, -11)
+        # Test also for generic type partition, which uses sorting
+        # and used to not bound check kth
+        d_obj = np.arange(10, dtype=object)
+        assert_raises(ValueError, d_obj.partition, 10)
+        assert_raises(ValueError, d_obj.partition, -11)
+
+    def test_argpartition_integer(self):
+        # Test non-integer values in kth raise an error/
+        d = np.arange(10)
+        assert_raises(TypeError, d.argpartition, 9.)
+        # Test also for generic type argpartition, which uses sorting
+        # and used to not bound check kth
+        d_obj = np.arange(10, dtype=object)
+        assert_raises(TypeError, d_obj.argpartition, 9.)
+
+    def test_partition_integer(self):
+        # Test out of range values in kth raise an error, gh-5469
+        d = np.arange(10)
+        assert_raises(TypeError, d.partition, 9.)
+        # Test also for generic type partition, which uses sorting
+        # and used to not bound check kth
+        d_obj = np.arange(10, dtype=object)
+        assert_raises(TypeError, d_obj.partition, 9.)
+
+    def test_partition_empty_array(self):
+        # check axis handling for multidimensional empty arrays
+        a = np.array([])
+        a.shape = (3, 2, 1, 0)
+        for axis in range(-a.ndim, a.ndim):
+            msg = 'test empty array partition with axis={0}'.format(axis)
+            assert_equal(np.partition(a, 0, axis=axis), a, msg)
+        msg = 'test empty array partition with axis=None'
+        assert_equal(np.partition(a, 0, axis=None), a.ravel(), msg)
+
+    def test_argpartition_empty_array(self):
+        # check axis handling for multidimensional empty arrays
+        a = np.array([])
+        a.shape = (3, 2, 1, 0)
+        for axis in range(-a.ndim, a.ndim):
+            msg = 'test empty array argpartition with axis={0}'.format(axis)
+            assert_equal(np.partition(a, 0, axis=axis),
+                         np.zeros_like(a, dtype=np.intp), msg)
+        msg = 'test empty array argpartition with axis=None'
+        assert_equal(np.partition(a, 0, axis=None),
+                     np.zeros_like(a.ravel(), dtype=np.intp), msg)
+
+    def test_partition(self):
+        d = np.arange(10)
+        assert_raises(TypeError, np.partition, d, 2, kind=1)
+        assert_raises(ValueError, np.partition, d, 2, kind="nonsense")
+        assert_raises(ValueError, np.argpartition, d, 2, kind="nonsense")
+        assert_raises(ValueError, d.partition, 2, axis=0, kind="nonsense")
+        assert_raises(ValueError, d.argpartition, 2, axis=0, kind="nonsense")
+        for k in ("introselect",):
+            d = np.array([])
+            assert_array_equal(np.partition(d, 0, kind=k), d)
+            assert_array_equal(np.argpartition(d, 0, kind=k), d)
+            d = np.ones(1)
+            assert_array_equal(np.partition(d, 0, kind=k)[0], d)
+            assert_array_equal(d[np.argpartition(d, 0, kind=k)],
+                               np.partition(d, 0, kind=k))
+
+            # kth not modified
+            kth = np.array([30, 15, 5])
+            okth = kth.copy()
+            np.partition(np.arange(40), kth)
+            assert_array_equal(kth, okth)
+
+            for r in ([2, 1], [1, 2], [1, 1]):
+                d = np.array(r)
+                tgt = np.sort(d)
+                assert_array_equal(np.partition(d, 0, kind=k)[0], tgt[0])
+                assert_array_equal(np.partition(d, 1, kind=k)[1], tgt[1])
+                assert_array_equal(d[np.argpartition(d, 0, kind=k)],
+                                   np.partition(d, 0, kind=k))
+                assert_array_equal(d[np.argpartition(d, 1, kind=k)],
+                                   np.partition(d, 1, kind=k))
+                for i in range(d.size):
+                    d[i:].partition(0, kind=k)
+                assert_array_equal(d, tgt)
+
+            for r in ([3, 2, 1], [1, 2, 3], [2, 1, 3], [2, 3, 1],
+                      [1, 1, 1], [1, 2, 2], [2, 2, 1], [1, 2, 1]):
+                d = np.array(r)
+                tgt = np.sort(d)
+                assert_array_equal(np.partition(d, 0, kind=k)[0], tgt[0])
+                assert_array_equal(np.partition(d, 1, kind=k)[1], tgt[1])
+                assert_array_equal(np.partition(d, 2, kind=k)[2], tgt[2])
+                assert_array_equal(d[np.argpartition(d, 0, kind=k)],
+                                   np.partition(d, 0, kind=k))
+                assert_array_equal(d[np.argpartition(d, 1, kind=k)],
+                                   np.partition(d, 1, kind=k))
+                assert_array_equal(d[np.argpartition(d, 2, kind=k)],
+                                   np.partition(d, 2, kind=k))
+                for i in range(d.size):
+                    d[i:].partition(0, kind=k)
+                assert_array_equal(d, tgt)
+
+            d = np.ones(50)
+            assert_array_equal(np.partition(d, 0, kind=k), d)
+            assert_array_equal(d[np.argpartition(d, 0, kind=k)],
+                               np.partition(d, 0, kind=k))
+
+            # sorted
+            d = np.arange(49)
+            assert_equal(np.partition(d, 5, kind=k)[5], 5)
+            assert_equal(np.partition(d, 15, kind=k)[15], 15)
+            assert_array_equal(d[np.argpartition(d, 5, kind=k)],
+                               np.partition(d, 5, kind=k))
+            assert_array_equal(d[np.argpartition(d, 15, kind=k)],
+                               np.partition(d, 15, kind=k))
+
+            # rsorted
+            d = np.arange(47)[::-1]
+            assert_equal(np.partition(d, 6, kind=k)[6], 6)
+            assert_equal(np.partition(d, 16, kind=k)[16], 16)
+            assert_array_equal(d[np.argpartition(d, 6, kind=k)],
+                               np.partition(d, 6, kind=k))
+            assert_array_equal(d[np.argpartition(d, 16, kind=k)],
+                               np.partition(d, 16, kind=k))
+
+            assert_array_equal(np.partition(d, -6, kind=k),
+                               np.partition(d, 41, kind=k))
+            assert_array_equal(np.partition(d, -16, kind=k),
+                               np.partition(d, 31, kind=k))
+            assert_array_equal(d[np.argpartition(d, -6, kind=k)],
+                               np.partition(d, 41, kind=k))
+
+            # median of 3 killer, O(n^2) on pure median 3 pivot quickselect
+            # exercises the median of median of 5 code used to keep O(n)
+            d = np.arange(1000000)
+            x = np.roll(d, d.size // 2)
+            mid = x.size // 2 + 1
+            assert_equal(np.partition(x, mid)[mid], mid)
+            d = np.arange(1000001)
+            x = np.roll(d, d.size // 2 + 1)
+            mid = x.size // 2 + 1
+            assert_equal(np.partition(x, mid)[mid], mid)
+
+            # max
+            d = np.ones(10)
+            d[1] = 4
+            assert_equal(np.partition(d, (2, -1))[-1], 4)
+            assert_equal(np.partition(d, (2, -1))[2], 1)
+            assert_equal(d[np.argpartition(d, (2, -1))][-1], 4)
+            assert_equal(d[np.argpartition(d, (2, -1))][2], 1)
+            d[1] = np.nan
+            assert_(np.isnan(d[np.argpartition(d, (2, -1))][-1]))
+            assert_(np.isnan(np.partition(d, (2, -1))[-1]))
+
+            # equal elements
+            d = np.arange(47) % 7
+            tgt = np.sort(np.arange(47) % 7)
+            np.random.shuffle(d)
+            for i in range(d.size):
+                assert_equal(np.partition(d, i, kind=k)[i], tgt[i])
+            assert_array_equal(d[np.argpartition(d, 6, kind=k)],
+                               np.partition(d, 6, kind=k))
+            assert_array_equal(d[np.argpartition(d, 16, kind=k)],
+                               np.partition(d, 16, kind=k))
+            for i in range(d.size):
+                d[i:].partition(0, kind=k)
+            assert_array_equal(d, tgt)
+
+            d = np.array([0, 1, 2, 3, 4, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                          7, 7, 7, 7, 7, 9])
+            kth = [0, 3, 19, 20]
+            assert_equal(np.partition(d, kth, kind=k)[kth], (0, 3, 7, 7))
+            assert_equal(d[np.argpartition(d, kth, kind=k)][kth], (0, 3, 7, 7))
+
+            d = np.array([2, 1])
+            d.partition(0, kind=k)
+            assert_raises(ValueError, d.partition, 2)
+            assert_raises(np.AxisError, d.partition, 3, axis=1)
+            assert_raises(ValueError, np.partition, d, 2)
+            assert_raises(np.AxisError, np.partition, d, 2, axis=1)
+            assert_raises(ValueError, d.argpartition, 2)
+            assert_raises(np.AxisError, d.argpartition, 3, axis=1)
+            assert_raises(ValueError, np.argpartition, d, 2)
+            assert_raises(np.AxisError, np.argpartition, d, 2, axis=1)
+            d = np.arange(10).reshape((2, 5))
+            d.partition(1, axis=0, kind=k)
+            d.partition(4, axis=1, kind=k)
+            np.partition(d, 1, axis=0, kind=k)
+            np.partition(d, 4, axis=1, kind=k)
+            np.partition(d, 1, axis=None, kind=k)
+            np.partition(d, 9, axis=None, kind=k)
+            d.argpartition(1, axis=0, kind=k)
+            d.argpartition(4, axis=1, kind=k)
+            np.argpartition(d, 1, axis=0, kind=k)
+            np.argpartition(d, 4, axis=1, kind=k)
+            np.argpartition(d, 1, axis=None, kind=k)
+            np.argpartition(d, 9, axis=None, kind=k)
+            assert_raises(ValueError, d.partition, 2, axis=0)
+            assert_raises(ValueError, d.partition, 11, axis=1)
+            assert_raises(TypeError, d.partition, 2, axis=None)
+            assert_raises(ValueError, np.partition, d, 9, axis=1)
+            assert_raises(ValueError, np.partition, d, 11, axis=None)
+            assert_raises(ValueError, d.argpartition, 2, axis=0)
+            assert_raises(ValueError, d.argpartition, 11, axis=1)
+            assert_raises(ValueError, np.argpartition, d, 9, axis=1)
+            assert_raises(ValueError, np.argpartition, d, 11, axis=None)
+
+            td = [(dt, s) for dt in [np.int32, np.float32, np.complex64]
+                  for s in (9, 16)]
+            for dt, s in td:
+                aae = assert_array_equal
+                at = assert_
+
+                d = np.arange(s, dtype=dt)
+                np.random.shuffle(d)
+                d1 = np.tile(np.arange(s, dtype=dt), (4, 1))
+                map(np.random.shuffle, d1)
+                d0 = np.transpose(d1)
+                for i in range(d.size):
+                    p = np.partition(d, i, kind=k)
+                    assert_equal(p[i], i)
+                    # all before are smaller
+                    assert_array_less(p[:i], p[i])
+                    # all after are larger
+                    assert_array_less(p[i], p[i + 1:])
+                    aae(p, d[np.argpartition(d, i, kind=k)])
+
+                    p = np.partition(d1, i, axis=1, kind=k)
+                    aae(p[:, i], np.array([i] * d1.shape[0], dtype=dt))
+                    # array_less does not seem to work right
+                    at((p[:, :i].T <= p[:, i]).all(),
+                       msg="%d: %r <= %r" % (i, p[:, i], p[:, :i].T))
+                    at((p[:, i + 1:].T > p[:, i]).all(),
+                       msg="%d: %r < %r" % (i, p[:, i], p[:, i + 1:].T))
+                    aae(p, d1[np.arange(d1.shape[0])[:, None],
+                        np.argpartition(d1, i, axis=1, kind=k)])
+
+                    p = np.partition(d0, i, axis=0, kind=k)
+                    aae(p[i, :], np.array([i] * d1.shape[0], dtype=dt))
+                    # array_less does not seem to work right
+                    at((p[:i, :] <= p[i, :]).all(),
+                       msg="%d: %r <= %r" % (i, p[i, :], p[:i, :]))
+                    at((p[i + 1:, :] > p[i, :]).all(),
+                       msg="%d: %r < %r" % (i, p[i, :], p[:, i + 1:]))
+                    aae(p, d0[np.argpartition(d0, i, axis=0, kind=k),
+                        np.arange(d0.shape[1])[None, :]])
+
+                    # check inplace
+                    dc = d.copy()
+                    dc.partition(i, kind=k)
+                    assert_equal(dc, np.partition(d, i, kind=k))
+                    dc = d0.copy()
+                    dc.partition(i, axis=0, kind=k)
+                    assert_equal(dc, np.partition(d0, i, axis=0, kind=k))
+                    dc = d1.copy()
+                    dc.partition(i, axis=1, kind=k)
+                    assert_equal(dc, np.partition(d1, i, axis=1, kind=k))
+
+    def assert_partitioned(self, d, kth):
+        prev = 0
+        for k in np.sort(kth):
+            assert_array_less(d[prev:k], d[k], err_msg='kth %d' % k)
+            assert_((d[k:] >= d[k]).all(),
+                    msg="kth %d, %r not greater equal %d" % (k, d[k:], d[k]))
+            prev = k + 1
+
+    def test_partition_iterative(self):
+            d = np.arange(17)
+            kth = (0, 1, 2, 429, 231)
+            assert_raises(ValueError, d.partition, kth)
+            assert_raises(ValueError, d.argpartition, kth)
+            d = np.arange(10).reshape((2, 5))
+            assert_raises(ValueError, d.partition, kth, axis=0)
+            assert_raises(ValueError, d.partition, kth, axis=1)
+            assert_raises(ValueError, np.partition, d, kth, axis=1)
+            assert_raises(ValueError, np.partition, d, kth, axis=None)
+
+            d = np.array([3, 4, 2, 1])
+            p = np.partition(d, (0, 3))
+            self.assert_partitioned(p, (0, 3))
+            self.assert_partitioned(d[np.argpartition(d, (0, 3))], (0, 3))
+
+            assert_array_equal(p, np.partition(d, (-3, -1)))
+            assert_array_equal(p, d[np.argpartition(d, (-3, -1))])
+
+            d = np.arange(17)
+            np.random.shuffle(d)
+            d.partition(range(d.size))
+            assert_array_equal(np.arange(17), d)
+            np.random.shuffle(d)
+            assert_array_equal(np.arange(17), d[d.argpartition(range(d.size))])
+
+            # test unsorted kth
+            d = np.arange(17)
+            np.random.shuffle(d)
+            keys = np.array([1, 3, 8, -2])
+            np.random.shuffle(d)
+            p = np.partition(d, keys)
+            self.assert_partitioned(p, keys)
+            p = d[np.argpartition(d, keys)]
+            self.assert_partitioned(p, keys)
+            np.random.shuffle(keys)
+            assert_array_equal(np.partition(d, keys), p)
+            assert_array_equal(d[np.argpartition(d, keys)], p)
+
+            # equal kth
+            d = np.arange(20)[::-1]
+            self.assert_partitioned(np.partition(d, [5]*4), [5])
+            self.assert_partitioned(np.partition(d, [5]*4 + [6, 13]),
+                                    [5]*4 + [6, 13])
+            self.assert_partitioned(d[np.argpartition(d, [5]*4)], [5])
+            self.assert_partitioned(d[np.argpartition(d, [5]*4 + [6, 13])],
+                                    [5]*4 + [6, 13])
+
+            d = np.arange(12)
+            np.random.shuffle(d)
+            d1 = np.tile(np.arange(12), (4, 1))
+            map(np.random.shuffle, d1)
+            d0 = np.transpose(d1)
+
+            kth = (1, 6, 7, -1)
+            p = np.partition(d1, kth, axis=1)
+            pa = d1[np.arange(d1.shape[0])[:, None],
+                    d1.argpartition(kth, axis=1)]
+            assert_array_equal(p, pa)
+            for i in range(d1.shape[0]):
+                self.assert_partitioned(p[i,:], kth)
+            p = np.partition(d0, kth, axis=0)
+            pa = d0[np.argpartition(d0, kth, axis=0),
+                    np.arange(d0.shape[1])[None,:]]
+            assert_array_equal(p, pa)
+            for i in range(d0.shape[1]):
+                self.assert_partitioned(p[:, i], kth)
+
+    def test_partition_cdtype(self):
+        d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
+                   ('Lancelot', 1.9, 38)],
+                  dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
+
+        tgt = np.sort(d, order=['age', 'height'])
+        assert_array_equal(np.partition(d, range(d.size),
+                                        order=['age', 'height']),
+                           tgt)
+        assert_array_equal(d[np.argpartition(d, range(d.size),
+                                             order=['age', 'height'])],
+                           tgt)
+        for k in range(d.size):
+            assert_equal(np.partition(d, k, order=['age', 'height'])[k],
+                        tgt[k])
+            assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k],
+                         tgt[k])
+
+        d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot'])
+        tgt = np.sort(d)
+        assert_array_equal(np.partition(d, range(d.size)), tgt)
+        for k in range(d.size):
+            assert_equal(np.partition(d, k)[k], tgt[k])
+            assert_equal(d[np.argpartition(d, k)][k], tgt[k])
+
+    def test_partition_unicode_kind(self):
+        d = np.arange(10)
+        k = b'\xc3\xa4'.decode("UTF8")
+        assert_raises(ValueError, d.partition, 2, kind=k)
+        assert_raises(ValueError, d.argpartition, 2, kind=k)
+
+    def test_partition_fuzz(self):
+        # a few rounds of random data testing
+        for j in range(10, 30):
+            for i in range(1, j - 2):
+                d = np.arange(j)
+                np.random.shuffle(d)
+                d = d % np.random.randint(2, 30)
+                idx = np.random.randint(d.size)
+                kth = [0, idx, i, i + 1]
+                tgt = np.sort(d)[kth]
+                assert_array_equal(np.partition(d, kth)[kth], tgt,
+                                   err_msg="data: %r\n kth: %r" % (d, kth))
+
+    def test_argpartition_gh5524(self):
+        #  A test for functionality of argpartition on lists.
+        d = [6,7,3,2,9,0]
+        p = np.argpartition(d,1)
+        self.assert_partitioned(np.array(d)[p],[1])
+
+    def test_flatten(self):
+        x0 = np.array([[1, 2, 3], [4, 5, 6]], np.int32)
+        x1 = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], np.int32)
+        y0 = np.array([1, 2, 3, 4, 5, 6], np.int32)
+        y0f = np.array([1, 4, 2, 5, 3, 6], np.int32)
+        y1 = np.array([1, 2, 3, 4, 5, 6, 7, 8], np.int32)
+        y1f = np.array([1, 5, 3, 7, 2, 6, 4, 8], np.int32)
+        assert_equal(x0.flatten(), y0)
+        assert_equal(x0.flatten('F'), y0f)
+        assert_equal(x0.flatten('F'), x0.T.flatten())
+        assert_equal(x1.flatten(), y1)
+        assert_equal(x1.flatten('F'), y1f)
+        assert_equal(x1.flatten('F'), x1.T.flatten())
+
+    @pytest.mark.parametrize('func', (np.dot, np.matmul))
+    def test_arr_mult(self, func):
+        a = np.array([[1, 0], [0, 1]])
+        b = np.array([[0, 1], [1, 0]])
+        c = np.array([[9, 1], [1, -9]])
+        d = np.arange(24).reshape(4, 6)
+        ddt = np.array(
+            [[  55,  145,  235,  325],
+             [ 145,  451,  757, 1063],
+             [ 235,  757, 1279, 1801],
+             [ 325, 1063, 1801, 2539]]
+        )
+        dtd = np.array(
+            [[504, 540, 576, 612, 648, 684],
+             [540, 580, 620, 660, 700, 740],
+             [576, 620, 664, 708, 752, 796],
+             [612, 660, 708, 756, 804, 852],
+             [648, 700, 752, 804, 856, 908],
+             [684, 740, 796, 852, 908, 964]]
+        )
+
+
+        # gemm vs syrk optimizations
+        for et in [np.float32, np.float64, np.complex64, np.complex128]:
+            eaf = a.astype(et)
+            assert_equal(func(eaf, eaf), eaf)
+            assert_equal(func(eaf.T, eaf), eaf)
+            assert_equal(func(eaf, eaf.T), eaf)
+            assert_equal(func(eaf.T, eaf.T), eaf)
+            assert_equal(func(eaf.T.copy(), eaf), eaf)
+            assert_equal(func(eaf, eaf.T.copy()), eaf)
+            assert_equal(func(eaf.T.copy(), eaf.T.copy()), eaf)
+
+        # syrk validations
+        for et in [np.float32, np.float64, np.complex64, np.complex128]:
+            eaf = a.astype(et)
+            ebf = b.astype(et)
+            assert_equal(func(ebf, ebf), eaf)
+            assert_equal(func(ebf.T, ebf), eaf)
+            assert_equal(func(ebf, ebf.T), eaf)
+            assert_equal(func(ebf.T, ebf.T), eaf)
+
+        # syrk - different shape, stride, and view validations
+        for et in [np.float32, np.float64, np.complex64, np.complex128]:
+            edf = d.astype(et)
+            assert_equal(
+                func(edf[::-1, :], edf.T),
+                func(edf[::-1, :].copy(), edf.T.copy())
+            )
+            assert_equal(
+                func(edf[:, ::-1], edf.T),
+                func(edf[:, ::-1].copy(), edf.T.copy())
+            )
+            assert_equal(
+                func(edf, edf[::-1, :].T),
+                func(edf, edf[::-1, :].T.copy())
+            )
+            assert_equal(
+                func(edf, edf[:, ::-1].T),
+                func(edf, edf[:, ::-1].T.copy())
+            )
+            assert_equal(
+                func(edf[:edf.shape[0] // 2, :], edf[::2, :].T),
+                func(edf[:edf.shape[0] // 2, :].copy(), edf[::2, :].T.copy())
+            )
+            assert_equal(
+                func(edf[::2, :], edf[:edf.shape[0] // 2, :].T),
+                func(edf[::2, :].copy(), edf[:edf.shape[0] // 2, :].T.copy())
+            )
+
+        # syrk - different shape
+        for et in [np.float32, np.float64, np.complex64, np.complex128]:
+            edf = d.astype(et)
+            eddtf = ddt.astype(et)
+            edtdf = dtd.astype(et)
+            assert_equal(func(edf, edf.T), eddtf)
+            assert_equal(func(edf.T, edf), edtdf)
+
+    @pytest.mark.parametrize('func', (np.dot, np.matmul))
+    @pytest.mark.parametrize('dtype', 'ifdFD')
+    def test_no_dgemv(self, func, dtype):
+        # check vector arg for contiguous before gemv
+        # gh-12156
+        a = np.arange(8.0, dtype=dtype).reshape(2, 4)
+        b = np.broadcast_to(1., (4, 1))
+        ret1 = func(a, b)
+        ret2 = func(a, b.copy())
+        assert_equal(ret1, ret2)
+
+        ret1 = func(b.T, a.T)
+        ret2 = func(b.T.copy(), a.T)
+        assert_equal(ret1, ret2)
+
+        # check for unaligned data
+        dt = np.dtype(dtype)
+        a = np.zeros(8 * dt.itemsize // 2 + 1, dtype='int16')[1:].view(dtype)
+        a = a.reshape(2, 4)
+        b = a[0]
+        # make sure it is not aligned
+        assert_(a.__array_interface__['data'][0] % dt.itemsize != 0)
+        ret1 = func(a, b)
+        ret2 = func(a.copy(), b.copy())
+        assert_equal(ret1, ret2)
+
+        ret1 = func(b.T, a.T)
+        ret2 = func(b.T.copy(), a.T.copy())
+        assert_equal(ret1, ret2)
+
+    def test_dot(self):
+        a = np.array([[1, 0], [0, 1]])
+        b = np.array([[0, 1], [1, 0]])
+        c = np.array([[9, 1], [1, -9]])
+        # function versus methods
+        assert_equal(np.dot(a, b), a.dot(b))
+        assert_equal(np.dot(np.dot(a, b), c), a.dot(b).dot(c))
+
+        # test passing in an output array
+        c = np.zeros_like(a)
+        a.dot(b, c)
+        assert_equal(c, np.dot(a, b))
+
+        # test keyword args
+        c = np.zeros_like(a)
+        a.dot(b=b, out=c)
+        assert_equal(c, np.dot(a, b))
+
+    def test_dot_type_mismatch(self):
+        c = 1.
+        A = np.array((1,1), dtype='i,i')
+
+        assert_raises(TypeError, np.dot, c, A)
+        assert_raises(TypeError, np.dot, A, c)
+
+    def test_dot_out_mem_overlap(self):
+        np.random.seed(1)
+
+        # Test BLAS and non-BLAS code paths, including all dtypes
+        # that dot() supports
+        dtypes = [np.dtype(code) for code in np.typecodes['All']
+                  if code not in 'USVM']
+        for dtype in dtypes:
+            a = np.random.rand(3, 3).astype(dtype)
+
+            # Valid dot() output arrays must be aligned
+            b = _aligned_zeros((3, 3), dtype=dtype)
+            b[...] = np.random.rand(3, 3)
+
+            y = np.dot(a, b)
+            x = np.dot(a, b, out=b)
+            assert_equal(x, y, err_msg=repr(dtype))
+
+            # Check invalid output array
+            assert_raises(ValueError, np.dot, a, b, out=b[::2])
+            assert_raises(ValueError, np.dot, a, b, out=b.T)
+
+    def test_dot_matmul_out(self):
+        # gh-9641
+        class Sub(np.ndarray):
+            pass
+        a = np.ones((2, 2)).view(Sub)
+        b = np.ones((2, 2)).view(Sub)
+        out = np.ones((2, 2))
+
+        # make sure out can be any ndarray (not only subclass of inputs)
+        np.dot(a, b, out=out)
+        np.matmul(a, b, out=out)
+
+    def test_dot_matmul_inner_array_casting_fails(self):
+
+        class A(object):
+            def __array__(self, *args, **kwargs):
+                raise NotImplementedError
+
+        # Don't override the error from calling __array__()
+        assert_raises(NotImplementedError, np.dot, A(), A())
+        assert_raises(NotImplementedError, np.matmul, A(), A())
+        assert_raises(NotImplementedError, np.inner, A(), A())
+
+    def test_matmul_out(self):
+        # overlapping memory
+        a = np.arange(18).reshape(2, 3, 3)
+        b = np.matmul(a, a)
+        c = np.matmul(a, a, out=a)
+        assert_(c is a)
+        assert_equal(c, b)
+        a = np.arange(18).reshape(2, 3, 3)
+        c = np.matmul(a, a, out=a[::-1, ...])
+        assert_(c.base is a.base)
+        assert_equal(c, b)
+
+    def test_diagonal(self):
+        a = np.arange(12).reshape((3, 4))
+        assert_equal(a.diagonal(), [0, 5, 10])
+        assert_equal(a.diagonal(0), [0, 5, 10])
+        assert_equal(a.diagonal(1), [1, 6, 11])
+        assert_equal(a.diagonal(-1), [4, 9])
+        assert_raises(np.AxisError, a.diagonal, axis1=0, axis2=5)
+        assert_raises(np.AxisError, a.diagonal, axis1=5, axis2=0)
+        assert_raises(np.AxisError, a.diagonal, axis1=5, axis2=5)
+        assert_raises(ValueError, a.diagonal, axis1=1, axis2=1)
+
+        b = np.arange(8).reshape((2, 2, 2))
+        assert_equal(b.diagonal(), [[0, 6], [1, 7]])
+        assert_equal(b.diagonal(0), [[0, 6], [1, 7]])
+        assert_equal(b.diagonal(1), [[2], [3]])
+        assert_equal(b.diagonal(-1), [[4], [5]])
+        assert_raises(ValueError, b.diagonal, axis1=0, axis2=0)
+        assert_equal(b.diagonal(0, 1, 2), [[0, 3], [4, 7]])
+        assert_equal(b.diagonal(0, 0, 1), [[0, 6], [1, 7]])
+        assert_equal(b.diagonal(offset=1, axis1=0, axis2=2), [[1], [3]])
+        # Order of axis argument doesn't matter:
+        assert_equal(b.diagonal(0, 2, 1), [[0, 3], [4, 7]])
+
+    def test_diagonal_view_notwriteable(self):
+        # this test is only for 1.9, the diagonal view will be
+        # writeable in 1.10.
+        a = np.eye(3).diagonal()
+        assert_(not a.flags.writeable)
+        assert_(not a.flags.owndata)
+
+        a = np.diagonal(np.eye(3))
+        assert_(not a.flags.writeable)
+        assert_(not a.flags.owndata)
+
+        a = np.diag(np.eye(3))
+        assert_(not a.flags.writeable)
+        assert_(not a.flags.owndata)
+
+    def test_diagonal_memleak(self):
+        # Regression test for a bug that crept in at one point
+        a = np.zeros((100, 100))
+        if HAS_REFCOUNT:
+            assert_(sys.getrefcount(a) < 50)
+        for i in range(100):
+            a.diagonal()
+        if HAS_REFCOUNT:
+            assert_(sys.getrefcount(a) < 50)
+
+    def test_size_zero_memleak(self):
+        # Regression test for issue 9615
+        # Exercises a special-case code path for dot products of length
+        # zero in cblasfuncs (making it is specific to floating dtypes).
+        a = np.array([], dtype=np.float64)
+        x = np.array(2.0)
+        for _ in range(100):
+            np.dot(a, a, out=x)
+        if HAS_REFCOUNT:
+            assert_(sys.getrefcount(x) < 50)
+
+    def test_trace(self):
+        a = np.arange(12).reshape((3, 4))
+        assert_equal(a.trace(), 15)
+        assert_equal(a.trace(0), 15)
+        assert_equal(a.trace(1), 18)
+        assert_equal(a.trace(-1), 13)
+
+        b = np.arange(8).reshape((2, 2, 2))
+        assert_equal(b.trace(), [6, 8])
+        assert_equal(b.trace(0), [6, 8])
+        assert_equal(b.trace(1), [2, 3])
+        assert_equal(b.trace(-1), [4, 5])
+        assert_equal(b.trace(0, 0, 1), [6, 8])
+        assert_equal(b.trace(0, 0, 2), [5, 9])
+        assert_equal(b.trace(0, 1, 2), [3, 11])
+        assert_equal(b.trace(offset=1, axis1=0, axis2=2), [1, 3])
+
+    def test_trace_subclass(self):
+        # The class would need to overwrite trace to ensure single-element
+        # output also has the right subclass.
+        class MyArray(np.ndarray):
+            pass
+
+        b = np.arange(8).reshape((2, 2, 2)).view(MyArray)
+        t = b.trace()
+        assert_(isinstance(t, MyArray))
+
+    def test_put(self):
+        icodes = np.typecodes['AllInteger']
+        fcodes = np.typecodes['AllFloat']
+        for dt in icodes + fcodes + 'O':
+            tgt = np.array([0, 1, 0, 3, 0, 5], dtype=dt)
+
+            # test 1-d
+            a = np.zeros(6, dtype=dt)
+            a.put([1, 3, 5], [1, 3, 5])
+            assert_equal(a, tgt)
+
+            # test 2-d
+            a = np.zeros((2, 3), dtype=dt)
+            a.put([1, 3, 5], [1, 3, 5])
+            assert_equal(a, tgt.reshape(2, 3))
+
+        for dt in '?':
+            tgt = np.array([False, True, False, True, False, True], dtype=dt)
+
+            # test 1-d
+            a = np.zeros(6, dtype=dt)
+            a.put([1, 3, 5], [True]*3)
+            assert_equal(a, tgt)
+
+            # test 2-d
+            a = np.zeros((2, 3), dtype=dt)
+            a.put([1, 3, 5], [True]*3)
+            assert_equal(a, tgt.reshape(2, 3))
+
+        # check must be writeable
+        a = np.zeros(6)
+        a.flags.writeable = False
+        assert_raises(ValueError, a.put, [1, 3, 5], [1, 3, 5])
+
+        # when calling np.put, make sure a
+        # TypeError is raised if the object
+        # isn't an ndarray
+        bad_array = [1, 2, 3]
+        assert_raises(TypeError, np.put, bad_array, [0, 2], 5)
+
+    def test_ravel(self):
+        a = np.array([[0, 1], [2, 3]])
+        assert_equal(a.ravel(), [0, 1, 2, 3])
+        assert_(not a.ravel().flags.owndata)
+        assert_equal(a.ravel('F'), [0, 2, 1, 3])
+        assert_equal(a.ravel(order='C'), [0, 1, 2, 3])
+        assert_equal(a.ravel(order='F'), [0, 2, 1, 3])
+        assert_equal(a.ravel(order='A'), [0, 1, 2, 3])
+        assert_(not a.ravel(order='A').flags.owndata)
+        assert_equal(a.ravel(order='K'), [0, 1, 2, 3])
+        assert_(not a.ravel(order='K').flags.owndata)
+        assert_equal(a.ravel(), a.reshape(-1))
+
+        a = np.array([[0, 1], [2, 3]], order='F')
+        assert_equal(a.ravel(), [0, 1, 2, 3])
+        assert_equal(a.ravel(order='A'), [0, 2, 1, 3])
+        assert_equal(a.ravel(order='K'), [0, 2, 1, 3])
+        assert_(not a.ravel(order='A').flags.owndata)
+        assert_(not a.ravel(order='K').flags.owndata)
+        assert_equal(a.ravel(), a.reshape(-1))
+        assert_equal(a.ravel(order='A'), a.reshape(-1, order='A'))
+
+        a = np.array([[0, 1], [2, 3]])[::-1, :]
+        assert_equal(a.ravel(), [2, 3, 0, 1])
+        assert_equal(a.ravel(order='C'), [2, 3, 0, 1])
+        assert_equal(a.ravel(order='F'), [2, 0, 3, 1])
+        assert_equal(a.ravel(order='A'), [2, 3, 0, 1])
+        # 'K' doesn't reverse the axes of negative strides
+        assert_equal(a.ravel(order='K'), [2, 3, 0, 1])
+        assert_(a.ravel(order='K').flags.owndata)
+
+        # Test simple 1-d copy behaviour:
+        a = np.arange(10)[::2]
+        assert_(a.ravel('K').flags.owndata)
+        assert_(a.ravel('C').flags.owndata)
+        assert_(a.ravel('F').flags.owndata)
+
+        # Not contiguous and 1-sized axis with non matching stride
+        a = np.arange(2**3 * 2)[::2]
+        a = a.reshape(2, 1, 2, 2).swapaxes(-1, -2)
+        strides = list(a.strides)
+        strides[1] = 123
+        a.strides = strides
+        assert_(a.ravel(order='K').flags.owndata)
+        assert_equal(a.ravel('K'), np.arange(0, 15, 2))
+
+        # contiguous and 1-sized axis with non matching stride works:
+        a = np.arange(2**3)
+        a = a.reshape(2, 1, 2, 2).swapaxes(-1, -2)
+        strides = list(a.strides)
+        strides[1] = 123
+        a.strides = strides
+        assert_(np.may_share_memory(a.ravel(order='K'), a))
+        assert_equal(a.ravel(order='K'), np.arange(2**3))
+
+        # Test negative strides (not very interesting since non-contiguous):
+        a = np.arange(4)[::-1].reshape(2, 2)
+        assert_(a.ravel(order='C').flags.owndata)
+        assert_(a.ravel(order='K').flags.owndata)
+        assert_equal(a.ravel('C'), [3, 2, 1, 0])
+        assert_equal(a.ravel('K'), [3, 2, 1, 0])
+
+        # 1-element tidy strides test (NPY_RELAXED_STRIDES_CHECKING):
+        a = np.array([[1]])
+        a.strides = (123, 432)
+        # If the stride is not 8, NPY_RELAXED_STRIDES_CHECKING is messing
+        # them up on purpose:
+        if np.ones(1).strides == (8,):
+            assert_(np.may_share_memory(a.ravel('K'), a))
+            assert_equal(a.ravel('K').strides, (a.dtype.itemsize,))
+
+        for order in ('C', 'F', 'A', 'K'):
+            # 0-d corner case:
+            a = np.array(0)
+            assert_equal(a.ravel(order), [0])
+            assert_(np.may_share_memory(a.ravel(order), a))
+
+        # Test that certain non-inplace ravels work right (mostly) for 'K':
+        b = np.arange(2**4 * 2)[::2].reshape(2, 2, 2, 2)
+        a = b[..., ::2]
+        assert_equal(a.ravel('K'), [0, 4, 8, 12, 16, 20, 24, 28])
+        assert_equal(a.ravel('C'), [0, 4, 8, 12, 16, 20, 24, 28])
+        assert_equal(a.ravel('A'), [0, 4, 8, 12, 16, 20, 24, 28])
+        assert_equal(a.ravel('F'), [0, 16, 8, 24, 4, 20, 12, 28])
+
+        a = b[::2, ...]
+        assert_equal(a.ravel('K'), [0, 2, 4, 6, 8, 10, 12, 14])
+        assert_equal(a.ravel('C'), [0, 2, 4, 6, 8, 10, 12, 14])
+        assert_equal(a.ravel('A'), [0, 2, 4, 6, 8, 10, 12, 14])
+        assert_equal(a.ravel('F'), [0, 8, 4, 12, 2, 10, 6, 14])
+
+    def test_ravel_subclass(self):
+        class ArraySubclass(np.ndarray):
+            pass
+
+        a = np.arange(10).view(ArraySubclass)
+        assert_(isinstance(a.ravel('C'), ArraySubclass))
+        assert_(isinstance(a.ravel('F'), ArraySubclass))
+        assert_(isinstance(a.ravel('A'), ArraySubclass))
+        assert_(isinstance(a.ravel('K'), ArraySubclass))
+
+        a = np.arange(10)[::2].view(ArraySubclass)
+        assert_(isinstance(a.ravel('C'), ArraySubclass))
+        assert_(isinstance(a.ravel('F'), ArraySubclass))
+        assert_(isinstance(a.ravel('A'), ArraySubclass))
+        assert_(isinstance(a.ravel('K'), ArraySubclass))
+
+    def test_swapaxes(self):
+        a = np.arange(1*2*3*4).reshape(1, 2, 3, 4).copy()
+        idx = np.indices(a.shape)
+        assert_(a.flags['OWNDATA'])
+        b = a.copy()
+        # check exceptions
+        assert_raises(np.AxisError, a.swapaxes, -5, 0)
+        assert_raises(np.AxisError, a.swapaxes, 4, 0)
+        assert_raises(np.AxisError, a.swapaxes, 0, -5)
+        assert_raises(np.AxisError, a.swapaxes, 0, 4)
+
+        for i in range(-4, 4):
+            for j in range(-4, 4):
+                for k, src in enumerate((a, b)):
+                    c = src.swapaxes(i, j)
+                    # check shape
+                    shape = list(src.shape)
+                    shape[i] = src.shape[j]
+                    shape[j] = src.shape[i]
+                    assert_equal(c.shape, shape, str((i, j, k)))
+                    # check array contents
+                    i0, i1, i2, i3 = [dim-1 for dim in c.shape]
+                    j0, j1, j2, j3 = [dim-1 for dim in src.shape]
+                    assert_equal(src[idx[j0], idx[j1], idx[j2], idx[j3]],
+                                 c[idx[i0], idx[i1], idx[i2], idx[i3]],
+                                 str((i, j, k)))
+                    # check a view is always returned, gh-5260
+                    assert_(not c.flags['OWNDATA'], str((i, j, k)))
+                    # check on non-contiguous input array
+                    if k == 1:
+                        b = c
+
+    def test_conjugate(self):
+        a = np.array([1-1j, 1+1j, 23+23.0j])
+        ac = a.conj()
+        assert_equal(a.real, ac.real)
+        assert_equal(a.imag, -ac.imag)
+        assert_equal(ac, a.conjugate())
+        assert_equal(ac, np.conjugate(a))
+
+        a = np.array([1-1j, 1+1j, 23+23.0j], 'F')
+        ac = a.conj()
+        assert_equal(a.real, ac.real)
+        assert_equal(a.imag, -ac.imag)
+        assert_equal(ac, a.conjugate())
+        assert_equal(ac, np.conjugate(a))
+
+        a = np.array([1, 2, 3])
+        ac = a.conj()
+        assert_equal(a, ac)
+        assert_equal(ac, a.conjugate())
+        assert_equal(ac, np.conjugate(a))
+
+        a = np.array([1.0, 2.0, 3.0])
+        ac = a.conj()
+        assert_equal(a, ac)
+        assert_equal(ac, a.conjugate())
+        assert_equal(ac, np.conjugate(a))
+
+        a = np.array([1-1j, 1+1j, 1, 2.0], object)
+        ac = a.conj()
+        assert_equal(ac, [k.conjugate() for k in a])
+        assert_equal(ac, a.conjugate())
+        assert_equal(ac, np.conjugate(a))
+
+        a = np.array([1-1j, 1, 2.0, 'f'], object)
+        assert_raises(AttributeError, lambda: a.conj())
+        assert_raises(AttributeError, lambda: a.conjugate())
+
+    def test__complex__(self):
+        dtypes = ['i1', 'i2', 'i4', 'i8',
+                  'u1', 'u2', 'u4', 'u8',
+                  'f', 'd', 'g', 'F', 'D', 'G',
+                  '?', 'O']
+        for dt in dtypes:
+            a = np.array(7, dtype=dt)
+            b = np.array([7], dtype=dt)
+            c = np.array([[[[[7]]]]], dtype=dt)
+
+            msg = 'dtype: {0}'.format(dt)
+            ap = complex(a)
+            assert_equal(ap, a, msg)
+            bp = complex(b)
+            assert_equal(bp, b, msg)
+            cp = complex(c)
+            assert_equal(cp, c, msg)
+
+    def test__complex__should_not_work(self):
+        dtypes = ['i1', 'i2', 'i4', 'i8',
+                  'u1', 'u2', 'u4', 'u8',
+                  'f', 'd', 'g', 'F', 'D', 'G',
+                  '?', 'O']
+        for dt in dtypes:
+            a = np.array([1, 2, 3], dtype=dt)
+            assert_raises(TypeError, complex, a)
+
+        dt = np.dtype([('a', 'f8'), ('b', 'i1')])
+        b = np.array((1.0, 3), dtype=dt)
+        assert_raises(TypeError, complex, b)
+
+        c = np.array([(1.0, 3), (2e-3, 7)], dtype=dt)
+        assert_raises(TypeError, complex, c)
+
+        d = np.array('1+1j')
+        assert_raises(TypeError, complex, d)
+
+        e = np.array(['1+1j'], 'U')
+        assert_raises(TypeError, complex, e)
+
+class TestCequenceMethods(object):
+    def test_array_contains(self):
+        assert_(4.0 in np.arange(16.).reshape(4,4))
+        assert_(20.0 not in np.arange(16.).reshape(4,4))
+
+class TestBinop(object):
+    def test_inplace(self):
+        # test refcount 1 inplace conversion
+        assert_array_almost_equal(np.array([0.5]) * np.array([1.0, 2.0]),
+                                  [0.5, 1.0])
+
+        d = np.array([0.5, 0.5])[::2]
+        assert_array_almost_equal(d * (d * np.array([1.0, 2.0])),
+                                  [0.25, 0.5])
+
+        a = np.array([0.5])
+        b = np.array([0.5])
+        c = a + b
+        c = a - b
+        c = a * b
+        c = a / b
+        assert_equal(a, b)
+        assert_almost_equal(c, 1.)
+
+        c = a + b * 2. / b * a - a / b
+        assert_equal(a, b)
+        assert_equal(c, 0.5)
+
+        # true divide
+        a = np.array([5])
+        b = np.array([3])
+        c = (a * a) / b
+
+        assert_almost_equal(c, 25 / 3)
+        assert_equal(a, 5)
+        assert_equal(b, 3)
+
+    # ndarray.__rop__ always calls ufunc
+    # ndarray.__iop__ always calls ufunc
+    # ndarray.__op__, __rop__:
+    #   - defer if other has __array_ufunc__ and it is None
+    #           or other is not a subclass and has higher array priority
+    #   - else, call ufunc
+    def test_ufunc_binop_interaction(self):
+        # Python method name (without underscores)
+        #   -> (numpy ufunc, has_in_place_version, preferred_dtype)
+        ops = {
+            'add':      (np.add, True, float),
+            'sub':      (np.subtract, True, float),
+            'mul':      (np.multiply, True, float),
+            'truediv':  (np.true_divide, True, float),
+            'floordiv': (np.floor_divide, True, float),
+            'mod':      (np.remainder, True, float),
+            'divmod':   (np.divmod, False, float),
+            'pow':      (np.power, True, int),
+            'lshift':   (np.left_shift, True, int),
+            'rshift':   (np.right_shift, True, int),
+            'and':      (np.bitwise_and, True, int),
+            'xor':      (np.bitwise_xor, True, int),
+            'or':       (np.bitwise_or, True, int),
+            # 'ge':       (np.less_equal, False),
+            # 'gt':       (np.less, False),
+            # 'le':       (np.greater_equal, False),
+            # 'lt':       (np.greater, False),
+            # 'eq':       (np.equal, False),
+            # 'ne':       (np.not_equal, False),
+        }
+        if sys.version_info >= (3, 5):
+            ops['matmul'] = (np.matmul, False, float)
+
+        class Coerced(Exception):
+            pass
+
+        def array_impl(self):
+            raise Coerced
+
+        def op_impl(self, other):
+            return "forward"
+
+        def rop_impl(self, other):
+            return "reverse"
+
+        def iop_impl(self, other):
+            return "in-place"
+
+        def array_ufunc_impl(self, ufunc, method, *args, **kwargs):
+            return ("__array_ufunc__", ufunc, method, args, kwargs)
+
+        # Create an object with the given base, in the given module, with a
+        # bunch of placeholder __op__ methods, and optionally a
+        # __array_ufunc__ and __array_priority__.
+        def make_obj(base, array_priority=False, array_ufunc=False,
+                     alleged_module="__main__"):
+            class_namespace = {"__array__": array_impl}
+            if array_priority is not False:
+                class_namespace["__array_priority__"] = array_priority
+            for op in ops:
+                class_namespace["__{0}__".format(op)] = op_impl
+                class_namespace["__r{0}__".format(op)] = rop_impl
+                class_namespace["__i{0}__".format(op)] = iop_impl
+            if array_ufunc is not False:
+                class_namespace["__array_ufunc__"] = array_ufunc
+            eval_namespace = {"base": base,
+                              "class_namespace": class_namespace,
+                              "__name__": alleged_module,
+                              }
+            MyType = eval("type('MyType', (base,), class_namespace)",
+                          eval_namespace)
+            if issubclass(MyType, np.ndarray):
+                # Use this range to avoid special case weirdnesses around
+                # divide-by-0, pow(x, 2), overflow due to pow(big, big), etc.
+                return np.arange(3, 7).reshape(2, 2).view(MyType)
+            else:
+                return MyType()
+
+        def check(obj, binop_override_expected, ufunc_override_expected,
+                  inplace_override_expected, check_scalar=True):
+            for op, (ufunc, has_inplace, dtype) in ops.items():
+                err_msg = ('op: %s, ufunc: %s, has_inplace: %s, dtype: %s'
+                           % (op, ufunc, has_inplace, dtype))
+                check_objs = [np.arange(3, 7, dtype=dtype).reshape(2, 2)]
+                if check_scalar:
+                    check_objs.append(check_objs[0][0])
+                for arr in check_objs:
+                    arr_method = getattr(arr, "__{0}__".format(op))
+
+                    def first_out_arg(result):
+                        if op == "divmod":
+                            assert_(isinstance(result, tuple))
+                            return result[0]
+                        else:
+                            return result
+
+                    # arr __op__ obj
+                    if binop_override_expected:
+                        assert_equal(arr_method(obj), NotImplemented, err_msg)
+                    elif ufunc_override_expected:
+                        assert_equal(arr_method(obj)[0], "__array_ufunc__",
+                                     err_msg)
+                    else:
+                        if (isinstance(obj, np.ndarray) and
+                            (type(obj).__array_ufunc__ is
+                             np.ndarray.__array_ufunc__)):
+                            # __array__ gets ignored
+                            res = first_out_arg(arr_method(obj))
+                            assert_(res.__class__ is obj.__class__, err_msg)
+                        else:
+                            assert_raises((TypeError, Coerced),
+                                          arr_method, obj, err_msg=err_msg)
+                    # obj __op__ arr
+                    arr_rmethod = getattr(arr, "__r{0}__".format(op))
+                    if ufunc_override_expected:
+                        res = arr_rmethod(obj)
+                        assert_equal(res[0], "__array_ufunc__",
+                                     err_msg=err_msg)
+                        assert_equal(res[1], ufunc, err_msg=err_msg)
+                    else:
+                        if (isinstance(obj, np.ndarray) and
+                                (type(obj).__array_ufunc__ is
+                                 np.ndarray.__array_ufunc__)):
+                            # __array__ gets ignored
+                            res = first_out_arg(arr_rmethod(obj))
+                            assert_(res.__class__ is obj.__class__, err_msg)
+                        else:
+                            # __array_ufunc__ = "asdf" creates a TypeError
+                            assert_raises((TypeError, Coerced),
+                                          arr_rmethod, obj, err_msg=err_msg)
+
+                    # arr __iop__ obj
+                    # array scalars don't have in-place operators
+                    if has_inplace and isinstance(arr, np.ndarray):
+                        arr_imethod = getattr(arr, "__i{0}__".format(op))
+                        if inplace_override_expected:
+                            assert_equal(arr_method(obj), NotImplemented,
+                                         err_msg=err_msg)
+                        elif ufunc_override_expected:
+                            res = arr_imethod(obj)
+                            assert_equal(res[0], "__array_ufunc__", err_msg)
+                            assert_equal(res[1], ufunc, err_msg)
+                            assert_(type(res[-1]["out"]) is tuple, err_msg)
+                            assert_(res[-1]["out"][0] is arr, err_msg)
+                        else:
+                            if (isinstance(obj, np.ndarray) and
+                                    (type(obj).__array_ufunc__ is
+                                    np.ndarray.__array_ufunc__)):
+                                # __array__ gets ignored
+                                assert_(arr_imethod(obj) is arr, err_msg)
+                            else:
+                                assert_raises((TypeError, Coerced),
+                                              arr_imethod, obj,
+                                              err_msg=err_msg)
+
+                    op_fn = getattr(operator, op, None)
+                    if op_fn is None:
+                        op_fn = getattr(operator, op + "_", None)
+                    if op_fn is None:
+                        op_fn = getattr(builtins, op)
+                    assert_equal(op_fn(obj, arr), "forward", err_msg)
+                    if not isinstance(obj, np.ndarray):
+                        if binop_override_expected:
+                            assert_equal(op_fn(arr, obj), "reverse", err_msg)
+                        elif ufunc_override_expected:
+                            assert_equal(op_fn(arr, obj)[0], "__array_ufunc__",
+                                         err_msg)
+                    if ufunc_override_expected:
+                        assert_equal(ufunc(obj, arr)[0], "__array_ufunc__",
+                                     err_msg)
+
+        # No array priority, no array_ufunc -> nothing called
+        check(make_obj(object), False, False, False)
+        # Negative array priority, no array_ufunc -> nothing called
+        # (has to be very negative, because scalar priority is -1000000.0)
+        check(make_obj(object, array_priority=-2**30), False, False, False)
+        # Positive array priority, no array_ufunc -> binops and iops only
+        check(make_obj(object, array_priority=1), True, False, True)
+        # ndarray ignores array_priority for ndarray subclasses
+        check(make_obj(np.ndarray, array_priority=1), False, False, False,
+              check_scalar=False)
+        # Positive array_priority and array_ufunc -> array_ufunc only
+        check(make_obj(object, array_priority=1,
+                       array_ufunc=array_ufunc_impl), False, True, False)
+        check(make_obj(np.ndarray, array_priority=1,
+                       array_ufunc=array_ufunc_impl), False, True, False)
+        # array_ufunc set to None -> defer binops only
+        check(make_obj(object, array_ufunc=None), True, False, False)
+        check(make_obj(np.ndarray, array_ufunc=None), True, False, False,
+              check_scalar=False)
+
+    def test_ufunc_override_normalize_signature(self):
+        # gh-5674
+        class SomeClass(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kw):
+                return kw
+
+        a = SomeClass()
+        kw = np.add(a, [1])
+        assert_('sig' not in kw and 'signature' not in kw)
+        kw = np.add(a, [1], sig='ii->i')
+        assert_('sig' not in kw and 'signature' in kw)
+        assert_equal(kw['signature'], 'ii->i')
+        kw = np.add(a, [1], signature='ii->i')
+        assert_('sig' not in kw and 'signature' in kw)
+        assert_equal(kw['signature'], 'ii->i')
+
+    def test_array_ufunc_index(self):
+        # Check that index is set appropriately, also if only an output
+        # is passed on (latter is another regression tests for github bug 4753)
+        # This also checks implicitly that 'out' is always a tuple.
+        class CheckIndex(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kw):
+                for i, a in enumerate(inputs):
+                    if a is self:
+                        return i
+                # calls below mean we must be in an output.
+                for j, a in enumerate(kw['out']):
+                    if a is self:
+                        return (j,)
+
+        a = CheckIndex()
+        dummy = np.arange(2.)
+        # 1 input, 1 output
+        assert_equal(np.sin(a), 0)
+        assert_equal(np.sin(dummy, a), (0,))
+        assert_equal(np.sin(dummy, out=a), (0,))
+        assert_equal(np.sin(dummy, out=(a,)), (0,))
+        assert_equal(np.sin(a, a), 0)
+        assert_equal(np.sin(a, out=a), 0)
+        assert_equal(np.sin(a, out=(a,)), 0)
+        # 1 input, 2 outputs
+        assert_equal(np.modf(dummy, a), (0,))
+        assert_equal(np.modf(dummy, None, a), (1,))
+        assert_equal(np.modf(dummy, dummy, a), (1,))
+        assert_equal(np.modf(dummy, out=(a, None)), (0,))
+        assert_equal(np.modf(dummy, out=(a, dummy)), (0,))
+        assert_equal(np.modf(dummy, out=(None, a)), (1,))
+        assert_equal(np.modf(dummy, out=(dummy, a)), (1,))
+        assert_equal(np.modf(a, out=(dummy, a)), 0)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', DeprecationWarning)
+            assert_equal(np.modf(dummy, out=a), (0,))
+            assert_(w[0].category is DeprecationWarning)
+        assert_raises(ValueError, np.modf, dummy, out=(a,))
+
+        # 2 inputs, 1 output
+        assert_equal(np.add(a, dummy), 0)
+        assert_equal(np.add(dummy, a), 1)
+        assert_equal(np.add(dummy, dummy, a), (0,))
+        assert_equal(np.add(dummy, a, a), 1)
+        assert_equal(np.add(dummy, dummy, out=a), (0,))
+        assert_equal(np.add(dummy, dummy, out=(a,)), (0,))
+        assert_equal(np.add(a, dummy, out=a), 0)
+
+    def test_out_override(self):
+        # regression test for github bug 4753
+        class OutClass(np.ndarray):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kw):
+                if 'out' in kw:
+                    tmp_kw = kw.copy()
+                    tmp_kw.pop('out')
+                    func = getattr(ufunc, method)
+                    kw['out'][0][...] = func(*inputs, **tmp_kw)
+
+        A = np.array([0]).view(OutClass)
+        B = np.array([5])
+        C = np.array([6])
+        np.multiply(C, B, A)
+        assert_equal(A[0], 30)
+        assert_(isinstance(A, OutClass))
+        A[0] = 0
+        np.multiply(C, B, out=A)
+        assert_equal(A[0], 30)
+        assert_(isinstance(A, OutClass))
+
+    def test_pow_override_with_errors(self):
+        # regression test for gh-9112
+        class PowerOnly(np.ndarray):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kw):
+                if ufunc is not np.power:
+                    raise NotImplementedError
+                return "POWER!"
+        # explicit cast to float, to ensure the fast power path is taken.
+        a = np.array(5., dtype=np.float64).view(PowerOnly)
+        assert_equal(a ** 2.5, "POWER!")
+        with assert_raises(NotImplementedError):
+            a ** 0.5
+        with assert_raises(NotImplementedError):
+            a ** 0
+        with assert_raises(NotImplementedError):
+            a ** 1
+        with assert_raises(NotImplementedError):
+            a ** -1
+        with assert_raises(NotImplementedError):
+            a ** 2
+
+    def test_pow_array_object_dtype(self):
+        # test pow on arrays of object dtype
+        class SomeClass(object):
+            def __init__(self, num=None):
+                self.num = num
+
+            # want to ensure a fast pow path is not taken
+            def __mul__(self, other):
+                raise AssertionError('__mul__ should not be called')
+
+            def __div__(self, other):
+                raise AssertionError('__div__ should not be called')
+
+            def __pow__(self, exp):
+                return SomeClass(num=self.num ** exp)
+
+            def __eq__(self, other):
+                if isinstance(other, SomeClass):
+                    return self.num == other.num
+
+            __rpow__ = __pow__
+
+        def pow_for(exp, arr):
+            return np.array([x ** exp for x in arr])
+
+        obj_arr = np.array([SomeClass(1), SomeClass(2), SomeClass(3)])
+
+        assert_equal(obj_arr ** 0.5, pow_for(0.5, obj_arr))
+        assert_equal(obj_arr ** 0, pow_for(0, obj_arr))
+        assert_equal(obj_arr ** 1, pow_for(1, obj_arr))
+        assert_equal(obj_arr ** -1, pow_for(-1, obj_arr))
+        assert_equal(obj_arr ** 2, pow_for(2, obj_arr))
+
+    def test_pos_array_ufunc_override(self):
+        class A(np.ndarray):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return getattr(ufunc, method)(*[i.view(np.ndarray) for
+                                                i in inputs], **kwargs)
+        tst = np.array('foo').view(A)
+        with assert_raises(TypeError):
+            +tst
+
+
+class TestTemporaryElide(object):
+    # elision is only triggered on relatively large arrays
+
+    def test_extension_incref_elide(self):
+        # test extension (e.g. cython) calling PyNumber_* slots without
+        # increasing the reference counts
+        #
+        # def incref_elide(a):
+        #    d = input.copy() # refcount 1
+        #    return d, d + d # PyNumber_Add without increasing refcount
+        from numpy.core._multiarray_tests import incref_elide
+        d = np.ones(100000)
+        orig, res = incref_elide(d)
+        d + d
+        # the return original should not be changed to an inplace operation
+        assert_array_equal(orig, d)
+        assert_array_equal(res, d + d)
+
+    def test_extension_incref_elide_stack(self):
+        # scanning if the refcount == 1 object is on the python stack to check
+        # that we are called directly from python is flawed as object may still
+        # be above the stack pointer and we have no access to the top of it
+        #
+        # def incref_elide_l(d):
+        #    return l[4] + l[4] # PyNumber_Add without increasing refcount
+        from numpy.core._multiarray_tests import incref_elide_l
+        # padding with 1 makes sure the object on the stack is not overwritten
+        l = [1, 1, 1, 1, np.ones(100000)]
+        res = incref_elide_l(l)
+        # the return original should not be changed to an inplace operation
+        assert_array_equal(l[4], np.ones(100000))
+        assert_array_equal(res, l[4] + l[4])
+
+    def test_temporary_with_cast(self):
+        # check that we don't elide into a temporary which would need casting
+        d = np.ones(200000, dtype=np.int64)
+        assert_equal(((d + d) + 2**222).dtype, np.dtype('O'))
+
+        r = ((d + d) / 2)
+        assert_equal(r.dtype, np.dtype('f8'))
+
+        r = np.true_divide((d + d), 2)
+        assert_equal(r.dtype, np.dtype('f8'))
+
+        r = ((d + d) / 2.)
+        assert_equal(r.dtype, np.dtype('f8'))
+
+        r = ((d + d) // 2)
+        assert_equal(r.dtype, np.dtype(np.int64))
+
+        # commutative elision into the astype result
+        f = np.ones(100000, dtype=np.float32)
+        assert_equal(((f + f) + f.astype(np.float64)).dtype, np.dtype('f8'))
+
+        # no elision into lower type
+        d = f.astype(np.float64)
+        assert_equal(((f + f) + d).dtype, d.dtype)
+        l = np.ones(100000, dtype=np.longdouble)
+        assert_equal(((d + d) + l).dtype, l.dtype)
+
+        # test unary abs with different output dtype
+        for dt in (np.complex64, np.complex128, np.clongdouble):
+            c = np.ones(100000, dtype=dt)
+            r = abs(c * 2.0)
+            assert_equal(r.dtype, np.dtype('f%d' % (c.itemsize // 2)))
+
+    def test_elide_broadcast(self):
+        # test no elision on broadcast to higher dimension
+        # only triggers elision code path in debug mode as triggering it in
+        # normal mode needs 256kb large matching dimension, so a lot of memory
+        d = np.ones((2000, 1), dtype=int)
+        b = np.ones((2000), dtype=bool)
+        r = (1 - d) + b
+        assert_equal(r, 1)
+        assert_equal(r.shape, (2000, 2000))
+
+    def test_elide_scalar(self):
+        # check inplace op does not create ndarray from scalars
+        a = np.bool_()
+        assert_(type(~(a & a)) is np.bool_)
+
+    def test_elide_scalar_readonly(self):
+        # The imaginary part of a real array is readonly. This needs to go
+        # through fast_scalar_power which is only called for powers of
+        # +1, -1, 0, 0.5, and 2, so use 2. Also need valid refcount for
+        # elision which can be gotten for the imaginary part of a real
+        # array. Should not error.
+        a = np.empty(100000, dtype=np.float64)
+        a.imag ** 2
+
+    def test_elide_readonly(self):
+        # don't try to elide readonly temporaries
+        r = np.asarray(np.broadcast_to(np.zeros(1), 100000).flat) * 0.0
+        assert_equal(r, 0)
+
+    def test_elide_updateifcopy(self):
+        a = np.ones(2**20)[::2]
+        b = a.flat.__array__() + 1
+        del b
+        assert_equal(a, 1)
+
+
+class TestCAPI(object):
+    def test_IsPythonScalar(self):
+        from numpy.core._multiarray_tests import IsPythonScalar
+        assert_(IsPythonScalar(b'foobar'))
+        assert_(IsPythonScalar(1))
+        assert_(IsPythonScalar(2**80))
+        assert_(IsPythonScalar(2.))
+        assert_(IsPythonScalar("a"))
+
+
+class TestSubscripting(object):
+    def test_test_zero_rank(self):
+        x = np.array([1, 2, 3])
+        assert_(isinstance(x[0], np.int_))
+        if sys.version_info[0] < 3:
+            assert_(isinstance(x[0], int))
+        assert_(type(x[0, ...]) is np.ndarray)
+
+
+class TestPickling(object):
+    def test_highest_available_pickle_protocol(self):
+        try:
+            import pickle5
+        except ImportError:
+            pickle5 = None
+
+        if sys.version_info[:2] >= (3, 8) or pickle5 is not None:
+            assert pickle.HIGHEST_PROTOCOL >= 5
+        else:
+            assert pickle.HIGHEST_PROTOCOL < 5
+
+    @pytest.mark.skipif(pickle.HIGHEST_PROTOCOL >= 5,
+                        reason=('this tests the error messages when trying to'
+                                'protocol 5 although it is not available'))
+    def test_correct_protocol5_error_message(self):
+        array = np.arange(10)
+
+        if sys.version_info[:2] in ((3, 6), (3, 7)):
+            # For the specific case of python3.6 and 3.7, raise a clear import
+            # error about the pickle5 backport when trying to use protocol=5
+            # without the pickle5 package
+            with pytest.raises(ImportError):
+                array.__reduce_ex__(5)
+
+        elif sys.version_info[:2] < (3, 6):
+            # when calling __reduce_ex__ explicitly with protocol=5 on python
+            # raise a ValueError saying that protocol 5 is not available for
+            # this python version
+            with pytest.raises(ValueError):
+                array.__reduce_ex__(5)
+
+    def test_record_array_with_object_dtype(self):
+        my_object = object()
+
+        arr_with_object = np.array(
+                [(my_object, 1, 2.0)],
+                dtype=[('a', object), ('b', int), ('c', float)])
+        arr_without_object = np.array(
+                [('xxx', 1, 2.0)],
+                dtype=[('a', str), ('b', int), ('c', float)])
+
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            depickled_arr_with_object = pickle.loads(
+                    pickle.dumps(arr_with_object, protocol=proto))
+            depickled_arr_without_object = pickle.loads(
+                    pickle.dumps(arr_without_object, protocol=proto))
+
+            assert_equal(arr_with_object.dtype,
+                         depickled_arr_with_object.dtype)
+            assert_equal(arr_without_object.dtype,
+                         depickled_arr_without_object.dtype)
+
+    @pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5,
+                        reason="requires pickle protocol 5")
+    def test_f_contiguous_array(self):
+        f_contiguous_array = np.array([[1, 2, 3], [4, 5, 6]], order='F')
+        buffers = []
+
+        # When using pickle protocol 5, Fortran-contiguous arrays can be
+        # serialized using out-of-band buffers
+        bytes_string = pickle.dumps(f_contiguous_array, protocol=5,
+                                    buffer_callback=buffers.append)
+
+        assert len(buffers) > 0
+
+        depickled_f_contiguous_array = pickle.loads(bytes_string,
+                                                    buffers=buffers)
+
+        assert_equal(f_contiguous_array, depickled_f_contiguous_array)
+
+    def test_non_contiguous_array(self):
+        non_contiguous_array = np.arange(12).reshape(3, 4)[:, :2]
+        assert not non_contiguous_array.flags.c_contiguous
+        assert not non_contiguous_array.flags.f_contiguous
+
+        # make sure non-contiguous arrays can be pickled-depickled
+        # using any protocol
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            depickled_non_contiguous_array = pickle.loads(
+                    pickle.dumps(non_contiguous_array, protocol=proto))
+
+            assert_equal(non_contiguous_array, depickled_non_contiguous_array)
+
+    def test_roundtrip(self):
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            carray = np.array([[2, 9], [7, 0], [3, 8]])
+            DATA = [
+                carray,
+                np.transpose(carray),
+                np.array([('xxx', 1, 2.0)], dtype=[('a', (str, 3)), ('b', int),
+                                                   ('c', float)])
+            ]
+
+            refs = [weakref.ref(a) for a in DATA]
+            for a in DATA:
+                assert_equal(
+                        a, pickle.loads(pickle.dumps(a, protocol=proto)),
+                        err_msg="%r" % a)
+            del a, DATA, carray
+            break_cycles()
+            # check for reference leaks (gh-12793)
+            for ref in refs:
+                assert ref() is None
+
+    def _loads(self, obj):
+        if sys.version_info[0] >= 3:
+            return pickle.loads(obj, encoding='latin1')
+        else:
+            return pickle.loads(obj)
+
+    # version 0 pickles, using protocol=2 to pickle
+    # version 0 doesn't have a version field
+    def test_version0_int8(self):
+        s = b'\x80\x02cnumpy.core._internal\n_reconstruct\nq\x01cnumpy\nndarray\nq\x02K\x00\x85U\x01b\x87Rq\x03(K\x04\x85cnumpy\ndtype\nq\x04U\x02i1K\x00K\x01\x87Rq\x05(U\x01|NNJ\xff\xff\xff\xffJ\xff\xff\xff\xfftb\x89U\x04\x01\x02\x03\x04tb.'
+        a = np.array([1, 2, 3, 4], dtype=np.int8)
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    def test_version0_float32(self):
+        s = b'\x80\x02cnumpy.core._internal\n_reconstruct\nq\x01cnumpy\nndarray\nq\x02K\x00\x85U\x01b\x87Rq\x03(K\x04\x85cnumpy\ndtype\nq\x04U\x02f4K\x00K\x01\x87Rq\x05(U\x01<NNJ\xff\xff\xff\xffJ\xff\xff\xff\xfftb\x89U\x10\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x80@tb.'
+        a = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    def test_version0_object(self):
+        s = b'\x80\x02cnumpy.core._internal\n_reconstruct\nq\x01cnumpy\nndarray\nq\x02K\x00\x85U\x01b\x87Rq\x03(K\x02\x85cnumpy\ndtype\nq\x04U\x02O8K\x00K\x01\x87Rq\x05(U\x01|NNJ\xff\xff\xff\xffJ\xff\xff\xff\xfftb\x89]q\x06(}q\x07U\x01aK\x01s}q\x08U\x01bK\x02setb.'
+        a = np.array([{'a': 1}, {'b': 2}])
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    # version 1 pickles, using protocol=2 to pickle
+    def test_version1_int8(self):
+        s = b'\x80\x02cnumpy.core._internal\n_reconstruct\nq\x01cnumpy\nndarray\nq\x02K\x00\x85U\x01b\x87Rq\x03(K\x01K\x04\x85cnumpy\ndtype\nq\x04U\x02i1K\x00K\x01\x87Rq\x05(K\x01U\x01|NNJ\xff\xff\xff\xffJ\xff\xff\xff\xfftb\x89U\x04\x01\x02\x03\x04tb.'
+        a = np.array([1, 2, 3, 4], dtype=np.int8)
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    def test_version1_float32(self):
+        s = b'\x80\x02cnumpy.core._internal\n_reconstruct\nq\x01cnumpy\nndarray\nq\x02K\x00\x85U\x01b\x87Rq\x03(K\x01K\x04\x85cnumpy\ndtype\nq\x04U\x02f4K\x00K\x01\x87Rq\x05(K\x01U\x01<NNJ\xff\xff\xff\xffJ\xff\xff\xff\xfftb\x89U\x10\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x80@tb.'
+        a = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    def test_version1_object(self):
+        s = b'\x80\x02cnumpy.core._internal\n_reconstruct\nq\x01cnumpy\nndarray\nq\x02K\x00\x85U\x01b\x87Rq\x03(K\x01K\x02\x85cnumpy\ndtype\nq\x04U\x02O8K\x00K\x01\x87Rq\x05(K\x01U\x01|NNJ\xff\xff\xff\xffJ\xff\xff\xff\xfftb\x89]q\x06(}q\x07U\x01aK\x01s}q\x08U\x01bK\x02setb.'
+        a = np.array([{'a': 1}, {'b': 2}])
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    def test_subarray_int_shape(self):
+        s = b"cnumpy.core.multiarray\n_reconstruct\np0\n(cnumpy\nndarray\np1\n(I0\ntp2\nS'b'\np3\ntp4\nRp5\n(I1\n(I1\ntp6\ncnumpy\ndtype\np7\n(S'V6'\np8\nI0\nI1\ntp9\nRp10\n(I3\nS'|'\np11\nN(S'a'\np12\ng3\ntp13\n(dp14\ng12\n(g7\n(S'V4'\np15\nI0\nI1\ntp16\nRp17\n(I3\nS'|'\np18\n(g7\n(S'i1'\np19\nI0\nI1\ntp20\nRp21\n(I3\nS'|'\np22\nNNNI-1\nI-1\nI0\ntp23\nb(I2\nI2\ntp24\ntp25\nNNI4\nI1\nI0\ntp26\nbI0\ntp27\nsg3\n(g7\n(S'V2'\np28\nI0\nI1\ntp29\nRp30\n(I3\nS'|'\np31\n(g21\nI2\ntp32\nNNI2\nI1\nI0\ntp33\nbI4\ntp34\nsI6\nI1\nI0\ntp35\nbI00\nS'\\x01\\x01\\x01\\x01\\x01\\x02'\np36\ntp37\nb."
+        a = np.array([(1, (1, 2))], dtype=[('a', 'i1', (2, 2)), ('b', 'i1', 2)])
+        p = self._loads(s)
+        assert_equal(a, p)
+
+    def test_datetime64_byteorder(self):
+        original = np.array([['2015-02-24T00:00:00.000000000']], dtype='datetime64[ns]')
+    
+        original_byte_reversed = original.copy(order='K')
+        original_byte_reversed.dtype = original_byte_reversed.dtype.newbyteorder('S')
+        original_byte_reversed.byteswap(inplace=True)
+
+        new = pickle.loads(pickle.dumps(original_byte_reversed))
+    
+        assert_equal(original.dtype, new.dtype)
+
+    def test_py2_can_load_py3_pickle_with_dtype_field_names(self):
+        # gh-2407 and PR #14275
+        # Py2 should be able to load a pickle that was created in PY3
+        # when the pickle contains a structured dtype with field names
+        import numpy as np
+
+        expected_dtype = np.dtype([('SPOT', np.float64)])
+        expected_data = np.array([(6.0)], dtype=expected_dtype)
+        # Pickled under Python 3.6.5 with protocol=2 by the code below:
+        # pickle.dumps(expected_data, protocol=2)
+        saved_pickle_from_py3 = b'''\
+\x80\x02cnumpy.core.multiarray\n_reconstruct\nq\x00cnumpy\nndarray\n\
+q\x01K\x00\x85q\x02c_codecs\nencode\nq\x03X\x01\x00\x00\x00bq\x04X\
+\x06\x00\x00\x00latin1q\x05\x86q\x06Rq\x07\x87q\x08Rq\t(K\x01K\x01\
+\x85q\ncnumpy\ndtype\nq\x0bX\x02\x00\x00\x00V8q\x0cK\x00K\x01\x87q\
+\rRq\x0e(K\x03X\x01\x00\x00\x00|q\x0fNX\x04\x00\x00\x00SPOTq\x10\
+\x85q\x11}q\x12h\x10h\x0bX\x02\x00\x00\x00f8q\x13K\x00K\x01\x87\
+q\x14Rq\x15(K\x03X\x01\x00\x00\x00<q\x16NNNJ\xff\xff\xff\xffJ\xff\
+\xff\xff\xffK\x00tq\x17bK\x00\x86q\x18sK\x08K\x01K\x10tq\x19b\x89h\
+\x03X\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18@q\x1ah\x05\x86q\
+\x1bRq\x1ctq\x1db.\
+'''
+
+        if sys.version_info[0] < 3:  # PY2
+            assert pickle.loads(saved_pickle_from_py3) == expected_data
+        else:
+            # check that the string above is what we claim on PY3
+            py3_pickle_dump = pickle.dumps(expected_data, protocol=2)
+            assert py3_pickle_dump == saved_pickle_from_py3
+
+    def test_py3_can_load_py2_pickle_with_dtype_field_names(self):
+        # gh-2407 and PR #14275
+        # Roundtrip: Py3 should load a pickle that was created in PY2
+        # after loading the saved_pickle (from PY3) in the test named
+        # 'test_py2_can_load_py3_pickle_with_dtype_field_names'
+        import numpy as np
+
+        expected_dtype = np.dtype([('SPOT', np.float64)])
+        expected = np.array([(6.0)], dtype=expected_dtype)
+        # Pickled under Python 2.7.16 with protocol=2 after it was loaded
+        # by test 'test_py2_can_load_py3_pickle_with_dtype_field_names'
+        pickle_from_py2 = b'''\
+\x80\x02cnumpy.core.multiarray\n_reconstruct\nq\x01cnumpy\nndarray\n\
+q\x02K\x00\x85U\x01b\x87Rq\x03(K\x01K\x01\x85cnumpy\ndtype\nq\x04U\x02\
+V8K\x00K\x01\x87Rq\x05(K\x03U\x01|NU\x04SPOTq\x06\x85q\x07}q\x08h\x06h\
+\x04U\x02f8K\x00K\x01\x87Rq\t(K\x03U\x01<NNNJ\xff\xff\xff\xffJ\xff\xff\
+\xff\xffK\x00tbK\x00\x86sK\x08K\x01K\x10tb\x89U\x08\x00\x00\x00\x00\x00\
+\x00\x18@tb.\
+'''
+
+        if sys.version_info[0] >= 3:  # PY3
+            assert pickle.loads(pickle_from_py2) == expected
+        else:
+            # check that the string above is what we claim on PY2
+            if sys.platform.startswith('linux') and not IS_PYPY:
+                assert pickle.dumps(expected, protocol=2) == pickle_from_py2
+
+
+
+class TestFancyIndexing(object):
+    def test_list(self):
+        x = np.ones((1, 1))
+        x[:, [0]] = 2.0
+        assert_array_equal(x, np.array([[2.0]]))
+
+        x = np.ones((1, 1, 1))
+        x[:, :, [0]] = 2.0
+        assert_array_equal(x, np.array([[[2.0]]]))
+
+    def test_tuple(self):
+        x = np.ones((1, 1))
+        x[:, (0,)] = 2.0
+        assert_array_equal(x, np.array([[2.0]]))
+        x = np.ones((1, 1, 1))
+        x[:, :, (0,)] = 2.0
+        assert_array_equal(x, np.array([[[2.0]]]))
+
+    def test_mask(self):
+        x = np.array([1, 2, 3, 4])
+        m = np.array([0, 1, 0, 0], bool)
+        assert_array_equal(x[m], np.array([2]))
+
+    def test_mask2(self):
+        x = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+        m = np.array([0, 1], bool)
+        m2 = np.array([[0, 1, 0, 0], [1, 0, 0, 0]], bool)
+        m3 = np.array([[0, 1, 0, 0], [0, 0, 0, 0]], bool)
+        assert_array_equal(x[m], np.array([[5, 6, 7, 8]]))
+        assert_array_equal(x[m2], np.array([2, 5]))
+        assert_array_equal(x[m3], np.array([2]))
+
+    def test_assign_mask(self):
+        x = np.array([1, 2, 3, 4])
+        m = np.array([0, 1, 0, 0], bool)
+        x[m] = 5
+        assert_array_equal(x, np.array([1, 5, 3, 4]))
+
+    def test_assign_mask2(self):
+        xorig = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+        m = np.array([0, 1], bool)
+        m2 = np.array([[0, 1, 0, 0], [1, 0, 0, 0]], bool)
+        m3 = np.array([[0, 1, 0, 0], [0, 0, 0, 0]], bool)
+        x = xorig.copy()
+        x[m] = 10
+        assert_array_equal(x, np.array([[1, 2, 3, 4], [10, 10, 10, 10]]))
+        x = xorig.copy()
+        x[m2] = 10
+        assert_array_equal(x, np.array([[1, 10, 3, 4], [10, 6, 7, 8]]))
+        x = xorig.copy()
+        x[m3] = 10
+        assert_array_equal(x, np.array([[1, 10, 3, 4], [5, 6, 7, 8]]))
+
+
+class TestStringCompare(object):
+    def test_string(self):
+        g1 = np.array(["This", "is", "example"])
+        g2 = np.array(["This", "was", "example"])
+        assert_array_equal(g1 == g2, [g1[i] == g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 != g2, [g1[i] != g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 <= g2, [g1[i] <= g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 >= g2, [g1[i] >= g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 < g2, [g1[i] < g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 > g2, [g1[i] > g2[i] for i in [0, 1, 2]])
+
+    def test_mixed(self):
+        g1 = np.array(["spam", "spa", "spammer", "and eggs"])
+        g2 = "spam"
+        assert_array_equal(g1 == g2, [x == g2 for x in g1])
+        assert_array_equal(g1 != g2, [x != g2 for x in g1])
+        assert_array_equal(g1 < g2, [x < g2 for x in g1])
+        assert_array_equal(g1 > g2, [x > g2 for x in g1])
+        assert_array_equal(g1 <= g2, [x <= g2 for x in g1])
+        assert_array_equal(g1 >= g2, [x >= g2 for x in g1])
+
+    def test_unicode(self):
+        g1 = np.array([u"This", u"is", u"example"])
+        g2 = np.array([u"This", u"was", u"example"])
+        assert_array_equal(g1 == g2, [g1[i] == g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 != g2, [g1[i] != g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 <= g2, [g1[i] <= g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 >= g2, [g1[i] >= g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 < g2,  [g1[i] < g2[i] for i in [0, 1, 2]])
+        assert_array_equal(g1 > g2,  [g1[i] > g2[i] for i in [0, 1, 2]])
+
+
+class TestArgmax(object):
+
+    nan_arr = [
+        ([0, 1, 2, 3, np.nan], 4),
+        ([0, 1, 2, np.nan, 3], 3),
+        ([np.nan, 0, 1, 2, 3], 0),
+        ([np.nan, 0, np.nan, 2, 3], 0),
+        ([0, 1, 2, 3, complex(0, np.nan)], 4),
+        ([0, 1, 2, 3, complex(np.nan, 0)], 4),
+        ([0, 1, 2, complex(np.nan, 0), 3], 3),
+        ([0, 1, 2, complex(0, np.nan), 3], 3),
+        ([complex(0, np.nan), 0, 1, 2, 3], 0),
+        ([complex(np.nan, np.nan), 0, 1, 2, 3], 0),
+        ([complex(np.nan, 0), complex(np.nan, 2), complex(np.nan, 1)], 0),
+        ([complex(np.nan, np.nan), complex(np.nan, 2), complex(np.nan, 1)], 0),
+        ([complex(np.nan, 0), complex(np.nan, 2), complex(np.nan, np.nan)], 0),
+
+        ([complex(0, 0), complex(0, 2), complex(0, 1)], 1),
+        ([complex(1, 0), complex(0, 2), complex(0, 1)], 0),
+        ([complex(1, 0), complex(0, 2), complex(1, 1)], 2),
+
+        ([np.datetime64('1923-04-14T12:43:12'),
+          np.datetime64('1994-06-21T14:43:15'),
+          np.datetime64('2001-10-15T04:10:32'),
+          np.datetime64('1995-11-25T16:02:16'),
+          np.datetime64('2005-01-04T03:14:12'),
+          np.datetime64('2041-12-03T14:05:03')], 5),
+        ([np.datetime64('1935-09-14T04:40:11'),
+          np.datetime64('1949-10-12T12:32:11'),
+          np.datetime64('2010-01-03T05:14:12'),
+          np.datetime64('2015-11-20T12:20:59'),
+          np.datetime64('1932-09-23T10:10:13'),
+          np.datetime64('2014-10-10T03:50:30')], 3),
+        # Assorted tests with NaTs
+        ([np.datetime64('NaT'),
+          np.datetime64('NaT'),
+          np.datetime64('2010-01-03T05:14:12'),
+          np.datetime64('NaT'),
+          np.datetime64('2015-09-23T10:10:13'),
+          np.datetime64('1932-10-10T03:50:30')], 4),
+        ([np.datetime64('2059-03-14T12:43:12'),
+          np.datetime64('1996-09-21T14:43:15'),
+          np.datetime64('NaT'),
+          np.datetime64('2022-12-25T16:02:16'),
+          np.datetime64('1963-10-04T03:14:12'),
+          np.datetime64('2013-05-08T18:15:23')], 0),
+        ([np.timedelta64(2, 's'),
+          np.timedelta64(1, 's'),
+          np.timedelta64('NaT', 's'),
+          np.timedelta64(3, 's')], 3),
+        ([np.timedelta64('NaT', 's')] * 3, 0),
+
+        ([timedelta(days=5, seconds=14), timedelta(days=2, seconds=35),
+          timedelta(days=-1, seconds=23)], 0),
+        ([timedelta(days=1, seconds=43), timedelta(days=10, seconds=5),
+          timedelta(days=5, seconds=14)], 1),
+        ([timedelta(days=10, seconds=24), timedelta(days=10, seconds=5),
+          timedelta(days=10, seconds=43)], 2),
+
+        ([False, False, False, False, True], 4),
+        ([False, False, False, True, False], 3),
+        ([True, False, False, False, False], 0),
+        ([True, False, True, False, False], 0),
+    ]
+
+    def test_all(self):
+        a = np.random.normal(0, 1, (4, 5, 6, 7, 8))
+        for i in range(a.ndim):
+            amax = a.max(i)
+            aargmax = a.argmax(i)
+            axes = list(range(a.ndim))
+            axes.remove(i)
+            assert_(np.all(amax == aargmax.choose(*a.transpose(i,*axes))))
+
+    def test_combinations(self):
+        for arr, pos in self.nan_arr:
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning,
+                           "invalid value encountered in reduce")
+                max_val = np.max(arr)
+
+            assert_equal(np.argmax(arr), pos, err_msg="%r" % arr)
+            assert_equal(arr[np.argmax(arr)], max_val, err_msg="%r" % arr)
+
+    def test_output_shape(self):
+        # see also gh-616
+        a = np.ones((10, 5))
+        # Check some simple shape mismatches
+        out = np.ones(11, dtype=np.int_)
+        assert_raises(ValueError, a.argmax, -1, out)
+
+        out = np.ones((2, 5), dtype=np.int_)
+        assert_raises(ValueError, a.argmax, -1, out)
+
+        # these could be relaxed possibly (used to allow even the previous)
+        out = np.ones((1, 10), dtype=np.int_)
+        assert_raises(ValueError, a.argmax, -1, out)
+
+        out = np.ones(10, dtype=np.int_)
+        a.argmax(-1, out=out)
+        assert_equal(out, a.argmax(-1))
+
+    def test_argmax_unicode(self):
+        d = np.zeros(6031, dtype='<U9')
+        d[5942] = "as"
+        assert_equal(d.argmax(), 5942)
+
+    def test_np_vs_ndarray(self):
+        # make sure both ndarray.argmax and numpy.argmax support out/axis args
+        a = np.random.normal(size=(2,3))
+
+        # check positional args
+        out1 = np.zeros(2, dtype=int)
+        out2 = np.zeros(2, dtype=int)
+        assert_equal(a.argmax(1, out1), np.argmax(a, 1, out2))
+        assert_equal(out1, out2)
+
+        # check keyword args
+        out1 = np.zeros(3, dtype=int)
+        out2 = np.zeros(3, dtype=int)
+        assert_equal(a.argmax(out=out1, axis=0), np.argmax(a, out=out2, axis=0))
+        assert_equal(out1, out2)
+
+    def test_object_argmax_with_NULLs(self):
+        # See gh-6032
+        a = np.empty(4, dtype='O')
+        ctypes.memset(a.ctypes.data, 0, a.nbytes)
+        assert_equal(a.argmax(), 0)
+        a[3] = 10
+        assert_equal(a.argmax(), 3)
+        a[1] = 30
+        assert_equal(a.argmax(), 1)
+
+
+class TestArgmin(object):
+
+    nan_arr = [
+        ([0, 1, 2, 3, np.nan], 4),
+        ([0, 1, 2, np.nan, 3], 3),
+        ([np.nan, 0, 1, 2, 3], 0),
+        ([np.nan, 0, np.nan, 2, 3], 0),
+        ([0, 1, 2, 3, complex(0, np.nan)], 4),
+        ([0, 1, 2, 3, complex(np.nan, 0)], 4),
+        ([0, 1, 2, complex(np.nan, 0), 3], 3),
+        ([0, 1, 2, complex(0, np.nan), 3], 3),
+        ([complex(0, np.nan), 0, 1, 2, 3], 0),
+        ([complex(np.nan, np.nan), 0, 1, 2, 3], 0),
+        ([complex(np.nan, 0), complex(np.nan, 2), complex(np.nan, 1)], 0),
+        ([complex(np.nan, np.nan), complex(np.nan, 2), complex(np.nan, 1)], 0),
+        ([complex(np.nan, 0), complex(np.nan, 2), complex(np.nan, np.nan)], 0),
+
+        ([complex(0, 0), complex(0, 2), complex(0, 1)], 0),
+        ([complex(1, 0), complex(0, 2), complex(0, 1)], 2),
+        ([complex(1, 0), complex(0, 2), complex(1, 1)], 1),
+
+        ([np.datetime64('1923-04-14T12:43:12'),
+          np.datetime64('1994-06-21T14:43:15'),
+          np.datetime64('2001-10-15T04:10:32'),
+          np.datetime64('1995-11-25T16:02:16'),
+          np.datetime64('2005-01-04T03:14:12'),
+          np.datetime64('2041-12-03T14:05:03')], 0),
+        ([np.datetime64('1935-09-14T04:40:11'),
+          np.datetime64('1949-10-12T12:32:11'),
+          np.datetime64('2010-01-03T05:14:12'),
+          np.datetime64('2014-11-20T12:20:59'),
+          np.datetime64('2015-09-23T10:10:13'),
+          np.datetime64('1932-10-10T03:50:30')], 5),
+        # Assorted tests with NaTs
+        ([np.datetime64('NaT'),
+          np.datetime64('NaT'),
+          np.datetime64('2010-01-03T05:14:12'),
+          np.datetime64('NaT'),
+          np.datetime64('2015-09-23T10:10:13'),
+          np.datetime64('1932-10-10T03:50:30')], 5),
+        ([np.datetime64('2059-03-14T12:43:12'),
+          np.datetime64('1996-09-21T14:43:15'),
+          np.datetime64('NaT'),
+          np.datetime64('2022-12-25T16:02:16'),
+          np.datetime64('1963-10-04T03:14:12'),
+          np.datetime64('2013-05-08T18:15:23')], 4),
+        ([np.timedelta64(2, 's'),
+          np.timedelta64(1, 's'),
+          np.timedelta64('NaT', 's'),
+          np.timedelta64(3, 's')], 1),
+        ([np.timedelta64('NaT', 's')] * 3, 0),
+
+        ([timedelta(days=5, seconds=14), timedelta(days=2, seconds=35),
+          timedelta(days=-1, seconds=23)], 2),
+        ([timedelta(days=1, seconds=43), timedelta(days=10, seconds=5),
+          timedelta(days=5, seconds=14)], 0),
+        ([timedelta(days=10, seconds=24), timedelta(days=10, seconds=5),
+          timedelta(days=10, seconds=43)], 1),
+
+        ([True, True, True, True, False], 4),
+        ([True, True, True, False, True], 3),
+        ([False, True, True, True, True], 0),
+        ([False, True, False, True, True], 0),
+    ]
+
+    def test_all(self):
+        a = np.random.normal(0, 1, (4, 5, 6, 7, 8))
+        for i in range(a.ndim):
+            amin = a.min(i)
+            aargmin = a.argmin(i)
+            axes = list(range(a.ndim))
+            axes.remove(i)
+            assert_(np.all(amin == aargmin.choose(*a.transpose(i,*axes))))
+
+    def test_combinations(self):
+        for arr, pos in self.nan_arr:
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning,
+                           "invalid value encountered in reduce")
+                min_val = np.min(arr)
+
+            assert_equal(np.argmin(arr), pos, err_msg="%r" % arr)
+            assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr)
+
+    def test_minimum_signed_integers(self):
+
+        a = np.array([1, -2**7, -2**7 + 1], dtype=np.int8)
+        assert_equal(np.argmin(a), 1)
+
+        a = np.array([1, -2**15, -2**15 + 1], dtype=np.int16)
+        assert_equal(np.argmin(a), 1)
+
+        a = np.array([1, -2**31, -2**31 + 1], dtype=np.int32)
+        assert_equal(np.argmin(a), 1)
+
+        a = np.array([1, -2**63, -2**63 + 1], dtype=np.int64)
+        assert_equal(np.argmin(a), 1)
+
+    def test_output_shape(self):
+        # see also gh-616
+        a = np.ones((10, 5))
+        # Check some simple shape mismatches
+        out = np.ones(11, dtype=np.int_)
+        assert_raises(ValueError, a.argmin, -1, out)
+
+        out = np.ones((2, 5), dtype=np.int_)
+        assert_raises(ValueError, a.argmin, -1, out)
+
+        # these could be relaxed possibly (used to allow even the previous)
+        out = np.ones((1, 10), dtype=np.int_)
+        assert_raises(ValueError, a.argmin, -1, out)
+
+        out = np.ones(10, dtype=np.int_)
+        a.argmin(-1, out=out)
+        assert_equal(out, a.argmin(-1))
+
+    def test_argmin_unicode(self):
+        d = np.ones(6031, dtype='<U9')
+        d[6001] = "0"
+        assert_equal(d.argmin(), 6001)
+
+    def test_np_vs_ndarray(self):
+        # make sure both ndarray.argmin and numpy.argmin support out/axis args
+        a = np.random.normal(size=(2, 3))
+
+        # check positional args
+        out1 = np.zeros(2, dtype=int)
+        out2 = np.ones(2, dtype=int)
+        assert_equal(a.argmin(1, out1), np.argmin(a, 1, out2))
+        assert_equal(out1, out2)
+
+        # check keyword args
+        out1 = np.zeros(3, dtype=int)
+        out2 = np.ones(3, dtype=int)
+        assert_equal(a.argmin(out=out1, axis=0), np.argmin(a, out=out2, axis=0))
+        assert_equal(out1, out2)
+
+    def test_object_argmin_with_NULLs(self):
+        # See gh-6032
+        a = np.empty(4, dtype='O')
+        ctypes.memset(a.ctypes.data, 0, a.nbytes)
+        assert_equal(a.argmin(), 0)
+        a[3] = 30
+        assert_equal(a.argmin(), 3)
+        a[1] = 10
+        assert_equal(a.argmin(), 1)
+
+
+class TestMinMax(object):
+
+    def test_scalar(self):
+        assert_raises(np.AxisError, np.amax, 1, 1)
+        assert_raises(np.AxisError, np.amin, 1, 1)
+
+        assert_equal(np.amax(1, axis=0), 1)
+        assert_equal(np.amin(1, axis=0), 1)
+        assert_equal(np.amax(1, axis=None), 1)
+        assert_equal(np.amin(1, axis=None), 1)
+
+    def test_axis(self):
+        assert_raises(np.AxisError, np.amax, [1, 2, 3], 1000)
+        assert_equal(np.amax([[1, 2, 3]], axis=1), 3)
+
+    def test_datetime(self):
+        # NaTs are ignored
+        for dtype in ('m8[s]', 'm8[Y]'):
+            a = np.arange(10).astype(dtype)
+            a[3] = 'NaT'
+            assert_equal(np.amin(a), a[0])
+            assert_equal(np.amax(a), a[9])
+            a[0] = 'NaT'
+            assert_equal(np.amin(a), a[1])
+            assert_equal(np.amax(a), a[9])
+            a.fill('NaT')
+            assert_equal(np.amin(a), a[0])
+            assert_equal(np.amax(a), a[0])
+
+
+class TestNewaxis(object):
+    def test_basic(self):
+        sk = np.array([0, -0.1, 0.1])
+        res = 250*sk[:, np.newaxis]
+        assert_almost_equal(res.ravel(), 250*sk)
+
+
+class TestClip(object):
+    def _check_range(self, x, cmin, cmax):
+        assert_(np.all(x >= cmin))
+        assert_(np.all(x <= cmax))
+
+    def _clip_type(self, type_group, array_max,
+                   clip_min, clip_max, inplace=False,
+                   expected_min=None, expected_max=None):
+        if expected_min is None:
+            expected_min = clip_min
+        if expected_max is None:
+            expected_max = clip_max
+
+        for T in np.sctypes[type_group]:
+            if sys.byteorder == 'little':
+                byte_orders = ['=', '>']
+            else:
+                byte_orders = ['<', '=']
+
+            for byteorder in byte_orders:
+                dtype = np.dtype(T).newbyteorder(byteorder)
+
+                x = (np.random.random(1000) * array_max).astype(dtype)
+                if inplace:
+                    x.clip(clip_min, clip_max, x)
+                else:
+                    x = x.clip(clip_min, clip_max)
+                    byteorder = '='
+
+                if x.dtype.byteorder == '|':
+                    byteorder = '|'
+                assert_equal(x.dtype.byteorder, byteorder)
+                self._check_range(x, expected_min, expected_max)
+        return x
+
+    def test_basic(self):
+        for inplace in [False, True]:
+            self._clip_type(
+                'float', 1024, -12.8, 100.2, inplace=inplace)
+            self._clip_type(
+                'float', 1024, 0, 0, inplace=inplace)
+
+            self._clip_type(
+                'int', 1024, -120, 100.5, inplace=inplace)
+            self._clip_type(
+                'int', 1024, 0, 0, inplace=inplace)
+
+            self._clip_type(
+                'uint', 1024, 0, 0, inplace=inplace)
+            self._clip_type(
+                'uint', 1024, -120, 100, inplace=inplace, expected_min=0)
+
+    def test_record_array(self):
+        rec = np.array([(-5, 2.0, 3.0), (5.0, 4.0, 3.0)],
+                       dtype=[('x', '<f8'), ('y', '<f8'), ('z', '<f8')])
+        y = rec['x'].clip(-0.3, 0.5)
+        self._check_range(y, -0.3, 0.5)
+
+    def test_max_or_min(self):
+        val = np.array([0, 1, 2, 3, 4, 5, 6, 7])
+        x = val.clip(3)
+        assert_(np.all(x >= 3))
+        x = val.clip(min=3)
+        assert_(np.all(x >= 3))
+        x = val.clip(max=4)
+        assert_(np.all(x <= 4))
+
+    def test_nan(self):
+        input_arr = np.array([-2., np.nan, 0.5, 3., 0.25, np.nan])
+        result = input_arr.clip(-1, 1)
+        expected = np.array([-1., np.nan, 0.5, 1., 0.25, np.nan])
+        assert_array_equal(result, expected)
+
+
+class TestCompress(object):
+    def test_axis(self):
+        tgt = [[5, 6, 7, 8, 9]]
+        arr = np.arange(10).reshape(2, 5)
+        out = np.compress([0, 1], arr, axis=0)
+        assert_equal(out, tgt)
+
+        tgt = [[1, 3], [6, 8]]
+        out = np.compress([0, 1, 0, 1, 0], arr, axis=1)
+        assert_equal(out, tgt)
+
+    def test_truncate(self):
+        tgt = [[1], [6]]
+        arr = np.arange(10).reshape(2, 5)
+        out = np.compress([0, 1], arr, axis=1)
+        assert_equal(out, tgt)
+
+    def test_flatten(self):
+        arr = np.arange(10).reshape(2, 5)
+        out = np.compress([0, 1], arr)
+        assert_equal(out, 1)
+
+
+class TestPutmask(object):
+    def tst_basic(self, x, T, mask, val):
+        np.putmask(x, mask, val)
+        assert_equal(x[mask], T(val))
+        assert_equal(x.dtype, T)
+
+    def test_ip_types(self):
+        unchecked_types = [bytes, unicode, np.void, object]
+
+        x = np.random.random(1000)*100
+        mask = x < 40
+
+        for val in [-100, 0, 15]:
+            for types in np.sctypes.values():
+                for T in types:
+                    if T not in unchecked_types:
+                        self.tst_basic(x.copy().astype(T), T, mask, val)
+
+    def test_mask_size(self):
+        assert_raises(ValueError, np.putmask, np.array([1, 2, 3]), [True], 5)
+
+    @pytest.mark.parametrize('dtype', ('>i4', '<i4'))
+    def test_byteorder(self, dtype):
+        x = np.array([1, 2, 3], dtype)
+        np.putmask(x, [True, False, True], -1)
+        assert_array_equal(x, [-1, 2, -1])
+
+    def test_record_array(self):
+        # Note mixed byteorder.
+        rec = np.array([(-5, 2.0, 3.0), (5.0, 4.0, 3.0)],
+                      dtype=[('x', '<f8'), ('y', '>f8'), ('z', '<f8')])
+        np.putmask(rec['x'], [True, False], 10)
+        assert_array_equal(rec['x'], [10, 5])
+        assert_array_equal(rec['y'], [2, 4])
+        assert_array_equal(rec['z'], [3, 3])
+        np.putmask(rec['y'], [True, False], 11)
+        assert_array_equal(rec['x'], [10, 5])
+        assert_array_equal(rec['y'], [11, 4])
+        assert_array_equal(rec['z'], [3, 3])
+
+
+class TestTake(object):
+    def tst_basic(self, x):
+        ind = list(range(x.shape[0]))
+        assert_array_equal(x.take(ind, axis=0), x)
+
+    def test_ip_types(self):
+        unchecked_types = [bytes, unicode, np.void, object]
+
+        x = np.random.random(24)*100
+        x.shape = 2, 3, 4
+        for types in np.sctypes.values():
+            for T in types:
+                if T not in unchecked_types:
+                    self.tst_basic(x.copy().astype(T))
+
+    def test_raise(self):
+        x = np.random.random(24)*100
+        x.shape = 2, 3, 4
+        assert_raises(IndexError, x.take, [0, 1, 2], axis=0)
+        assert_raises(IndexError, x.take, [-3], axis=0)
+        assert_array_equal(x.take([-1], axis=0)[0], x[1])
+
+    def test_clip(self):
+        x = np.random.random(24)*100
+        x.shape = 2, 3, 4
+        assert_array_equal(x.take([-1], axis=0, mode='clip')[0], x[0])
+        assert_array_equal(x.take([2], axis=0, mode='clip')[0], x[1])
+
+    def test_wrap(self):
+        x = np.random.random(24)*100
+        x.shape = 2, 3, 4
+        assert_array_equal(x.take([-1], axis=0, mode='wrap')[0], x[1])
+        assert_array_equal(x.take([2], axis=0, mode='wrap')[0], x[0])
+        assert_array_equal(x.take([3], axis=0, mode='wrap')[0], x[1])
+
+    @pytest.mark.parametrize('dtype', ('>i4', '<i4'))
+    def test_byteorder(self, dtype):
+        x = np.array([1, 2, 3], dtype)
+        assert_array_equal(x.take([0, 2, 1]), [1, 3, 2])
+
+    def test_record_array(self):
+        # Note mixed byteorder.
+        rec = np.array([(-5, 2.0, 3.0), (5.0, 4.0, 3.0)],
+                      dtype=[('x', '<f8'), ('y', '>f8'), ('z', '<f8')])
+        rec1 = rec.take([1])
+        assert_(rec1['x'] == 5.0 and rec1['y'] == 4.0)
+
+
+class TestLexsort(object):
+    def test_basic(self):
+        a = [1, 2, 1, 3, 1, 5]
+        b = [0, 4, 5, 6, 2, 3]
+        idx = np.lexsort((b, a))
+        expected_idx = np.array([0, 4, 2, 1, 3, 5])
+        assert_array_equal(idx, expected_idx)
+
+        x = np.vstack((b, a))
+        idx = np.lexsort(x)
+        assert_array_equal(idx, expected_idx)
+
+        assert_array_equal(x[1][idx], np.sort(x[1]))
+
+    def test_datetime(self):
+        a = np.array([0,0,0], dtype='datetime64[D]')
+        b = np.array([2,1,0], dtype='datetime64[D]')
+        idx = np.lexsort((b, a))
+        expected_idx = np.array([2, 1, 0])
+        assert_array_equal(idx, expected_idx)
+
+        a = np.array([0,0,0], dtype='timedelta64[D]')
+        b = np.array([2,1,0], dtype='timedelta64[D]')
+        idx = np.lexsort((b, a))
+        expected_idx = np.array([2, 1, 0])
+        assert_array_equal(idx, expected_idx)
+
+    def test_object(self):  # gh-6312
+        a = np.random.choice(10, 1000)
+        b = np.random.choice(['abc', 'xy', 'wz', 'efghi', 'qwst', 'x'], 1000)
+
+        for u in a, b:
+            left = np.lexsort((u.astype('O'),))
+            right = np.argsort(u, kind='mergesort')
+            assert_array_equal(left, right)
+
+        for u, v in (a, b), (b, a):
+            idx = np.lexsort((u, v))
+            assert_array_equal(idx, np.lexsort((u.astype('O'), v)))
+            assert_array_equal(idx, np.lexsort((u, v.astype('O'))))
+            u, v = np.array(u, dtype='object'), np.array(v, dtype='object')
+            assert_array_equal(idx, np.lexsort((u, v)))
+
+    def test_invalid_axis(self): # gh-7528
+        x = np.linspace(0., 1., 42*3).reshape(42, 3)
+        assert_raises(np.AxisError, np.lexsort, x, axis=2)
+
+class TestIO(object):
+    """Test tofile, fromfile, tobytes, and fromstring"""
+
+    def setup(self):
+        shape = (2, 4, 3)
+        rand = np.random.random
+        self.x = rand(shape) + rand(shape).astype(complex)*1j
+        self.x[0,:, 1] = [np.nan, np.inf, -np.inf, np.nan]
+        self.dtype = self.x.dtype
+        self.tempdir = tempfile.mkdtemp()
+        self.filename = tempfile.mktemp(dir=self.tempdir)
+
+    def teardown(self):
+        shutil.rmtree(self.tempdir)
+
+    def test_nofile(self):
+        # this should probably be supported as a file
+        # but for now test for proper errors
+        b = io.BytesIO()
+        assert_raises(IOError, np.fromfile, b, np.uint8, 80)
+        d = np.ones(7)
+        assert_raises(IOError, lambda x: x.tofile(b), d)
+
+    def test_bool_fromstring(self):
+        v = np.array([True, False, True, False], dtype=np.bool_)
+        y = np.fromstring('1 0 -2.3 0.0', sep=' ', dtype=np.bool_)
+        assert_array_equal(v, y)
+
+    def test_uint64_fromstring(self):
+        d = np.fromstring("9923372036854775807 104783749223640",
+                          dtype=np.uint64, sep=' ')
+        e = np.array([9923372036854775807, 104783749223640], dtype=np.uint64)
+        assert_array_equal(d, e)
+
+    def test_int64_fromstring(self):
+        d = np.fromstring("-25041670086757 104783749223640",
+                          dtype=np.int64, sep=' ')
+        e = np.array([-25041670086757, 104783749223640], dtype=np.int64)
+        assert_array_equal(d, e)
+
+    def test_empty_files_binary(self):
+        f = open(self.filename, 'w')
+        f.close()
+        y = np.fromfile(self.filename)
+        assert_(y.size == 0, "Array not empty")
+
+    def test_empty_files_text(self):
+        f = open(self.filename, 'w')
+        f.close()
+        y = np.fromfile(self.filename, sep=" ")
+        assert_(y.size == 0, "Array not empty")
+
+    def test_roundtrip_file(self):
+        f = open(self.filename, 'wb')
+        self.x.tofile(f)
+        f.close()
+        # NB. doesn't work with flush+seek, due to use of C stdio
+        f = open(self.filename, 'rb')
+        y = np.fromfile(f, dtype=self.dtype)
+        f.close()
+        assert_array_equal(y, self.x.flat)
+
+    def test_roundtrip_filename(self):
+        self.x.tofile(self.filename)
+        y = np.fromfile(self.filename, dtype=self.dtype)
+        assert_array_equal(y, self.x.flat)
+
+    def test_roundtrip_binary_str(self):
+        s = self.x.tobytes()
+        y = np.frombuffer(s, dtype=self.dtype)
+        assert_array_equal(y, self.x.flat)
+
+        s = self.x.tobytes('F')
+        y = np.frombuffer(s, dtype=self.dtype)
+        assert_array_equal(y, self.x.flatten('F'))
+
+    def test_roundtrip_str(self):
+        x = self.x.real.ravel()
+        s = "@".join(map(str, x))
+        y = np.fromstring(s, sep="@")
+        # NB. str imbues less precision
+        nan_mask = ~np.isfinite(x)
+        assert_array_equal(x[nan_mask], y[nan_mask])
+        assert_array_almost_equal(x[~nan_mask], y[~nan_mask], decimal=5)
+
+    def test_roundtrip_repr(self):
+        x = self.x.real.ravel()
+        s = "@".join(map(repr, x))
+        y = np.fromstring(s, sep="@")
+        assert_array_equal(x, y)
+
+    def test_unseekable_fromfile(self):
+        # gh-6246
+        self.x.tofile(self.filename)
+
+        def fail(*args, **kwargs):
+            raise IOError('Can not tell or seek')
+
+        with io.open(self.filename, 'rb', buffering=0) as f:
+            f.seek = fail
+            f.tell = fail
+            assert_raises(IOError, np.fromfile, f, dtype=self.dtype)
+
+    def test_io_open_unbuffered_fromfile(self):
+        # gh-6632
+        self.x.tofile(self.filename)
+        with io.open(self.filename, 'rb', buffering=0) as f:
+            y = np.fromfile(f, dtype=self.dtype)
+            assert_array_equal(y, self.x.flat)
+
+    def test_largish_file(self):
+        # check the fallocate path on files > 16MB
+        d = np.zeros(4 * 1024 ** 2)
+        d.tofile(self.filename)
+        assert_equal(os.path.getsize(self.filename), d.nbytes)
+        assert_array_equal(d, np.fromfile(self.filename))
+        # check offset
+        with open(self.filename, "r+b") as f:
+            f.seek(d.nbytes)
+            d.tofile(f)
+            assert_equal(os.path.getsize(self.filename), d.nbytes * 2)
+        # check append mode (gh-8329)
+        open(self.filename, "w").close() # delete file contents
+        with open(self.filename, "ab") as f:
+            d.tofile(f)
+        assert_array_equal(d, np.fromfile(self.filename))
+        with open(self.filename, "ab") as f:
+            d.tofile(f)
+        assert_equal(os.path.getsize(self.filename), d.nbytes * 2)
+
+    def test_io_open_buffered_fromfile(self):
+        # gh-6632
+        self.x.tofile(self.filename)
+        with io.open(self.filename, 'rb', buffering=-1) as f:
+            y = np.fromfile(f, dtype=self.dtype)
+        assert_array_equal(y, self.x.flat)
+
+    def test_file_position_after_fromfile(self):
+        # gh-4118
+        sizes = [io.DEFAULT_BUFFER_SIZE//8,
+                 io.DEFAULT_BUFFER_SIZE,
+                 io.DEFAULT_BUFFER_SIZE*8]
+
+        for size in sizes:
+            f = open(self.filename, 'wb')
+            f.seek(size-1)
+            f.write(b'\0')
+            f.close()
+
+            for mode in ['rb', 'r+b']:
+                err_msg = "%d %s" % (size, mode)
+
+                f = open(self.filename, mode)
+                f.read(2)
+                np.fromfile(f, dtype=np.float64, count=1)
+                pos = f.tell()
+                f.close()
+                assert_equal(pos, 10, err_msg=err_msg)
+
+    def test_file_position_after_tofile(self):
+        # gh-4118
+        sizes = [io.DEFAULT_BUFFER_SIZE//8,
+                 io.DEFAULT_BUFFER_SIZE,
+                 io.DEFAULT_BUFFER_SIZE*8]
+
+        for size in sizes:
+            err_msg = "%d" % (size,)
+
+            f = open(self.filename, 'wb')
+            f.seek(size-1)
+            f.write(b'\0')
+            f.seek(10)
+            f.write(b'12')
+            np.array([0], dtype=np.float64).tofile(f)
+            pos = f.tell()
+            f.close()
+            assert_equal(pos, 10 + 2 + 8, err_msg=err_msg)
+
+            f = open(self.filename, 'r+b')
+            f.read(2)
+            f.seek(0, 1)  # seek between read&write required by ANSI C
+            np.array([0], dtype=np.float64).tofile(f)
+            pos = f.tell()
+            f.close()
+            assert_equal(pos, 10, err_msg=err_msg)
+
+    def test_load_object_array_fromfile(self):
+        # gh-12300
+        with open(self.filename, 'w') as f:
+            # Ensure we have a file with consistent contents
+            pass
+
+        with open(self.filename, 'rb') as f:
+            assert_raises_regex(ValueError, "Cannot read into object array",
+                                np.fromfile, f, dtype=object)
+
+        assert_raises_regex(ValueError, "Cannot read into object array",
+                            np.fromfile, self.filename, dtype=object)
+
+    def _check_from(self, s, value, **kw):
+        if 'sep' not in kw:
+            y = np.frombuffer(s, **kw)
+        else:
+            y = np.fromstring(s, **kw)
+        assert_array_equal(y, value)
+
+        f = open(self.filename, 'wb')
+        f.write(s)
+        f.close()
+        y = np.fromfile(self.filename, **kw)
+        assert_array_equal(y, value)
+
+    def test_nan(self):
+        self._check_from(
+            b"nan +nan -nan NaN nan(foo) +NaN(BAR) -NAN(q_u_u_x_)",
+            [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+            sep=' ')
+
+    def test_inf(self):
+        self._check_from(
+            b"inf +inf -inf infinity -Infinity iNfInItY -inF",
+            [np.inf, np.inf, -np.inf, np.inf, -np.inf, np.inf, -np.inf],
+            sep=' ')
+
+    def test_numbers(self):
+        self._check_from(b"1.234 -1.234 .3 .3e55 -123133.1231e+133",
+                         [1.234, -1.234, .3, .3e55, -123133.1231e+133], sep=' ')
+
+    def test_binary(self):
+        self._check_from(b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x80@',
+                         np.array([1, 2, 3, 4]),
+                         dtype='<f4')
+
+    @pytest.mark.slow  # takes > 1 minute on mechanical hard drive
+    def test_big_binary(self):
+        """Test workarounds for 32-bit limited fwrite, fseek, and ftell
+        calls in windows. These normally would hang doing something like this.
+        See http://projects.scipy.org/numpy/ticket/1660"""
+        if sys.platform != 'win32':
+            return
+        try:
+            # before workarounds, only up to 2**32-1 worked
+            fourgbplus = 2**32 + 2**16
+            testbytes = np.arange(8, dtype=np.int8)
+            n = len(testbytes)
+            flike = tempfile.NamedTemporaryFile()
+            f = flike.file
+            np.tile(testbytes, fourgbplus // testbytes.nbytes).tofile(f)
+            flike.seek(0)
+            a = np.fromfile(f, dtype=np.int8)
+            flike.close()
+            assert_(len(a) == fourgbplus)
+            # check only start and end for speed:
+            assert_((a[:n] == testbytes).all())
+            assert_((a[-n:] == testbytes).all())
+        except (MemoryError, ValueError):
+            pass
+
+    def test_string(self):
+        self._check_from(b'1,2,3,4', [1., 2., 3., 4.], sep=',')
+
+    def test_counted_string(self):
+        self._check_from(b'1,2,3,4', [1., 2., 3., 4.], count=4, sep=',')
+        self._check_from(b'1,2,3,4', [1., 2., 3.], count=3, sep=',')
+        self._check_from(b'1,2,3,4', [1., 2., 3., 4.], count=-1, sep=',')
+
+    def test_string_with_ws(self):
+        self._check_from(b'1 2  3     4   ', [1, 2, 3, 4], dtype=int, sep=' ')
+
+    def test_counted_string_with_ws(self):
+        self._check_from(b'1 2  3     4   ', [1, 2, 3], count=3, dtype=int,
+                         sep=' ')
+
+    def test_ascii(self):
+        self._check_from(b'1 , 2 , 3 , 4', [1., 2., 3., 4.], sep=',')
+        self._check_from(b'1,2,3,4', [1., 2., 3., 4.], dtype=float, sep=',')
+
+    def test_malformed(self):
+        self._check_from(b'1.234 1,234', [1.234, 1.], sep=' ')
+
+    def test_long_sep(self):
+        self._check_from(b'1_x_3_x_4_x_5', [1, 3, 4, 5], sep='_x_')
+
+    def test_dtype(self):
+        v = np.array([1, 2, 3, 4], dtype=np.int_)
+        self._check_from(b'1,2,3,4', v, sep=',', dtype=np.int_)
+
+    def test_dtype_bool(self):
+        # can't use _check_from because fromstring can't handle True/False
+        v = np.array([True, False, True, False], dtype=np.bool_)
+        s = b'1,0,-2.3,0'
+        f = open(self.filename, 'wb')
+        f.write(s)
+        f.close()
+        y = np.fromfile(self.filename, sep=',', dtype=np.bool_)
+        assert_(y.dtype == '?')
+        assert_array_equal(y, v)
+
+    def test_tofile_sep(self):
+        x = np.array([1.51, 2, 3.51, 4], dtype=float)
+        f = open(self.filename, 'w')
+        x.tofile(f, sep=',')
+        f.close()
+        f = open(self.filename, 'r')
+        s = f.read()
+        f.close()
+        #assert_equal(s, '1.51,2.0,3.51,4.0')
+        y = np.array([float(p) for p in s.split(',')])
+        assert_array_equal(x,y)
+
+    def test_tofile_format(self):
+        x = np.array([1.51, 2, 3.51, 4], dtype=float)
+        f = open(self.filename, 'w')
+        x.tofile(f, sep=',', format='%.2f')
+        f.close()
+        f = open(self.filename, 'r')
+        s = f.read()
+        f.close()
+        assert_equal(s, '1.51,2.00,3.51,4.00')
+
+    def test_locale(self):
+        with CommaDecimalPointLocale():
+            self.test_numbers()
+            self.test_nan()
+            self.test_inf()
+            self.test_counted_string()
+            self.test_ascii()
+            self.test_malformed()
+            self.test_tofile_sep()
+            self.test_tofile_format()
+
+
+class TestFromBuffer(object):
+    @pytest.mark.parametrize('byteorder', ['<', '>'])
+    @pytest.mark.parametrize('dtype', [float, int, complex])
+    def test_basic(self, byteorder, dtype):
+        dt = np.dtype(dtype).newbyteorder(byteorder)
+        x = (np.random.random((4, 7)) * 5).astype(dt)
+        buf = x.tobytes()
+        assert_array_equal(np.frombuffer(buf, dtype=dt), x.flat)
+
+    def test_empty(self):
+        assert_array_equal(np.frombuffer(b''), np.array([]))
+
+
+class TestFlat(object):
+    def setup(self):
+        a0 = np.arange(20.0)
+        a = a0.reshape(4, 5)
+        a0.shape = (4, 5)
+        a.flags.writeable = False
+        self.a = a
+        self.b = a[::2, ::2]
+        self.a0 = a0
+        self.b0 = a0[::2, ::2]
+
+    def test_contiguous(self):
+        testpassed = False
+        try:
+            self.a.flat[12] = 100.0
+        except ValueError:
+            testpassed = True
+        assert_(testpassed)
+        assert_(self.a.flat[12] == 12.0)
+
+    def test_discontiguous(self):
+        testpassed = False
+        try:
+            self.b.flat[4] = 100.0
+        except ValueError:
+            testpassed = True
+        assert_(testpassed)
+        assert_(self.b.flat[4] == 12.0)
+
+    def test___array__(self):
+        c = self.a.flat.__array__()
+        d = self.b.flat.__array__()
+        e = self.a0.flat.__array__()
+        f = self.b0.flat.__array__()
+
+        assert_(c.flags.writeable is False)
+        assert_(d.flags.writeable is False)
+        # for 1.14 all are set to non-writeable on the way to replacing the
+        # UPDATEIFCOPY array returned for non-contiguous arrays.
+        assert_(e.flags.writeable is True)
+        assert_(f.flags.writeable is False)
+        with assert_warns(DeprecationWarning):
+            assert_(c.flags.updateifcopy is False)
+        with assert_warns(DeprecationWarning):
+            assert_(d.flags.updateifcopy is False)
+        with assert_warns(DeprecationWarning):
+            assert_(e.flags.updateifcopy is False)
+        with assert_warns(DeprecationWarning):
+            # UPDATEIFCOPY is removed.
+            assert_(f.flags.updateifcopy is False)
+        assert_(c.flags.writebackifcopy is False)
+        assert_(d.flags.writebackifcopy is False)
+        assert_(e.flags.writebackifcopy is False)
+        assert_(f.flags.writebackifcopy is False)
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_refcount(self):
+        # includes regression test for reference count error gh-13165
+        inds = [np.intp(0), np.array([True]*self.a.size), np.array([0]), None]
+        indtype = np.dtype(np.intp)
+        rc_indtype = sys.getrefcount(indtype)
+        for ind in inds:
+            rc_ind = sys.getrefcount(ind)
+            for _ in range(100):
+                try:
+                    self.a.flat[ind]
+                except IndexError:
+                    pass
+            assert_(abs(sys.getrefcount(ind) - rc_ind) < 50)
+            assert_(abs(sys.getrefcount(indtype) - rc_indtype) < 50)
+
+
+class TestResize(object):
+    def test_basic(self):
+        x = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        if IS_PYPY:
+            x.resize((5, 5), refcheck=False)
+        else:
+            x.resize((5, 5))
+        assert_array_equal(x.flat[:9],
+                np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]).flat)
+        assert_array_equal(x[9:].flat, 0)
+
+    def test_check_reference(self):
+        x = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y = x
+        assert_raises(ValueError, x.resize, (5, 1))
+        del y  # avoid pyflakes unused variable warning.
+
+    def test_int_shape(self):
+        x = np.eye(3)
+        if IS_PYPY:
+            x.resize(3, refcheck=False)
+        else:
+            x.resize(3)
+        assert_array_equal(x, np.eye(3)[0,:])
+
+    def test_none_shape(self):
+        x = np.eye(3)
+        x.resize(None)
+        assert_array_equal(x, np.eye(3))
+        x.resize()
+        assert_array_equal(x, np.eye(3))
+
+    def test_0d_shape(self):
+        # to it multiple times to test it does not break alloc cache gh-9216
+        for i in range(10):
+            x = np.empty((1,))
+            x.resize(())
+            assert_equal(x.shape, ())
+            assert_equal(x.size, 1)
+            x = np.empty(())
+            x.resize((1,))
+            assert_equal(x.shape, (1,))
+            assert_equal(x.size, 1)
+
+    def test_invalid_arguments(self):
+        assert_raises(TypeError, np.eye(3).resize, 'hi')
+        assert_raises(ValueError, np.eye(3).resize, -1)
+        assert_raises(TypeError, np.eye(3).resize, order=1)
+        assert_raises(TypeError, np.eye(3).resize, refcheck='hi')
+
+    def test_freeform_shape(self):
+        x = np.eye(3)
+        if IS_PYPY:
+            x.resize(3, 2, 1, refcheck=False)
+        else:
+            x.resize(3, 2, 1)
+        assert_(x.shape == (3, 2, 1))
+
+    def test_zeros_appended(self):
+        x = np.eye(3)
+        if IS_PYPY:
+            x.resize(2, 3, 3, refcheck=False)
+        else:
+            x.resize(2, 3, 3)
+        assert_array_equal(x[0], np.eye(3))
+        assert_array_equal(x[1], np.zeros((3, 3)))
+
+    def test_obj_obj(self):
+        # check memory is initialized on resize, gh-4857
+        a = np.ones(10, dtype=[('k', object, 2)])
+        if IS_PYPY:
+            a.resize(15, refcheck=False)
+        else:
+            a.resize(15,)
+        assert_equal(a.shape, (15,))
+        assert_array_equal(a['k'][-5:], 0)
+        assert_array_equal(a['k'][:-5], 1)
+
+    def test_empty_view(self):
+        # check that sizes containing a zero don't trigger a reallocate for
+        # already empty arrays
+        x = np.zeros((10, 0), int)
+        x_view = x[...]
+        x_view.resize((0, 10))
+        x_view.resize((0, 100))
+
+    def test_check_weakref(self):
+        x = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        xref = weakref.ref(x)
+        assert_raises(ValueError, x.resize, (5, 1))
+        del xref  # avoid pyflakes unused variable warning.
+
+
+class TestRecord(object):
+    def test_field_rename(self):
+        dt = np.dtype([('f', float), ('i', int)])
+        dt.names = ['p', 'q']
+        assert_equal(dt.names, ['p', 'q'])
+
+    def test_multiple_field_name_occurrence(self):
+        def test_dtype_init():
+            np.dtype([("A", "f8"), ("B", "f8"), ("A", "f8")])
+
+        # Error raised when multiple fields have the same name
+        assert_raises(ValueError, test_dtype_init)
+
+    @pytest.mark.skipif(sys.version_info[0] < 3, reason="Not Python 3")
+    def test_bytes_fields(self):
+        # Bytes are not allowed in field names and not recognized in titles
+        # on Py3
+        assert_raises(TypeError, np.dtype, [(b'a', int)])
+        assert_raises(TypeError, np.dtype, [(('b', b'a'), int)])
+
+        dt = np.dtype([((b'a', 'b'), int)])
+        assert_raises(TypeError, dt.__getitem__, b'a')
+
+        x = np.array([(1,), (2,), (3,)], dtype=dt)
+        assert_raises(IndexError, x.__getitem__, b'a')
+
+        y = x[0]
+        assert_raises(IndexError, y.__getitem__, b'a')
+
+    @pytest.mark.skipif(sys.version_info[0] < 3, reason="Not Python 3")
+    def test_multiple_field_name_unicode(self):
+        def test_dtype_unicode():
+            np.dtype([("\u20B9", "f8"), ("B", "f8"), ("\u20B9", "f8")])
+
+        # Error raised when multiple fields have the same name(unicode included)
+        assert_raises(ValueError, test_dtype_unicode)
+
+    @pytest.mark.skipif(sys.version_info[0] >= 3, reason="Not Python 2")
+    def test_unicode_field_titles(self):
+        # Unicode field titles are added to field dict on Py2
+        title = u'b'
+        dt = np.dtype([((title, 'a'), int)])
+        dt[title]
+        dt['a']
+        x = np.array([(1,), (2,), (3,)], dtype=dt)
+        x[title]
+        x['a']
+        y = x[0]
+        y[title]
+        y['a']
+
+    @pytest.mark.skipif(sys.version_info[0] >= 3, reason="Not Python 2")
+    def test_unicode_field_names(self):
+        # Unicode field names are converted to ascii on Python 2:
+        encodable_name = u'b'
+        assert_equal(np.dtype([(encodable_name, int)]).names[0], b'b')
+        assert_equal(np.dtype([(('a', encodable_name), int)]).names[0], b'b')
+
+        # But raises UnicodeEncodeError if it can't be encoded:
+        nonencodable_name = u'\uc3bc'
+        assert_raises(UnicodeEncodeError, np.dtype, [(nonencodable_name, int)])
+        assert_raises(UnicodeEncodeError, np.dtype, [(('a', nonencodable_name), int)])
+
+    def test_fromarrays_unicode(self):
+        # A single name string provided to fromarrays() is allowed to be unicode
+        # on both Python 2 and 3:
+        x = np.core.records.fromarrays([[0], [1]], names=u'a,b', formats=u'i4,i4')
+        assert_equal(x['a'][0], 0)
+        assert_equal(x['b'][0], 1)
+
+    def test_unicode_order(self):
+        # Test that we can sort with order as a unicode field name in both Python 2 and
+        # 3:
+        name = u'b'
+        x = np.array([1, 3, 2], dtype=[(name, int)])
+        x.sort(order=name)
+        assert_equal(x[u'b'], np.array([1, 2, 3]))
+
+    def test_field_names(self):
+        # Test unicode and 8-bit / byte strings can be used
+        a = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        is_py3 = sys.version_info[0] >= 3
+        if is_py3:
+            funcs = (str,)
+            # byte string indexing fails gracefully
+            assert_raises(IndexError, a.__setitem__, b'f1', 1)
+            assert_raises(IndexError, a.__getitem__, b'f1')
+            assert_raises(IndexError, a['f1'].__setitem__, b'sf1', 1)
+            assert_raises(IndexError, a['f1'].__getitem__, b'sf1')
+        else:
+            funcs = (str, unicode)
+        for func in funcs:
+            b = a.copy()
+            fn1 = func('f1')
+            b[fn1] = 1
+            assert_equal(b[fn1], 1)
+            fnn = func('not at all')
+            assert_raises(ValueError, b.__setitem__, fnn, 1)
+            assert_raises(ValueError, b.__getitem__, fnn)
+            b[0][fn1] = 2
+            assert_equal(b[fn1], 2)
+            # Subfield
+            assert_raises(ValueError, b[0].__setitem__, fnn, 1)
+            assert_raises(ValueError, b[0].__getitem__, fnn)
+            # Subfield
+            fn3 = func('f3')
+            sfn1 = func('sf1')
+            b[fn3][sfn1] = 1
+            assert_equal(b[fn3][sfn1], 1)
+            assert_raises(ValueError, b[fn3].__setitem__, fnn, 1)
+            assert_raises(ValueError, b[fn3].__getitem__, fnn)
+            # multiple subfields
+            fn2 = func('f2')
+            b[fn2] = 3
+
+            assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+            assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+            assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+
+        # non-ascii unicode field indexing is well behaved
+        if not is_py3:
+            pytest.skip('non ascii unicode field indexing skipped; '
+                        'raises segfault on python 2.x')
+        else:
+            assert_raises(ValueError, a.__setitem__, u'\u03e0', 1)
+            assert_raises(ValueError, a.__getitem__, u'\u03e0')
+
+    def test_record_hash(self):
+        a = np.array([(1, 2), (1, 2)], dtype='i1,i2')
+        a.flags.writeable = False
+        b = np.array([(1, 2), (3, 4)], dtype=[('num1', 'i1'), ('num2', 'i2')])
+        b.flags.writeable = False
+        c = np.array([(1, 2), (3, 4)], dtype='i1,i2')
+        c.flags.writeable = False
+        assert_(hash(a[0]) == hash(a[1]))
+        assert_(hash(a[0]) == hash(b[0]))
+        assert_(hash(a[0]) != hash(b[1]))
+        assert_(hash(c[0]) == hash(a[0]) and c[0] == a[0])
+
+    def test_record_no_hash(self):
+        a = np.array([(1, 2), (1, 2)], dtype='i1,i2')
+        assert_raises(TypeError, hash, a[0])
+
+    def test_empty_structure_creation(self):
+        # make sure these do not raise errors (gh-5631)
+        np.array([()], dtype={'names': [], 'formats': [],
+                           'offsets': [], 'itemsize': 12})
+        np.array([(), (), (), (), ()], dtype={'names': [], 'formats': [],
+                                           'offsets': [], 'itemsize': 12})
+
+    def test_multifield_indexing_view(self):
+        a = np.ones(3, dtype=[('a', 'i4'), ('b', 'f4'), ('c', 'u4')])
+        v = a[['a', 'c']]
+        assert_(v.base is a)
+        assert_(v.dtype == np.dtype({'names': ['a', 'c'],
+                                     'formats': ['i4', 'u4'],
+                                     'offsets': [0, 8]}))
+        v[:] = (4,5)
+        assert_equal(a[0].item(), (4, 1, 5))
+
+class TestView(object):
+    def test_basic(self):
+        x = np.array([(1, 2, 3, 4), (5, 6, 7, 8)],
+                     dtype=[('r', np.int8), ('g', np.int8),
+                            ('b', np.int8), ('a', np.int8)])
+        # We must be specific about the endianness here:
+        y = x.view(dtype='<i4')
+        # ... and again without the keyword.
+        z = x.view('<i4')
+        assert_array_equal(y, z)
+        assert_array_equal(y, [67305985, 134678021])
+
+
+def _mean(a, **args):
+    return a.mean(**args)
+
+
+def _var(a, **args):
+    return a.var(**args)
+
+
+def _std(a, **args):
+    return a.std(**args)
+
+
+class TestStats(object):
+
+    funcs = [_mean, _var, _std]
+
+    def setup(self):
+        np.random.seed(range(3))
+        self.rmat = np.random.random((4, 5))
+        self.cmat = self.rmat + 1j * self.rmat
+        self.omat = np.array([Decimal(repr(r)) for r in self.rmat.flat])
+        self.omat = self.omat.reshape(4, 5)
+
+    def test_python_type(self):
+        for x in (np.float16(1.), 1, 1., 1+0j):
+            assert_equal(np.mean([x]), 1.)
+            assert_equal(np.std([x]), 0.)
+            assert_equal(np.var([x]), 0.)
+
+    def test_keepdims(self):
+        mat = np.eye(3)
+        for f in self.funcs:
+            for axis in [0, 1]:
+                res = f(mat, axis=axis, keepdims=True)
+                assert_(res.ndim == mat.ndim)
+                assert_(res.shape[axis] == 1)
+            for axis in [None]:
+                res = f(mat, axis=axis, keepdims=True)
+                assert_(res.shape == (1, 1))
+
+    def test_out(self):
+        mat = np.eye(3)
+        for f in self.funcs:
+            out = np.zeros(3)
+            tgt = f(mat, axis=1)
+            res = f(mat, axis=1, out=out)
+            assert_almost_equal(res, out)
+            assert_almost_equal(res, tgt)
+        out = np.empty(2)
+        assert_raises(ValueError, f, mat, axis=1, out=out)
+        out = np.empty((2, 2))
+        assert_raises(ValueError, f, mat, axis=1, out=out)
+
+    def test_dtype_from_input(self):
+
+        icodes = np.typecodes['AllInteger']
+        fcodes = np.typecodes['AllFloat']
+
+        # object type
+        for f in self.funcs:
+            mat = np.array([[Decimal(1)]*3]*3)
+            tgt = mat.dtype.type
+            res = f(mat, axis=1).dtype.type
+            assert_(res is tgt)
+            # scalar case
+            res = type(f(mat, axis=None))
+            assert_(res is Decimal)
+
+        # integer types
+        for f in self.funcs:
+            for c in icodes:
+                mat = np.eye(3, dtype=c)
+                tgt = np.float64
+                res = f(mat, axis=1).dtype.type
+                assert_(res is tgt)
+                # scalar case
+                res = f(mat, axis=None).dtype.type
+                assert_(res is tgt)
+
+        # mean for float types
+        for f in [_mean]:
+            for c in fcodes:
+                mat = np.eye(3, dtype=c)
+                tgt = mat.dtype.type
+                res = f(mat, axis=1).dtype.type
+                assert_(res is tgt)
+                # scalar case
+                res = f(mat, axis=None).dtype.type
+                assert_(res is tgt)
+
+        # var, std for float types
+        for f in [_var, _std]:
+            for c in fcodes:
+                mat = np.eye(3, dtype=c)
+                # deal with complex types
+                tgt = mat.real.dtype.type
+                res = f(mat, axis=1).dtype.type
+                assert_(res is tgt)
+                # scalar case
+                res = f(mat, axis=None).dtype.type
+                assert_(res is tgt)
+
+    def test_dtype_from_dtype(self):
+        mat = np.eye(3)
+
+        # stats for integer types
+        # FIXME:
+        # this needs definition as there are lots places along the line
+        # where type casting may take place.
+
+        # for f in self.funcs:
+        #    for c in np.typecodes['AllInteger']:
+        #        tgt = np.dtype(c).type
+        #        res = f(mat, axis=1, dtype=c).dtype.type
+        #        assert_(res is tgt)
+        #        # scalar case
+        #        res = f(mat, axis=None, dtype=c).dtype.type
+        #        assert_(res is tgt)
+
+        # stats for float types
+        for f in self.funcs:
+            for c in np.typecodes['AllFloat']:
+                tgt = np.dtype(c).type
+                res = f(mat, axis=1, dtype=c).dtype.type
+                assert_(res is tgt)
+                # scalar case
+                res = f(mat, axis=None, dtype=c).dtype.type
+                assert_(res is tgt)
+
+    def test_ddof(self):
+        for f in [_var]:
+            for ddof in range(3):
+                dim = self.rmat.shape[1]
+                tgt = f(self.rmat, axis=1) * dim
+                res = f(self.rmat, axis=1, ddof=ddof) * (dim - ddof)
+        for f in [_std]:
+            for ddof in range(3):
+                dim = self.rmat.shape[1]
+                tgt = f(self.rmat, axis=1) * np.sqrt(dim)
+                res = f(self.rmat, axis=1, ddof=ddof) * np.sqrt(dim - ddof)
+                assert_almost_equal(res, tgt)
+                assert_almost_equal(res, tgt)
+
+    def test_ddof_too_big(self):
+        dim = self.rmat.shape[1]
+        for f in [_var, _std]:
+            for ddof in range(dim, dim + 2):
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    res = f(self.rmat, axis=1, ddof=ddof)
+                    assert_(not (res < 0).any())
+                    assert_(len(w) > 0)
+                    assert_(issubclass(w[0].category, RuntimeWarning))
+
+    def test_empty(self):
+        A = np.zeros((0, 3))
+        for f in self.funcs:
+            for axis in [0, None]:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    assert_(np.isnan(f(A, axis=axis)).all())
+                    assert_(len(w) > 0)
+                    assert_(issubclass(w[0].category, RuntimeWarning))
+            for axis in [1]:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    assert_equal(f(A, axis=axis), np.zeros([]))
+
+    def test_mean_values(self):
+        for mat in [self.rmat, self.cmat, self.omat]:
+            for axis in [0, 1]:
+                tgt = mat.sum(axis=axis)
+                res = _mean(mat, axis=axis) * mat.shape[axis]
+                assert_almost_equal(res, tgt)
+            for axis in [None]:
+                tgt = mat.sum(axis=axis)
+                res = _mean(mat, axis=axis) * np.prod(mat.shape)
+                assert_almost_equal(res, tgt)
+
+    def test_mean_float16(self):
+        # This fail if the sum inside mean is done in float16 instead
+        # of float32.
+        assert_(_mean(np.ones(100000, dtype='float16')) == 1)
+
+    def test_var_values(self):
+        for mat in [self.rmat, self.cmat, self.omat]:
+            for axis in [0, 1, None]:
+                msqr = _mean(mat * mat.conj(), axis=axis)
+                mean = _mean(mat, axis=axis)
+                tgt = msqr - mean * mean.conjugate()
+                res = _var(mat, axis=axis)
+                assert_almost_equal(res, tgt)
+
+    def test_std_values(self):
+        for mat in [self.rmat, self.cmat, self.omat]:
+            for axis in [0, 1, None]:
+                tgt = np.sqrt(_var(mat, axis=axis))
+                res = _std(mat, axis=axis)
+                assert_almost_equal(res, tgt)
+
+    def test_subclass(self):
+        class TestArray(np.ndarray):
+            def __new__(cls, data, info):
+                result = np.array(data)
+                result = result.view(cls)
+                result.info = info
+                return result
+
+            def __array_finalize__(self, obj):
+                self.info = getattr(obj, "info", '')
+
+        dat = TestArray([[1, 2, 3, 4], [5, 6, 7, 8]], 'jubba')
+        res = dat.mean(1)
+        assert_(res.info == dat.info)
+        res = dat.std(1)
+        assert_(res.info == dat.info)
+        res = dat.var(1)
+        assert_(res.info == dat.info)
+
+class TestVdot(object):
+    def test_basic(self):
+        dt_numeric = np.typecodes['AllFloat'] + np.typecodes['AllInteger']
+        dt_complex = np.typecodes['Complex']
+
+        # test real
+        a = np.eye(3)
+        for dt in dt_numeric + 'O':
+            b = a.astype(dt)
+            res = np.vdot(b, b)
+            assert_(np.isscalar(res))
+            assert_equal(np.vdot(b, b), 3)
+
+        # test complex
+        a = np.eye(3) * 1j
+        for dt in dt_complex + 'O':
+            b = a.astype(dt)
+            res = np.vdot(b, b)
+            assert_(np.isscalar(res))
+            assert_equal(np.vdot(b, b), 3)
+
+        # test boolean
+        b = np.eye(3, dtype=bool)
+        res = np.vdot(b, b)
+        assert_(np.isscalar(res))
+        assert_equal(np.vdot(b, b), True)
+
+    def test_vdot_array_order(self):
+        a = np.array([[1, 2], [3, 4]], order='C')
+        b = np.array([[1, 2], [3, 4]], order='F')
+        res = np.vdot(a, a)
+
+        # integer arrays are exact
+        assert_equal(np.vdot(a, b), res)
+        assert_equal(np.vdot(b, a), res)
+        assert_equal(np.vdot(b, b), res)
+
+    def test_vdot_uncontiguous(self):
+        for size in [2, 1000]:
+            # Different sizes match different branches in vdot.
+            a = np.zeros((size, 2, 2))
+            b = np.zeros((size, 2, 2))
+            a[:, 0, 0] = np.arange(size)
+            b[:, 0, 0] = np.arange(size) + 1
+            # Make a and b uncontiguous:
+            a = a[..., 0]
+            b = b[..., 0]
+
+            assert_equal(np.vdot(a, b),
+                         np.vdot(a.flatten(), b.flatten()))
+            assert_equal(np.vdot(a, b.copy()),
+                         np.vdot(a.flatten(), b.flatten()))
+            assert_equal(np.vdot(a.copy(), b),
+                         np.vdot(a.flatten(), b.flatten()))
+            assert_equal(np.vdot(a.copy('F'), b),
+                         np.vdot(a.flatten(), b.flatten()))
+            assert_equal(np.vdot(a, b.copy('F')),
+                         np.vdot(a.flatten(), b.flatten()))
+
+
+class TestDot(object):
+    def setup(self):
+        np.random.seed(128)
+        self.A = np.random.rand(4, 2)
+        self.b1 = np.random.rand(2, 1)
+        self.b2 = np.random.rand(2)
+        self.b3 = np.random.rand(1, 2)
+        self.b4 = np.random.rand(4)
+        self.N = 7
+
+    def test_dotmatmat(self):
+        A = self.A
+        res = np.dot(A.transpose(), A)
+        tgt = np.array([[1.45046013, 0.86323640],
+                        [0.86323640, 0.84934569]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotmatvec(self):
+        A, b1 = self.A, self.b1
+        res = np.dot(A, b1)
+        tgt = np.array([[0.32114320], [0.04889721],
+                        [0.15696029], [0.33612621]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotmatvec2(self):
+        A, b2 = self.A, self.b2
+        res = np.dot(A, b2)
+        tgt = np.array([0.29677940, 0.04518649, 0.14468333, 0.31039293])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecmat(self):
+        A, b4 = self.A, self.b4
+        res = np.dot(b4, A)
+        tgt = np.array([1.23495091, 1.12222648])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecmat2(self):
+        b3, A = self.b3, self.A
+        res = np.dot(b3, A.transpose())
+        tgt = np.array([[0.58793804, 0.08957460, 0.30605758, 0.62716383]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecmat3(self):
+        A, b4 = self.A, self.b4
+        res = np.dot(A.transpose(), b4)
+        tgt = np.array([1.23495091, 1.12222648])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecvecouter(self):
+        b1, b3 = self.b1, self.b3
+        res = np.dot(b1, b3)
+        tgt = np.array([[0.20128610, 0.08400440], [0.07190947, 0.03001058]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecvecinner(self):
+        b1, b3 = self.b1, self.b3
+        res = np.dot(b3, b1)
+        tgt = np.array([[ 0.23129668]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotcolumnvect1(self):
+        b1 = np.ones((3, 1))
+        b2 = [5.3]
+        res = np.dot(b1, b2)
+        tgt = np.array([5.3, 5.3, 5.3])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotcolumnvect2(self):
+        b1 = np.ones((3, 1)).transpose()
+        b2 = [6.2]
+        res = np.dot(b2, b1)
+        tgt = np.array([6.2, 6.2, 6.2])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecscalar(self):
+        np.random.seed(100)
+        b1 = np.random.rand(1, 1)
+        b2 = np.random.rand(1, 4)
+        res = np.dot(b1, b2)
+        tgt = np.array([[0.15126730, 0.23068496, 0.45905553, 0.00256425]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_dotvecscalar2(self):
+        np.random.seed(100)
+        b1 = np.random.rand(4, 1)
+        b2 = np.random.rand(1, 1)
+        res = np.dot(b1, b2)
+        tgt = np.array([[0.00256425],[0.00131359],[0.00200324],[ 0.00398638]])
+        assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_all(self):
+        dims = [(), (1,), (1, 1)]
+        dout = [(), (1,), (1, 1), (1,), (), (1,), (1, 1), (1,), (1, 1)]
+        for dim, (dim1, dim2) in zip(dout, itertools.product(dims, dims)):
+            b1 = np.zeros(dim1)
+            b2 = np.zeros(dim2)
+            res = np.dot(b1, b2)
+            tgt = np.zeros(dim)
+            assert_(res.shape == tgt.shape)
+            assert_almost_equal(res, tgt, decimal=self.N)
+
+    def test_vecobject(self):
+        class Vec(object):
+            def __init__(self, sequence=None):
+                if sequence is None:
+                    sequence = []
+                self.array = np.array(sequence)
+
+            def __add__(self, other):
+                out = Vec()
+                out.array = self.array + other.array
+                return out
+
+            def __sub__(self, other):
+                out = Vec()
+                out.array = self.array - other.array
+                return out
+
+            def __mul__(self, other):  # with scalar
+                out = Vec(self.array.copy())
+                out.array *= other
+                return out
+
+            def __rmul__(self, other):
+                return self*other
+
+        U_non_cont = np.transpose([[1., 1.], [1., 2.]])
+        U_cont = np.ascontiguousarray(U_non_cont)
+        x = np.array([Vec([1., 0.]), Vec([0., 1.])])
+        zeros = np.array([Vec([0., 0.]), Vec([0., 0.])])
+        zeros_test = np.dot(U_cont, x) - np.dot(U_non_cont, x)
+        assert_equal(zeros[0].array, zeros_test[0].array)
+        assert_equal(zeros[1].array, zeros_test[1].array)
+
+    def test_dot_2args(self):
+        from numpy.core.multiarray import dot
+
+        a = np.array([[1, 2], [3, 4]], dtype=float)
+        b = np.array([[1, 0], [1, 1]], dtype=float)
+        c = np.array([[3, 2], [7, 4]], dtype=float)
+
+        d = dot(a, b)
+        assert_allclose(c, d)
+
+    def test_dot_3args(self):
+        from numpy.core.multiarray import dot
+
+        np.random.seed(22)
+        f = np.random.random_sample((1024, 16))
+        v = np.random.random_sample((16, 32))
+
+        r = np.empty((1024, 32))
+        for i in range(12):
+            dot(f, v, r)
+        if HAS_REFCOUNT:
+            assert_equal(sys.getrefcount(r), 2)
+        r2 = dot(f, v, out=None)
+        assert_array_equal(r2, r)
+        assert_(r is dot(f, v, out=r))
+
+        v = v[:, 0].copy()  # v.shape == (16,)
+        r = r[:, 0].copy()  # r.shape == (1024,)
+        r2 = dot(f, v)
+        assert_(r is dot(f, v, r))
+        assert_array_equal(r2, r)
+
+    def test_dot_3args_errors(self):
+        from numpy.core.multiarray import dot
+
+        np.random.seed(22)
+        f = np.random.random_sample((1024, 16))
+        v = np.random.random_sample((16, 32))
+
+        r = np.empty((1024, 31))
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((1024,))
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((32,))
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((32, 1024))
+        assert_raises(ValueError, dot, f, v, r)
+        assert_raises(ValueError, dot, f, v, r.T)
+
+        r = np.empty((1024, 64))
+        assert_raises(ValueError, dot, f, v, r[:, ::2])
+        assert_raises(ValueError, dot, f, v, r[:, :32])
+
+        r = np.empty((1024, 32), dtype=np.float32)
+        assert_raises(ValueError, dot, f, v, r)
+
+        r = np.empty((1024, 32), dtype=int)
+        assert_raises(ValueError, dot, f, v, r)
+
+    def test_dot_array_order(self):
+        a = np.array([[1, 2], [3, 4]], order='C')
+        b = np.array([[1, 2], [3, 4]], order='F')
+        res = np.dot(a, a)
+
+        # integer arrays are exact
+        assert_equal(np.dot(a, b), res)
+        assert_equal(np.dot(b, a), res)
+        assert_equal(np.dot(b, b), res)
+
+    def test_accelerate_framework_sgemv_fix(self):
+
+        def aligned_array(shape, align, dtype, order='C'):
+            d = dtype(0)
+            N = np.prod(shape)
+            tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
+            address = tmp.__array_interface__["data"][0]
+            for offset in range(align):
+                if (address + offset) % align == 0:
+                    break
+            tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
+            return tmp.reshape(shape, order=order)
+
+        def as_aligned(arr, align, dtype, order='C'):
+            aligned = aligned_array(arr.shape, align, dtype, order)
+            aligned[:] = arr[:]
+            return aligned
+
+        def assert_dot_close(A, X, desired):
+            assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7)
+
+        m = aligned_array(100, 15, np.float32)
+        s = aligned_array((100, 100), 15, np.float32)
+        np.dot(s, m)  # this will always segfault if the bug is present
+
+        testdata = itertools.product((15,32), (10000,), (200,89), ('C','F'))
+        for align, m, n, a_order in testdata:
+            # Calculation in double precision
+            A_d = np.random.rand(m, n)
+            X_d = np.random.rand(n)
+            desired = np.dot(A_d, X_d)
+            # Calculation with aligned single precision
+            A_f = as_aligned(A_d, align, np.float32, order=a_order)
+            X_f = as_aligned(X_d, align, np.float32)
+            assert_dot_close(A_f, X_f, desired)
+            # Strided A rows
+            A_d_2 = A_d[::2]
+            desired = np.dot(A_d_2, X_d)
+            A_f_2 = A_f[::2]
+            assert_dot_close(A_f_2, X_f, desired)
+            # Strided A columns, strided X vector
+            A_d_22 = A_d_2[:, ::2]
+            X_d_2 = X_d[::2]
+            desired = np.dot(A_d_22, X_d_2)
+            A_f_22 = A_f_2[:, ::2]
+            X_f_2 = X_f[::2]
+            assert_dot_close(A_f_22, X_f_2, desired)
+            # Check the strides are as expected
+            if a_order == 'F':
+                assert_equal(A_f_22.strides, (8, 8 * m))
+            else:
+                assert_equal(A_f_22.strides, (8 * n, 8))
+            assert_equal(X_f_2.strides, (8,))
+            # Strides in A rows + cols only
+            X_f_2c = as_aligned(X_f_2, align, np.float32)
+            assert_dot_close(A_f_22, X_f_2c, desired)
+            # Strides just in A cols
+            A_d_12 = A_d[:, ::2]
+            desired = np.dot(A_d_12, X_d_2)
+            A_f_12 = A_f[:, ::2]
+            assert_dot_close(A_f_12, X_f_2c, desired)
+            # Strides in A cols and X
+            assert_dot_close(A_f_12, X_f_2, desired)
+
+
+class MatmulCommon(object):
+    """Common tests for '@' operator and numpy.matmul.
+
+    """
+    # Should work with these types. Will want to add
+    # "O" at some point
+    types = "?bhilqBHILQefdgFDG"
+
+    def test_exceptions(self):
+        dims = [
+            ((1,), (2,)),            # mismatched vector vector
+            ((2, 1,), (2,)),         # mismatched matrix vector
+            ((2,), (1, 2)),          # mismatched vector matrix
+            ((1, 2), (3, 1)),        # mismatched matrix matrix
+            ((1,), ()),              # vector scalar
+            ((), (1)),               # scalar vector
+            ((1, 1), ()),            # matrix scalar
+            ((), (1, 1)),            # scalar matrix
+            ((2, 2, 1), (3, 1, 2)),  # cannot broadcast
+            ]
+
+        for dt, (dm1, dm2) in itertools.product(self.types, dims):
+            a = np.ones(dm1, dtype=dt)
+            b = np.ones(dm2, dtype=dt)
+            assert_raises(ValueError, self.matmul, a, b)
+
+    def test_shapes(self):
+        dims = [
+            ((1, 1), (2, 1, 1)),     # broadcast first argument
+            ((2, 1, 1), (1, 1)),     # broadcast second argument
+            ((2, 1, 1), (2, 1, 1)),  # matrix stack sizes match
+            ]
+
+        for dt, (dm1, dm2) in itertools.product(self.types, dims):
+            a = np.ones(dm1, dtype=dt)
+            b = np.ones(dm2, dtype=dt)
+            res = self.matmul(a, b)
+            assert_(res.shape == (2, 1, 1))
+
+        # vector vector returns scalars.
+        for dt in self.types:
+            a = np.ones((2,), dtype=dt)
+            b = np.ones((2,), dtype=dt)
+            c = self.matmul(a, b)
+            assert_(np.array(c).shape == ())
+
+    def test_result_types(self):
+        mat = np.ones((1,1))
+        vec = np.ones((1,))
+        for dt in self.types:
+            m = mat.astype(dt)
+            v = vec.astype(dt)
+            for arg in [(m, v), (v, m), (m, m)]:
+                res = self.matmul(*arg)
+                assert_(res.dtype == dt)
+
+            # vector vector returns scalars
+            res = self.matmul(v, v)
+            assert_(type(res) is np.dtype(dt).type)
+
+    def test_scalar_output(self):
+        vec1 = np.array([2])
+        vec2 = np.array([3, 4]).reshape(1, -1)
+        tgt = np.array([6, 8])
+        for dt in self.types[1:]:
+            v1 = vec1.astype(dt)
+            v2 = vec2.astype(dt)
+            res = self.matmul(v1, v2)
+            assert_equal(res, tgt)
+            res = self.matmul(v2.T, v1)
+            assert_equal(res, tgt)
+
+        # boolean type
+        vec = np.array([True, True], dtype='?').reshape(1, -1)
+        res = self.matmul(vec[:, 0], vec)
+        assert_equal(res, True)
+
+    def test_vector_vector_values(self):
+        vec1 = np.array([1, 2])
+        vec2 = np.array([3, 4]).reshape(-1, 1)
+        tgt1 = np.array([11])
+        tgt2 = np.array([[3, 6], [4, 8]])
+        for dt in self.types[1:]:
+            v1 = vec1.astype(dt)
+            v2 = vec2.astype(dt)
+            res = self.matmul(v1, v2)
+            assert_equal(res, tgt1)
+            # no broadcast, we must make v1 into a 2d ndarray
+            res = self.matmul(v2, v1.reshape(1, -1))
+            assert_equal(res, tgt2)
+
+        # boolean type
+        vec = np.array([True, True], dtype='?')
+        res = self.matmul(vec, vec)
+        assert_equal(res, True)
+
+    def test_vector_matrix_values(self):
+        vec = np.array([1, 2])
+        mat1 = np.array([[1, 2], [3, 4]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([7, 10])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+        for dt in self.types[1:]:
+            v = vec.astype(dt)
+            m1 = mat1.astype(dt)
+            m2 = mat2.astype(dt)
+            res = self.matmul(v, m1)
+            assert_equal(res, tgt1)
+            res = self.matmul(v, m2)
+            assert_equal(res, tgt2)
+
+        # boolean type
+        vec = np.array([True, False])
+        mat1 = np.array([[True, False], [False, True]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([True, False])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+
+        res = self.matmul(vec, mat1)
+        assert_equal(res, tgt1)
+        res = self.matmul(vec, mat2)
+        assert_equal(res, tgt2)
+
+    def test_matrix_vector_values(self):
+        vec = np.array([1, 2])
+        mat1 = np.array([[1, 2], [3, 4]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([5, 11])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+        for dt in self.types[1:]:
+            v = vec.astype(dt)
+            m1 = mat1.astype(dt)
+            m2 = mat2.astype(dt)
+            res = self.matmul(m1, v)
+            assert_equal(res, tgt1)
+            res = self.matmul(m2, v)
+            assert_equal(res, tgt2)
+
+        # boolean type
+        vec = np.array([True, False])
+        mat1 = np.array([[True, False], [False, True]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([True, False])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+
+        res = self.matmul(vec, mat1)
+        assert_equal(res, tgt1)
+        res = self.matmul(vec, mat2)
+        assert_equal(res, tgt2)
+
+    def test_matrix_matrix_values(self):
+        mat1 = np.array([[1, 2], [3, 4]])
+        mat2 = np.array([[1, 0], [1, 1]])
+        mat12 = np.stack([mat1, mat2], axis=0)
+        mat21 = np.stack([mat2, mat1], axis=0)
+        tgt11 = np.array([[7, 10], [15, 22]])
+        tgt12 = np.array([[3, 2], [7, 4]])
+        tgt21 = np.array([[1, 2], [4, 6]])
+        tgt12_21 = np.stack([tgt12, tgt21], axis=0)
+        tgt11_12 = np.stack((tgt11, tgt12), axis=0)
+        tgt11_21 = np.stack((tgt11, tgt21), axis=0)
+        for dt in self.types[1:]:
+            m1 = mat1.astype(dt)
+            m2 = mat2.astype(dt)
+            m12 = mat12.astype(dt)
+            m21 = mat21.astype(dt)
+
+            # matrix @ matrix
+            res = self.matmul(m1, m2)
+            assert_equal(res, tgt12)
+            res = self.matmul(m2, m1)
+            assert_equal(res, tgt21)
+
+            # stacked @ matrix
+            res = self.matmul(m12, m1)
+            assert_equal(res, tgt11_21)
+
+            # matrix @ stacked
+            res = self.matmul(m1, m12)
+            assert_equal(res, tgt11_12)
+
+            # stacked @ stacked
+            res = self.matmul(m12, m21)
+            assert_equal(res, tgt12_21)
+
+        # boolean type
+        m1 = np.array([[1, 1], [0, 0]], dtype=np.bool_)
+        m2 = np.array([[1, 0], [1, 1]], dtype=np.bool_)
+        m12 = np.stack([m1, m2], axis=0)
+        m21 = np.stack([m2, m1], axis=0)
+        tgt11 = m1
+        tgt12 = m1
+        tgt21 = np.array([[1, 1], [1, 1]], dtype=np.bool_)
+        tgt12_21 = np.stack([tgt12, tgt21], axis=0)
+        tgt11_12 = np.stack((tgt11, tgt12), axis=0)
+        tgt11_21 = np.stack((tgt11, tgt21), axis=0)
+
+        # matrix @ matrix
+        res = self.matmul(m1, m2)
+        assert_equal(res, tgt12)
+        res = self.matmul(m2, m1)
+        assert_equal(res, tgt21)
+
+        # stacked @ matrix
+        res = self.matmul(m12, m1)
+        assert_equal(res, tgt11_21)
+
+        # matrix @ stacked
+        res = self.matmul(m1, m12)
+        assert_equal(res, tgt11_12)
+
+        # stacked @ stacked
+        res = self.matmul(m12, m21)
+        assert_equal(res, tgt12_21)
+
+
+class TestMatmul(MatmulCommon):
+    matmul = np.matmul
+
+    def test_out_arg(self):
+        a = np.ones((5, 2), dtype=float)
+        b = np.array([[1, 3], [5, 7]], dtype=float)
+        tgt = np.dot(a, b)
+
+        # test as positional argument
+        msg = "out positional argument"
+        out = np.zeros((5, 2), dtype=float)
+        self.matmul(a, b, out)
+        assert_array_equal(out, tgt, err_msg=msg)
+
+        # test as keyword argument
+        msg = "out keyword argument"
+        out = np.zeros((5, 2), dtype=float)
+        self.matmul(a, b, out=out)
+        assert_array_equal(out, tgt, err_msg=msg)
+
+        # test out with not allowed type cast (safe casting)
+        msg = "Cannot cast ufunc matmul output"
+        out = np.zeros((5, 2), dtype=np.int32)
+        assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out)
+
+        # test out with type upcast to complex
+        out = np.zeros((5, 2), dtype=np.complex128)
+        c = self.matmul(a, b, out=out)
+        assert_(c is out)
+        with suppress_warnings() as sup:
+            sup.filter(np.ComplexWarning, '')
+            c = c.astype(tgt.dtype)
+        assert_array_equal(c, tgt)
+
+    def test_out_contiguous(self):
+        a = np.ones((5, 2), dtype=float)
+        b = np.array([[1, 3], [5, 7]], dtype=float)
+        v = np.array([1, 3], dtype=float)
+        tgt = np.dot(a, b)
+        tgt_mv = np.dot(a, v)
+
+        # test out non-contiguous
+        out = np.ones((5, 2, 2), dtype=float)
+        c = self.matmul(a, b, out=out[..., 0])
+        assert c.base is out
+        assert_array_equal(c, tgt)
+        c = self.matmul(a, v, out=out[:, 0, 0])
+        assert_array_equal(c, tgt_mv)
+        c = self.matmul(v, a.T, out=out[:, 0, 0])
+        assert_array_equal(c, tgt_mv)
+
+        # test out contiguous in only last dim
+        out = np.ones((10, 2), dtype=float)
+        c = self.matmul(a, b, out=out[::2, :])
+        assert_array_equal(c, tgt)
+
+        # test transposes of out, args
+        out = np.ones((5, 2), dtype=float)
+        c = self.matmul(b.T, a.T, out=out.T)
+        assert_array_equal(out, tgt)
+
+    m1 = np.arange(15.).reshape(5, 3)
+    m2 = np.arange(21.).reshape(3, 7)
+    m3 = np.arange(30.).reshape(5, 6)[:, ::2]  # non-contiguous
+    vc = np.arange(10.)
+    vr = np.arange(6.)
+    m0 = np.zeros((3, 0))
+    @pytest.mark.parametrize('args', (
+            # matrix-matrix
+            (m1, m2), (m2.T, m1.T), (m2.T.copy(), m1.T), (m2.T, m1.T.copy()),
+            # matrix-matrix-transpose, contiguous and non
+            (m1, m1.T), (m1.T, m1), (m1, m3.T), (m3, m1.T),
+            (m3, m3.T), (m3.T, m3),
+            # matrix-matrix non-contiguous
+            (m3, m2), (m2.T, m3.T), (m2.T.copy(), m3.T),
+            # vector-matrix, matrix-vector, contiguous
+            (m1, vr[:3]), (vc[:5], m1), (m1.T, vc[:5]), (vr[:3], m1.T),
+            # vector-matrix, matrix-vector, vector non-contiguous
+            (m1, vr[::2]), (vc[::2], m1), (m1.T, vc[::2]), (vr[::2], m1.T),
+            # vector-matrix, matrix-vector, matrix non-contiguous
+            (m3, vr[:3]), (vc[:5], m3), (m3.T, vc[:5]), (vr[:3], m3.T),
+            # vector-matrix, matrix-vector, both non-contiguous
+            (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T),
+            # size == 0
+            (m0, m0.T), (m0.T, m0), (m1, m0), (m0.T, m1.T),
+        ))
+    def test_dot_equivalent(self, args):
+        r1 = np.matmul(*args)
+        r2 = np.dot(*args)
+        assert_equal(r1, r2)
+
+        r3 = np.matmul(args[0].copy(), args[1].copy())
+        assert_equal(r1, r3)
+
+    def test_matmul_object(self):
+        import fractions
+
+        f = np.vectorize(fractions.Fraction)
+        def random_ints():
+            return np.random.randint(1, 1000, size=(10, 3, 3))
+        M1 = f(random_ints(), random_ints())
+        M2 = f(random_ints(), random_ints())
+
+        M3 = self.matmul(M1, M2)
+
+        [N1, N2, N3] = [a.astype(float) for a in [M1, M2, M3]]
+
+        assert_allclose(N3, self.matmul(N1, N2))
+
+    def test_matmul_object_type_scalar(self):
+        from fractions import Fraction as F
+        v = np.array([F(2,3), F(5,7)])
+        res = self.matmul(v, v)
+        assert_(type(res) is F)
+
+    def test_matmul_empty(self):
+        a = np.empty((3, 0), dtype=object)
+        b = np.empty((0, 3), dtype=object)
+        c = np.zeros((3, 3))
+        assert_array_equal(np.matmul(a, b), c)
+
+    def test_matmul_exception_multiply(self):
+        # test that matmul fails if `__mul__` is missing
+        class add_not_multiply():
+            def __add__(self, other):
+                return self
+        a = np.full((3,3), add_not_multiply())
+        with assert_raises(TypeError):
+            b = np.matmul(a, a)
+
+    def test_matmul_exception_add(self):
+        # test that matmul fails if `__add__` is missing
+        class multiply_not_add():
+            def __mul__(self, other):
+                return self
+        a = np.full((3,3), multiply_not_add())
+        with assert_raises(TypeError):
+            b = np.matmul(a, a)
+
+    def test_matmul_bool(self):
+        # gh-14439
+        a = np.array([[1, 0],[1, 1]], dtype=bool)
+        assert np.max(a.view(np.uint8)) == 1
+        b = np.matmul(a, a)
+        # matmul with boolean output should always be 0, 1
+        assert np.max(b.view(np.uint8)) == 1
+
+        np.random.seed(42)
+        d = np.random.randint(2, size=4*5, dtype=np.int8)
+        d = d.reshape(4, 5) > 0
+        out1 = np.matmul(d, d.reshape(5, 4))
+        out2 = np.dot(d, d.reshape(5, 4))
+        assert_equal(out1, out2)
+
+        c = np.matmul(np.zeros((2, 0), dtype=bool), np.zeros(0, dtype=bool))
+        assert not np.any(c)
+
+
+if sys.version_info[:2] >= (3, 5):
+    class TestMatmulOperator(MatmulCommon):
+        import operator
+        matmul = operator.matmul
+
+        def test_array_priority_override(self):
+
+            class A(object):
+                __array_priority__ = 1000
+
+                def __matmul__(self, other):
+                    return "A"
+
+                def __rmatmul__(self, other):
+                    return "A"
+
+            a = A()
+            b = np.ones(2)
+            assert_equal(self.matmul(a, b), "A")
+            assert_equal(self.matmul(b, a), "A")
+
+        def test_matmul_raises(self):
+            assert_raises(TypeError, self.matmul, np.int8(5), np.int8(5))
+            assert_raises(TypeError, self.matmul, np.void(b'abc'), np.void(b'abc'))
+            assert_raises(ValueError, self.matmul, np.arange(10), np.void(b'abc'))
+
+    def test_matmul_inplace():
+        # It would be nice to support in-place matmul eventually, but for now
+        # we don't have a working implementation, so better just to error out
+        # and nudge people to writing "a = a @ b".
+        a = np.eye(3)
+        b = np.eye(3)
+        assert_raises(TypeError, a.__imatmul__, b)
+        import operator
+        assert_raises(TypeError, operator.imatmul, a, b)
+        # we avoid writing the token `exec` so as not to crash python 2's
+        # parser
+        exec_ = getattr(builtins, "exec")
+        assert_raises(TypeError, exec_, "a @= b", globals(), locals())
+
+    def test_matmul_axes():
+        a = np.arange(3*4*5).reshape(3, 4, 5)
+        c = np.matmul(a, a, axes=[(-2, -1), (-1, -2), (1, 2)])
+        assert c.shape == (3, 4, 4)
+        d = np.matmul(a, a, axes=[(-2, -1), (-1, -2), (0, 1)])
+        assert d.shape == (4, 4, 3)
+        e = np.swapaxes(d, 0, 2)
+        assert_array_equal(e, c)
+        f = np.matmul(a, np.arange(3), axes=[(1, 0), (0), (0)])
+        assert f.shape == (4, 5)
+
+
+class TestInner(object):
+
+    def test_inner_type_mismatch(self):
+        c = 1.
+        A = np.array((1,1), dtype='i,i')
+
+        assert_raises(TypeError, np.inner, c, A)
+        assert_raises(TypeError, np.inner, A, c)
+
+    def test_inner_scalar_and_vector(self):
+        for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
+            sca = np.array(3, dtype=dt)[()]
+            vec = np.array([1, 2], dtype=dt)
+            desired = np.array([3, 6], dtype=dt)
+            assert_equal(np.inner(vec, sca), desired)
+            assert_equal(np.inner(sca, vec), desired)
+
+    def test_vecself(self):
+        # Ticket 844.
+        # Inner product of a vector with itself segfaults or give
+        # meaningless result
+        a = np.zeros(shape=(1, 80), dtype=np.float64)
+        p = np.inner(a, a)
+        assert_almost_equal(p, 0, decimal=14)
+
+    def test_inner_product_with_various_contiguities(self):
+        # github issue 6532
+        for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
+            # check an inner product involving a matrix transpose
+            A = np.array([[1, 2], [3, 4]], dtype=dt)
+            B = np.array([[1, 3], [2, 4]], dtype=dt)
+            C = np.array([1, 1], dtype=dt)
+            desired = np.array([4, 6], dtype=dt)
+            assert_equal(np.inner(A.T, C), desired)
+            assert_equal(np.inner(C, A.T), desired)
+            assert_equal(np.inner(B, C), desired)
+            assert_equal(np.inner(C, B), desired)
+            # check a matrix product
+            desired = np.array([[7, 10], [15, 22]], dtype=dt)
+            assert_equal(np.inner(A, B), desired)
+            # check the syrk vs. gemm paths
+            desired = np.array([[5, 11], [11, 25]], dtype=dt)
+            assert_equal(np.inner(A, A), desired)
+            assert_equal(np.inner(A, A.copy()), desired)
+            # check an inner product involving an aliased and reversed view
+            a = np.arange(5).astype(dt)
+            b = a[::-1]
+            desired = np.array(10, dtype=dt).item()
+            assert_equal(np.inner(b, a), desired)
+
+    def test_3d_tensor(self):
+        for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
+            a = np.arange(24).reshape(2,3,4).astype(dt)
+            b = np.arange(24, 48).reshape(2,3,4).astype(dt)
+            desired = np.array(
+                [[[[ 158,  182,  206],
+                   [ 230,  254,  278]],
+
+                  [[ 566,  654,  742],
+                   [ 830,  918, 1006]],
+
+                  [[ 974, 1126, 1278],
+                   [1430, 1582, 1734]]],
+
+                 [[[1382, 1598, 1814],
+                   [2030, 2246, 2462]],
+
+                  [[1790, 2070, 2350],
+                   [2630, 2910, 3190]],
+
+                  [[2198, 2542, 2886],
+                   [3230, 3574, 3918]]]],
+                dtype=dt
+            )
+            assert_equal(np.inner(a, b), desired)
+            assert_equal(np.inner(b, a).transpose(2,3,0,1), desired)
+
+
+class TestAlen(object):
+    def test_basic(self):
+        m = np.array([1, 2, 3])
+        assert_equal(np.alen(m), 3)
+
+        m = np.array([[1, 2, 3], [4, 5, 7]])
+        assert_equal(np.alen(m), 2)
+
+        m = [1, 2, 3]
+        assert_equal(np.alen(m), 3)
+
+        m = [[1, 2, 3], [4, 5, 7]]
+        assert_equal(np.alen(m), 2)
+
+    def test_singleton(self):
+        assert_equal(np.alen(5), 1)
+
+
+class TestChoose(object):
+    def setup(self):
+        self.x = 2*np.ones((3,), dtype=int)
+        self.y = 3*np.ones((3,), dtype=int)
+        self.x2 = 2*np.ones((2, 3), dtype=int)
+        self.y2 = 3*np.ones((2, 3), dtype=int)
+        self.ind = [0, 0, 1]
+
+    def test_basic(self):
+        A = np.choose(self.ind, (self.x, self.y))
+        assert_equal(A, [2, 2, 3])
+
+    def test_broadcast1(self):
+        A = np.choose(self.ind, (self.x2, self.y2))
+        assert_equal(A, [[2, 2, 3], [2, 2, 3]])
+
+    def test_broadcast2(self):
+        A = np.choose(self.ind, (self.x, self.y2))
+        assert_equal(A, [[2, 2, 3], [2, 2, 3]])
+
+
+class TestRepeat(object):
+    def setup(self):
+        self.m = np.array([1, 2, 3, 4, 5, 6])
+        self.m_rect = self.m.reshape((2, 3))
+
+    def test_basic(self):
+        A = np.repeat(self.m, [1, 3, 2, 1, 1, 2])
+        assert_equal(A, [1, 2, 2, 2, 3,
+                         3, 4, 5, 6, 6])
+
+    def test_broadcast1(self):
+        A = np.repeat(self.m, 2)
+        assert_equal(A, [1, 1, 2, 2, 3, 3,
+                         4, 4, 5, 5, 6, 6])
+
+    def test_axis_spec(self):
+        A = np.repeat(self.m_rect, [2, 1], axis=0)
+        assert_equal(A, [[1, 2, 3],
+                         [1, 2, 3],
+                         [4, 5, 6]])
+
+        A = np.repeat(self.m_rect, [1, 3, 2], axis=1)
+        assert_equal(A, [[1, 2, 2, 2, 3, 3],
+                         [4, 5, 5, 5, 6, 6]])
+
+    def test_broadcast2(self):
+        A = np.repeat(self.m_rect, 2, axis=0)
+        assert_equal(A, [[1, 2, 3],
+                         [1, 2, 3],
+                         [4, 5, 6],
+                         [4, 5, 6]])
+
+        A = np.repeat(self.m_rect, 2, axis=1)
+        assert_equal(A, [[1, 1, 2, 2, 3, 3],
+                         [4, 4, 5, 5, 6, 6]])
+
+
+# TODO: test for multidimensional
+NEIGH_MODE = {'zero': 0, 'one': 1, 'constant': 2, 'circular': 3, 'mirror': 4}
+
+
+[email protected]('dt', [float, Decimal], ids=['float', 'object'])
+class TestNeighborhoodIter(object):
+    # Simple, 2d tests
+    def test_simple2d(self, dt):
+        # Test zero and one padding for simple data type
+        x = np.array([[0, 1], [2, 3]], dtype=dt)
+        r = [np.array([[0, 0, 0], [0, 0, 1]], dtype=dt),
+             np.array([[0, 0, 0], [0, 1, 0]], dtype=dt),
+             np.array([[0, 0, 1], [0, 2, 3]], dtype=dt),
+             np.array([[0, 1, 0], [2, 3, 0]], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 0, -1, 1], x[0], NEIGH_MODE['zero'])
+        assert_array_equal(l, r)
+
+        r = [np.array([[1, 1, 1], [1, 0, 1]], dtype=dt),
+             np.array([[1, 1, 1], [0, 1, 1]], dtype=dt),
+             np.array([[1, 0, 1], [1, 2, 3]], dtype=dt),
+             np.array([[0, 1, 1], [2, 3, 1]], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 0, -1, 1], x[0], NEIGH_MODE['one'])
+        assert_array_equal(l, r)
+
+        r = [np.array([[4, 4, 4], [4, 0, 1]], dtype=dt),
+             np.array([[4, 4, 4], [0, 1, 4]], dtype=dt),
+             np.array([[4, 0, 1], [4, 2, 3]], dtype=dt),
+             np.array([[0, 1, 4], [2, 3, 4]], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 0, -1, 1], 4, NEIGH_MODE['constant'])
+        assert_array_equal(l, r)
+
+    def test_mirror2d(self, dt):
+        x = np.array([[0, 1], [2, 3]], dtype=dt)
+        r = [np.array([[0, 0, 1], [0, 0, 1]], dtype=dt),
+             np.array([[0, 1, 1], [0, 1, 1]], dtype=dt),
+             np.array([[0, 0, 1], [2, 2, 3]], dtype=dt),
+             np.array([[0, 1, 1], [2, 3, 3]], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 0, -1, 1], x[0], NEIGH_MODE['mirror'])
+        assert_array_equal(l, r)
+
+    # Simple, 1d tests
+    def test_simple(self, dt):
+        # Test padding with constant values
+        x = np.linspace(1, 5, 5).astype(dt)
+        r = [[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 0]]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 1], x[0], NEIGH_MODE['zero'])
+        assert_array_equal(l, r)
+
+        r = [[1, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 1]]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 1], x[0], NEIGH_MODE['one'])
+        assert_array_equal(l, r)
+
+        r = [[x[4], 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, x[4]]]
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-1, 1], x[4], NEIGH_MODE['constant'])
+        assert_array_equal(l, r)
+
+    # Test mirror modes
+    def test_mirror(self, dt):
+        x = np.linspace(1, 5, 5).astype(dt)
+        r = np.array([[2, 1, 1, 2, 3], [1, 1, 2, 3, 4], [1, 2, 3, 4, 5],
+                [2, 3, 4, 5, 5], [3, 4, 5, 5, 4]], dtype=dt)
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-2, 2], x[1], NEIGH_MODE['mirror'])
+        assert_([i.dtype == dt for i in l])
+        assert_array_equal(l, r)
+
+    # Circular mode
+    def test_circular(self, dt):
+        x = np.linspace(1, 5, 5).astype(dt)
+        r = np.array([[4, 5, 1, 2, 3], [5, 1, 2, 3, 4], [1, 2, 3, 4, 5],
+                [2, 3, 4, 5, 1], [3, 4, 5, 1, 2]], dtype=dt)
+        l = _multiarray_tests.test_neighborhood_iterator(
+                x, [-2, 2], x[0], NEIGH_MODE['circular'])
+        assert_array_equal(l, r)
+
+
+# Test stacking neighborhood iterators
+class TestStackedNeighborhoodIter(object):
+    # Simple, 1d test: stacking 2 constant-padded neigh iterators
+    def test_simple_const(self):
+        dt = np.float64
+        # Test zero and one padding for simple data type
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([0], dtype=dt),
+             np.array([0], dtype=dt),
+             np.array([1], dtype=dt),
+             np.array([2], dtype=dt),
+             np.array([3], dtype=dt),
+             np.array([0], dtype=dt),
+             np.array([0], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-2, 4], NEIGH_MODE['zero'], [0, 0], NEIGH_MODE['zero'])
+        assert_array_equal(l, r)
+
+        r = [np.array([1, 0, 1], dtype=dt),
+             np.array([0, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 0], dtype=dt),
+             np.array([3, 0, 1], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [-1, 1], NEIGH_MODE['one'])
+        assert_array_equal(l, r)
+
+    # 2nd simple, 1d test: stacking 2 neigh iterators, mixing const padding and
+    # mirror padding
+    def test_simple_mirror(self):
+        dt = np.float64
+        # Stacking zero on top of mirror
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([0, 1, 1], dtype=dt),
+             np.array([1, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 3], dtype=dt),
+             np.array([3, 3, 0], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['mirror'], [-1, 1], NEIGH_MODE['zero'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([1, 0, 0], dtype=dt),
+             np.array([0, 0, 1], dtype=dt),
+             np.array([0, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 0], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [-2, 0], NEIGH_MODE['mirror'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero: 2nd
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([0, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 0], dtype=dt),
+             np.array([3, 0, 0], dtype=dt),
+             np.array([0, 0, 3], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [0, 2], NEIGH_MODE['mirror'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero: 3rd
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([1, 0, 0, 1, 2], dtype=dt),
+             np.array([0, 0, 1, 2, 3], dtype=dt),
+             np.array([0, 1, 2, 3, 0], dtype=dt),
+             np.array([1, 2, 3, 0, 0], dtype=dt),
+             np.array([2, 3, 0, 0, 3], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [-2, 2], NEIGH_MODE['mirror'])
+        assert_array_equal(l, r)
+
+    # 3rd simple, 1d test: stacking 2 neigh iterators, mixing const padding and
+    # circular padding
+    def test_simple_circular(self):
+        dt = np.float64
+        # Stacking zero on top of mirror
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([0, 3, 1], dtype=dt),
+             np.array([3, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 1], dtype=dt),
+             np.array([3, 1, 0], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['circular'], [-1, 1], NEIGH_MODE['zero'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([3, 0, 0], dtype=dt),
+             np.array([0, 0, 1], dtype=dt),
+             np.array([0, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 0], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [-2, 0], NEIGH_MODE['circular'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero: 2nd
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([0, 1, 2], dtype=dt),
+             np.array([1, 2, 3], dtype=dt),
+             np.array([2, 3, 0], dtype=dt),
+             np.array([3, 0, 0], dtype=dt),
+             np.array([0, 0, 1], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [0, 2], NEIGH_MODE['circular'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero: 3rd
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([3, 0, 0, 1, 2], dtype=dt),
+             np.array([0, 0, 1, 2, 3], dtype=dt),
+             np.array([0, 1, 2, 3, 0], dtype=dt),
+             np.array([1, 2, 3, 0, 0], dtype=dt),
+             np.array([2, 3, 0, 0, 1], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [-1, 3], NEIGH_MODE['zero'], [-2, 2], NEIGH_MODE['circular'])
+        assert_array_equal(l, r)
+
+    # 4th simple, 1d test: stacking 2 neigh iterators, but with lower iterator
+    # being strictly within the array
+    def test_simple_strict_within(self):
+        dt = np.float64
+        # Stacking zero on top of zero, first neighborhood strictly inside the
+        # array
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([1, 2, 3, 0], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [1, 1], NEIGH_MODE['zero'], [-1, 2], NEIGH_MODE['zero'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero, first neighborhood strictly inside the
+        # array
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([1, 2, 3, 3], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [1, 1], NEIGH_MODE['zero'], [-1, 2], NEIGH_MODE['mirror'])
+        assert_array_equal(l, r)
+
+        # Stacking mirror on top of zero, first neighborhood strictly inside the
+        # array
+        x = np.array([1, 2, 3], dtype=dt)
+        r = [np.array([1, 2, 3, 1], dtype=dt)]
+        l = _multiarray_tests.test_neighborhood_iterator_oob(
+                x, [1, 1], NEIGH_MODE['zero'], [-1, 2], NEIGH_MODE['circular'])
+        assert_array_equal(l, r)
+
+class TestWarnings(object):
+
+    def test_complex_warning(self):
+        x = np.array([1, 2])
+        y = np.array([1-2j, 1+2j])
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", np.ComplexWarning)
+            assert_raises(np.ComplexWarning, x.__setitem__, slice(None), y)
+            assert_equal(x, [1, 2])
+
+
+class TestMinScalarType(object):
+
+    def test_usigned_shortshort(self):
+        dt = np.min_scalar_type(2**8-1)
+        wanted = np.dtype('uint8')
+        assert_equal(wanted, dt)
+
+    def test_usigned_short(self):
+        dt = np.min_scalar_type(2**16-1)
+        wanted = np.dtype('uint16')
+        assert_equal(wanted, dt)
+
+    def test_usigned_int(self):
+        dt = np.min_scalar_type(2**32-1)
+        wanted = np.dtype('uint32')
+        assert_equal(wanted, dt)
+
+    def test_usigned_longlong(self):
+        dt = np.min_scalar_type(2**63-1)
+        wanted = np.dtype('uint64')
+        assert_equal(wanted, dt)
+
+    def test_object(self):
+        dt = np.min_scalar_type(2**64)
+        wanted = np.dtype('O')
+        assert_equal(wanted, dt)
+
+
+from numpy.core._internal import _dtype_from_pep3118
+
+
+class TestPEP3118Dtype(object):
+    def _check(self, spec, wanted):
+        dt = np.dtype(wanted)
+        actual = _dtype_from_pep3118(spec)
+        assert_equal(actual, dt,
+                     err_msg="spec %r != dtype %r" % (spec, wanted))
+
+    def test_native_padding(self):
+        align = np.dtype('i').alignment
+        for j in range(8):
+            if j == 0:
+                s = 'bi'
+            else:
+                s = 'b%dxi' % j
+            self._check('@'+s, {'f0': ('i1', 0),
+                                'f1': ('i', align*(1 + j//align))})
+            self._check('='+s, {'f0': ('i1', 0),
+                                'f1': ('i', 1+j)})
+
+    def test_native_padding_2(self):
+        # Native padding should work also for structs and sub-arrays
+        self._check('x3T{xi}', {'f0': (({'f0': ('i', 4)}, (3,)), 4)})
+        self._check('^x3T{xi}', {'f0': (({'f0': ('i', 1)}, (3,)), 1)})
+
+    def test_trailing_padding(self):
+        # Trailing padding should be included, *and*, the item size
+        # should match the alignment if in aligned mode
+        align = np.dtype('i').alignment
+        size = np.dtype('i').itemsize
+
+        def aligned(n):
+            return align*(1 + (n-1)//align)
+
+        base = dict(formats=['i'], names=['f0'])
+
+        self._check('ix',    dict(itemsize=aligned(size + 1), **base))
+        self._check('ixx',   dict(itemsize=aligned(size + 2), **base))
+        self._check('ixxx',  dict(itemsize=aligned(size + 3), **base))
+        self._check('ixxxx', dict(itemsize=aligned(size + 4), **base))
+        self._check('i7x',   dict(itemsize=aligned(size + 7), **base))
+
+        self._check('^ix',    dict(itemsize=size + 1, **base))
+        self._check('^ixx',   dict(itemsize=size + 2, **base))
+        self._check('^ixxx',  dict(itemsize=size + 3, **base))
+        self._check('^ixxxx', dict(itemsize=size + 4, **base))
+        self._check('^i7x',   dict(itemsize=size + 7, **base))
+
+    def test_native_padding_3(self):
+        dt = np.dtype(
+                [('a', 'b'), ('b', 'i'),
+                    ('sub', np.dtype('b,i')), ('c', 'i')],
+                align=True)
+        self._check("T{b:a:xxxi:b:T{b:f0:=i:f1:}:sub:xxxi:c:}", dt)
+
+        dt = np.dtype(
+                [('a', 'b'), ('b', 'i'), ('c', 'b'), ('d', 'b'),
+                    ('e', 'b'), ('sub', np.dtype('b,i', align=True))])
+        self._check("T{b:a:=i:b:b:c:b:d:b:e:T{b:f0:xxxi:f1:}:sub:}", dt)
+
+    def test_padding_with_array_inside_struct(self):
+        dt = np.dtype(
+                [('a', 'b'), ('b', 'i'), ('c', 'b', (3,)),
+                    ('d', 'i')],
+                align=True)
+        self._check("T{b:a:xxxi:b:3b:c:xi:d:}", dt)
+
+    def test_byteorder_inside_struct(self):
+        # The byte order after @T{=i} should be '=', not '@'.
+        # Check this by noting the absence of native alignment.
+        self._check('@T{^i}xi', {'f0': ({'f0': ('i', 0)}, 0),
+                                 'f1': ('i', 5)})
+
+    def test_intra_padding(self):
+        # Natively aligned sub-arrays may require some internal padding
+        align = np.dtype('i').alignment
+        size = np.dtype('i').itemsize
+
+        def aligned(n):
+            return (align*(1 + (n-1)//align))
+
+        self._check('(3)T{ix}', (dict(
+            names=['f0'],
+            formats=['i'],
+            offsets=[0],
+            itemsize=aligned(size + 1)
+        ), (3,)))
+
+    def test_char_vs_string(self):
+        dt = np.dtype('c')
+        self._check('c', dt)
+
+        dt = np.dtype([('f0', 'S1', (4,)), ('f1', 'S4')])
+        self._check('4c4s', dt)
+
+    def test_field_order(self):
+        # gh-9053 - previously, we relied on dictionary key order
+        self._check("(0)I:a:f:b:", [('a', 'I', (0,)), ('b', 'f')])
+        self._check("(0)I:b:f:a:", [('b', 'I', (0,)), ('a', 'f')])
+
+    def test_unnamed_fields(self):
+        self._check('ii',     [('f0', 'i'), ('f1', 'i')])
+        self._check('ii:f0:', [('f1', 'i'), ('f0', 'i')])
+
+        self._check('i', 'i')
+        self._check('i:f0:', [('f0', 'i')])
+
+
+class TestNewBufferProtocol(object):
+    """ Test PEP3118 buffers """
+
+    def _check_roundtrip(self, obj):
+        obj = np.asarray(obj)
+        x = memoryview(obj)
+        y = np.asarray(x)
+        y2 = np.array(x)
+        assert_(not y.flags.owndata)
+        assert_(y2.flags.owndata)
+
+        assert_equal(y.dtype, obj.dtype)
+        assert_equal(y.shape, obj.shape)
+        assert_array_equal(obj, y)
+
+        assert_equal(y2.dtype, obj.dtype)
+        assert_equal(y2.shape, obj.shape)
+        assert_array_equal(obj, y2)
+
+    def test_roundtrip(self):
+        x = np.array([1, 2, 3, 4, 5], dtype='i4')
+        self._check_roundtrip(x)
+
+        x = np.array([[1, 2], [3, 4]], dtype=np.float64)
+        self._check_roundtrip(x)
+
+        x = np.zeros((3, 3, 3), dtype=np.float32)[:, 0,:]
+        self._check_roundtrip(x)
+
+        dt = [('a', 'b'),
+              ('b', 'h'),
+              ('c', 'i'),
+              ('d', 'l'),
+              ('dx', 'q'),
+              ('e', 'B'),
+              ('f', 'H'),
+              ('g', 'I'),
+              ('h', 'L'),
+              ('hx', 'Q'),
+              ('i', np.single),
+              ('j', np.double),
+              ('k', np.longdouble),
+              ('ix', np.csingle),
+              ('jx', np.cdouble),
+              ('kx', np.clongdouble),
+              ('l', 'S4'),
+              ('m', 'U4'),
+              ('n', 'V3'),
+              ('o', '?'),
+              ('p', np.half),
+              ]
+        x = np.array(
+                [(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    b'aaaa', 'bbbb', b'xxx', True, 1.0)],
+                dtype=dt)
+        self._check_roundtrip(x)
+
+        x = np.array(([[1, 2], [3, 4]],), dtype=[('a', (int, (2, 2)))])
+        self._check_roundtrip(x)
+
+        x = np.array([1, 2, 3], dtype='>i2')
+        self._check_roundtrip(x)
+
+        x = np.array([1, 2, 3], dtype='<i2')
+        self._check_roundtrip(x)
+
+        x = np.array([1, 2, 3], dtype='>i4')
+        self._check_roundtrip(x)
+
+        x = np.array([1, 2, 3], dtype='<i4')
+        self._check_roundtrip(x)
+
+        # check long long can be represented as non-native
+        x = np.array([1, 2, 3], dtype='>q')
+        self._check_roundtrip(x)
+
+        # Native-only data types can be passed through the buffer interface
+        # only in native byte order
+        if sys.byteorder == 'little':
+            x = np.array([1, 2, 3], dtype='>g')
+            assert_raises(ValueError, self._check_roundtrip, x)
+            x = np.array([1, 2, 3], dtype='<g')
+            self._check_roundtrip(x)
+        else:
+            x = np.array([1, 2, 3], dtype='>g')
+            self._check_roundtrip(x)
+            x = np.array([1, 2, 3], dtype='<g')
+            assert_raises(ValueError, self._check_roundtrip, x)
+
+    def test_roundtrip_half(self):
+        half_list = [
+            1.0,
+            -2.0,
+            6.5504 * 10**4,  # (max half precision)
+            2**-14,  # ~= 6.10352 * 10**-5 (minimum positive normal)
+            2**-24,  # ~= 5.96046 * 10**-8 (minimum strictly positive subnormal)
+            0.0,
+            -0.0,
+            float('+inf'),
+            float('-inf'),
+            0.333251953125,  # ~= 1/3
+        ]
+
+        x = np.array(half_list, dtype='>e')
+        self._check_roundtrip(x)
+        x = np.array(half_list, dtype='<e')
+        self._check_roundtrip(x)
+
+    def test_roundtrip_single_types(self):
+        for typ in np.typeDict.values():
+            dtype = np.dtype(typ)
+
+            if dtype.char in 'Mm':
+                # datetimes cannot be used in buffers
+                continue
+            if dtype.char == 'V':
+                # skip void
+                continue
+
+            x = np.zeros(4, dtype=dtype)
+            self._check_roundtrip(x)
+
+            if dtype.char not in 'qQgG':
+                dt = dtype.newbyteorder('<')
+                x = np.zeros(4, dtype=dt)
+                self._check_roundtrip(x)
+
+                dt = dtype.newbyteorder('>')
+                x = np.zeros(4, dtype=dt)
+                self._check_roundtrip(x)
+
+    def test_roundtrip_scalar(self):
+        # Issue #4015.
+        self._check_roundtrip(0)
+
+    def test_invalid_buffer_format(self):
+        # datetime64 cannot be used fully in a buffer yet
+        # Should be fixed in the next Numpy major release
+        dt = np.dtype([('a', 'uint16'), ('b', 'M8[s]')])
+        a = np.empty(3, dt)
+        assert_raises((ValueError, BufferError), memoryview, a)
+        assert_raises((ValueError, BufferError), memoryview, np.array((3), 'M8[D]'))
+
+    def test_export_simple_1d(self):
+        x = np.array([1, 2, 3, 4, 5], dtype='i')
+        y = memoryview(x)
+        assert_equal(y.format, 'i')
+        assert_equal(y.shape, (5,))
+        assert_equal(y.ndim, 1)
+        assert_equal(y.strides, (4,))
+        assert_equal(y.suboffsets, EMPTY)
+        assert_equal(y.itemsize, 4)
+
+    def test_export_simple_nd(self):
+        x = np.array([[1, 2], [3, 4]], dtype=np.float64)
+        y = memoryview(x)
+        assert_equal(y.format, 'd')
+        assert_equal(y.shape, (2, 2))
+        assert_equal(y.ndim, 2)
+        assert_equal(y.strides, (16, 8))
+        assert_equal(y.suboffsets, EMPTY)
+        assert_equal(y.itemsize, 8)
+
+    def test_export_discontiguous(self):
+        x = np.zeros((3, 3, 3), dtype=np.float32)[:, 0,:]
+        y = memoryview(x)
+        assert_equal(y.format, 'f')
+        assert_equal(y.shape, (3, 3))
+        assert_equal(y.ndim, 2)
+        assert_equal(y.strides, (36, 4))
+        assert_equal(y.suboffsets, EMPTY)
+        assert_equal(y.itemsize, 4)
+
+    def test_export_record(self):
+        dt = [('a', 'b'),
+              ('b', 'h'),
+              ('c', 'i'),
+              ('d', 'l'),
+              ('dx', 'q'),
+              ('e', 'B'),
+              ('f', 'H'),
+              ('g', 'I'),
+              ('h', 'L'),
+              ('hx', 'Q'),
+              ('i', np.single),
+              ('j', np.double),
+              ('k', np.longdouble),
+              ('ix', np.csingle),
+              ('jx', np.cdouble),
+              ('kx', np.clongdouble),
+              ('l', 'S4'),
+              ('m', 'U4'),
+              ('n', 'V3'),
+              ('o', '?'),
+              ('p', np.half),
+              ]
+        x = np.array(
+                [(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    b'aaaa', 'bbbb', b'   ', True, 1.0)],
+                dtype=dt)
+        y = memoryview(x)
+        assert_equal(y.shape, (1,))
+        assert_equal(y.ndim, 1)
+        assert_equal(y.suboffsets, EMPTY)
+
+        sz = sum([np.dtype(b).itemsize for a, b in dt])
+        if np.dtype('l').itemsize == 4:
+            assert_equal(y.format, 'T{b:a:=h:b:i:c:l:d:q:dx:B:e:@H:f:=I:g:L:h:Q:hx:f:i:d:j:^g:k:=Zf:ix:Zd:jx:^Zg:kx:4s:l:=4w:m:3x:n:?:o:@e:p:}')
+        else:
+            assert_equal(y.format, 'T{b:a:=h:b:i:c:q:d:q:dx:B:e:@H:f:=I:g:Q:h:Q:hx:f:i:d:j:^g:k:=Zf:ix:Zd:jx:^Zg:kx:4s:l:=4w:m:3x:n:?:o:@e:p:}')
+        # Cannot test if NPY_RELAXED_STRIDES_CHECKING changes the strides
+        if not (np.ones(1).strides[0] == np.iinfo(np.intp).max):
+            assert_equal(y.strides, (sz,))
+        assert_equal(y.itemsize, sz)
+
+    def test_export_subarray(self):
+        x = np.array(([[1, 2], [3, 4]],), dtype=[('a', ('i', (2, 2)))])
+        y = memoryview(x)
+        assert_equal(y.format, 'T{(2,2)i:a:}')
+        assert_equal(y.shape, EMPTY)
+        assert_equal(y.ndim, 0)
+        assert_equal(y.strides, EMPTY)
+        assert_equal(y.suboffsets, EMPTY)
+        assert_equal(y.itemsize, 16)
+
+    def test_export_endian(self):
+        x = np.array([1, 2, 3], dtype='>i')
+        y = memoryview(x)
+        if sys.byteorder == 'little':
+            assert_equal(y.format, '>i')
+        else:
+            assert_equal(y.format, 'i')
+
+        x = np.array([1, 2, 3], dtype='<i')
+        y = memoryview(x)
+        if sys.byteorder == 'little':
+            assert_equal(y.format, 'i')
+        else:
+            assert_equal(y.format, '<i')
+
+    def test_export_flags(self):
+        # Check SIMPLE flag, see also gh-3613 (exception should be BufferError)
+        assert_raises(ValueError,
+                      _multiarray_tests.get_buffer_info,
+                       np.arange(5)[::2], ('SIMPLE',))
+
+    def test_padding(self):
+        for j in range(8):
+            x = np.array([(1,), (2,)], dtype={'f0': (int, j)})
+            self._check_roundtrip(x)
+
+    def test_reference_leak(self):
+        if HAS_REFCOUNT:
+            count_1 = sys.getrefcount(np.core._internal)
+        a = np.zeros(4)
+        b = memoryview(a)
+        c = np.asarray(b)
+        if HAS_REFCOUNT:
+            count_2 = sys.getrefcount(np.core._internal)
+            assert_equal(count_1, count_2)
+        del c  # avoid pyflakes unused variable warning.
+
+    def test_padded_struct_array(self):
+        dt1 = np.dtype(
+                [('a', 'b'), ('b', 'i'), ('sub', np.dtype('b,i')), ('c', 'i')],
+                align=True)
+        x1 = np.arange(dt1.itemsize, dtype=np.int8).view(dt1)
+        self._check_roundtrip(x1)
+
+        dt2 = np.dtype(
+                [('a', 'b'), ('b', 'i'), ('c', 'b', (3,)), ('d', 'i')],
+                align=True)
+        x2 = np.arange(dt2.itemsize, dtype=np.int8).view(dt2)
+        self._check_roundtrip(x2)
+
+        dt3 = np.dtype(
+                [('a', 'b'), ('b', 'i'), ('c', 'b'), ('d', 'b'),
+                    ('e', 'b'), ('sub', np.dtype('b,i', align=True))])
+        x3 = np.arange(dt3.itemsize, dtype=np.int8).view(dt3)
+        self._check_roundtrip(x3)
+
+    def test_relaxed_strides(self):
+        # Test that relaxed strides are converted to non-relaxed
+        c = np.ones((1, 10, 10), dtype='i8')
+
+        # Check for NPY_RELAXED_STRIDES_CHECKING:
+        if np.ones((10, 1), order="C").flags.f_contiguous:
+            c.strides = (-1, 80, 8)
+
+        assert_(memoryview(c).strides == (800, 80, 8))
+
+        # Writing C-contiguous data to a BytesIO buffer should work
+        fd = io.BytesIO()
+        fd.write(c.data)
+
+        fortran = c.T
+        assert_(memoryview(fortran).strides == (8, 80, 800))
+
+        arr = np.ones((1, 10))
+        if arr.flags.f_contiguous:
+            shape, strides = _multiarray_tests.get_buffer_info(
+                    arr, ['F_CONTIGUOUS'])
+            assert_(strides[0] == 8)
+            arr = np.ones((10, 1), order='F')
+            shape, strides = _multiarray_tests.get_buffer_info(
+                    arr, ['C_CONTIGUOUS'])
+            assert_(strides[-1] == 8)
+
+    def test_out_of_order_fields(self):
+        dt = np.dtype(dict(
+            formats=['<i4', '<i4'],
+            names=['one', 'two'],
+            offsets=[4, 0],
+            itemsize=8
+        ))
+
+        # overlapping fields cannot be represented by PEP3118
+        arr = np.empty(1, dt)
+        with assert_raises(ValueError):
+            memoryview(arr)
+
+    def test_max_dims(self):
+        a = np.empty((1,) * 32)
+        self._check_roundtrip(a)
+
+    @pytest.mark.skipif(sys.version_info < (2, 7, 7), reason="See gh-11115")
+    def test_error_too_many_dims(self):
+        def make_ctype(shape, scalar_type):
+            t = scalar_type
+            for dim in shape[::-1]:
+                t = dim * t
+            return t
+
+        # construct a memoryview with 33 dimensions
+        c_u8_33d = make_ctype((1,)*33, ctypes.c_uint8)
+        m = memoryview(c_u8_33d())
+        assert_equal(m.ndim, 33)
+
+        assert_raises_regex(
+            RuntimeError, "ndim",
+            np.array, m)
+
+    def test_error_pointer_type(self):
+        # gh-6741
+        m = memoryview(ctypes.pointer(ctypes.c_uint8()))
+        assert_('&' in m.format)
+
+        assert_raises_regex(
+            ValueError, "format string",
+            np.array, m)
+
+    def test_error_message_unsupported(self):
+        # wchar has no corresponding numpy type - if this changes in future, we
+        # need a better way to construct an invalid memoryview format.
+        t = ctypes.c_wchar * 4
+        with assert_raises(ValueError) as cm:
+            np.array(t())
+
+        exc = cm.exception
+        if sys.version_info.major > 2:
+            with assert_raises_regex(
+                NotImplementedError,
+                r"Unrepresentable .* 'u' \(UCS-2 strings\)"
+            ):
+                raise exc.__cause__
+
+    def test_ctypes_integer_via_memoryview(self):
+        # gh-11150, due to bpo-10746
+        for c_integer in {ctypes.c_int, ctypes.c_long, ctypes.c_longlong}:
+            value = c_integer(42)
+            with warnings.catch_warnings(record=True):
+                warnings.filterwarnings('always', r'.*\bctypes\b', RuntimeWarning)
+                np.asarray(value)
+
+    def test_ctypes_struct_via_memoryview(self):
+        # gh-10528
+        class foo(ctypes.Structure):
+            _fields_ = [('a', ctypes.c_uint8), ('b', ctypes.c_uint32)]
+        f = foo(a=1, b=2)
+
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings('always', r'.*\bctypes\b', RuntimeWarning)
+            arr = np.asarray(f)
+
+        assert_equal(arr['a'], 1)
+        assert_equal(arr['b'], 2)
+        f.a = 3
+        assert_equal(arr['a'], 3)
+
+
+class TestArrayAttributeDeletion(object):
+
+    def test_multiarray_writable_attributes_deletion(self):
+        # ticket #2046, should not seqfault, raise AttributeError
+        a = np.ones(2)
+        attr = ['shape', 'strides', 'data', 'dtype', 'real', 'imag', 'flat']
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "Assigning the 'data' attribute")
+            for s in attr:
+                assert_raises(AttributeError, delattr, a, s)
+
+    def test_multiarray_not_writable_attributes_deletion(self):
+        a = np.ones(2)
+        attr = ["ndim", "flags", "itemsize", "size", "nbytes", "base",
+                "ctypes", "T", "__array_interface__", "__array_struct__",
+                "__array_priority__", "__array_finalize__"]
+        for s in attr:
+            assert_raises(AttributeError, delattr, a, s)
+
+    def test_multiarray_flags_writable_attribute_deletion(self):
+        a = np.ones(2).flags
+        attr = ['writebackifcopy', 'updateifcopy', 'aligned', 'writeable']
+        for s in attr:
+            assert_raises(AttributeError, delattr, a, s)
+
+    def test_multiarray_flags_not_writable_attribute_deletion(self):
+        a = np.ones(2).flags
+        attr = ["contiguous", "c_contiguous", "f_contiguous", "fortran",
+                "owndata", "fnc", "forc", "behaved", "carray", "farray",
+                "num"]
+        for s in attr:
+            assert_raises(AttributeError, delattr, a, s)
+
+
+class TestArrayInterface():
+    class Foo(object):
+        def __init__(self, value):
+            self.value = value
+            self.iface = {'typestr': 'f8'}
+
+        def __float__(self):
+            return float(self.value)
+
+        @property
+        def __array_interface__(self):
+            return self.iface
+
+
+    f = Foo(0.5)
+
+    @pytest.mark.parametrize('val, iface, expected', [
+        (f, {}, 0.5),
+        ([f], {}, [0.5]),
+        ([f, f], {}, [0.5, 0.5]),
+        (f, {'shape': ()}, 0.5),
+        (f, {'shape': None}, TypeError),
+        (f, {'shape': (1, 1)}, [[0.5]]),
+        (f, {'shape': (2,)}, ValueError),
+        (f, {'strides': ()}, 0.5),
+        (f, {'strides': (2,)}, ValueError),
+        (f, {'strides': 16}, TypeError),
+        ])
+    def test_scalar_interface(self, val, iface, expected):
+        # Test scalar coercion within the array interface
+        self.f.iface = {'typestr': 'f8'}
+        self.f.iface.update(iface)
+        if HAS_REFCOUNT:
+            pre_cnt = sys.getrefcount(np.dtype('f8'))
+        if isinstance(expected, type):
+            assert_raises(expected, np.array, val)
+        else:
+            result = np.array(val)
+            assert_equal(np.array(val), expected)
+            assert result.dtype == 'f8'
+            del result
+        if HAS_REFCOUNT:
+            post_cnt = sys.getrefcount(np.dtype('f8'))
+            assert_equal(pre_cnt, post_cnt)
+
+def test_interface_no_shape():
+    class ArrayLike(object):
+        array = np.array(1)
+        __array_interface__ = array.__array_interface__
+    assert_equal(np.array(ArrayLike()), 1)
+
+
+def test_array_interface_itemsize():
+    # See gh-6361
+    my_dtype = np.dtype({'names': ['A', 'B'], 'formats': ['f4', 'f4'],
+                         'offsets': [0, 8], 'itemsize': 16})
+    a = np.ones(10, dtype=my_dtype)
+    descr_t = np.dtype(a.__array_interface__['descr'])
+    typestr_t = np.dtype(a.__array_interface__['typestr'])
+    assert_equal(descr_t.itemsize, typestr_t.itemsize)
+
+
+def test_array_interface_empty_shape():
+    # See gh-7994
+    arr = np.array([1, 2, 3])
+    interface1 = dict(arr.__array_interface__)
+    interface1['shape'] = ()
+
+    class DummyArray1(object):
+        __array_interface__ = interface1
+
+    # NOTE: Because Py2 str/Py3 bytes supports the buffer interface, setting
+    # the interface data to bytes would invoke the bug this tests for, that
+    # __array_interface__ with shape=() is not allowed if the data is an object
+    # exposing the buffer interface
+    interface2 = dict(interface1)
+    interface2['data'] = arr[0].tobytes()
+
+    class DummyArray2(object):
+        __array_interface__ = interface2
+
+    arr1 = np.asarray(DummyArray1())
+    arr2 = np.asarray(DummyArray2())
+    arr3 = arr[:1].reshape(())
+    assert_equal(arr1, arr2)
+    assert_equal(arr1, arr3)
+
+[email protected](IS_PYPY, reason='PyDict_GetItemString(.., "data") segfaults')
+def test_array_interface_offset():
+    arr = np.array([1, 2, 3], dtype='int32')
+    interface = dict(arr.__array_interface__)
+    if sys.version_info[0] < 3:
+        interface['data'] = buffer(arr)
+    else:
+        interface['data'] = memoryview(arr)
+    interface['shape'] = (2,)
+    interface['offset'] = 4
+
+
+    class DummyArray(object):
+        __array_interface__ = interface
+
+    arr1 = np.asarray(DummyArray())
+    assert_equal(arr1, arr[1:])
+
+def test_flat_element_deletion():
+    it = np.ones(3).flat
+    try:
+        del it[1]
+        del it[1:2]
+    except TypeError:
+        pass
+    except Exception:
+        raise AssertionError
+
+
+def test_scalar_element_deletion():
+    a = np.zeros(2, dtype=[('x', 'int'), ('y', 'int')])
+    assert_raises(ValueError, a[0].__delitem__, 'x')
+
+
+class TestMemEventHook(object):
+    def test_mem_seteventhook(self):
+        # The actual tests are within the C code in
+        # multiarray/_multiarray_tests.c.src
+        _multiarray_tests.test_pydatamem_seteventhook_start()
+        # force an allocation and free of a numpy array
+        # needs to be larger then limit of small memory cacher in ctors.c
+        a = np.zeros(1000)
+        del a
+        break_cycles()
+        _multiarray_tests.test_pydatamem_seteventhook_end()
+
+class TestMapIter(object):
+    def test_mapiter(self):
+        # The actual tests are within the C code in
+        # multiarray/_multiarray_tests.c.src
+
+        a = np.arange(12).reshape((3, 4)).astype(float)
+        index = ([1, 1, 2, 0],
+                 [0, 0, 2, 3])
+        vals = [50, 50, 30, 16]
+
+        _multiarray_tests.test_inplace_increment(a, index, vals)
+        assert_equal(a, [[0.00, 1., 2.0, 19.],
+                         [104., 5., 6.0, 7.0],
+                         [8.00, 9., 40., 11.]])
+
+        b = np.arange(6).astype(float)
+        index = (np.array([1, 2, 0]),)
+        vals = [50, 4, 100.1]
+        _multiarray_tests.test_inplace_increment(b, index, vals)
+        assert_equal(b, [100.1,  51.,   6.,   3.,   4.,   5.])
+
+
+class TestAsCArray(object):
+    def test_1darray(self):
+        array = np.arange(24, dtype=np.double)
+        from_c = _multiarray_tests.test_as_c_array(array, 3)
+        assert_equal(array[3], from_c)
+
+    def test_2darray(self):
+        array = np.arange(24, dtype=np.double).reshape(3, 8)
+        from_c = _multiarray_tests.test_as_c_array(array, 2, 4)
+        assert_equal(array[2, 4], from_c)
+
+    def test_3darray(self):
+        array = np.arange(24, dtype=np.double).reshape(2, 3, 4)
+        from_c = _multiarray_tests.test_as_c_array(array, 1, 2, 3)
+        assert_equal(array[1, 2, 3], from_c)
+
+
+class TestConversion(object):
+    def test_array_scalar_relational_operation(self):
+        # All integer
+        for dt1 in np.typecodes['AllInteger']:
+            assert_(1 > np.array(0, dtype=dt1), "type %s failed" % (dt1,))
+            assert_(not 1 < np.array(0, dtype=dt1), "type %s failed" % (dt1,))
+
+            for dt2 in np.typecodes['AllInteger']:
+                assert_(np.array(1, dtype=dt1) > np.array(0, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(not np.array(1, dtype=dt1) < np.array(0, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+
+        # Unsigned integers
+        for dt1 in 'BHILQP':
+            assert_(-1 < np.array(1, dtype=dt1), "type %s failed" % (dt1,))
+            assert_(not -1 > np.array(1, dtype=dt1), "type %s failed" % (dt1,))
+            assert_(-1 != np.array(1, dtype=dt1), "type %s failed" % (dt1,))
+
+            # Unsigned vs signed
+            for dt2 in 'bhilqp':
+                assert_(np.array(1, dtype=dt1) > np.array(-1, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(not np.array(1, dtype=dt1) < np.array(-1, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(np.array(1, dtype=dt1) != np.array(-1, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+
+        # Signed integers and floats
+        for dt1 in 'bhlqp' + np.typecodes['Float']:
+            assert_(1 > np.array(-1, dtype=dt1), "type %s failed" % (dt1,))
+            assert_(not 1 < np.array(-1, dtype=dt1), "type %s failed" % (dt1,))
+            assert_(-1 == np.array(-1, dtype=dt1), "type %s failed" % (dt1,))
+
+            for dt2 in 'bhlqp' + np.typecodes['Float']:
+                assert_(np.array(1, dtype=dt1) > np.array(-1, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(not np.array(1, dtype=dt1) < np.array(-1, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(np.array(-1, dtype=dt1) == np.array(-1, dtype=dt2),
+                        "type %s and %s failed" % (dt1, dt2))
+
+    def test_to_bool_scalar(self):
+        assert_equal(bool(np.array([False])), False)
+        assert_equal(bool(np.array([True])), True)
+        assert_equal(bool(np.array([[42]])), True)
+        assert_raises(ValueError, bool, np.array([1, 2]))
+
+        class NotConvertible(object):
+            def __bool__(self):
+                raise NotImplementedError
+            __nonzero__ = __bool__  # python 2
+
+        assert_raises(NotImplementedError, bool, np.array(NotConvertible()))
+        assert_raises(NotImplementedError, bool, np.array([NotConvertible()]))
+
+        self_containing = np.array([None])
+        self_containing[0] = self_containing
+        try:
+            Error = RecursionError
+        except NameError:
+            Error = RuntimeError  # python < 3.5
+        assert_raises(Error, bool, self_containing)  # previously stack overflow
+        self_containing[0] = None  # resolve circular reference
+
+    def test_to_int_scalar(self):
+        # gh-9972 means that these aren't always the same
+        int_funcs = (int, lambda x: x.__int__())
+        for int_func in int_funcs:
+            assert_equal(int_func(np.array([1])), 1)
+            assert_equal(int_func(np.array([0])), 0)
+            assert_equal(int_func(np.array([[42]])), 42)
+            assert_raises(TypeError, int_func, np.array([1, 2]))
+
+            # gh-9972
+            assert_equal(4, int_func(np.array('4')))
+            assert_equal(5, int_func(np.bytes_(b'5')))
+            assert_equal(6, int_func(np.unicode_(u'6')))
+
+            class HasTrunc:
+                def __trunc__(self):
+                    return 3
+            assert_equal(3, int_func(np.array(HasTrunc())))
+            assert_equal(3, int_func(np.array([HasTrunc()])))
+
+            class NotConvertible(object):
+                def __int__(self):
+                    raise NotImplementedError
+            assert_raises(NotImplementedError,
+                int_func, np.array(NotConvertible()))
+            assert_raises(NotImplementedError,
+                int_func, np.array([NotConvertible()]))
+
+
+class TestWhere(object):
+    def test_basic(self):
+        dts = [bool, np.int16, np.int32, np.int64, np.double, np.complex128,
+               np.longdouble, np.clongdouble]
+        for dt in dts:
+            c = np.ones(53, dtype=bool)
+            assert_equal(np.where( c, dt(0), dt(1)), dt(0))
+            assert_equal(np.where(~c, dt(0), dt(1)), dt(1))
+            assert_equal(np.where(True, dt(0), dt(1)), dt(0))
+            assert_equal(np.where(False, dt(0), dt(1)), dt(1))
+            d = np.ones_like(c).astype(dt)
+            e = np.zeros_like(d)
+            r = d.astype(dt)
+            c[7] = False
+            r[7] = e[7]
+            assert_equal(np.where(c, e, e), e)
+            assert_equal(np.where(c, d, e), r)
+            assert_equal(np.where(c, d, e[0]), r)
+            assert_equal(np.where(c, d[0], e), r)
+            assert_equal(np.where(c[::2], d[::2], e[::2]), r[::2])
+            assert_equal(np.where(c[1::2], d[1::2], e[1::2]), r[1::2])
+            assert_equal(np.where(c[::3], d[::3], e[::3]), r[::3])
+            assert_equal(np.where(c[1::3], d[1::3], e[1::3]), r[1::3])
+            assert_equal(np.where(c[::-2], d[::-2], e[::-2]), r[::-2])
+            assert_equal(np.where(c[::-3], d[::-3], e[::-3]), r[::-3])
+            assert_equal(np.where(c[1::-3], d[1::-3], e[1::-3]), r[1::-3])
+
+    def test_exotic(self):
+        # object
+        assert_array_equal(np.where(True, None, None), np.array(None))
+        # zero sized
+        m = np.array([], dtype=bool).reshape(0, 3)
+        b = np.array([], dtype=np.float64).reshape(0, 3)
+        assert_array_equal(np.where(m, 0, b), np.array([]).reshape(0, 3))
+
+        # object cast
+        d = np.array([-1.34, -0.16, -0.54, -0.31, -0.08, -0.95, 0.000, 0.313,
+                      0.547, -0.18, 0.876, 0.236, 1.969, 0.310, 0.699, 1.013,
+                      1.267, 0.229, -1.39, 0.487])
+        nan = float('NaN')
+        e = np.array(['5z', '0l', nan, 'Wz', nan, nan, 'Xq', 'cs', nan, nan,
+                     'QN', nan, nan, 'Fd', nan, nan, 'kp', nan, '36', 'i1'],
+                     dtype=object)
+        m = np.array([0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
+                      0, 1, 1, 0, 1, 1, 0, 1, 0, 0], dtype=bool)
+
+        r = e[:]
+        r[np.where(m)] = d[np.where(m)]
+        assert_array_equal(np.where(m, d, e), r)
+
+        r = e[:]
+        r[np.where(~m)] = d[np.where(~m)]
+        assert_array_equal(np.where(m, e, d), r)
+
+        assert_array_equal(np.where(m, e, e), e)
+
+        # minimal dtype result with NaN scalar (e.g required by pandas)
+        d = np.array([1., 2.], dtype=np.float32)
+        e = float('NaN')
+        assert_equal(np.where(True, d, e).dtype, np.float32)
+        e = float('Infinity')
+        assert_equal(np.where(True, d, e).dtype, np.float32)
+        e = float('-Infinity')
+        assert_equal(np.where(True, d, e).dtype, np.float32)
+        # also check upcast
+        e = float(1e150)
+        assert_equal(np.where(True, d, e).dtype, np.float64)
+
+    def test_ndim(self):
+        c = [True, False]
+        a = np.zeros((2, 25))
+        b = np.ones((2, 25))
+        r = np.where(np.array(c)[:,np.newaxis], a, b)
+        assert_array_equal(r[0], a[0])
+        assert_array_equal(r[1], b[0])
+
+        a = a.T
+        b = b.T
+        r = np.where(c, a, b)
+        assert_array_equal(r[:,0], a[:,0])
+        assert_array_equal(r[:,1], b[:,0])
+
+    def test_dtype_mix(self):
+        c = np.array([False, True, False, False, False, False, True, False,
+                     False, False, True, False])
+        a = np.uint32(1)
+        b = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.],
+                      dtype=np.float64)
+        r = np.array([5., 1., 3., 2., -1., -4., 1., -10., 10., 1., 1., 3.],
+                     dtype=np.float64)
+        assert_equal(np.where(c, a, b), r)
+
+        a = a.astype(np.float32)
+        b = b.astype(np.int64)
+        assert_equal(np.where(c, a, b), r)
+
+        # non bool mask
+        c = c.astype(int)
+        c[c != 0] = 34242324
+        assert_equal(np.where(c, a, b), r)
+        # invert
+        tmpmask = c != 0
+        c[c == 0] = 41247212
+        c[tmpmask] = 0
+        assert_equal(np.where(c, b, a), r)
+
+    def test_foreign(self):
+        c = np.array([False, True, False, False, False, False, True, False,
+                     False, False, True, False])
+        r = np.array([5., 1., 3., 2., -1., -4., 1., -10., 10., 1., 1., 3.],
+                     dtype=np.float64)
+        a = np.ones(1, dtype='>i4')
+        b = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.],
+                     dtype=np.float64)
+        assert_equal(np.where(c, a, b), r)
+
+        b = b.astype('>f8')
+        assert_equal(np.where(c, a, b), r)
+
+        a = a.astype('<i4')
+        assert_equal(np.where(c, a, b), r)
+
+        c = c.astype('>i4')
+        assert_equal(np.where(c, a, b), r)
+
+    def test_error(self):
+        c = [True, True]
+        a = np.ones((4, 5))
+        b = np.ones((5, 5))
+        assert_raises(ValueError, np.where, c, a, a)
+        assert_raises(ValueError, np.where, c[0], a, b)
+
+    def test_string(self):
+        # gh-4778 check strings are properly filled with nulls
+        a = np.array("abc")
+        b = np.array("x" * 753)
+        assert_equal(np.where(True, a, b), "abc")
+        assert_equal(np.where(False, b, a), "abc")
+
+        # check native datatype sized strings
+        a = np.array("abcd")
+        b = np.array("x" * 8)
+        assert_equal(np.where(True, a, b), "abcd")
+        assert_equal(np.where(False, b, a), "abcd")
+
+    def test_empty_result(self):
+        # pass empty where result through an assignment which reads the data of
+        # empty arrays, error detectable with valgrind, see gh-8922
+        x = np.zeros((1, 1))
+        ibad = np.vstack(np.where(x == 99.))
+        assert_array_equal(ibad,
+                           np.atleast_2d(np.array([[],[]], dtype=np.intp)))
+
+    def test_largedim(self):
+        # invalid read regression gh-9304
+        shape = [10, 2, 3, 4, 5, 6]
+        np.random.seed(2)
+        array = np.random.rand(*shape)
+
+        for i in range(10):
+            benchmark = array.nonzero()
+            result = array.nonzero()
+            assert_array_equal(benchmark, result)
+
+
+if not IS_PYPY:
+    # sys.getsizeof() is not valid on PyPy
+    class TestSizeOf(object):
+
+        def test_empty_array(self):
+            x = np.array([])
+            assert_(sys.getsizeof(x) > 0)
+
+        def check_array(self, dtype):
+            elem_size = dtype(0).itemsize
+
+            for length in [10, 50, 100, 500]:
+                x = np.arange(length, dtype=dtype)
+                assert_(sys.getsizeof(x) > length * elem_size)
+
+        def test_array_int32(self):
+            self.check_array(np.int32)
+
+        def test_array_int64(self):
+            self.check_array(np.int64)
+
+        def test_array_float32(self):
+            self.check_array(np.float32)
+
+        def test_array_float64(self):
+            self.check_array(np.float64)
+
+        def test_view(self):
+            d = np.ones(100)
+            assert_(sys.getsizeof(d[...]) < sys.getsizeof(d))
+
+        def test_reshape(self):
+            d = np.ones(100)
+            assert_(sys.getsizeof(d) < sys.getsizeof(d.reshape(100, 1, 1).copy()))
+
+        def test_resize(self):
+            d = np.ones(100)
+            old = sys.getsizeof(d)
+            d.resize(50)
+            assert_(old > sys.getsizeof(d))
+            d.resize(150)
+            assert_(old < sys.getsizeof(d))
+
+        def test_error(self):
+            d = np.ones(100)
+            assert_raises(TypeError, d.__sizeof__, "a")
+
+
+class TestHashing(object):
+
+    def test_arrays_not_hashable(self):
+        x = np.ones(3)
+        assert_raises(TypeError, hash, x)
+
+    def test_collections_hashable(self):
+        x = np.array([])
+        assert_(not isinstance(x, collections_abc.Hashable))
+
+
+class TestArrayPriority(object):
+    # This will go away when __array_priority__ is settled, meanwhile
+    # it serves to check unintended changes.
+    op = operator
+    binary_ops = [
+        op.pow, op.add, op.sub, op.mul, op.floordiv, op.truediv, op.mod,
+        op.and_, op.or_, op.xor, op.lshift, op.rshift, op.mod, op.gt,
+        op.ge, op.lt, op.le, op.ne, op.eq
+        ]
+
+    # See #7949. Don't use "/" operator With -3 switch, since python reports it
+    # as a DeprecationWarning
+    if sys.version_info[0] < 3 and not sys.py3kwarning:
+        binary_ops.append(op.div)
+
+    class Foo(np.ndarray):
+        __array_priority__ = 100.
+
+        def __new__(cls, *args, **kwargs):
+            return np.array(*args, **kwargs).view(cls)
+
+    class Bar(np.ndarray):
+        __array_priority__ = 101.
+
+        def __new__(cls, *args, **kwargs):
+            return np.array(*args, **kwargs).view(cls)
+
+    class Other(object):
+        __array_priority__ = 1000.
+
+        def _all(self, other):
+            return self.__class__()
+
+        __add__ = __radd__ = _all
+        __sub__ = __rsub__ = _all
+        __mul__ = __rmul__ = _all
+        __pow__ = __rpow__ = _all
+        __div__ = __rdiv__ = _all
+        __mod__ = __rmod__ = _all
+        __truediv__ = __rtruediv__ = _all
+        __floordiv__ = __rfloordiv__ = _all
+        __and__ = __rand__ = _all
+        __xor__ = __rxor__ = _all
+        __or__ = __ror__ = _all
+        __lshift__ = __rlshift__ = _all
+        __rshift__ = __rrshift__ = _all
+        __eq__ = _all
+        __ne__ = _all
+        __gt__ = _all
+        __ge__ = _all
+        __lt__ = _all
+        __le__ = _all
+
+    def test_ndarray_subclass(self):
+        a = np.array([1, 2])
+        b = self.Bar([1, 2])
+        for f in self.binary_ops:
+            msg = repr(f)
+            assert_(isinstance(f(a, b), self.Bar), msg)
+            assert_(isinstance(f(b, a), self.Bar), msg)
+
+    def test_ndarray_other(self):
+        a = np.array([1, 2])
+        b = self.Other()
+        for f in self.binary_ops:
+            msg = repr(f)
+            assert_(isinstance(f(a, b), self.Other), msg)
+            assert_(isinstance(f(b, a), self.Other), msg)
+
+    def test_subclass_subclass(self):
+        a = self.Foo([1, 2])
+        b = self.Bar([1, 2])
+        for f in self.binary_ops:
+            msg = repr(f)
+            assert_(isinstance(f(a, b), self.Bar), msg)
+            assert_(isinstance(f(b, a), self.Bar), msg)
+
+    def test_subclass_other(self):
+        a = self.Foo([1, 2])
+        b = self.Other()
+        for f in self.binary_ops:
+            msg = repr(f)
+            assert_(isinstance(f(a, b), self.Other), msg)
+            assert_(isinstance(f(b, a), self.Other), msg)
+
+
+class TestBytestringArrayNonzero(object):
+
+    def test_empty_bstring_array_is_falsey(self):
+        assert_(not np.array([''], dtype=str))
+
+    def test_whitespace_bstring_array_is_falsey(self):
+        a = np.array(['spam'], dtype=str)
+        a[0] = '  \0\0'
+        assert_(not a)
+
+    def test_all_null_bstring_array_is_falsey(self):
+        a = np.array(['spam'], dtype=str)
+        a[0] = '\0\0\0\0'
+        assert_(not a)
+
+    def test_null_inside_bstring_array_is_truthy(self):
+        a = np.array(['spam'], dtype=str)
+        a[0] = ' \0 \0'
+        assert_(a)
+
+
+class TestUnicodeArrayNonzero(object):
+
+    def test_empty_ustring_array_is_falsey(self):
+        assert_(not np.array([''], dtype=np.unicode))
+
+    def test_whitespace_ustring_array_is_falsey(self):
+        a = np.array(['eggs'], dtype=np.unicode)
+        a[0] = '  \0\0'
+        assert_(not a)
+
+    def test_all_null_ustring_array_is_falsey(self):
+        a = np.array(['eggs'], dtype=np.unicode)
+        a[0] = '\0\0\0\0'
+        assert_(not a)
+
+    def test_null_inside_ustring_array_is_truthy(self):
+        a = np.array(['eggs'], dtype=np.unicode)
+        a[0] = ' \0 \0'
+        assert_(a)
+
+
+class TestFormat(object):
+
+    def test_0d(self):
+        a = np.array(np.pi)
+        assert_equal('{:0.3g}'.format(a), '3.14')
+        assert_equal('{:0.3g}'.format(a[()]), '3.14')
+
+    def test_1d_no_format(self):
+        a = np.array([np.pi])
+        assert_equal('{}'.format(a), str(a))
+
+    def test_1d_format(self):
+        # until gh-5543, ensure that the behaviour matches what it used to be
+        a = np.array([np.pi])
+        if sys.version_info[:2] >= (3, 4):
+            assert_raises(TypeError, '{:30}'.format, a)
+        else:
+            with suppress_warnings() as sup:
+                sup.filter(PendingDeprecationWarning)
+                res = '{:30}'.format(a)
+                dst = object.__format__(a, '30')
+                assert_equal(res, dst)
+
+from numpy.testing import IS_PYPY
+
+class TestCTypes(object):
+
+    def test_ctypes_is_available(self):
+        test_arr = np.array([[1, 2, 3], [4, 5, 6]])
+
+        assert_equal(ctypes, test_arr.ctypes._ctypes)
+        assert_equal(tuple(test_arr.ctypes.shape), (2, 3))
+
+    def test_ctypes_is_not_available(self):
+        from numpy.core import _internal
+        _internal.ctypes = None
+        try:
+            test_arr = np.array([[1, 2, 3], [4, 5, 6]])
+
+            assert_(isinstance(test_arr.ctypes._ctypes,
+                               _internal._missing_ctypes))
+            assert_equal(tuple(test_arr.ctypes.shape), (2, 3))
+        finally:
+            _internal.ctypes = ctypes
+
+    def _make_readonly(x):
+        x.flags.writeable = False
+        return x
+
+    @pytest.mark.parametrize('arr', [
+        np.array([1, 2, 3]),
+        np.array([['one', 'two'], ['three', 'four']]),
+        np.array((1, 2), dtype='i4,i4'),
+        np.zeros((2,), dtype=
+            np.dtype(dict(
+                formats=['<i4', '<i4'],
+                names=['a', 'b'],
+                offsets=[0, 2],
+                itemsize=6
+            ))
+        ),
+        np.array([None], dtype=object),
+        np.array([]),
+        np.empty((0, 0)),
+        _make_readonly(np.array([1, 2, 3])),
+    ], ids=[
+        '1d',
+        '2d',
+        'structured',
+        'overlapping',
+        'object',
+        'empty',
+        'empty-2d',
+        'readonly'
+    ])
+    def test_ctypes_data_as_holds_reference(self, arr):
+        # gh-9647
+        # create a copy to ensure that pytest does not mess with the refcounts
+        arr = arr.copy()
+
+        arr_ref = weakref.ref(arr)
+
+        ctypes_ptr = arr.ctypes.data_as(ctypes.c_void_p)
+
+        # `ctypes_ptr` should hold onto `arr`
+        del arr
+        break_cycles()
+        assert_(arr_ref() is not None, "ctypes pointer did not hold onto a reference")
+
+        # but when the `ctypes_ptr` object dies, so should `arr`
+        del ctypes_ptr
+        if IS_PYPY:
+            # Pypy does not recycle arr objects immediately. Trigger gc to
+            # release arr. Cpython uses refcounts. An explicit call to gc
+            # should not be needed here.
+            break_cycles()
+        assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
+
+    def test_ctypes_as_parameter_holds_reference(self):
+        arr = np.array([None]).copy()
+
+        arr_ref = weakref.ref(arr)
+
+        ctypes_ptr = arr.ctypes._as_parameter_
+
+        # `ctypes_ptr` should hold onto `arr`
+        del arr
+        break_cycles()
+        assert_(arr_ref() is not None, "ctypes pointer did not hold onto a reference")
+
+        # but when the `ctypes_ptr` object dies, so should `arr`
+        del ctypes_ptr
+        if IS_PYPY:
+            break_cycles()
+        assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
+
+
+class TestWritebackIfCopy(object):
+    # all these tests use the WRITEBACKIFCOPY mechanism
+    def test_argmax_with_out(self):
+        mat = np.eye(5)
+        out = np.empty(5, dtype='i2')
+        res = np.argmax(mat, 0, out=out)
+        assert_equal(res, range(5))
+
+    def test_argmin_with_out(self):
+        mat = -np.eye(5)
+        out = np.empty(5, dtype='i2')
+        res = np.argmin(mat, 0, out=out)
+        assert_equal(res, range(5))
+
+    def test_clip_with_out(self):
+        mat = np.eye(5)
+        out = np.eye(5, dtype='i2')
+        res = np.clip(mat, a_min=-10, a_max=0, out=out)
+        assert_(res is out)
+        assert_equal(np.sum(out), 0)
+
+    def test_insert_noncontiguous(self):
+        a = np.arange(6).reshape(2,3).T # force non-c-contiguous
+        # uses arr_insert
+        np.place(a, a>2, [44, 55])
+        assert_equal(a, np.array([[0, 44], [1, 55], [2, 44]]))
+        # hit one of the failing paths
+        assert_raises(ValueError, np.place, a, a>20, [])
+
+    def test_put_noncontiguous(self):
+        a = np.arange(6).reshape(2,3).T # force non-c-contiguous
+        np.put(a, [0, 2], [44, 55])
+        assert_equal(a, np.array([[44, 3], [55, 4], [2, 5]]))
+
+    def test_putmask_noncontiguous(self):
+        a = np.arange(6).reshape(2,3).T # force non-c-contiguous
+        # uses arr_putmask
+        np.putmask(a, a>2, a**2)
+        assert_equal(a, np.array([[0, 9], [1, 16], [2, 25]]))
+
+    def test_take_mode_raise(self):
+        a = np.arange(6, dtype='int')
+        out = np.empty(2, dtype='int')
+        np.take(a, [0, 2], out=out, mode='raise')
+        assert_equal(out, np.array([0, 2]))
+
+    def test_choose_mod_raise(self):
+        a = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1]])
+        out = np.empty((3,3), dtype='int')
+        choices = [-10, 10]
+        np.choose(a, choices, out=out, mode='raise')
+        assert_equal(out, np.array([[ 10, -10,  10],
+                                    [-10,  10, -10],
+                                    [ 10, -10,  10]]))
+
+    def test_flatiter__array__(self):
+        a = np.arange(9).reshape(3,3)
+        b = a.T.flat
+        c = b.__array__()
+        # triggers the WRITEBACKIFCOPY resolution, assuming refcount semantics
+        del c
+
+    def test_dot_out(self):
+        # if HAVE_CBLAS, will use WRITEBACKIFCOPY
+        a = np.arange(9, dtype=float).reshape(3,3)
+        b = np.dot(a, a, out=a)
+        assert_equal(b, np.array([[15, 18, 21], [42, 54, 66], [69, 90, 111]]))
+
+    def test_view_assign(self):
+        from numpy.core._multiarray_tests import npy_create_writebackifcopy, npy_resolve
+
+        arr = np.arange(9).reshape(3, 3).T
+        arr_wb = npy_create_writebackifcopy(arr)
+        assert_(arr_wb.flags.writebackifcopy)
+        assert_(arr_wb.base is arr)
+        arr_wb[...] = -100
+        npy_resolve(arr_wb)
+        # arr changes after resolve, even though we assigned to arr_wb
+        assert_equal(arr, -100)
+        # after resolve, the two arrays no longer reference each other
+        assert_(arr_wb.ctypes.data != 0)
+        assert_equal(arr_wb.base, None)
+        # assigning to arr_wb does not get transferred to arr
+        arr_wb[...] = 100
+        assert_equal(arr, -100)
+
+    def test_dealloc_warning(self):
+        with suppress_warnings() as sup:
+            sup.record(RuntimeWarning)
+            arr = np.arange(9).reshape(3, 3)
+            v = arr.T
+            _multiarray_tests.npy_abuse_writebackifcopy(v)
+            assert len(sup.log) == 1
+
+    def test_view_discard_refcount(self):
+        from numpy.core._multiarray_tests import npy_create_writebackifcopy, npy_discard
+
+        arr = np.arange(9).reshape(3, 3).T
+        orig = arr.copy()
+        if HAS_REFCOUNT:
+            arr_cnt = sys.getrefcount(arr)
+        arr_wb = npy_create_writebackifcopy(arr)
+        assert_(arr_wb.flags.writebackifcopy)
+        assert_(arr_wb.base is arr)
+        arr_wb[...] = -100
+        npy_discard(arr_wb)
+        # arr remains unchanged after discard
+        assert_equal(arr, orig)
+        # after discard, the two arrays no longer reference each other
+        assert_(arr_wb.ctypes.data != 0)
+        assert_equal(arr_wb.base, None)
+        if HAS_REFCOUNT:
+            assert_equal(arr_cnt, sys.getrefcount(arr))
+        # assigning to arr_wb does not get transferred to arr
+        arr_wb[...] = 100
+        assert_equal(arr, orig)
+
+
+class TestArange(object):
+    def test_infinite(self):
+        assert_raises_regex(
+            ValueError, "size exceeded",
+            np.arange, 0, np.inf
+        )
+
+    def test_nan_step(self):
+        assert_raises_regex(
+            ValueError, "cannot compute length",
+            np.arange, 0, 1, np.nan
+        )
+
+    def test_zero_step(self):
+        assert_raises(ZeroDivisionError, np.arange, 0, 10, 0)
+        assert_raises(ZeroDivisionError, np.arange, 0.0, 10.0, 0.0)
+
+        # empty range
+        assert_raises(ZeroDivisionError, np.arange, 0, 0, 0)
+        assert_raises(ZeroDivisionError, np.arange, 0.0, 0.0, 0.0)
+
+
+class TestArrayFinalize(object):
+    """ Tests __array_finalize__ """
+
+    def test_receives_base(self):
+        # gh-11237
+        class SavesBase(np.ndarray):
+            def __array_finalize__(self, obj):
+                self.saved_base = self.base
+
+        a = np.array(1).view(SavesBase)
+        assert_(a.saved_base is a.base)
+
+    def test_lifetime_on_error(self):
+        # gh-11237
+        class RaisesInFinalize(np.ndarray):
+            def __array_finalize__(self, obj):
+                # crash, but keep this object alive
+                raise Exception(self)
+
+        # a plain object can't be weakref'd
+        class Dummy(object): pass
+
+        # get a weak reference to an object within an array
+        obj_arr = np.array(Dummy())
+        obj_ref = weakref.ref(obj_arr[()])
+
+        # get an array that crashed in __array_finalize__
+        with assert_raises(Exception) as e:
+            obj_arr.view(RaisesInFinalize)
+        if sys.version_info.major == 2:
+            # prevent an extra reference being kept
+            sys.exc_clear()
+
+        obj_subarray = e.exception.args[0]
+        del e
+        assert_(isinstance(obj_subarray, RaisesInFinalize))
+
+        # reference should still be held by obj_arr
+        break_cycles()
+        assert_(obj_ref() is not None, "object should not already be dead")
+
+        del obj_arr
+        break_cycles()
+        assert_(obj_ref() is not None, "obj_arr should not hold the last reference")
+
+        del obj_subarray
+        break_cycles()
+        assert_(obj_ref() is None, "no references should remain")
+
+
+def test_orderconverter_with_nonASCII_unicode_ordering():
+    # gh-7475
+    a = np.arange(5)
+    assert_raises(ValueError, a.flatten, order=u'\xe2')
+
+
+def test_equal_override():
+    # gh-9153: ndarray.__eq__ uses special logic for structured arrays, which
+    # did not respect overrides with __array_priority__ or __array_ufunc__.
+    # The PR fixed this for __array_priority__ and __array_ufunc__ = None.
+    class MyAlwaysEqual(object):
+        def __eq__(self, other):
+            return "eq"
+
+        def __ne__(self, other):
+            return "ne"
+
+    class MyAlwaysEqualOld(MyAlwaysEqual):
+        __array_priority__ = 10000
+
+    class MyAlwaysEqualNew(MyAlwaysEqual):
+        __array_ufunc__ = None
+
+    array = np.array([(0, 1), (2, 3)], dtype='i4,i4')
+    for my_always_equal_cls in MyAlwaysEqualOld, MyAlwaysEqualNew:
+        my_always_equal = my_always_equal_cls()
+        assert_equal(my_always_equal == array, 'eq')
+        assert_equal(array == my_always_equal, 'eq')
+        assert_equal(my_always_equal != array, 'ne')
+        assert_equal(array != my_always_equal, 'ne')
+
+
+def test_npymath_complex():
+    # Smoketest npymath functions
+    from numpy.core._multiarray_tests import (
+        npy_cabs, npy_carg)
+
+    funcs = {npy_cabs: np.absolute,
+             npy_carg: np.angle}
+    vals = (1, np.inf, -np.inf, np.nan)
+    types = (np.complex64, np.complex128, np.clongdouble)
+
+    for fun, npfun in funcs.items():
+        for x, y in itertools.product(vals, vals):
+            for t in types:
+                z = t(complex(x, y))
+                got = fun(z)
+                expected = npfun(z)
+                assert_allclose(got, expected)
+
+
+def test_npymath_real():
+    # Smoketest npymath functions
+    from numpy.core._multiarray_tests import (
+        npy_log10, npy_cosh, npy_sinh, npy_tan, npy_tanh)
+
+    funcs = {npy_log10: np.log10,
+             npy_cosh: np.cosh,
+             npy_sinh: np.sinh,
+             npy_tan: np.tan,
+             npy_tanh: np.tanh}
+    vals = (1, np.inf, -np.inf, np.nan)
+    types = (np.float32, np.float64, np.longdouble)
+
+    with np.errstate(all='ignore'):
+        for fun, npfun in funcs.items():
+            for x, t in itertools.product(vals, types):
+                z = t(x)
+                got = fun(z)
+                expected = npfun(z)
+                assert_allclose(got, expected)
+
+def test_uintalignment_and_alignment():
+    # alignment code needs to satisfy these requrements:
+    #  1. numpy structs match C struct layout
+    #  2. ufuncs/casting is safe wrt to aligned access
+    #  3. copy code is safe wrt to "uint alidned" access
+    #
+    # Complex types are the main problem, whose alignment may not be the same
+    # as their "uint alignment".
+    #
+    # This test might only fail on certain platforms, where uint64 alignment is
+    # not equal to complex64 alignment. The second 2 tests will only fail
+    # for DEBUG=1.
+
+    d1 = np.dtype('u1,c8', align=True)
+    d2 = np.dtype('u4,c8', align=True)
+    d3 = np.dtype({'names': ['a', 'b'], 'formats': ['u1', d1]}, align=True)
+
+    assert_equal(np.zeros(1, dtype=d1)['f1'].flags['ALIGNED'], True)
+    assert_equal(np.zeros(1, dtype=d2)['f1'].flags['ALIGNED'], True)
+    assert_equal(np.zeros(1, dtype='u1,c8')['f1'].flags['ALIGNED'], False)
+
+    # check that C struct matches numpy struct size
+    s = _multiarray_tests.get_struct_alignments()
+    for d, (alignment, size) in zip([d1,d2,d3], s):
+        assert_equal(d.alignment, alignment)
+        assert_equal(d.itemsize, size)
+
+    # check that ufuncs don't complain in debug mode
+    # (this is probably OK if the aligned flag is true above)
+    src = np.zeros((2,2), dtype=d1)['f1']  # 4-byte aligned, often
+    np.exp(src)  # assert fails?
+
+    # check that copy code doesn't complain in debug mode
+    dst = np.zeros((2,2), dtype='c8')
+    dst[:,1] = src[:,1]  # assert in lowlevel_strided_loops fails?
+
+class TestAlignment(object):
+    # adapted from scipy._lib.tests.test__util.test__aligned_zeros
+    # Checks that unusual memory alignments don't trip up numpy.
+    # In particular, check RELAXED_STRIDES don't trip alignment assertions in
+    # NDEBUG mode for size-0 arrays (gh-12503)
+
+    def check(self, shape, dtype, order, align):
+        err_msg = repr((shape, dtype, order, align))
+        x = _aligned_zeros(shape, dtype, order, align=align)
+        if align is None:
+            align = np.dtype(dtype).alignment
+        assert_equal(x.__array_interface__['data'][0] % align, 0)
+        if hasattr(shape, '__len__'):
+            assert_equal(x.shape, shape, err_msg)
+        else:
+            assert_equal(x.shape, (shape,), err_msg)
+        assert_equal(x.dtype, dtype)
+        if order == "C":
+            assert_(x.flags.c_contiguous, err_msg)
+        elif order == "F":
+            if x.size > 0:
+                assert_(x.flags.f_contiguous, err_msg)
+        elif order is None:
+            assert_(x.flags.c_contiguous, err_msg)
+        else:
+            raise ValueError()
+
+    def test_various_alignments(self):
+        for align in [1, 2, 3, 4, 8, 12, 16, 32, 64, None]:
+            for n in [0, 1, 3, 11]:
+                for order in ["C", "F", None]:
+                    for dtype in list(np.typecodes["All"]) + ['i4,i4,i4']:
+                        if dtype == 'O':
+                            # object dtype can't be misaligned
+                            continue
+                        for shape in [n, (1, 2, 3, n)]:
+                            self.check(shape, np.dtype(dtype), order, align)
+
+    def test_strided_loop_alignments(self):
+        # particularly test that complex64 and float128 use right alignment
+        # code-paths, since these are particularly problematic. It is useful to
+        # turn on USE_DEBUG for this test, so lowlevel-loop asserts are run.
+        for align in [1, 2, 4, 8, 12, 16, None]:
+            xf64 = _aligned_zeros(3, np.float64)
+
+            xc64 = _aligned_zeros(3, np.complex64, align=align)
+            xf128 = _aligned_zeros(3, np.longdouble, align=align)
+
+            # test casting, both to and from misaligned
+            with suppress_warnings() as sup:
+                sup.filter(np.ComplexWarning, "Casting complex values")
+                xc64.astype('f8')
+            xf64.astype(np.complex64)
+            test = xc64 + xf64
+
+            xf128.astype('f8')
+            xf64.astype(np.longdouble)
+            test = xf128 + xf64
+
+            test = xf128 + xc64
+
+            # test copy, both to and from misaligned
+            # contig copy
+            xf64[:] = xf64.copy()
+            xc64[:] = xc64.copy()
+            xf128[:] = xf128.copy()
+            # strided copy
+            xf64[::2] = xf64[::2].copy()
+            xc64[::2] = xc64[::2].copy()
+            xf128[::2] = xf128[::2].copy()
+
+def test_getfield():
+    a = np.arange(32, dtype='uint16')
+    if sys.byteorder == 'little':
+        i = 0
+        j = 1
+    else:
+        i = 1
+        j = 0
+    b = a.getfield('int8', i)
+    assert_equal(b, a)
+    b = a.getfield('int8', j)
+    assert_equal(b, 0)
+    pytest.raises(ValueError, a.getfield, 'uint8', -1)
+    pytest.raises(ValueError, a.getfield, 'uint8', 16)
+    pytest.raises(ValueError, a.getfield, 'uint64', 0)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_nditer.py b/contrib/python/numpy/py2/numpy/core/tests/test_nditer.py
new file mode 100644
index 00000000000..3a24ce55ec1
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_nditer.py
@@ -0,0 +1,2861 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import pytest
+
+import numpy as np
+import numpy.core._multiarray_tests as _multiarray_tests
+from numpy import array, arange, nditer, all
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises,
+    HAS_REFCOUNT, suppress_warnings
+    )
+
+
+def iter_multi_index(i):
+    ret = []
+    while not i.finished:
+        ret.append(i.multi_index)
+        i.iternext()
+    return ret
+
+def iter_indices(i):
+    ret = []
+    while not i.finished:
+        ret.append(i.index)
+        i.iternext()
+    return ret
+
+def iter_iterindices(i):
+    ret = []
+    while not i.finished:
+        ret.append(i.iterindex)
+        i.iternext()
+    return ret
+
+[email protected](not HAS_REFCOUNT, reason="Python lacks refcounts")
+def test_iter_refcount():
+    # Make sure the iterator doesn't leak
+
+    # Basic
+    a = arange(6)
+    dt = np.dtype('f4').newbyteorder()
+    rc_a = sys.getrefcount(a)
+    rc_dt = sys.getrefcount(dt)
+    with nditer(a, [],
+                [['readwrite', 'updateifcopy']],
+                casting='unsafe',
+                op_dtypes=[dt]) as it:
+        assert_(not it.iterationneedsapi)
+        assert_(sys.getrefcount(a) > rc_a)
+        assert_(sys.getrefcount(dt) > rc_dt)
+    # del 'it'
+    it = None
+    assert_equal(sys.getrefcount(a), rc_a)
+    assert_equal(sys.getrefcount(dt), rc_dt)
+
+    # With a copy
+    a = arange(6, dtype='f4')
+    dt = np.dtype('f4')
+    rc_a = sys.getrefcount(a)
+    rc_dt = sys.getrefcount(dt)
+    it = nditer(a, [],
+                [['readwrite']],
+                op_dtypes=[dt])
+    rc2_a = sys.getrefcount(a)
+    rc2_dt = sys.getrefcount(dt)
+    it2 = it.copy()
+    assert_(sys.getrefcount(a) > rc2_a)
+    assert_(sys.getrefcount(dt) > rc2_dt)
+    it = None
+    assert_equal(sys.getrefcount(a), rc2_a)
+    assert_equal(sys.getrefcount(dt), rc2_dt)
+    it2 = None
+    assert_equal(sys.getrefcount(a), rc_a)
+    assert_equal(sys.getrefcount(dt), rc_dt)
+
+    del it2  # avoid pyflakes unused variable warning
+
+def test_iter_best_order():
+    # The iterator should always find the iteration order
+    # with increasing memory addresses
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3, 4), (2, 3, 4), (2, 3, 4, 3), (2, 3, 2, 2, 3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit) & dirs):
+                    dirs_index[bit] = slice(None, None, -1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = nditer(aview, [], [['readonly']])
+            assert_equal([x for x in i], a)
+            # Fortran-order
+            i = nditer(aview.T, [], [['readonly']])
+            assert_equal([x for x in i], a)
+            # Other order
+            if len(shape) > 2:
+                i = nditer(aview.swapaxes(0, 1), [], [['readonly']])
+                assert_equal([x for x in i], a)
+
+def test_iter_c_order():
+    # Test forcing C order
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3, 4), (2, 3, 4), (2, 3, 4, 3), (2, 3, 2, 2, 3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit) & dirs):
+                    dirs_index[bit] = slice(None, None, -1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = nditer(aview, order='C')
+            assert_equal([x for x in i], aview.ravel(order='C'))
+            # Fortran-order
+            i = nditer(aview.T, order='C')
+            assert_equal([x for x in i], aview.T.ravel(order='C'))
+            # Other order
+            if len(shape) > 2:
+                i = nditer(aview.swapaxes(0, 1), order='C')
+                assert_equal([x for x in i],
+                                    aview.swapaxes(0, 1).ravel(order='C'))
+
+def test_iter_f_order():
+    # Test forcing F order
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3, 4), (2, 3, 4), (2, 3, 4, 3), (2, 3, 2, 2, 3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit) & dirs):
+                    dirs_index[bit] = slice(None, None, -1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = nditer(aview, order='F')
+            assert_equal([x for x in i], aview.ravel(order='F'))
+            # Fortran-order
+            i = nditer(aview.T, order='F')
+            assert_equal([x for x in i], aview.T.ravel(order='F'))
+            # Other order
+            if len(shape) > 2:
+                i = nditer(aview.swapaxes(0, 1), order='F')
+                assert_equal([x for x in i],
+                                    aview.swapaxes(0, 1).ravel(order='F'))
+
+def test_iter_c_or_f_order():
+    # Test forcing any contiguous (C or F) order
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3, 4), (2, 3, 4), (2, 3, 4, 3), (2, 3, 2, 2, 3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit) & dirs):
+                    dirs_index[bit] = slice(None, None, -1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = nditer(aview, order='A')
+            assert_equal([x for x in i], aview.ravel(order='A'))
+            # Fortran-order
+            i = nditer(aview.T, order='A')
+            assert_equal([x for x in i], aview.T.ravel(order='A'))
+            # Other order
+            if len(shape) > 2:
+                i = nditer(aview.swapaxes(0, 1), order='A')
+                assert_equal([x for x in i],
+                                    aview.swapaxes(0, 1).ravel(order='A'))
+
+def test_iter_best_order_multi_index_1d():
+    # The multi-indices should be correct with any reordering
+
+    a = arange(4)
+    # 1D order
+    i = nditer(a, ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(0,), (1,), (2,), (3,)])
+    # 1D reversed order
+    i = nditer(a[::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(3,), (2,), (1,), (0,)])
+
+def test_iter_best_order_multi_index_2d():
+    # The multi-indices should be correct with any reordering
+
+    a = arange(6)
+    # 2D C-order
+    i = nditer(a.reshape(2, 3), ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)])
+    # 2D Fortran-order
+    i = nditer(a.reshape(2, 3).copy(order='F'), ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(0, 0), (1, 0), (0, 1), (1, 1), (0, 2), (1, 2)])
+    # 2D reversed C-order
+    i = nditer(a.reshape(2, 3)[::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(1, 0), (1, 1), (1, 2), (0, 0), (0, 1), (0, 2)])
+    i = nditer(a.reshape(2, 3)[:, ::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(0, 2), (0, 1), (0, 0), (1, 2), (1, 1), (1, 0)])
+    i = nditer(a.reshape(2, 3)[::-1, ::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(1, 2), (1, 1), (1, 0), (0, 2), (0, 1), (0, 0)])
+    # 2D reversed Fortran-order
+    i = nditer(a.reshape(2, 3).copy(order='F')[::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(1, 0), (0, 0), (1, 1), (0, 1), (1, 2), (0, 2)])
+    i = nditer(a.reshape(2, 3).copy(order='F')[:, ::-1],
+                                                   ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(0, 2), (1, 2), (0, 1), (1, 1), (0, 0), (1, 0)])
+    i = nditer(a.reshape(2, 3).copy(order='F')[::-1, ::-1],
+                                                   ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i), [(1, 2), (0, 2), (1, 1), (0, 1), (1, 0), (0, 0)])
+
+def test_iter_best_order_multi_index_3d():
+    # The multi-indices should be correct with any reordering
+
+    a = arange(12)
+    # 3D C-order
+    i = nditer(a.reshape(2, 3, 2), ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (0, 2, 0), (0, 2, 1),
+                             (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1), (1, 2, 0), (1, 2, 1)])
+    # 3D Fortran-order
+    i = nditer(a.reshape(2, 3, 2).copy(order='F'), ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(0, 0, 0), (1, 0, 0), (0, 1, 0), (1, 1, 0), (0, 2, 0), (1, 2, 0),
+                             (0, 0, 1), (1, 0, 1), (0, 1, 1), (1, 1, 1), (0, 2, 1), (1, 2, 1)])
+    # 3D reversed C-order
+    i = nditer(a.reshape(2, 3, 2)[::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1), (1, 2, 0), (1, 2, 1),
+                             (0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (0, 2, 0), (0, 2, 1)])
+    i = nditer(a.reshape(2, 3, 2)[:, ::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(0, 2, 0), (0, 2, 1), (0, 1, 0), (0, 1, 1), (0, 0, 0), (0, 0, 1),
+                             (1, 2, 0), (1, 2, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)])
+    i = nditer(a.reshape(2, 3, 2)[:,:, ::-1], ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(0, 0, 1), (0, 0, 0), (0, 1, 1), (0, 1, 0), (0, 2, 1), (0, 2, 0),
+                             (1, 0, 1), (1, 0, 0), (1, 1, 1), (1, 1, 0), (1, 2, 1), (1, 2, 0)])
+    # 3D reversed Fortran-order
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[::-1],
+                                                    ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(1, 0, 0), (0, 0, 0), (1, 1, 0), (0, 1, 0), (1, 2, 0), (0, 2, 0),
+                             (1, 0, 1), (0, 0, 1), (1, 1, 1), (0, 1, 1), (1, 2, 1), (0, 2, 1)])
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[:, ::-1],
+                                                    ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(0, 2, 0), (1, 2, 0), (0, 1, 0), (1, 1, 0), (0, 0, 0), (1, 0, 0),
+                             (0, 2, 1), (1, 2, 1), (0, 1, 1), (1, 1, 1), (0, 0, 1), (1, 0, 1)])
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[:,:, ::-1],
+                                                    ['multi_index'], [['readonly']])
+    assert_equal(iter_multi_index(i),
+                            [(0, 0, 1), (1, 0, 1), (0, 1, 1), (1, 1, 1), (0, 2, 1), (1, 2, 1),
+                             (0, 0, 0), (1, 0, 0), (0, 1, 0), (1, 1, 0), (0, 2, 0), (1, 2, 0)])
+
+def test_iter_best_order_c_index_1d():
+    # The C index should be correct with any reordering
+
+    a = arange(4)
+    # 1D order
+    i = nditer(a, ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [0, 1, 2, 3])
+    # 1D reversed order
+    i = nditer(a[::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [3, 2, 1, 0])
+
+def test_iter_best_order_c_index_2d():
+    # The C index should be correct with any reordering
+
+    a = arange(6)
+    # 2D C-order
+    i = nditer(a.reshape(2, 3), ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [0, 1, 2, 3, 4, 5])
+    # 2D Fortran-order
+    i = nditer(a.reshape(2, 3).copy(order='F'),
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [0, 3, 1, 4, 2, 5])
+    # 2D reversed C-order
+    i = nditer(a.reshape(2, 3)[::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [3, 4, 5, 0, 1, 2])
+    i = nditer(a.reshape(2, 3)[:, ::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [2, 1, 0, 5, 4, 3])
+    i = nditer(a.reshape(2, 3)[::-1, ::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [5, 4, 3, 2, 1, 0])
+    # 2D reversed Fortran-order
+    i = nditer(a.reshape(2, 3).copy(order='F')[::-1],
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [3, 0, 4, 1, 5, 2])
+    i = nditer(a.reshape(2, 3).copy(order='F')[:, ::-1],
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [2, 5, 1, 4, 0, 3])
+    i = nditer(a.reshape(2, 3).copy(order='F')[::-1, ::-1],
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i), [5, 2, 4, 1, 3, 0])
+
+def test_iter_best_order_c_index_3d():
+    # The C index should be correct with any reordering
+
+    a = arange(12)
+    # 3D C-order
+    i = nditer(a.reshape(2, 3, 2), ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    # 3D Fortran-order
+    i = nditer(a.reshape(2, 3, 2).copy(order='F'),
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [0, 6, 2, 8, 4, 10, 1, 7, 3, 9, 5, 11])
+    # 3D reversed C-order
+    i = nditer(a.reshape(2, 3, 2)[::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5])
+    i = nditer(a.reshape(2, 3, 2)[:, ::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [4, 5, 2, 3, 0, 1, 10, 11, 8, 9, 6, 7])
+    i = nditer(a.reshape(2, 3, 2)[:,:, ::-1], ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10])
+    # 3D reversed Fortran-order
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[::-1],
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [6, 0, 8, 2, 10, 4, 7, 1, 9, 3, 11, 5])
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[:, ::-1],
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [4, 10, 2, 8, 0, 6, 5, 11, 3, 9, 1, 7])
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[:,:, ::-1],
+                                    ['c_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [1, 7, 3, 9, 5, 11, 0, 6, 2, 8, 4, 10])
+
+def test_iter_best_order_f_index_1d():
+    # The Fortran index should be correct with any reordering
+
+    a = arange(4)
+    # 1D order
+    i = nditer(a, ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [0, 1, 2, 3])
+    # 1D reversed order
+    i = nditer(a[::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [3, 2, 1, 0])
+
+def test_iter_best_order_f_index_2d():
+    # The Fortran index should be correct with any reordering
+
+    a = arange(6)
+    # 2D C-order
+    i = nditer(a.reshape(2, 3), ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [0, 2, 4, 1, 3, 5])
+    # 2D Fortran-order
+    i = nditer(a.reshape(2, 3).copy(order='F'),
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [0, 1, 2, 3, 4, 5])
+    # 2D reversed C-order
+    i = nditer(a.reshape(2, 3)[::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [1, 3, 5, 0, 2, 4])
+    i = nditer(a.reshape(2, 3)[:, ::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [4, 2, 0, 5, 3, 1])
+    i = nditer(a.reshape(2, 3)[::-1, ::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [5, 3, 1, 4, 2, 0])
+    # 2D reversed Fortran-order
+    i = nditer(a.reshape(2, 3).copy(order='F')[::-1],
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [1, 0, 3, 2, 5, 4])
+    i = nditer(a.reshape(2, 3).copy(order='F')[:, ::-1],
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [4, 5, 2, 3, 0, 1])
+    i = nditer(a.reshape(2, 3).copy(order='F')[::-1, ::-1],
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i), [5, 4, 3, 2, 1, 0])
+
+def test_iter_best_order_f_index_3d():
+    # The Fortran index should be correct with any reordering
+
+    a = arange(12)
+    # 3D C-order
+    i = nditer(a.reshape(2, 3, 2), ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [0, 6, 2, 8, 4, 10, 1, 7, 3, 9, 5, 11])
+    # 3D Fortran-order
+    i = nditer(a.reshape(2, 3, 2).copy(order='F'),
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    # 3D reversed C-order
+    i = nditer(a.reshape(2, 3, 2)[::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [1, 7, 3, 9, 5, 11, 0, 6, 2, 8, 4, 10])
+    i = nditer(a.reshape(2, 3, 2)[:, ::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [4, 10, 2, 8, 0, 6, 5, 11, 3, 9, 1, 7])
+    i = nditer(a.reshape(2, 3, 2)[:,:, ::-1], ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [6, 0, 8, 2, 10, 4, 7, 1, 9, 3, 11, 5])
+    # 3D reversed Fortran-order
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[::-1],
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10])
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[:, ::-1],
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [4, 5, 2, 3, 0, 1, 10, 11, 8, 9, 6, 7])
+    i = nditer(a.reshape(2, 3, 2).copy(order='F')[:,:, ::-1],
+                                    ['f_index'], [['readonly']])
+    assert_equal(iter_indices(i),
+                            [6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5])
+
+def test_iter_no_inner_full_coalesce():
+    # Check no_inner iterators which coalesce into a single inner loop
+
+    for shape in [(5,), (3, 4), (2, 3, 4), (2, 3, 4, 3), (2, 3, 2, 2, 3)]:
+        size = np.prod(shape)
+        a = arange(size)
+        # Test each combination of forward and backwards indexing
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit) & dirs):
+                    dirs_index[bit] = slice(None, None, -1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = nditer(aview, ['external_loop'], [['readonly']])
+            assert_equal(i.ndim, 1)
+            assert_equal(i[0].shape, (size,))
+            # Fortran-order
+            i = nditer(aview.T, ['external_loop'], [['readonly']])
+            assert_equal(i.ndim, 1)
+            assert_equal(i[0].shape, (size,))
+            # Other order
+            if len(shape) > 2:
+                i = nditer(aview.swapaxes(0, 1),
+                                    ['external_loop'], [['readonly']])
+                assert_equal(i.ndim, 1)
+                assert_equal(i[0].shape, (size,))
+
+def test_iter_no_inner_dim_coalescing():
+    # Check no_inner iterators whose dimensions may not coalesce completely
+
+    # Skipping the last element in a dimension prevents coalescing
+    # with the next-bigger dimension
+    a = arange(24).reshape(2, 3, 4)[:,:, :-1]
+    i = nditer(a, ['external_loop'], [['readonly']])
+    assert_equal(i.ndim, 2)
+    assert_equal(i[0].shape, (3,))
+    a = arange(24).reshape(2, 3, 4)[:, :-1,:]
+    i = nditer(a, ['external_loop'], [['readonly']])
+    assert_equal(i.ndim, 2)
+    assert_equal(i[0].shape, (8,))
+    a = arange(24).reshape(2, 3, 4)[:-1,:,:]
+    i = nditer(a, ['external_loop'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    assert_equal(i[0].shape, (12,))
+
+    # Even with lots of 1-sized dimensions, should still coalesce
+    a = arange(24).reshape(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1)
+    i = nditer(a, ['external_loop'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    assert_equal(i[0].shape, (24,))
+
+def test_iter_dim_coalescing():
+    # Check that the correct number of dimensions are coalesced
+
+    # Tracking a multi-index disables coalescing
+    a = arange(24).reshape(2, 3, 4)
+    i = nditer(a, ['multi_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+
+    # A tracked index can allow coalescing if it's compatible with the array
+    a3d = arange(24).reshape(2, 3, 4)
+    i = nditer(a3d, ['c_index'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    i = nditer(a3d.swapaxes(0, 1), ['c_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+    i = nditer(a3d.T, ['c_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+    i = nditer(a3d.T, ['f_index'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    i = nditer(a3d.T.swapaxes(0, 1), ['f_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+
+    # When C or F order is forced, coalescing may still occur
+    a3d = arange(24).reshape(2, 3, 4)
+    i = nditer(a3d, order='C')
+    assert_equal(i.ndim, 1)
+    i = nditer(a3d.T, order='C')
+    assert_equal(i.ndim, 3)
+    i = nditer(a3d, order='F')
+    assert_equal(i.ndim, 3)
+    i = nditer(a3d.T, order='F')
+    assert_equal(i.ndim, 1)
+    i = nditer(a3d, order='A')
+    assert_equal(i.ndim, 1)
+    i = nditer(a3d.T, order='A')
+    assert_equal(i.ndim, 1)
+
+def test_iter_broadcasting():
+    # Standard NumPy broadcasting rules
+
+    # 1D with scalar
+    i = nditer([arange(6), np.int32(2)], ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (6,))
+
+    # 2D with scalar
+    i = nditer([arange(6).reshape(2, 3), np.int32(2)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2, 3))
+    # 2D with 1D
+    i = nditer([arange(6).reshape(2, 3), arange(3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2, 3))
+    i = nditer([arange(2).reshape(2, 1), arange(3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2, 3))
+    # 2D with 2D
+    i = nditer([arange(2).reshape(2, 1), arange(3).reshape(1, 3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2, 3))
+
+    # 3D with scalar
+    i = nditer([np.int32(2), arange(24).reshape(4, 2, 3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    # 3D with 1D
+    i = nditer([arange(3), arange(24).reshape(4, 2, 3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    i = nditer([arange(3), arange(8).reshape(4, 2, 1)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    # 3D with 2D
+    i = nditer([arange(6).reshape(2, 3), arange(24).reshape(4, 2, 3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    i = nditer([arange(2).reshape(2, 1), arange(24).reshape(4, 2, 3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    i = nditer([arange(3).reshape(1, 3), arange(8).reshape(4, 2, 1)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    # 3D with 3D
+    i = nditer([arange(2).reshape(1, 2, 1), arange(3).reshape(1, 1, 3),
+                        arange(4).reshape(4, 1, 1)],
+                        ['multi_index'], [['readonly']]*3)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    i = nditer([arange(6).reshape(1, 2, 3), arange(4).reshape(4, 1, 1)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+    i = nditer([arange(24).reshape(4, 2, 3), arange(12).reshape(4, 1, 3)],
+                        ['multi_index'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4, 2, 3))
+
+def test_iter_itershape():
+    # Check that allocated outputs work with a specified shape
+    a = np.arange(6, dtype='i2').reshape(2, 3)
+    i = nditer([a, None], [], [['readonly'], ['writeonly', 'allocate']],
+                            op_axes=[[0, 1, None], None],
+                            itershape=(-1, -1, 4))
+    assert_equal(i.operands[1].shape, (2, 3, 4))
+    assert_equal(i.operands[1].strides, (24, 8, 2))
+
+    i = nditer([a.T, None], [], [['readonly'], ['writeonly', 'allocate']],
+                            op_axes=[[0, 1, None], None],
+                            itershape=(-1, -1, 4))
+    assert_equal(i.operands[1].shape, (3, 2, 4))
+    assert_equal(i.operands[1].strides, (8, 24, 2))
+
+    i = nditer([a.T, None], [], [['readonly'], ['writeonly', 'allocate']],
+                            order='F',
+                            op_axes=[[0, 1, None], None],
+                            itershape=(-1, -1, 4))
+    assert_equal(i.operands[1].shape, (3, 2, 4))
+    assert_equal(i.operands[1].strides, (2, 6, 12))
+
+    # If we specify 1 in the itershape, it shouldn't allow broadcasting
+    # of that dimension to a bigger value
+    assert_raises(ValueError, nditer, [a, None], [],
+                            [['readonly'], ['writeonly', 'allocate']],
+                            op_axes=[[0, 1, None], None],
+                            itershape=(-1, 1, 4))
+    # Test bug that for no op_axes but itershape, they are NULLed correctly
+    i = np.nditer([np.ones(2), None, None], itershape=(2,))
+
+def test_iter_broadcasting_errors():
+    # Check that errors are thrown for bad broadcasting shapes
+
+    # 1D with 1D
+    assert_raises(ValueError, nditer, [arange(2), arange(3)],
+                    [], [['readonly']]*2)
+    # 2D with 1D
+    assert_raises(ValueError, nditer,
+                    [arange(6).reshape(2, 3), arange(2)],
+                    [], [['readonly']]*2)
+    # 2D with 2D
+    assert_raises(ValueError, nditer,
+                    [arange(6).reshape(2, 3), arange(9).reshape(3, 3)],
+                    [], [['readonly']]*2)
+    assert_raises(ValueError, nditer,
+                    [arange(6).reshape(2, 3), arange(4).reshape(2, 2)],
+                    [], [['readonly']]*2)
+    # 3D with 3D
+    assert_raises(ValueError, nditer,
+                    [arange(36).reshape(3, 3, 4), arange(24).reshape(2, 3, 4)],
+                    [], [['readonly']]*2)
+    assert_raises(ValueError, nditer,
+                    [arange(8).reshape(2, 4, 1), arange(24).reshape(2, 3, 4)],
+                    [], [['readonly']]*2)
+
+    # Verify that the error message mentions the right shapes
+    try:
+        nditer([arange(2).reshape(1, 2, 1),
+                arange(3).reshape(1, 3),
+                arange(6).reshape(2, 3)],
+               [],
+               [['readonly'], ['readonly'], ['writeonly', 'no_broadcast']])
+        raise AssertionError('Should have raised a broadcast error')
+    except ValueError as e:
+        msg = str(e)
+        # The message should contain the shape of the 3rd operand
+        assert_(msg.find('(2,3)') >= 0,
+                'Message "%s" doesn\'t contain operand shape (2,3)' % msg)
+        # The message should contain the broadcast shape
+        assert_(msg.find('(1,2,3)') >= 0,
+                'Message "%s" doesn\'t contain broadcast shape (1,2,3)' % msg)
+
+    try:
+        nditer([arange(6).reshape(2, 3), arange(2)],
+               [],
+               [['readonly'], ['readonly']],
+               op_axes=[[0, 1], [0, np.newaxis]],
+               itershape=(4, 3))
+        raise AssertionError('Should have raised a broadcast error')
+    except ValueError as e:
+        msg = str(e)
+        # The message should contain "shape->remappedshape" for each operand
+        assert_(msg.find('(2,3)->(2,3)') >= 0,
+            'Message "%s" doesn\'t contain operand shape (2,3)->(2,3)' % msg)
+        assert_(msg.find('(2,)->(2,newaxis)') >= 0,
+                ('Message "%s" doesn\'t contain remapped operand shape' +
+                '(2,)->(2,newaxis)') % msg)
+        # The message should contain the itershape parameter
+        assert_(msg.find('(4,3)') >= 0,
+                'Message "%s" doesn\'t contain itershape parameter (4,3)' % msg)
+
+    try:
+        nditer([np.zeros((2, 1, 1)), np.zeros((2,))],
+               [],
+               [['writeonly', 'no_broadcast'], ['readonly']])
+        raise AssertionError('Should have raised a broadcast error')
+    except ValueError as e:
+        msg = str(e)
+        # The message should contain the shape of the bad operand
+        assert_(msg.find('(2,1,1)') >= 0,
+            'Message "%s" doesn\'t contain operand shape (2,1,1)' % msg)
+        # The message should contain the broadcast shape
+        assert_(msg.find('(2,1,2)') >= 0,
+                'Message "%s" doesn\'t contain the broadcast shape (2,1,2)' % msg)
+
+def test_iter_flags_errors():
+    # Check that bad combinations of flags produce errors
+
+    a = arange(6)
+
+    # Not enough operands
+    assert_raises(ValueError, nditer, [], [], [])
+    # Too many operands
+    assert_raises(ValueError, nditer, [a]*100, [], [['readonly']]*100)
+    # Bad global flag
+    assert_raises(ValueError, nditer, [a], ['bad flag'], [['readonly']])
+    # Bad op flag
+    assert_raises(ValueError, nditer, [a], [], [['readonly', 'bad flag']])
+    # Bad order parameter
+    assert_raises(ValueError, nditer, [a], [], [['readonly']], order='G')
+    # Bad casting parameter
+    assert_raises(ValueError, nditer, [a], [], [['readonly']], casting='noon')
+    # op_flags must match ops
+    assert_raises(ValueError, nditer, [a]*3, [], [['readonly']]*2)
+    # Cannot track both a C and an F index
+    assert_raises(ValueError, nditer, a,
+                ['c_index', 'f_index'], [['readonly']])
+    # Inner iteration and multi-indices/indices are incompatible
+    assert_raises(ValueError, nditer, a,
+                ['external_loop', 'multi_index'], [['readonly']])
+    assert_raises(ValueError, nditer, a,
+                ['external_loop', 'c_index'], [['readonly']])
+    assert_raises(ValueError, nditer, a,
+                ['external_loop', 'f_index'], [['readonly']])
+    # Must specify exactly one of readwrite/readonly/writeonly per operand
+    assert_raises(ValueError, nditer, a, [], [[]])
+    assert_raises(ValueError, nditer, a, [], [['readonly', 'writeonly']])
+    assert_raises(ValueError, nditer, a, [], [['readonly', 'readwrite']])
+    assert_raises(ValueError, nditer, a, [], [['writeonly', 'readwrite']])
+    assert_raises(ValueError, nditer, a,
+                [], [['readonly', 'writeonly', 'readwrite']])
+    # Python scalars are always readonly
+    assert_raises(TypeError, nditer, 1.5, [], [['writeonly']])
+    assert_raises(TypeError, nditer, 1.5, [], [['readwrite']])
+    # Array scalars are always readonly
+    assert_raises(TypeError, nditer, np.int32(1), [], [['writeonly']])
+    assert_raises(TypeError, nditer, np.int32(1), [], [['readwrite']])
+    # Check readonly array
+    a.flags.writeable = False
+    assert_raises(ValueError, nditer, a, [], [['writeonly']])
+    assert_raises(ValueError, nditer, a, [], [['readwrite']])
+    a.flags.writeable = True
+    # Multi-indices available only with the multi_index flag
+    i = nditer(arange(6), [], [['readonly']])
+    assert_raises(ValueError, lambda i:i.multi_index, i)
+    # Index available only with an index flag
+    assert_raises(ValueError, lambda i:i.index, i)
+    # GotoCoords and GotoIndex incompatible with buffering or no_inner
+
+    def assign_multi_index(i):
+        i.multi_index = (0,)
+
+    def assign_index(i):
+        i.index = 0
+
+    def assign_iterindex(i):
+        i.iterindex = 0
+
+    def assign_iterrange(i):
+        i.iterrange = (0, 1)
+    i = nditer(arange(6), ['external_loop'])
+    assert_raises(ValueError, assign_multi_index, i)
+    assert_raises(ValueError, assign_index, i)
+    assert_raises(ValueError, assign_iterindex, i)
+    assert_raises(ValueError, assign_iterrange, i)
+    i = nditer(arange(6), ['buffered'])
+    assert_raises(ValueError, assign_multi_index, i)
+    assert_raises(ValueError, assign_index, i)
+    assert_raises(ValueError, assign_iterrange, i)
+    # Can't iterate if size is zero
+    assert_raises(ValueError, nditer, np.array([]))
+
+def test_iter_slice():
+    a, b, c = np.arange(3), np.arange(3), np.arange(3.)
+    i = nditer([a, b, c], [], ['readwrite'])
+    with i:
+        i[0:2] = (3, 3)
+        assert_equal(a, [3, 1, 2])
+        assert_equal(b, [3, 1, 2])
+        assert_equal(c, [0, 1, 2])
+        i[1] = 12
+        assert_equal(i[0:2], [3, 12])
+
+def test_iter_assign_mapping():
+    a = np.arange(24, dtype='f8').reshape(2, 3, 4).T
+    it = np.nditer(a, [], [['readwrite', 'updateifcopy']],
+                       casting='same_kind', op_dtypes=[np.dtype('f4')])
+    with it:
+        it.operands[0][...] = 3
+        it.operands[0][...] = 14
+    assert_equal(a, 14)
+    it = np.nditer(a, [], [['readwrite', 'updateifcopy']],
+                       casting='same_kind', op_dtypes=[np.dtype('f4')])
+    with it:
+        x = it.operands[0][-1:1]
+        x[...] = 14
+        it.operands[0][...] = -1234
+    assert_equal(a, -1234)
+    # check for no warnings on dealloc
+    x = None
+    it = None
+
+def test_iter_nbo_align_contig():
+    # Check that byte order, alignment, and contig changes work
+
+    # Byte order change by requesting a specific dtype
+    a = np.arange(6, dtype='f4')
+    au = a.byteswap().newbyteorder()
+    assert_(a.dtype.byteorder != au.dtype.byteorder)
+    i = nditer(au, [], [['readwrite', 'updateifcopy']],
+                        casting='equiv',
+                        op_dtypes=[np.dtype('f4')])
+    with i:
+        # context manager triggers UPDATEIFCOPY on i at exit
+        assert_equal(i.dtypes[0].byteorder, a.dtype.byteorder)
+        assert_equal(i.operands[0].dtype.byteorder, a.dtype.byteorder)
+        assert_equal(i.operands[0], a)
+        i.operands[0][:] = 2
+    assert_equal(au, [2]*6)
+    del i  # should not raise a warning
+    # Byte order change by requesting NBO
+    a = np.arange(6, dtype='f4')
+    au = a.byteswap().newbyteorder()
+    assert_(a.dtype.byteorder != au.dtype.byteorder)
+    with nditer(au, [], [['readwrite', 'updateifcopy', 'nbo']],
+                        casting='equiv') as i:
+        # context manager triggers UPDATEIFCOPY on i at exit
+        assert_equal(i.dtypes[0].byteorder, a.dtype.byteorder)
+        assert_equal(i.operands[0].dtype.byteorder, a.dtype.byteorder)
+        assert_equal(i.operands[0], a)
+        i.operands[0][:] = 12345
+        i.operands[0][:] = 2
+    assert_equal(au, [2]*6)
+
+    # Unaligned input
+    a = np.zeros((6*4+1,), dtype='i1')[1:]
+    a.dtype = 'f4'
+    a[:] = np.arange(6, dtype='f4')
+    assert_(not a.flags.aligned)
+    # Without 'aligned', shouldn't copy
+    i = nditer(a, [], [['readonly']])
+    assert_(not i.operands[0].flags.aligned)
+    assert_equal(i.operands[0], a)
+    # With 'aligned', should make a copy
+    with nditer(a, [], [['readwrite', 'updateifcopy', 'aligned']]) as i:
+        assert_(i.operands[0].flags.aligned)
+        # context manager triggers UPDATEIFCOPY on i at exit
+        assert_equal(i.operands[0], a)
+        i.operands[0][:] = 3
+    assert_equal(a, [3]*6)
+
+    # Discontiguous input
+    a = arange(12)
+    # If it is contiguous, shouldn't copy
+    i = nditer(a[:6], [], [['readonly']])
+    assert_(i.operands[0].flags.contiguous)
+    assert_equal(i.operands[0], a[:6])
+    # If it isn't contiguous, should buffer
+    i = nditer(a[::2], ['buffered', 'external_loop'],
+                        [['readonly', 'contig']],
+                        buffersize=10)
+    assert_(i[0].flags.contiguous)
+    assert_equal(i[0], a[::2])
+
+def test_iter_array_cast():
+    # Check that arrays are cast as requested
+
+    # No cast 'f4' -> 'f4'
+    a = np.arange(6, dtype='f4').reshape(2, 3)
+    i = nditer(a, [], [['readwrite']], op_dtypes=[np.dtype('f4')])
+    with i:
+        assert_equal(i.operands[0], a)
+        assert_equal(i.operands[0].dtype, np.dtype('f4'))
+
+    # Byte-order cast '<f4' -> '>f4'
+    a = np.arange(6, dtype='<f4').reshape(2, 3)
+    with nditer(a, [], [['readwrite', 'updateifcopy']],
+            casting='equiv',
+            op_dtypes=[np.dtype('>f4')]) as i:
+        assert_equal(i.operands[0], a)
+        assert_equal(i.operands[0].dtype, np.dtype('>f4'))
+
+    # Safe case 'f4' -> 'f8'
+    a = np.arange(24, dtype='f4').reshape(2, 3, 4).swapaxes(1, 2)
+    i = nditer(a, [], [['readonly', 'copy']],
+            casting='safe',
+            op_dtypes=[np.dtype('f8')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('f8'))
+    # The memory layout of the temporary should match a (a is (48,4,16))
+    # except negative strides get flipped to positive strides.
+    assert_equal(i.operands[0].strides, (96, 8, 32))
+    a = a[::-1,:, ::-1]
+    i = nditer(a, [], [['readonly', 'copy']],
+            casting='safe',
+            op_dtypes=[np.dtype('f8')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('f8'))
+    assert_equal(i.operands[0].strides, (96, 8, 32))
+
+    # Same-kind cast 'f8' -> 'f4' -> 'f8'
+    a = np.arange(24, dtype='f8').reshape(2, 3, 4).T
+    with nditer(a, [],
+            [['readwrite', 'updateifcopy']],
+            casting='same_kind',
+            op_dtypes=[np.dtype('f4')]) as i:
+        assert_equal(i.operands[0], a)
+        assert_equal(i.operands[0].dtype, np.dtype('f4'))
+        assert_equal(i.operands[0].strides, (4, 16, 48))
+        # Check that WRITEBACKIFCOPY is activated at exit
+        i.operands[0][2, 1, 1] = -12.5
+        assert_(a[2, 1, 1] != -12.5)
+    assert_equal(a[2, 1, 1], -12.5)
+
+    a = np.arange(6, dtype='i4')[::-2]
+    with nditer(a, [],
+            [['writeonly', 'updateifcopy']],
+            casting='unsafe',
+            op_dtypes=[np.dtype('f4')]) as i:
+        assert_equal(i.operands[0].dtype, np.dtype('f4'))
+        # Even though the stride was negative in 'a', it
+        # becomes positive in the temporary
+        assert_equal(i.operands[0].strides, (4,))
+        i.operands[0][:] = [1, 2, 3]
+    assert_equal(a, [1, 2, 3])
+
+def test_iter_array_cast_errors():
+    # Check that invalid casts are caught
+
+    # Need to enable copying for casts to occur
+    assert_raises(TypeError, nditer, arange(2, dtype='f4'), [],
+                [['readonly']], op_dtypes=[np.dtype('f8')])
+    # Also need to allow casting for casts to occur
+    assert_raises(TypeError, nditer, arange(2, dtype='f4'), [],
+                [['readonly', 'copy']], casting='no',
+                op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, nditer, arange(2, dtype='f4'), [],
+                [['readonly', 'copy']], casting='equiv',
+                op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, nditer, arange(2, dtype='f8'), [],
+                [['writeonly', 'updateifcopy']],
+                casting='no',
+                op_dtypes=[np.dtype('f4')])
+    assert_raises(TypeError, nditer, arange(2, dtype='f8'), [],
+                [['writeonly', 'updateifcopy']],
+                casting='equiv',
+                op_dtypes=[np.dtype('f4')])
+    # '<f4' -> '>f4' should not work with casting='no'
+    assert_raises(TypeError, nditer, arange(2, dtype='<f4'), [],
+                [['readonly', 'copy']], casting='no',
+                op_dtypes=[np.dtype('>f4')])
+    # 'f4' -> 'f8' is a safe cast, but 'f8' -> 'f4' isn't
+    assert_raises(TypeError, nditer, arange(2, dtype='f4'), [],
+                [['readwrite', 'updateifcopy']],
+                casting='safe',
+                op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, nditer, arange(2, dtype='f8'), [],
+                [['readwrite', 'updateifcopy']],
+                casting='safe',
+                op_dtypes=[np.dtype('f4')])
+    # 'f4' -> 'i4' is neither a safe nor a same-kind cast
+    assert_raises(TypeError, nditer, arange(2, dtype='f4'), [],
+                [['readonly', 'copy']],
+                casting='same_kind',
+                op_dtypes=[np.dtype('i4')])
+    assert_raises(TypeError, nditer, arange(2, dtype='i4'), [],
+                [['writeonly', 'updateifcopy']],
+                casting='same_kind',
+                op_dtypes=[np.dtype('f4')])
+
+def test_iter_scalar_cast():
+    # Check that scalars are cast as requested
+
+    # No cast 'f4' -> 'f4'
+    i = nditer(np.float32(2.5), [], [['readonly']],
+                    op_dtypes=[np.dtype('f4')])
+    assert_equal(i.dtypes[0], np.dtype('f4'))
+    assert_equal(i.value.dtype, np.dtype('f4'))
+    assert_equal(i.value, 2.5)
+    # Safe cast 'f4' -> 'f8'
+    i = nditer(np.float32(2.5), [],
+                    [['readonly', 'copy']],
+                    casting='safe',
+                    op_dtypes=[np.dtype('f8')])
+    assert_equal(i.dtypes[0], np.dtype('f8'))
+    assert_equal(i.value.dtype, np.dtype('f8'))
+    assert_equal(i.value, 2.5)
+    # Same-kind cast 'f8' -> 'f4'
+    i = nditer(np.float64(2.5), [],
+                    [['readonly', 'copy']],
+                    casting='same_kind',
+                    op_dtypes=[np.dtype('f4')])
+    assert_equal(i.dtypes[0], np.dtype('f4'))
+    assert_equal(i.value.dtype, np.dtype('f4'))
+    assert_equal(i.value, 2.5)
+    # Unsafe cast 'f8' -> 'i4'
+    i = nditer(np.float64(3.0), [],
+                    [['readonly', 'copy']],
+                    casting='unsafe',
+                    op_dtypes=[np.dtype('i4')])
+    assert_equal(i.dtypes[0], np.dtype('i4'))
+    assert_equal(i.value.dtype, np.dtype('i4'))
+    assert_equal(i.value, 3)
+    # Readonly scalars may be cast even without setting COPY or BUFFERED
+    i = nditer(3, [], [['readonly']], op_dtypes=[np.dtype('f8')])
+    assert_equal(i[0].dtype, np.dtype('f8'))
+    assert_equal(i[0], 3.)
+
+def test_iter_scalar_cast_errors():
+    # Check that invalid casts are caught
+
+    # Need to allow copying/buffering for write casts of scalars to occur
+    assert_raises(TypeError, nditer, np.float32(2), [],
+                [['readwrite']], op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, nditer, 2.5, [],
+                [['readwrite']], op_dtypes=[np.dtype('f4')])
+    # 'f8' -> 'f4' isn't a safe cast if the value would overflow
+    assert_raises(TypeError, nditer, np.float64(1e60), [],
+                [['readonly']],
+                casting='safe',
+                op_dtypes=[np.dtype('f4')])
+    # 'f4' -> 'i4' is neither a safe nor a same-kind cast
+    assert_raises(TypeError, nditer, np.float32(2), [],
+                [['readonly']],
+                casting='same_kind',
+                op_dtypes=[np.dtype('i4')])
+
+def test_iter_object_arrays_basic():
+    # Check that object arrays work
+
+    obj = {'a':3,'b':'d'}
+    a = np.array([[1, 2, 3], None, obj, None], dtype='O')
+    if HAS_REFCOUNT:
+        rc = sys.getrefcount(obj)
+
+    # Need to allow references for object arrays
+    assert_raises(TypeError, nditer, a)
+    if HAS_REFCOUNT:
+        assert_equal(sys.getrefcount(obj), rc)
+
+    i = nditer(a, ['refs_ok'], ['readonly'])
+    vals = [x_[()] for x_ in i]
+    assert_equal(np.array(vals, dtype='O'), a)
+    vals, i, x = [None]*3
+    if HAS_REFCOUNT:
+        assert_equal(sys.getrefcount(obj), rc)
+
+    i = nditer(a.reshape(2, 2).T, ['refs_ok', 'buffered'],
+                        ['readonly'], order='C')
+    assert_(i.iterationneedsapi)
+    vals = [x_[()] for x_ in i]
+    assert_equal(np.array(vals, dtype='O'), a.reshape(2, 2).ravel(order='F'))
+    vals, i, x = [None]*3
+    if HAS_REFCOUNT:
+        assert_equal(sys.getrefcount(obj), rc)
+
+    i = nditer(a.reshape(2, 2).T, ['refs_ok', 'buffered'],
+                        ['readwrite'], order='C')
+    with i:
+        for x in i:
+            x[...] = None
+        vals, i, x = [None]*3
+    if HAS_REFCOUNT:
+        assert_(sys.getrefcount(obj) == rc-1)
+    assert_equal(a, np.array([None]*4, dtype='O'))
+
+def test_iter_object_arrays_conversions():
+    # Conversions to/from objects
+    a = np.arange(6, dtype='O')
+    i = nditer(a, ['refs_ok', 'buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='i4')
+    with i:
+        for x in i:
+            x[...] += 1
+    assert_equal(a, np.arange(6)+1)
+
+    a = np.arange(6, dtype='i4')
+    i = nditer(a, ['refs_ok', 'buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='O')
+    with i:
+        for x in i:
+            x[...] += 1
+    assert_equal(a, np.arange(6)+1)
+
+    # Non-contiguous object array
+    a = np.zeros((6,), dtype=[('p', 'i1'), ('a', 'O')])
+    a = a['a']
+    a[:] = np.arange(6)
+    i = nditer(a, ['refs_ok', 'buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='i4')
+    with i:
+        for x in i:
+            x[...] += 1
+    assert_equal(a, np.arange(6)+1)
+
+    #Non-contiguous value array
+    a = np.zeros((6,), dtype=[('p', 'i1'), ('a', 'i4')])
+    a = a['a']
+    a[:] = np.arange(6) + 98172488
+    i = nditer(a, ['refs_ok', 'buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='O')
+    with i:
+        ob = i[0][()]
+        if HAS_REFCOUNT:
+            rc = sys.getrefcount(ob)
+        for x in i:
+            x[...] += 1
+    if HAS_REFCOUNT:
+        assert_(sys.getrefcount(ob) == rc-1)
+    assert_equal(a, np.arange(6)+98172489)
+
+def test_iter_common_dtype():
+    # Check that the iterator finds a common data type correctly
+
+    i = nditer([array([3], dtype='f4'), array([0], dtype='f8')],
+                    ['common_dtype'],
+                    [['readonly', 'copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('f8'))
+    assert_equal(i.dtypes[1], np.dtype('f8'))
+    i = nditer([array([3], dtype='i4'), array([0], dtype='f4')],
+                    ['common_dtype'],
+                    [['readonly', 'copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('f8'))
+    assert_equal(i.dtypes[1], np.dtype('f8'))
+    i = nditer([array([3], dtype='f4'), array(0, dtype='f8')],
+                    ['common_dtype'],
+                    [['readonly', 'copy']]*2,
+                    casting='same_kind')
+    assert_equal(i.dtypes[0], np.dtype('f4'))
+    assert_equal(i.dtypes[1], np.dtype('f4'))
+    i = nditer([array([3], dtype='u4'), array(0, dtype='i4')],
+                    ['common_dtype'],
+                    [['readonly', 'copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('u4'))
+    assert_equal(i.dtypes[1], np.dtype('u4'))
+    i = nditer([array([3], dtype='u4'), array(-12, dtype='i4')],
+                    ['common_dtype'],
+                    [['readonly', 'copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('i8'))
+    assert_equal(i.dtypes[1], np.dtype('i8'))
+    i = nditer([array([3], dtype='u4'), array(-12, dtype='i4'),
+                 array([2j], dtype='c8'), array([9], dtype='f8')],
+                    ['common_dtype'],
+                    [['readonly', 'copy']]*4,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('c16'))
+    assert_equal(i.dtypes[1], np.dtype('c16'))
+    assert_equal(i.dtypes[2], np.dtype('c16'))
+    assert_equal(i.dtypes[3], np.dtype('c16'))
+    assert_equal(i.value, (3, -12, 2j, 9))
+
+    # When allocating outputs, other outputs aren't factored in
+    i = nditer([array([3], dtype='i4'), None, array([2j], dtype='c16')], [],
+                    [['readonly', 'copy'],
+                     ['writeonly', 'allocate'],
+                     ['writeonly']],
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('i4'))
+    assert_equal(i.dtypes[1], np.dtype('i4'))
+    assert_equal(i.dtypes[2], np.dtype('c16'))
+    # But, if common data types are requested, they are
+    i = nditer([array([3], dtype='i4'), None, array([2j], dtype='c16')],
+                    ['common_dtype'],
+                    [['readonly', 'copy'],
+                     ['writeonly', 'allocate'],
+                     ['writeonly']],
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('c16'))
+    assert_equal(i.dtypes[1], np.dtype('c16'))
+    assert_equal(i.dtypes[2], np.dtype('c16'))
+
+def test_iter_copy_if_overlap():
+    # Ensure the iterator makes copies on read/write overlap, if requested
+
+    # Copy not needed, 1 op
+    for flag in ['readonly', 'writeonly', 'readwrite']:
+        a = arange(10)
+        i = nditer([a], ['copy_if_overlap'], [[flag]])
+        with i:
+            assert_(i.operands[0] is a)
+
+    # Copy needed, 2 ops, read-write overlap
+    x = arange(10)
+    a = x[1:]
+    b = x[:-1]
+    with nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']]) as i:
+        assert_(not np.shares_memory(*i.operands))
+
+    # Copy not needed with elementwise, 2 ops, exactly same arrays
+    x = arange(10)
+    a = x
+    b = x
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly', 'overlap_assume_elementwise'],
+                                             ['readwrite', 'overlap_assume_elementwise']])
+    with i:
+        assert_(i.operands[0] is a and i.operands[1] is b)
+    with nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']]) as i:
+        assert_(i.operands[0] is a and not np.shares_memory(i.operands[1], b))
+
+    # Copy not needed, 2 ops, no overlap
+    x = arange(10)
+    a = x[::2]
+    b = x[1::2]
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']])
+    assert_(i.operands[0] is a and i.operands[1] is b)
+
+    # Copy needed, 2 ops, read-write overlap
+    x = arange(4, dtype=np.int8)
+    a = x[3:]
+    b = x.view(np.int32)[:1]
+    with nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']]) as i:
+        assert_(not np.shares_memory(*i.operands))
+
+    # Copy needed, 3 ops, read-write overlap
+    for flag in ['writeonly', 'readwrite']:
+        x = np.ones([10, 10])
+        a = x
+        b = x.T
+        c = x
+        with nditer([a, b, c], ['copy_if_overlap'],
+                   [['readonly'], ['readonly'], [flag]]) as i:
+            a2, b2, c2 = i.operands
+            assert_(not np.shares_memory(a2, c2))
+            assert_(not np.shares_memory(b2, c2))
+
+    # Copy not needed, 3 ops, read-only overlap
+    x = np.ones([10, 10])
+    a = x
+    b = x.T
+    c = x
+    i = nditer([a, b, c], ['copy_if_overlap'],
+               [['readonly'], ['readonly'], ['readonly']])
+    a2, b2, c2 = i.operands
+    assert_(a is a2)
+    assert_(b is b2)
+    assert_(c is c2)
+
+    # Copy not needed, 3 ops, read-only overlap
+    x = np.ones([10, 10])
+    a = x
+    b = np.ones([10, 10])
+    c = x.T
+    i = nditer([a, b, c], ['copy_if_overlap'],
+               [['readonly'], ['writeonly'], ['readonly']])
+    a2, b2, c2 = i.operands
+    assert_(a is a2)
+    assert_(b is b2)
+    assert_(c is c2)
+
+    # Copy not needed, 3 ops, write-only overlap
+    x = np.arange(7)
+    a = x[:3]
+    b = x[3:6]
+    c = x[4:7]
+    i = nditer([a, b, c], ['copy_if_overlap'],
+               [['readonly'], ['writeonly'], ['writeonly']])
+    a2, b2, c2 = i.operands
+    assert_(a is a2)
+    assert_(b is b2)
+    assert_(c is c2)
+
+def test_iter_op_axes():
+    # Check that custom axes work
+
+    # Reverse the axes
+    a = arange(6).reshape(2, 3)
+    i = nditer([a, a.T], [], [['readonly']]*2, op_axes=[[0, 1], [1, 0]])
+    assert_(all([x == y for (x, y) in i]))
+    a = arange(24).reshape(2, 3, 4)
+    i = nditer([a.T, a], [], [['readonly']]*2, op_axes=[[2, 1, 0], None])
+    assert_(all([x == y for (x, y) in i]))
+
+    # Broadcast 1D to any dimension
+    a = arange(1, 31).reshape(2, 3, 5)
+    b = arange(1, 3)
+    i = nditer([a, b], [], [['readonly']]*2, op_axes=[None, [0, -1, -1]])
+    assert_equal([x*y for (x, y) in i], (a*b.reshape(2, 1, 1)).ravel())
+    b = arange(1, 4)
+    i = nditer([a, b], [], [['readonly']]*2, op_axes=[None, [-1, 0, -1]])
+    assert_equal([x*y for (x, y) in i], (a*b.reshape(1, 3, 1)).ravel())
+    b = arange(1, 6)
+    i = nditer([a, b], [], [['readonly']]*2,
+                            op_axes=[None, [np.newaxis, np.newaxis, 0]])
+    assert_equal([x*y for (x, y) in i], (a*b.reshape(1, 1, 5)).ravel())
+
+    # Inner product-style broadcasting
+    a = arange(24).reshape(2, 3, 4)
+    b = arange(40).reshape(5, 2, 4)
+    i = nditer([a, b], ['multi_index'], [['readonly']]*2,
+                            op_axes=[[0, 1, -1, -1], [-1, -1, 0, 1]])
+    assert_equal(i.shape, (2, 3, 5, 2))
+
+    # Matrix product-style broadcasting
+    a = arange(12).reshape(3, 4)
+    b = arange(20).reshape(4, 5)
+    i = nditer([a, b], ['multi_index'], [['readonly']]*2,
+                            op_axes=[[0, -1], [-1, 1]])
+    assert_equal(i.shape, (3, 5))
+
+def test_iter_op_axes_errors():
+    # Check that custom axes throws errors for bad inputs
+
+    # Wrong number of items in op_axes
+    a = arange(6).reshape(2, 3)
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[0], [1], [0]])
+    # Out of bounds items in op_axes
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[2, 1], [0, 1]])
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[0, 1], [2, -1]])
+    # Duplicate items in op_axes
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[0, 0], [0, 1]])
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[0, 1], [1, 1]])
+
+    # Different sized arrays in op_axes
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[0, 1], [0, 1, 0]])
+
+    # Non-broadcastable dimensions in the result
+    assert_raises(ValueError, nditer, [a, a], [], [['readonly']]*2,
+                                    op_axes=[[0, 1], [1, 0]])
+
+def test_iter_copy():
+    # Check that copying the iterator works correctly
+    a = arange(24).reshape(2, 3, 4)
+
+    # Simple iterator
+    i = nditer(a)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterindex = 3
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    # Buffered iterator
+    i = nditer(a, ['buffered', 'ranged'], order='F', buffersize=3)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterindex = 3
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterrange = (3, 9)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterrange = (2, 18)
+    next(i)
+    next(i)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    # Casting iterator
+    with nditer(a, ['buffered'], order='F', casting='unsafe',
+                op_dtypes='f8', buffersize=5) as i:
+        j = i.copy()
+    assert_equal([x[()] for x in j], a.ravel(order='F'))
+
+    a = arange(24, dtype='<i4').reshape(2, 3, 4)
+    with nditer(a, ['buffered'], order='F', casting='unsafe',
+                op_dtypes='>f8', buffersize=5) as i:
+        j = i.copy()
+    assert_equal([x[()] for x in j], a.ravel(order='F'))
+
+def test_iter_allocate_output_simple():
+    # Check that the iterator will properly allocate outputs
+
+    # Simple case
+    a = arange(6)
+    i = nditer([a, None], [], [['readonly'], ['writeonly', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+
+def test_iter_allocate_output_buffered_readwrite():
+    # Allocated output with buffering + delay_bufalloc
+
+    a = arange(6)
+    i = nditer([a, None], ['buffered', 'delay_bufalloc'],
+                        [['readonly'], ['allocate', 'readwrite']])
+    with i:
+        i.operands[1][:] = 1
+        i.reset()
+        for x in i:
+            x[1][...] += x[0][...]
+        assert_equal(i.operands[1], a+1)
+
+def test_iter_allocate_output_itorder():
+    # The allocated output should match the iteration order
+
+    # C-order input, best iteration order
+    a = arange(6, dtype='i4').reshape(2, 3)
+    i = nditer([a, None], [], [['readonly'], ['writeonly', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].strides, a.strides)
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+    # F-order input, best iteration order
+    a = arange(24, dtype='i4').reshape(2, 3, 4).T
+    i = nditer([a, None], [], [['readonly'], ['writeonly', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].strides, a.strides)
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+    # Non-contiguous input, C iteration order
+    a = arange(24, dtype='i4').reshape(2, 3, 4).swapaxes(0, 1)
+    i = nditer([a, None], [],
+                        [['readonly'], ['writeonly', 'allocate']],
+                        order='C',
+                        op_dtypes=[None, np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].strides, (32, 16, 4))
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+
+def test_iter_allocate_output_opaxes():
+    # Specifying op_axes should work
+
+    a = arange(24, dtype='i4').reshape(2, 3, 4)
+    i = nditer([None, a], [], [['writeonly', 'allocate'], ['readonly']],
+                        op_dtypes=[np.dtype('u4'), None],
+                        op_axes=[[1, 2, 0], None])
+    assert_equal(i.operands[0].shape, (4, 2, 3))
+    assert_equal(i.operands[0].strides, (4, 48, 16))
+    assert_equal(i.operands[0].dtype, np.dtype('u4'))
+
+def test_iter_allocate_output_types_promotion():
+    # Check type promotion of automatic outputs
+
+    i = nditer([array([3], dtype='f4'), array([0], dtype='f8'), None], [],
+                    [['readonly']]*2+[['writeonly', 'allocate']])
+    assert_equal(i.dtypes[2], np.dtype('f8'))
+    i = nditer([array([3], dtype='i4'), array([0], dtype='f4'), None], [],
+                    [['readonly']]*2+[['writeonly', 'allocate']])
+    assert_equal(i.dtypes[2], np.dtype('f8'))
+    i = nditer([array([3], dtype='f4'), array(0, dtype='f8'), None], [],
+                    [['readonly']]*2+[['writeonly', 'allocate']])
+    assert_equal(i.dtypes[2], np.dtype('f4'))
+    i = nditer([array([3], dtype='u4'), array(0, dtype='i4'), None], [],
+                    [['readonly']]*2+[['writeonly', 'allocate']])
+    assert_equal(i.dtypes[2], np.dtype('u4'))
+    i = nditer([array([3], dtype='u4'), array(-12, dtype='i4'), None], [],
+                    [['readonly']]*2+[['writeonly', 'allocate']])
+    assert_equal(i.dtypes[2], np.dtype('i8'))
+
+def test_iter_allocate_output_types_byte_order():
+    # Verify the rules for byte order changes
+
+    # When there's just one input, the output type exactly matches
+    a = array([3], dtype='u4').newbyteorder()
+    i = nditer([a, None], [],
+                    [['readonly'], ['writeonly', 'allocate']])
+    assert_equal(i.dtypes[0], i.dtypes[1])
+    # With two or more inputs, the output type is in native byte order
+    i = nditer([a, a, None], [],
+                    [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    assert_(i.dtypes[0] != i.dtypes[2])
+    assert_equal(i.dtypes[0].newbyteorder('='), i.dtypes[2])
+
+def test_iter_allocate_output_types_scalar():
+    # If the inputs are all scalars, the output should be a scalar
+
+    i = nditer([None, 1, 2.3, np.float32(12), np.complex128(3)], [],
+                [['writeonly', 'allocate']] + [['readonly']]*4)
+    assert_equal(i.operands[0].dtype, np.dtype('complex128'))
+    assert_equal(i.operands[0].ndim, 0)
+
+def test_iter_allocate_output_subtype():
+    # Make sure that the subtype with priority wins
+    class MyNDArray(np.ndarray):
+        __array_priority__ = 15
+
+    # subclass vs ndarray
+    a = np.array([[1, 2], [3, 4]]).view(MyNDArray)
+    b = np.arange(4).reshape(2, 2).T
+    i = nditer([a, b, None], [],
+               [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    assert_equal(type(a), type(i.operands[2]))
+    assert_(type(b) is not type(i.operands[2]))
+    assert_equal(i.operands[2].shape, (2, 2))
+
+    # If subtypes are disabled, we should get back an ndarray.
+    i = nditer([a, b, None], [],
+               [['readonly'], ['readonly'],
+                ['writeonly', 'allocate', 'no_subtype']])
+    assert_equal(type(b), type(i.operands[2]))
+    assert_(type(a) is not type(i.operands[2]))
+    assert_equal(i.operands[2].shape, (2, 2))
+
+def test_iter_allocate_output_errors():
+    # Check that the iterator will throw errors for bad output allocations
+
+    # Need an input if no output data type is specified
+    a = arange(6)
+    assert_raises(TypeError, nditer, [a, None], [],
+                        [['writeonly'], ['writeonly', 'allocate']])
+    # Allocated output should be flagged for writing
+    assert_raises(ValueError, nditer, [a, None], [],
+                        [['readonly'], ['allocate', 'readonly']])
+    # Allocated output can't have buffering without delayed bufalloc
+    assert_raises(ValueError, nditer, [a, None], ['buffered'],
+                                            ['allocate', 'readwrite'])
+    # Must specify at least one input
+    assert_raises(ValueError, nditer, [None, None], [],
+                        [['writeonly', 'allocate'],
+                         ['writeonly', 'allocate']],
+                        op_dtypes=[np.dtype('f4'), np.dtype('f4')])
+    # If using op_axes, must specify all the axes
+    a = arange(24, dtype='i4').reshape(2, 3, 4)
+    assert_raises(ValueError, nditer, [a, None], [],
+                        [['readonly'], ['writeonly', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')],
+                        op_axes=[None, [0, np.newaxis, 1]])
+    # If using op_axes, the axes must be within bounds
+    assert_raises(ValueError, nditer, [a, None], [],
+                        [['readonly'], ['writeonly', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')],
+                        op_axes=[None, [0, 3, 1]])
+    # If using op_axes, there can't be duplicates
+    assert_raises(ValueError, nditer, [a, None], [],
+                        [['readonly'], ['writeonly', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')],
+                        op_axes=[None, [0, 2, 1, 0]])
+
+def test_iter_remove_axis():
+    a = arange(24).reshape(2, 3, 4)
+
+    i = nditer(a, ['multi_index'])
+    i.remove_axis(1)
+    assert_equal([x for x in i], a[:, 0,:].ravel())
+
+    a = a[::-1,:,:]
+    i = nditer(a, ['multi_index'])
+    i.remove_axis(0)
+    assert_equal([x for x in i], a[0,:,:].ravel())
+
+def test_iter_remove_multi_index_inner_loop():
+    # Check that removing multi-index support works
+
+    a = arange(24).reshape(2, 3, 4)
+
+    i = nditer(a, ['multi_index'])
+    assert_equal(i.ndim, 3)
+    assert_equal(i.shape, (2, 3, 4))
+    assert_equal(i.itviews[0].shape, (2, 3, 4))
+
+    # Removing the multi-index tracking causes all dimensions to coalesce
+    before = [x for x in i]
+    i.remove_multi_index()
+    after = [x for x in i]
+
+    assert_equal(before, after)
+    assert_equal(i.ndim, 1)
+    assert_raises(ValueError, lambda i:i.shape, i)
+    assert_equal(i.itviews[0].shape, (24,))
+
+    # Removing the inner loop means there's just one iteration
+    i.reset()
+    assert_equal(i.itersize, 24)
+    assert_equal(i[0].shape, tuple())
+    i.enable_external_loop()
+    assert_equal(i.itersize, 24)
+    assert_equal(i[0].shape, (24,))
+    assert_equal(i.value, arange(24))
+
+def test_iter_iterindex():
+    # Make sure iterindex works
+
+    buffersize = 5
+    a = arange(24).reshape(4, 3, 2)
+    for flags in ([], ['buffered']):
+        i = nditer(a, flags, buffersize=buffersize)
+        assert_equal(iter_iterindices(i), list(range(24)))
+        i.iterindex = 2
+        assert_equal(iter_iterindices(i), list(range(2, 24)))
+
+        i = nditer(a, flags, order='F', buffersize=buffersize)
+        assert_equal(iter_iterindices(i), list(range(24)))
+        i.iterindex = 5
+        assert_equal(iter_iterindices(i), list(range(5, 24)))
+
+        i = nditer(a[::-1], flags, order='F', buffersize=buffersize)
+        assert_equal(iter_iterindices(i), list(range(24)))
+        i.iterindex = 9
+        assert_equal(iter_iterindices(i), list(range(9, 24)))
+
+        i = nditer(a[::-1, ::-1], flags, order='C', buffersize=buffersize)
+        assert_equal(iter_iterindices(i), list(range(24)))
+        i.iterindex = 13
+        assert_equal(iter_iterindices(i), list(range(13, 24)))
+
+        i = nditer(a[::1, ::-1], flags, buffersize=buffersize)
+        assert_equal(iter_iterindices(i), list(range(24)))
+        i.iterindex = 23
+        assert_equal(iter_iterindices(i), list(range(23, 24)))
+        i.reset()
+        i.iterindex = 2
+        assert_equal(iter_iterindices(i), list(range(2, 24)))
+
+def test_iter_iterrange():
+    # Make sure getting and resetting the iterrange works
+
+    buffersize = 5
+    a = arange(24, dtype='i4').reshape(4, 3, 2)
+    a_fort = a.ravel(order='F')
+
+    i = nditer(a, ['ranged'], ['readonly'], order='F',
+                buffersize=buffersize)
+    assert_equal(i.iterrange, (0, 24))
+    assert_equal([x[()] for x in i], a_fort)
+    for r in [(0, 24), (1, 2), (3, 24), (5, 5), (0, 20), (23, 24)]:
+        i.iterrange = r
+        assert_equal(i.iterrange, r)
+        assert_equal([x[()] for x in i], a_fort[r[0]:r[1]])
+
+    i = nditer(a, ['ranged', 'buffered'], ['readonly'], order='F',
+                op_dtypes='f8', buffersize=buffersize)
+    assert_equal(i.iterrange, (0, 24))
+    assert_equal([x[()] for x in i], a_fort)
+    for r in [(0, 24), (1, 2), (3, 24), (5, 5), (0, 20), (23, 24)]:
+        i.iterrange = r
+        assert_equal(i.iterrange, r)
+        assert_equal([x[()] for x in i], a_fort[r[0]:r[1]])
+
+    def get_array(i):
+        val = np.array([], dtype='f8')
+        for x in i:
+            val = np.concatenate((val, x))
+        return val
+
+    i = nditer(a, ['ranged', 'buffered', 'external_loop'],
+                ['readonly'], order='F',
+                op_dtypes='f8', buffersize=buffersize)
+    assert_equal(i.iterrange, (0, 24))
+    assert_equal(get_array(i), a_fort)
+    for r in [(0, 24), (1, 2), (3, 24), (5, 5), (0, 20), (23, 24)]:
+        i.iterrange = r
+        assert_equal(i.iterrange, r)
+        assert_equal(get_array(i), a_fort[r[0]:r[1]])
+
+def test_iter_buffering():
+    # Test buffering with several buffer sizes and types
+    arrays = []
+    # F-order swapped array
+    arrays.append(np.arange(24,
+                    dtype='c16').reshape(2, 3, 4).T.newbyteorder().byteswap())
+    # Contiguous 1-dimensional array
+    arrays.append(np.arange(10, dtype='f4'))
+    # Unaligned array
+    a = np.zeros((4*16+1,), dtype='i1')[1:]
+    a.dtype = 'i4'
+    a[:] = np.arange(16, dtype='i4')
+    arrays.append(a)
+    # 4-D F-order array
+    arrays.append(np.arange(120, dtype='i4').reshape(5, 3, 2, 4).T)
+    for a in arrays:
+        for buffersize in (1, 2, 3, 5, 8, 11, 16, 1024):
+            vals = []
+            i = nditer(a, ['buffered', 'external_loop'],
+                           [['readonly', 'nbo', 'aligned']],
+                           order='C',
+                           casting='equiv',
+                           buffersize=buffersize)
+            while not i.finished:
+                assert_(i[0].size <= buffersize)
+                vals.append(i[0].copy())
+                i.iternext()
+            assert_equal(np.concatenate(vals), a.ravel(order='C'))
+
+def test_iter_write_buffering():
+    # Test that buffering of writes is working
+
+    # F-order swapped array
+    a = np.arange(24).reshape(2, 3, 4).T.newbyteorder().byteswap()
+    i = nditer(a, ['buffered'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='equiv',
+                   order='C',
+                   buffersize=16)
+    x = 0
+    with i:
+        while not i.finished:
+            i[0] = x
+            x += 1
+            i.iternext()
+    assert_equal(a.ravel(order='C'), np.arange(24))
+
+def test_iter_buffering_delayed_alloc():
+    # Test that delaying buffer allocation works
+
+    a = np.arange(6)
+    b = np.arange(1, dtype='f4')
+    i = nditer([a, b], ['buffered', 'delay_bufalloc', 'multi_index', 'reduce_ok'],
+                    ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes='f4')
+    assert_(i.has_delayed_bufalloc)
+    assert_raises(ValueError, lambda i:i.multi_index, i)
+    assert_raises(ValueError, lambda i:i[0], i)
+    assert_raises(ValueError, lambda i:i[0:2], i)
+
+    def assign_iter(i):
+        i[0] = 0
+    assert_raises(ValueError, assign_iter, i)
+
+    i.reset()
+    assert_(not i.has_delayed_bufalloc)
+    assert_equal(i.multi_index, (0,))
+    with i:
+        assert_equal(i[0], 0)
+        i[1] = 1
+        assert_equal(i[0:2], [0, 1])
+        assert_equal([[x[0][()], x[1][()]] for x in i], list(zip(range(6), [1]*6)))
+
+def test_iter_buffered_cast_simple():
+    # Test that buffering can handle a simple cast
+
+    a = np.arange(10, dtype='f4')
+    i = nditer(a, ['buffered', 'external_loop'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('f8')],
+                   buffersize=3)
+    with i:
+        for v in i:
+            v[...] *= 2
+
+    assert_equal(a, 2*np.arange(10, dtype='f4'))
+
+def test_iter_buffered_cast_byteswapped():
+    # Test that buffering can handle a cast which requires swap->cast->swap
+
+    a = np.arange(10, dtype='f4').newbyteorder().byteswap()
+    i = nditer(a, ['buffered', 'external_loop'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('f8').newbyteorder()],
+                   buffersize=3)
+    with i:
+        for v in i:
+            v[...] *= 2
+
+    assert_equal(a, 2*np.arange(10, dtype='f4'))
+
+    with suppress_warnings() as sup:
+        sup.filter(np.ComplexWarning)
+
+        a = np.arange(10, dtype='f8').newbyteorder().byteswap()
+        i = nditer(a, ['buffered', 'external_loop'],
+                       [['readwrite', 'nbo', 'aligned']],
+                       casting='unsafe',
+                       op_dtypes=[np.dtype('c8').newbyteorder()],
+                       buffersize=3)
+        with i:
+            for v in i:
+                v[...] *= 2
+
+        assert_equal(a, 2*np.arange(10, dtype='f8'))
+
+def test_iter_buffered_cast_byteswapped_complex():
+    # Test that buffering can handle a cast which requires swap->cast->copy
+
+    a = np.arange(10, dtype='c8').newbyteorder().byteswap()
+    a += 2j
+    i = nditer(a, ['buffered', 'external_loop'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('c16')],
+                   buffersize=3)
+    with i:
+        for v in i:
+            v[...] *= 2
+    assert_equal(a, 2*np.arange(10, dtype='c8') + 4j)
+
+    a = np.arange(10, dtype='c8')
+    a += 2j
+    i = nditer(a, ['buffered', 'external_loop'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('c16').newbyteorder()],
+                   buffersize=3)
+    with i:
+        for v in i:
+            v[...] *= 2
+    assert_equal(a, 2*np.arange(10, dtype='c8') + 4j)
+
+    a = np.arange(10, dtype=np.clongdouble).newbyteorder().byteswap()
+    a += 2j
+    i = nditer(a, ['buffered', 'external_loop'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('c16')],
+                   buffersize=3)
+    with i:
+        for v in i:
+            v[...] *= 2
+    assert_equal(a, 2*np.arange(10, dtype=np.clongdouble) + 4j)
+
+    a = np.arange(10, dtype=np.longdouble).newbyteorder().byteswap()
+    i = nditer(a, ['buffered', 'external_loop'],
+                   [['readwrite', 'nbo', 'aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('f4')],
+                   buffersize=7)
+    with i:
+        for v in i:
+            v[...] *= 2
+    assert_equal(a, 2*np.arange(10, dtype=np.longdouble))
+
+def test_iter_buffered_cast_structured_type():
+    # Tests buffering of structured types
+
+    # simple -> struct type (duplicates the value)
+    sdt = [('a', 'f4'), ('b', 'i8'), ('c', 'c8', (2, 3)), ('d', 'O')]
+    a = np.arange(3, dtype='f4') + 0.5
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt)
+    vals = [np.array(x) for x in i]
+    assert_equal(vals[0]['a'], 0.5)
+    assert_equal(vals[0]['b'], 0)
+    assert_equal(vals[0]['c'], [[(0.5)]*3]*2)
+    assert_equal(vals[0]['d'], 0.5)
+    assert_equal(vals[1]['a'], 1.5)
+    assert_equal(vals[1]['b'], 1)
+    assert_equal(vals[1]['c'], [[(1.5)]*3]*2)
+    assert_equal(vals[1]['d'], 1.5)
+    assert_equal(vals[0].dtype, np.dtype(sdt))
+
+    # object -> struct type
+    sdt = [('a', 'f4'), ('b', 'i8'), ('c', 'c8', (2, 3)), ('d', 'O')]
+    a = np.zeros((3,), dtype='O')
+    a[0] = (0.5, 0.5, [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 0.5)
+    a[1] = (1.5, 1.5, [[1.5, 1.5, 1.5], [1.5, 1.5, 1.5]], 1.5)
+    a[2] = (2.5, 2.5, [[2.5, 2.5, 2.5], [2.5, 2.5, 2.5]], 2.5)
+    if HAS_REFCOUNT:
+        rc = sys.getrefcount(a[0])
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt)
+    vals = [x.copy() for x in i]
+    assert_equal(vals[0]['a'], 0.5)
+    assert_equal(vals[0]['b'], 0)
+    assert_equal(vals[0]['c'], [[(0.5)]*3]*2)
+    assert_equal(vals[0]['d'], 0.5)
+    assert_equal(vals[1]['a'], 1.5)
+    assert_equal(vals[1]['b'], 1)
+    assert_equal(vals[1]['c'], [[(1.5)]*3]*2)
+    assert_equal(vals[1]['d'], 1.5)
+    assert_equal(vals[0].dtype, np.dtype(sdt))
+    vals, i, x = [None]*3
+    if HAS_REFCOUNT:
+        assert_equal(sys.getrefcount(a[0]), rc)
+
+    # single-field struct type -> simple
+    sdt = [('a', 'f4')]
+    a = np.array([(5.5,), (8,)], dtype=sdt)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes='i4')
+    assert_equal([x_[()] for x_ in i], [5, 8])
+
+    # make sure multi-field struct type -> simple doesn't work
+    sdt = [('a', 'f4'), ('b', 'i8'), ('d', 'O')]
+    a = np.array([(5.5, 7, 'test'), (8, 10, 11)], dtype=sdt)
+    assert_raises(TypeError, lambda: (
+        nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+               casting='unsafe',
+               op_dtypes='i4')))
+
+    # struct type -> struct type (field-wise copy)
+    sdt1 = [('a', 'f4'), ('b', 'i8'), ('d', 'O')]
+    sdt2 = [('d', 'u2'), ('a', 'O'), ('b', 'f8')]
+    a = np.array([(1, 2, 3), (4, 5, 6)], dtype=sdt1)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    assert_equal([np.array(x_) for x_ in i],
+                 [np.array((1, 2, 3), dtype=sdt2),
+                  np.array((4, 5, 6), dtype=sdt2)])
+
+    # make sure struct type -> struct type with different
+    # number of fields fails
+    sdt1 = [('a', 'f4'), ('b', 'i8'), ('d', 'O')]
+    sdt2 = [('b', 'O'), ('a', 'f8')]
+    a = np.array([(1, 2, 3), (4, 5, 6)], dtype=sdt1)
+
+    assert_raises(ValueError, lambda : (
+        nditer(a, ['buffered', 'refs_ok'], ['readwrite'],
+               casting='unsafe',
+               op_dtypes=sdt2)))
+
+
+def test_iter_buffered_cast_subarray():
+    # Tests buffering of subarrays
+
+    # one element -> many (copies it to all)
+    sdt1 = [('a', 'f4')]
+    sdt2 = [('a', 'f8', (3, 2, 2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    for x, count in zip(i, list(range(6))):
+        assert_(np.all(x['a'] == count))
+
+    # one element -> many -> back (copies it to all)
+    sdt1 = [('a', 'O', (1, 1))]
+    sdt2 = [('a', 'O', (3, 2, 2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:, 0, 0] = np.arange(6)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    with i:
+        assert_equal(i[0].dtype, np.dtype(sdt2))
+        count = 0
+        for x in i:
+            assert_(np.all(x['a'] == count))
+            x['a'][0] += 2
+            count += 1
+    assert_equal(a['a'], np.arange(6).reshape(6, 1, 1)+2)
+
+    # many -> one element -> back (copies just element 0)
+    sdt1 = [('a', 'O', (3, 2, 2))]
+    sdt2 = [('a', 'O', (1,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:, 0, 0, 0] = np.arange(6)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    with i:
+        assert_equal(i[0].dtype, np.dtype(sdt2))
+        count = 0
+        for x in i:
+            assert_equal(x['a'], count)
+            x['a'] += 2
+            count += 1
+    assert_equal(a['a'], np.arange(6).reshape(6, 1, 1, 1)*np.ones((1, 3, 2, 2))+2)
+
+    # many -> one element -> back (copies just element 0)
+    sdt1 = [('a', 'f8', (3, 2, 2))]
+    sdt2 = [('a', 'O', (1,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:, 0, 0, 0] = np.arange(6)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], count)
+        count += 1
+
+    # many -> one element (copies just element 0)
+    sdt1 = [('a', 'O', (3, 2, 2))]
+    sdt2 = [('a', 'f4', (1,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:, 0, 0, 0] = np.arange(6)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], count)
+        count += 1
+
+    # many -> matching shape (straightforward copy)
+    sdt1 = [('a', 'O', (3, 2, 2))]
+    sdt2 = [('a', 'f4', (3, 2, 2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*3*2*2).reshape(6, 3, 2, 2)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], a[count]['a'])
+        count += 1
+
+    # vector -> smaller vector (truncates)
+    sdt1 = [('a', 'f8', (6,))]
+    sdt2 = [('a', 'f4', (2,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*6).reshape(6, 6)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], a[count]['a'][:2])
+        count += 1
+
+    # vector -> bigger vector (pads with zeros)
+    sdt1 = [('a', 'f8', (2,))]
+    sdt2 = [('a', 'f4', (6,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2).reshape(6, 2)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][:2], a[count]['a'])
+        assert_equal(x['a'][2:], [0, 0, 0, 0])
+        count += 1
+
+    # vector -> matrix (broadcasts)
+    sdt1 = [('a', 'f8', (2,))]
+    sdt2 = [('a', 'f4', (2, 2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2).reshape(6, 2)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][0], a[count]['a'])
+        assert_equal(x['a'][1], a[count]['a'])
+        count += 1
+
+    # vector -> matrix (broadcasts and zero-pads)
+    sdt1 = [('a', 'f8', (2, 1))]
+    sdt2 = [('a', 'f4', (3, 2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2).reshape(6, 2, 1)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][:2, 0], a[count]['a'][:, 0])
+        assert_equal(x['a'][:2, 1], a[count]['a'][:, 0])
+        assert_equal(x['a'][2,:], [0, 0])
+        count += 1
+
+    # matrix -> matrix (truncates and zero-pads)
+    sdt1 = [('a', 'f8', (2, 3))]
+    sdt2 = [('a', 'f4', (3, 2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2*3).reshape(6, 2, 3)
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][:2, 0], a[count]['a'][:, 0])
+        assert_equal(x['a'][:2, 1], a[count]['a'][:, 1])
+        assert_equal(x['a'][2,:], [0, 0])
+        count += 1
+
+def test_iter_buffering_badwriteback():
+    # Writing back from a buffer cannot combine elements
+
+    # a needs write buffering, but had a broadcast dimension
+    a = np.arange(6).reshape(2, 3, 1)
+    b = np.arange(12).reshape(2, 3, 2)
+    assert_raises(ValueError, nditer, [a, b],
+                  ['buffered', 'external_loop'],
+                  [['readwrite'], ['writeonly']],
+                  order='C')
+
+    # But if a is readonly, it's fine
+    nditer([a, b], ['buffered', 'external_loop'],
+           [['readonly'], ['writeonly']],
+           order='C')
+
+    # If a has just one element, it's fine too (constant 0 stride, a reduction)
+    a = np.arange(1).reshape(1, 1, 1)
+    nditer([a, b], ['buffered', 'external_loop', 'reduce_ok'],
+           [['readwrite'], ['writeonly']],
+           order='C')
+
+    # check that it fails on other dimensions too
+    a = np.arange(6).reshape(1, 3, 2)
+    assert_raises(ValueError, nditer, [a, b],
+                  ['buffered', 'external_loop'],
+                  [['readwrite'], ['writeonly']],
+                  order='C')
+    a = np.arange(4).reshape(2, 1, 2)
+    assert_raises(ValueError, nditer, [a, b],
+                  ['buffered', 'external_loop'],
+                  [['readwrite'], ['writeonly']],
+                  order='C')
+
+def test_iter_buffering_string():
+    # Safe casting disallows shrinking strings
+    a = np.array(['abc', 'a', 'abcd'], dtype=np.bytes_)
+    assert_equal(a.dtype, np.dtype('S4'))
+    assert_raises(TypeError, nditer, a, ['buffered'], ['readonly'],
+                  op_dtypes='S2')
+    i = nditer(a, ['buffered'], ['readonly'], op_dtypes='S6')
+    assert_equal(i[0], b'abc')
+    assert_equal(i[0].dtype, np.dtype('S6'))
+
+    a = np.array(['abc', 'a', 'abcd'], dtype=np.unicode)
+    assert_equal(a.dtype, np.dtype('U4'))
+    assert_raises(TypeError, nditer, a, ['buffered'], ['readonly'],
+                    op_dtypes='U2')
+    i = nditer(a, ['buffered'], ['readonly'], op_dtypes='U6')
+    assert_equal(i[0], u'abc')
+    assert_equal(i[0].dtype, np.dtype('U6'))
+
+def test_iter_buffering_growinner():
+    # Test that the inner loop grows when no buffering is needed
+    a = np.arange(30)
+    i = nditer(a, ['buffered', 'growinner', 'external_loop'],
+                           buffersize=5)
+    # Should end up with just one inner loop here
+    assert_equal(i[0].size, a.size)
+
+
+[email protected]
+def test_iter_buffered_reduce_reuse():
+    # large enough array for all views, including negative strides.
+    a = np.arange(2*3**5)[3**5:3**5+1]
+    flags = ['buffered', 'delay_bufalloc', 'multi_index', 'reduce_ok', 'refs_ok']
+    op_flags = [('readonly',), ('readwrite', 'allocate')]
+    op_axes_list = [[(0, 1, 2), (0, 1, -1)], [(0, 1, 2), (0, -1, -1)]]
+    # wrong dtype to force buffering
+    op_dtypes = [float, a.dtype]
+
+    def get_params():
+        for xs in range(-3**2, 3**2 + 1):
+            for ys in range(xs, 3**2 + 1):
+                for op_axes in op_axes_list:
+                    # last stride is reduced and because of that not
+                    # important for this test, as it is the inner stride.
+                    strides = (xs * a.itemsize, ys * a.itemsize, a.itemsize)
+                    arr = np.lib.stride_tricks.as_strided(a, (3, 3, 3), strides)
+
+                    for skip in [0, 1]:
+                        yield arr, op_axes, skip
+
+    for arr, op_axes, skip in get_params():
+        nditer2 = np.nditer([arr.copy(), None],
+                            op_axes=op_axes, flags=flags, op_flags=op_flags,
+                            op_dtypes=op_dtypes)
+        with nditer2:
+            nditer2.operands[-1][...] = 0
+            nditer2.reset()
+            nditer2.iterindex = skip
+
+            for (a2_in, b2_in) in nditer2:
+                b2_in += a2_in.astype(np.int_)
+
+            comp_res = nditer2.operands[-1]
+
+        for bufsize in range(0, 3**3):
+            nditer1 = np.nditer([arr, None],
+                                op_axes=op_axes, flags=flags, op_flags=op_flags,
+                                buffersize=bufsize, op_dtypes=op_dtypes)
+            with nditer1:
+                nditer1.operands[-1][...] = 0
+                nditer1.reset()
+                nditer1.iterindex = skip
+
+                for (a1_in, b1_in) in nditer1:
+                    b1_in += a1_in.astype(np.int_)
+
+                res = nditer1.operands[-1]
+            assert_array_equal(res, comp_res)
+
+
+def test_iter_no_broadcast():
+    # Test that the no_broadcast flag works
+    a = np.arange(24).reshape(2, 3, 4)
+    b = np.arange(6).reshape(2, 3, 1)
+    c = np.arange(12).reshape(3, 4)
+
+    nditer([a, b, c], [],
+           [['readonly', 'no_broadcast'],
+            ['readonly'], ['readonly']])
+    assert_raises(ValueError, nditer, [a, b, c], [],
+                  [['readonly'], ['readonly', 'no_broadcast'], ['readonly']])
+    assert_raises(ValueError, nditer, [a, b, c], [],
+                  [['readonly'], ['readonly'], ['readonly', 'no_broadcast']])
+
+
+class TestIterNested(object):
+
+    def test_basic(self):
+        # Test nested iteration basic usage
+        a = arange(12).reshape(2, 3, 2)
+
+        i, j = np.nested_iters(a, [[0], [1, 2]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
+
+        i, j = np.nested_iters(a, [[0, 1], [2]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]])
+
+        i, j = np.nested_iters(a, [[0, 2], [1]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 2, 4], [1, 3, 5], [6, 8, 10], [7, 9, 11]])
+
+    def test_reorder(self):
+        # Test nested iteration basic usage
+        a = arange(12).reshape(2, 3, 2)
+
+        # In 'K' order (default), it gets reordered
+        i, j = np.nested_iters(a, [[0], [2, 1]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
+
+        i, j = np.nested_iters(a, [[1, 0], [2]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]])
+
+        i, j = np.nested_iters(a, [[2, 0], [1]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 2, 4], [1, 3, 5], [6, 8, 10], [7, 9, 11]])
+
+        # In 'C' order, it doesn't
+        i, j = np.nested_iters(a, [[0], [2, 1]], order='C')
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 2, 4, 1, 3, 5], [6, 8, 10, 7, 9, 11]])
+
+        i, j = np.nested_iters(a, [[1, 0], [2]], order='C')
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1], [6, 7], [2, 3], [8, 9], [4, 5], [10, 11]])
+
+        i, j = np.nested_iters(a, [[2, 0], [1]], order='C')
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 2, 4], [6, 8, 10], [1, 3, 5], [7, 9, 11]])
+
+    def test_flip_axes(self):
+        # Test nested iteration with negative axes
+        a = arange(12).reshape(2, 3, 2)[::-1, ::-1, ::-1]
+
+        # In 'K' order (default), the axes all get flipped
+        i, j = np.nested_iters(a, [[0], [1, 2]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
+
+        i, j = np.nested_iters(a, [[0, 1], [2]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]])
+
+        i, j = np.nested_iters(a, [[0, 2], [1]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 2, 4], [1, 3, 5], [6, 8, 10], [7, 9, 11]])
+
+        # In 'C' order, flipping axes is disabled
+        i, j = np.nested_iters(a, [[0], [1, 2]], order='C')
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[11, 10, 9, 8, 7, 6], [5, 4, 3, 2, 1, 0]])
+
+        i, j = np.nested_iters(a, [[0, 1], [2]], order='C')
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[11, 10], [9, 8], [7, 6], [5, 4], [3, 2], [1, 0]])
+
+        i, j = np.nested_iters(a, [[0, 2], [1]], order='C')
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[11, 9, 7], [10, 8, 6], [5, 3, 1], [4, 2, 0]])
+
+    def test_broadcast(self):
+        # Test nested iteration with broadcasting
+        a = arange(2).reshape(2, 1)
+        b = arange(3).reshape(1, 3)
+
+        i, j = np.nested_iters([a, b], [[0], [1]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[[0, 0], [0, 1], [0, 2]], [[1, 0], [1, 1], [1, 2]]])
+
+        i, j = np.nested_iters([a, b], [[1], [0]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[[0, 0], [1, 0]], [[0, 1], [1, 1]], [[0, 2], [1, 2]]])
+
+    def test_dtype_copy(self):
+        # Test nested iteration with a copy to change dtype
+
+        # copy
+        a = arange(6, dtype='i4').reshape(2, 3)
+        i, j = np.nested_iters(a, [[0], [1]],
+                            op_flags=['readonly', 'copy'],
+                            op_dtypes='f8')
+        assert_equal(j[0].dtype, np.dtype('f8'))
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1, 2], [3, 4, 5]])
+        vals = None
+
+        # writebackifcopy - using conext manager
+        a = arange(6, dtype='f4').reshape(2, 3)
+        i, j = np.nested_iters(a, [[0], [1]],
+                            op_flags=['readwrite', 'updateifcopy'],
+                            casting='same_kind',
+                            op_dtypes='f8')
+        with i, j:
+            assert_equal(j[0].dtype, np.dtype('f8'))
+            for x in i:
+                for y in j:
+                    y[...] += 1
+            assert_equal(a, [[0, 1, 2], [3, 4, 5]])
+        assert_equal(a, [[1, 2, 3], [4, 5, 6]])
+
+        # writebackifcopy - using close()
+        a = arange(6, dtype='f4').reshape(2, 3)
+        i, j = np.nested_iters(a, [[0], [1]],
+                            op_flags=['readwrite', 'updateifcopy'],
+                            casting='same_kind',
+                            op_dtypes='f8')
+        assert_equal(j[0].dtype, np.dtype('f8'))
+        for x in i:
+            for y in j:
+                y[...] += 1
+        assert_equal(a, [[0, 1, 2], [3, 4, 5]])
+        i.close()
+        j.close()
+        assert_equal(a, [[1, 2, 3], [4, 5, 6]])
+
+    def test_dtype_buffered(self):
+        # Test nested iteration with buffering to change dtype
+
+        a = arange(6, dtype='f4').reshape(2, 3)
+        i, j = np.nested_iters(a, [[0], [1]],
+                            flags=['buffered'],
+                            op_flags=['readwrite'],
+                            casting='same_kind',
+                            op_dtypes='f8')
+        assert_equal(j[0].dtype, np.dtype('f8'))
+        for x in i:
+            for y in j:
+                y[...] += 1
+        assert_equal(a, [[1, 2, 3], [4, 5, 6]])
+
+    def test_0d(self):
+        a = np.arange(12).reshape(2, 3, 2)
+        i, j = np.nested_iters(a, [[], [1, 0, 2]])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]])
+
+        i, j = np.nested_iters(a, [[1, 0, 2], []])
+        vals = [list(j) for _ in i]
+        assert_equal(vals, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11]])
+
+        i, j, k = np.nested_iters(a, [[2, 0], [], [1]])
+        vals = []
+        for x in i:
+            for y in j:
+                vals.append([z for z in k])
+        assert_equal(vals, [[0, 2, 4], [1, 3, 5], [6, 8, 10], [7, 9, 11]])
+
+    def test_iter_nested_iters_dtype_buffered(self):
+        # Test nested iteration with buffering to change dtype
+
+        a = arange(6, dtype='f4').reshape(2, 3)
+        i, j = np.nested_iters(a, [[0], [1]],
+                            flags=['buffered'],
+                            op_flags=['readwrite'],
+                            casting='same_kind',
+                            op_dtypes='f8')
+        with i, j:
+            assert_equal(j[0].dtype, np.dtype('f8'))
+            for x in i:
+                for y in j:
+                    y[...] += 1
+        assert_equal(a, [[1, 2, 3], [4, 5, 6]])
+
+def test_iter_reduction_error():
+
+    a = np.arange(6)
+    assert_raises(ValueError, nditer, [a, None], [],
+                    [['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[[0], [-1]])
+
+    a = np.arange(6).reshape(2, 3)
+    assert_raises(ValueError, nditer, [a, None], ['external_loop'],
+                    [['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[[0, 1], [-1, -1]])
+
+def test_iter_reduction():
+    # Test doing reductions with the iterator
+
+    a = np.arange(6)
+    i = nditer([a, None], ['reduce_ok'],
+                    [['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[[0], [-1]])
+    # Need to initialize the output operand to the addition unit
+    with i:
+        i.operands[1][...] = 0
+        # Do the reduction
+        for x, y in i:
+            y[...] += x
+        # Since no axes were specified, should have allocated a scalar
+        assert_equal(i.operands[1].ndim, 0)
+        assert_equal(i.operands[1], np.sum(a))
+
+    a = np.arange(6).reshape(2, 3)
+    i = nditer([a, None], ['reduce_ok', 'external_loop'],
+                    [['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[[0, 1], [-1, -1]])
+    # Need to initialize the output operand to the addition unit
+    with i:
+        i.operands[1][...] = 0
+        # Reduction shape/strides for the output
+        assert_equal(i[1].shape, (6,))
+        assert_equal(i[1].strides, (0,))
+        # Do the reduction
+        for x, y in i:
+            # Use a for loop instead of ``y[...] += x``
+            # (equivalent to ``y[...] = y[...].copy() + x``),
+            # because y has zero strides we use for the reduction
+            for j in range(len(y)):
+                y[j] += x[j]
+        # Since no axes were specified, should have allocated a scalar
+        assert_equal(i.operands[1].ndim, 0)
+        assert_equal(i.operands[1], np.sum(a))
+
+    # This is a tricky reduction case for the buffering double loop
+    # to handle
+    a = np.ones((2, 3, 5))
+    it1 = nditer([a, None], ['reduce_ok', 'external_loop'],
+                    [['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[None, [0, -1, 1]])
+    it2 = nditer([a, None], ['reduce_ok', 'external_loop',
+                            'buffered', 'delay_bufalloc'],
+                    [['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[None, [0, -1, 1]], buffersize=10)
+    with it1, it2:
+        it1.operands[1].fill(0)
+        it2.operands[1].fill(0)
+        it2.reset()
+        for x in it1:
+            x[1][...] += x[0]
+        for x in it2:
+            x[1][...] += x[0]
+        assert_equal(it1.operands[1], it2.operands[1])
+        assert_equal(it2.operands[1].sum(), a.size)
+
+def test_iter_buffering_reduction():
+    # Test doing buffered reductions with the iterator
+
+    a = np.arange(6)
+    b = np.array(0., dtype='f8').byteswap().newbyteorder()
+    i = nditer([a, b], ['reduce_ok', 'buffered'],
+                    [['readonly'], ['readwrite', 'nbo']],
+                    op_axes=[[0], [-1]])
+    with i:
+        assert_equal(i[1].dtype, np.dtype('f8'))
+        assert_(i[1].dtype != b.dtype)
+        # Do the reduction
+        for x, y in i:
+            y[...] += x
+    # Since no axes were specified, should have allocated a scalar
+    assert_equal(b, np.sum(a))
+
+    a = np.arange(6).reshape(2, 3)
+    b = np.array([0, 0], dtype='f8').byteswap().newbyteorder()
+    i = nditer([a, b], ['reduce_ok', 'external_loop', 'buffered'],
+                    [['readonly'], ['readwrite', 'nbo']],
+                    op_axes=[[0, 1], [0, -1]])
+    # Reduction shape/strides for the output
+    with i:
+        assert_equal(i[1].shape, (3,))
+        assert_equal(i[1].strides, (0,))
+        # Do the reduction
+        for x, y in i:
+            # Use a for loop instead of ``y[...] += x``
+            # (equivalent to ``y[...] = y[...].copy() + x``),
+            # because y has zero strides we use for the reduction
+            for j in range(len(y)):
+                y[j] += x[j]
+    assert_equal(b, np.sum(a, axis=1))
+
+    # Iterator inner double loop was wrong on this one
+    p = np.arange(2) + 1
+    it = np.nditer([p, None],
+            ['delay_bufalloc', 'reduce_ok', 'buffered', 'external_loop'],
+            [['readonly'], ['readwrite', 'allocate']],
+            op_axes=[[-1, 0], [-1, -1]],
+            itershape=(2, 2))
+    with it:
+        it.operands[1].fill(0)
+        it.reset()
+        assert_equal(it[0], [1, 2, 1, 2])
+
+    # Iterator inner loop should take argument contiguity into account
+    x = np.ones((7, 13, 8), np.int8)[4:6,1:11:6,1:5].transpose(1, 2, 0)
+    x[...] = np.arange(x.size).reshape(x.shape)
+    y_base = np.arange(4*4, dtype=np.int8).reshape(4, 4)
+    y_base_copy = y_base.copy()
+    y = y_base[::2,:,None]
+
+    it = np.nditer([y, x],
+                   ['buffered', 'external_loop', 'reduce_ok'],
+                   [['readwrite'], ['readonly']])
+    with it:
+        for a, b in it:
+            a.fill(2)
+
+    assert_equal(y_base[1::2], y_base_copy[1::2])
+    assert_equal(y_base[::2], 2)
+
+def test_iter_buffering_reduction_reuse_reduce_loops():
+    # There was a bug triggering reuse of the reduce loop inappropriately,
+    # which caused processing to happen in unnecessarily small chunks
+    # and overran the buffer.
+
+    a = np.zeros((2, 7))
+    b = np.zeros((1, 7))
+    it = np.nditer([a, b], flags=['reduce_ok', 'external_loop', 'buffered'],
+                    op_flags=[['readonly'], ['readwrite']],
+                    buffersize=5)
+
+    with it:
+        bufsizes = [x.shape[0] for x, y in it]
+    assert_equal(bufsizes, [5, 2, 5, 2])
+    assert_equal(sum(bufsizes), a.size)
+
+def test_iter_writemasked_badinput():
+    a = np.zeros((2, 3))
+    b = np.zeros((3,))
+    m = np.array([[True, True, False], [False, True, False]])
+    m2 = np.array([True, True, False])
+    m3 = np.array([0, 1, 1], dtype='u1')
+    mbad1 = np.array([0, 1, 1], dtype='i1')
+    mbad2 = np.array([0, 1, 1], dtype='f4')
+
+    # Need an 'arraymask' if any operand is 'writemasked'
+    assert_raises(ValueError, nditer, [a, m], [],
+                    [['readwrite', 'writemasked'], ['readonly']])
+
+    # A 'writemasked' operand must not be readonly
+    assert_raises(ValueError, nditer, [a, m], [],
+                    [['readonly', 'writemasked'], ['readonly', 'arraymask']])
+
+    # 'writemasked' and 'arraymask' may not be used together
+    assert_raises(ValueError, nditer, [a, m], [],
+                    [['readonly'], ['readwrite', 'arraymask', 'writemasked']])
+
+    # 'arraymask' may only be specified once
+    assert_raises(ValueError, nditer, [a, m, m2], [],
+                    [['readwrite', 'writemasked'],
+                     ['readonly', 'arraymask'],
+                     ['readonly', 'arraymask']])
+
+    # An 'arraymask' with nothing 'writemasked' also doesn't make sense
+    assert_raises(ValueError, nditer, [a, m], [],
+                    [['readwrite'], ['readonly', 'arraymask']])
+
+    # A writemasked reduction requires a similarly smaller mask
+    assert_raises(ValueError, nditer, [a, b, m], ['reduce_ok'],
+                    [['readonly'],
+                     ['readwrite', 'writemasked'],
+                     ['readonly', 'arraymask']])
+    # But this should work with a smaller/equal mask to the reduction operand
+    np.nditer([a, b, m2], ['reduce_ok'],
+                    [['readonly'],
+                     ['readwrite', 'writemasked'],
+                     ['readonly', 'arraymask']])
+    # The arraymask itself cannot be a reduction
+    assert_raises(ValueError, nditer, [a, b, m2], ['reduce_ok'],
+                    [['readonly'],
+                     ['readwrite', 'writemasked'],
+                     ['readwrite', 'arraymask']])
+
+    # A uint8 mask is ok too
+    np.nditer([a, m3], ['buffered'],
+                    [['readwrite', 'writemasked'],
+                     ['readonly', 'arraymask']],
+                    op_dtypes=['f4', None],
+                    casting='same_kind')
+    # An int8 mask isn't ok
+    assert_raises(TypeError, np.nditer, [a, mbad1], ['buffered'],
+                    [['readwrite', 'writemasked'],
+                     ['readonly', 'arraymask']],
+                    op_dtypes=['f4', None],
+                    casting='same_kind')
+    # A float32 mask isn't ok
+    assert_raises(TypeError, np.nditer, [a, mbad2], ['buffered'],
+                    [['readwrite', 'writemasked'],
+                     ['readonly', 'arraymask']],
+                    op_dtypes=['f4', None],
+                    casting='same_kind')
+
+def test_iter_writemasked():
+    a = np.zeros((3,), dtype='f8')
+    msk = np.array([True, True, False])
+
+    # When buffering is unused, 'writemasked' effectively does nothing.
+    # It's up to the user of the iterator to obey the requested semantics.
+    it = np.nditer([a, msk], [],
+                [['readwrite', 'writemasked'],
+                 ['readonly', 'arraymask']])
+    with it:
+        for x, m in it:
+            x[...] = 1
+    # Because we violated the semantics, all the values became 1
+    assert_equal(a, [1, 1, 1])
+
+    # Even if buffering is enabled, we still may be accessing the array
+    # directly.
+    it = np.nditer([a, msk], ['buffered'],
+                [['readwrite', 'writemasked'],
+                 ['readonly', 'arraymask']])
+    with it:
+        for x, m in it:
+            x[...] = 2.5
+    # Because we violated the semantics, all the values became 2.5
+    assert_equal(a, [2.5, 2.5, 2.5])
+
+    # If buffering will definitely happening, for instance because of
+    # a cast, only the items selected by the mask will be copied back from
+    # the buffer.
+    it = np.nditer([a, msk], ['buffered'],
+                [['readwrite', 'writemasked'],
+                 ['readonly', 'arraymask']],
+                op_dtypes=['i8', None],
+                casting='unsafe')
+    with it:
+        for x, m in it:
+            x[...] = 3
+    # Even though we violated the semantics, only the selected values
+    # were copied back
+    assert_equal(a, [3, 3, 2.5])
+
+def test_iter_non_writable_attribute_deletion():
+    it = np.nditer(np.ones(2))
+    attr = ["value", "shape", "operands", "itviews", "has_delayed_bufalloc",
+            "iterationneedsapi", "has_multi_index", "has_index", "dtypes",
+            "ndim", "nop", "itersize", "finished"]
+
+    for s in attr:
+        assert_raises(AttributeError, delattr, it, s)
+
+
+def test_iter_writable_attribute_deletion():
+    it = np.nditer(np.ones(2))
+    attr = [ "multi_index", "index", "iterrange", "iterindex"]
+    for s in attr:
+        assert_raises(AttributeError, delattr, it, s)
+
+
+def test_iter_element_deletion():
+    it = np.nditer(np.ones(3))
+    try:
+        del it[1]
+        del it[1:2]
+    except TypeError:
+        pass
+    except Exception:
+        raise AssertionError
+
+def test_iter_allocated_array_dtypes():
+    # If the dtype of an allocated output has a shape, the shape gets
+    # tacked onto the end of the result.
+    it = np.nditer(([1, 3, 20], None), op_dtypes=[None, ('i4', (2,))])
+    for a, b in it:
+        b[0] = a - 1
+        b[1] = a + 1
+    assert_equal(it.operands[1], [[0, 2], [2, 4], [19, 21]])
+
+    # Make sure this works for scalars too
+    it = np.nditer((10, 2, None), op_dtypes=[None, None, ('i4', (2, 2))])
+    for a, b, c in it:
+        c[0, 0] = a - b
+        c[0, 1] = a + b
+        c[1, 0] = a * b
+        c[1, 1] = a / b
+    assert_equal(it.operands[2], [[8, 12], [20, 5]])
+
+
+def test_0d_iter():
+    # Basic test for iteration of 0-d arrays:
+    i = nditer([2, 3], ['multi_index'], [['readonly']]*2)
+    assert_equal(i.ndim, 0)
+    assert_equal(next(i), (2, 3))
+    assert_equal(i.multi_index, ())
+    assert_equal(i.iterindex, 0)
+    assert_raises(StopIteration, next, i)
+    # test reset:
+    i.reset()
+    assert_equal(next(i), (2, 3))
+    assert_raises(StopIteration, next, i)
+
+    # test forcing to 0-d
+    i = nditer(np.arange(5), ['multi_index'], [['readonly']], op_axes=[()])
+    assert_equal(i.ndim, 0)
+    assert_equal(len(i), 1)
+    # note that itershape=(), still behaves like None due to the conversions
+
+    # Test a more complex buffered casting case (same as another test above)
+    sdt = [('a', 'f4'), ('b', 'i8'), ('c', 'c8', (2, 3)), ('d', 'O')]
+    a = np.array(0.5, dtype='f4')
+    i = nditer(a, ['buffered', 'refs_ok'], ['readonly'],
+                    casting='unsafe', op_dtypes=sdt)
+    vals = next(i)
+    assert_equal(vals['a'], 0.5)
+    assert_equal(vals['b'], 0)
+    assert_equal(vals['c'], [[(0.5)]*3]*2)
+    assert_equal(vals['d'], 0.5)
+
+
+def test_iter_too_large():
+    # The total size of the iterator must not exceed the maximum intp due
+    # to broadcasting. Dividing by 1024 will keep it small enough to
+    # give a legal array.
+    size = np.iinfo(np.intp).max // 1024
+    arr = np.lib.stride_tricks.as_strided(np.zeros(1), (size,), (0,))
+    assert_raises(ValueError, nditer, (arr, arr[:, None]))
+    # test the same for multiindex. That may get more interesting when
+    # removing 0 dimensional axis is allowed (since an iterator can grow then)
+    assert_raises(ValueError, nditer,
+                  (arr, arr[:, None]), flags=['multi_index'])
+
+
+def test_iter_too_large_with_multiindex():
+    # When a multi index is being tracked, the error is delayed this
+    # checks the delayed error messages and getting below that by
+    # removing an axis.
+    base_size = 2**10
+    num = 1
+    while base_size**num < np.iinfo(np.intp).max:
+        num += 1
+
+    shape_template = [1, 1] * num
+    arrays = []
+    for i in range(num):
+        shape = shape_template[:]
+        shape[i * 2] = 2**10
+        arrays.append(np.empty(shape))
+    arrays = tuple(arrays)
+
+    # arrays are now too large to be broadcast. The different modes test
+    # different nditer functionality with or without GIL.
+    for mode in range(6):
+        with assert_raises(ValueError):
+            _multiarray_tests.test_nditer_too_large(arrays, -1, mode)
+    # but if we do nothing with the nditer, it can be constructed:
+    _multiarray_tests.test_nditer_too_large(arrays, -1, 7)
+
+    # When an axis is removed, things should work again (half the time):
+    for i in range(num):
+        for mode in range(6):
+            # an axis with size 1024 is removed:
+            _multiarray_tests.test_nditer_too_large(arrays, i*2, mode)
+            # an axis with size 1 is removed:
+            with assert_raises(ValueError):
+                _multiarray_tests.test_nditer_too_large(arrays, i*2 + 1, mode)
+
+def test_writebacks():
+    a = np.arange(6, dtype='f4')
+    au = a.byteswap().newbyteorder()
+    assert_(a.dtype.byteorder != au.dtype.byteorder)
+    it = nditer(au, [], [['readwrite', 'updateifcopy']],
+                        casting='equiv', op_dtypes=[np.dtype('f4')])
+    with it:
+        it.operands[0][:] = 100
+    assert_equal(au, 100)
+    # do it again, this time raise an error,
+    it = nditer(au, [], [['readwrite', 'updateifcopy']],
+                        casting='equiv', op_dtypes=[np.dtype('f4')])
+    try:
+        with it:
+            assert_equal(au.flags.writeable, False)
+            it.operands[0][:] = 0
+            raise ValueError('exit context manager on exception')
+    except:
+        pass
+    assert_equal(au, 0)
+    assert_equal(au.flags.writeable, True)
+    # cannot reuse i outside context manager
+    assert_raises(ValueError, getattr, it, 'operands')
+
+    it = nditer(au, [], [['readwrite', 'updateifcopy']],
+                        casting='equiv', op_dtypes=[np.dtype('f4')])
+    with it:
+        x = it.operands[0]
+        x[:] = 6
+        assert_(x.flags.writebackifcopy)
+    assert_equal(au, 6)
+    assert_(not x.flags.writebackifcopy)
+    x[:] = 123 # x.data still valid
+    assert_equal(au, 6) # but not connected to au
+
+    it = nditer(au, [],
+                 [['readwrite', 'updateifcopy']],
+                 casting='equiv', op_dtypes=[np.dtype('f4')])
+    # reentering works
+    with it:
+        with it:
+            for x in it:
+                x[...] = 123
+
+    it = nditer(au, [],
+                 [['readwrite', 'updateifcopy']],
+                 casting='equiv', op_dtypes=[np.dtype('f4')])
+    # make sure exiting the inner context manager closes the iterator
+    with it:
+        with it:
+            for x in it:
+                x[...] = 123
+        assert_raises(ValueError, getattr, it, 'operands')
+    # do not crash if original data array is decrefed
+    it = nditer(au, [],
+                 [['readwrite', 'updateifcopy']],
+                 casting='equiv', op_dtypes=[np.dtype('f4')])
+    del au
+    with it:
+        for x in it:
+            x[...] = 123
+    # make sure we cannot reenter the closed iterator
+    enter = it.__enter__
+    assert_raises(RuntimeError, enter)
+
+def test_close_equivalent():
+    ''' using a context amanger and using nditer.close are equivalent
+    '''
+    def add_close(x, y, out=None):
+        addop = np.add
+        it = np.nditer([x, y, out], [],
+                    [['readonly'], ['readonly'], ['writeonly','allocate']])
+        for (a, b, c) in it:
+            addop(a, b, out=c)
+        ret = it.operands[2]
+        it.close()
+        return ret
+
+    def add_context(x, y, out=None):
+        addop = np.add
+        it = np.nditer([x, y, out], [],
+                    [['readonly'], ['readonly'], ['writeonly','allocate']])
+        with it:
+            for (a, b, c) in it:
+                addop(a, b, out=c)
+            return it.operands[2]
+    z = add_close(range(5), range(5))
+    assert_equal(z, range(0, 10, 2))
+    z = add_context(range(5), range(5))
+    assert_equal(z, range(0, 10, 2))
+
+def test_close_raises():
+    it = np.nditer(np.arange(3))
+    assert_equal (next(it), 0)
+    it.close()
+    assert_raises(StopIteration, next, it)
+    assert_raises(ValueError, getattr, it, 'operands')
+
+[email protected](not HAS_REFCOUNT, reason="Python lacks refcounts")
+def test_warn_noclose():
+    a = np.arange(6, dtype='f4')
+    au = a.byteswap().newbyteorder()
+    with suppress_warnings() as sup:
+        sup.record(RuntimeWarning)
+        it = np.nditer(au, [], [['readwrite', 'updateifcopy']],
+                        casting='equiv', op_dtypes=[np.dtype('f4')])
+        del it
+        assert len(sup.log) == 1
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_numeric.py b/contrib/python/numpy/py2/numpy/core/tests/test_numeric.py
new file mode 100644
index 00000000000..ed02c156168
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_numeric.py
@@ -0,0 +1,2797 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import warnings
+import itertools
+import platform
+import pytest
+from decimal import Decimal
+
+import numpy as np
+from numpy.core import umath
+from numpy.random import rand, randint, randn
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_raises_regex,
+    assert_array_equal, assert_almost_equal, assert_array_almost_equal,
+    HAS_REFCOUNT
+    )
+
+
+class TestResize(object):
+    def test_copies(self):
+        A = np.array([[1, 2], [3, 4]])
+        Ar1 = np.array([[1, 2, 3, 4], [1, 2, 3, 4]])
+        assert_equal(np.resize(A, (2, 4)), Ar1)
+
+        Ar2 = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+        assert_equal(np.resize(A, (4, 2)), Ar2)
+
+        Ar3 = np.array([[1, 2, 3], [4, 1, 2], [3, 4, 1], [2, 3, 4]])
+        assert_equal(np.resize(A, (4, 3)), Ar3)
+
+    def test_zeroresize(self):
+        A = np.array([[1, 2], [3, 4]])
+        Ar = np.resize(A, (0,))
+        assert_array_equal(Ar, np.array([]))
+        assert_equal(A.dtype, Ar.dtype)
+
+        Ar = np.resize(A, (0, 2))
+        assert_equal(Ar.shape, (0, 2))
+
+        Ar = np.resize(A, (2, 0))
+        assert_equal(Ar.shape, (2, 0))
+
+    def test_reshape_from_zero(self):
+        # See also gh-6740
+        A = np.zeros(0, dtype=[('a', np.float32, 1)])
+        Ar = np.resize(A, (2, 1))
+        assert_array_equal(Ar, np.zeros((2, 1), Ar.dtype))
+        assert_equal(A.dtype, Ar.dtype)
+
+
+class TestNonarrayArgs(object):
+    # check that non-array arguments to functions wrap them in arrays
+    def test_choose(self):
+        choices = [[0, 1, 2],
+                   [3, 4, 5],
+                   [5, 6, 7]]
+        tgt = [5, 1, 5]
+        a = [2, 0, 1]
+
+        out = np.choose(a, choices)
+        assert_equal(out, tgt)
+
+    def test_clip(self):
+        arr = [-1, 5, 2, 3, 10, -4, -9]
+        out = np.clip(arr, 2, 7)
+        tgt = [2, 5, 2, 3, 7, 2, 2]
+        assert_equal(out, tgt)
+
+    def test_compress(self):
+        arr = [[0, 1, 2, 3, 4],
+               [5, 6, 7, 8, 9]]
+        tgt = [[5, 6, 7, 8, 9]]
+        out = np.compress([0, 1], arr, axis=0)
+        assert_equal(out, tgt)
+
+    def test_count_nonzero(self):
+        arr = [[0, 1, 7, 0, 0],
+               [3, 0, 0, 2, 19]]
+        tgt = np.array([2, 3])
+        out = np.count_nonzero(arr, axis=1)
+        assert_equal(out, tgt)
+
+    def test_cumproduct(self):
+        A = [[1, 2, 3], [4, 5, 6]]
+        assert_(np.all(np.cumproduct(A) == np.array([1, 2, 6, 24, 120, 720])))
+
+    def test_diagonal(self):
+        a = [[0, 1, 2, 3],
+             [4, 5, 6, 7],
+             [8, 9, 10, 11]]
+        out = np.diagonal(a)
+        tgt = [0, 5, 10]
+
+        assert_equal(out, tgt)
+
+    def test_mean(self):
+        A = [[1, 2, 3], [4, 5, 6]]
+        assert_(np.mean(A) == 3.5)
+        assert_(np.all(np.mean(A, 0) == np.array([2.5, 3.5, 4.5])))
+        assert_(np.all(np.mean(A, 1) == np.array([2., 5.])))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_(np.isnan(np.mean([])))
+            assert_(w[0].category is RuntimeWarning)
+
+    def test_ptp(self):
+        a = [3, 4, 5, 10, -3, -5, 6.0]
+        assert_equal(np.ptp(a, axis=0), 15.0)
+
+    def test_prod(self):
+        arr = [[1, 2, 3, 4],
+               [5, 6, 7, 9],
+               [10, 3, 4, 5]]
+        tgt = [24, 1890, 600]
+
+        assert_equal(np.prod(arr, axis=-1), tgt)
+
+    def test_ravel(self):
+        a = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
+        tgt = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+        assert_equal(np.ravel(a), tgt)
+
+    def test_repeat(self):
+        a = [1, 2, 3]
+        tgt = [1, 1, 2, 2, 3, 3]
+
+        out = np.repeat(a, 2)
+        assert_equal(out, tgt)
+
+    def test_reshape(self):
+        arr = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
+        tgt = [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]
+        assert_equal(np.reshape(arr, (2, 6)), tgt)
+
+    def test_round(self):
+        arr = [1.56, 72.54, 6.35, 3.25]
+        tgt = [1.6, 72.5, 6.4, 3.2]
+        assert_equal(np.around(arr, decimals=1), tgt)
+
+    def test_searchsorted(self):
+        arr = [-8, -5, -1, 3, 6, 10]
+        out = np.searchsorted(arr, 0)
+        assert_equal(out, 3)
+
+    def test_size(self):
+        A = [[1, 2, 3], [4, 5, 6]]
+        assert_(np.size(A) == 6)
+        assert_(np.size(A, 0) == 2)
+        assert_(np.size(A, 1) == 3)
+
+    def test_squeeze(self):
+        A = [[[1, 1, 1], [2, 2, 2], [3, 3, 3]]]
+        assert_(np.squeeze(A).shape == (3, 3))
+
+    def test_std(self):
+        A = [[1, 2, 3], [4, 5, 6]]
+        assert_almost_equal(np.std(A), 1.707825127659933)
+        assert_almost_equal(np.std(A, 0), np.array([1.5, 1.5, 1.5]))
+        assert_almost_equal(np.std(A, 1), np.array([0.81649658, 0.81649658]))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_(np.isnan(np.std([])))
+            assert_(w[0].category is RuntimeWarning)
+
+    def test_swapaxes(self):
+        tgt = [[[0, 4], [2, 6]], [[1, 5], [3, 7]]]
+        a = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
+        out = np.swapaxes(a, 0, 2)
+        assert_equal(out, tgt)
+
+    def test_sum(self):
+        m = [[1, 2, 3],
+             [4, 5, 6],
+             [7, 8, 9]]
+        tgt = [[6], [15], [24]]
+        out = np.sum(m, axis=1, keepdims=True)
+
+        assert_equal(tgt, out)
+
+    def test_take(self):
+        tgt = [2, 3, 5]
+        indices = [1, 2, 4]
+        a = [1, 2, 3, 4, 5]
+
+        out = np.take(a, indices)
+        assert_equal(out, tgt)
+
+    def test_trace(self):
+        c = [[1, 2], [3, 4], [5, 6]]
+        assert_equal(np.trace(c), 5)
+
+    def test_transpose(self):
+        arr = [[1, 2], [3, 4], [5, 6]]
+        tgt = [[1, 3, 5], [2, 4, 6]]
+        assert_equal(np.transpose(arr, (1, 0)), tgt)
+
+    def test_var(self):
+        A = [[1, 2, 3], [4, 5, 6]]
+        assert_almost_equal(np.var(A), 2.9166666666666665)
+        assert_almost_equal(np.var(A, 0), np.array([2.25, 2.25, 2.25]))
+        assert_almost_equal(np.var(A, 1), np.array([0.66666667, 0.66666667]))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_(np.isnan(np.var([])))
+            assert_(w[0].category is RuntimeWarning)
+
+
+class TestIsscalar(object):
+    def test_isscalar(self):
+        assert_(np.isscalar(3.1))
+        assert_(np.isscalar(np.int16(12345)))
+        assert_(np.isscalar(False))
+        assert_(np.isscalar('numpy'))
+        assert_(not np.isscalar([3.1]))
+        assert_(not np.isscalar(None))
+
+        # PEP 3141
+        from fractions import Fraction
+        assert_(np.isscalar(Fraction(5, 17)))
+        from numbers import Number
+        assert_(np.isscalar(Number()))
+
+
+class TestBoolScalar(object):
+    def test_logical(self):
+        f = np.False_
+        t = np.True_
+        s = "xyz"
+        assert_((t and s) is s)
+        assert_((f and s) is f)
+
+    def test_bitwise_or(self):
+        f = np.False_
+        t = np.True_
+        assert_((t | t) is t)
+        assert_((f | t) is t)
+        assert_((t | f) is t)
+        assert_((f | f) is f)
+
+    def test_bitwise_and(self):
+        f = np.False_
+        t = np.True_
+        assert_((t & t) is t)
+        assert_((f & t) is f)
+        assert_((t & f) is f)
+        assert_((f & f) is f)
+
+    def test_bitwise_xor(self):
+        f = np.False_
+        t = np.True_
+        assert_((t ^ t) is f)
+        assert_((f ^ t) is t)
+        assert_((t ^ f) is t)
+        assert_((f ^ f) is f)
+
+
+class TestBoolArray(object):
+    def setup(self):
+        # offset for simd tests
+        self.t = np.array([True] * 41, dtype=bool)[1::]
+        self.f = np.array([False] * 41, dtype=bool)[1::]
+        self.o = np.array([False] * 42, dtype=bool)[2::]
+        self.nm = self.f.copy()
+        self.im = self.t.copy()
+        self.nm[3] = True
+        self.nm[-2] = True
+        self.im[3] = False
+        self.im[-2] = False
+
+    def test_all_any(self):
+        assert_(self.t.all())
+        assert_(self.t.any())
+        assert_(not self.f.all())
+        assert_(not self.f.any())
+        assert_(self.nm.any())
+        assert_(self.im.any())
+        assert_(not self.nm.all())
+        assert_(not self.im.all())
+        # check bad element in all positions
+        for i in range(256 - 7):
+            d = np.array([False] * 256, dtype=bool)[7::]
+            d[i] = True
+            assert_(np.any(d))
+            e = np.array([True] * 256, dtype=bool)[7::]
+            e[i] = False
+            assert_(not np.all(e))
+            assert_array_equal(e, ~d)
+        # big array test for blocked libc loops
+        for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+            d = np.array([False] * 100043, dtype=bool)
+            d[i] = True
+            assert_(np.any(d), msg="%r" % i)
+            e = np.array([True] * 100043, dtype=bool)
+            e[i] = False
+            assert_(not np.all(e), msg="%r" % i)
+
+    def test_logical_not_abs(self):
+        assert_array_equal(~self.t, self.f)
+        assert_array_equal(np.abs(~self.t), self.f)
+        assert_array_equal(np.abs(~self.f), self.t)
+        assert_array_equal(np.abs(self.f), self.f)
+        assert_array_equal(~np.abs(self.f), self.t)
+        assert_array_equal(~np.abs(self.t), self.f)
+        assert_array_equal(np.abs(~self.nm), self.im)
+        np.logical_not(self.t, out=self.o)
+        assert_array_equal(self.o, self.f)
+        np.abs(self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+
+    def test_logical_and_or_xor(self):
+        assert_array_equal(self.t | self.t, self.t)
+        assert_array_equal(self.f | self.f, self.f)
+        assert_array_equal(self.t | self.f, self.t)
+        assert_array_equal(self.f | self.t, self.t)
+        np.logical_or(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+        assert_array_equal(self.t & self.t, self.t)
+        assert_array_equal(self.f & self.f, self.f)
+        assert_array_equal(self.t & self.f, self.f)
+        assert_array_equal(self.f & self.t, self.f)
+        np.logical_and(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+        assert_array_equal(self.t ^ self.t, self.f)
+        assert_array_equal(self.f ^ self.f, self.f)
+        assert_array_equal(self.t ^ self.f, self.t)
+        assert_array_equal(self.f ^ self.t, self.t)
+        np.logical_xor(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.f)
+
+        assert_array_equal(self.nm & self.t, self.nm)
+        assert_array_equal(self.im & self.f, False)
+        assert_array_equal(self.nm & True, self.nm)
+        assert_array_equal(self.im & False, self.f)
+        assert_array_equal(self.nm | self.t, self.t)
+        assert_array_equal(self.im | self.f, self.im)
+        assert_array_equal(self.nm | True, self.t)
+        assert_array_equal(self.im | False, self.im)
+        assert_array_equal(self.nm ^ self.t, self.im)
+        assert_array_equal(self.im ^ self.f, self.im)
+        assert_array_equal(self.nm ^ True, self.im)
+        assert_array_equal(self.im ^ False, self.im)
+
+
+class TestBoolCmp(object):
+    def setup(self):
+        self.f = np.ones(256, dtype=np.float32)
+        self.ef = np.ones(self.f.size, dtype=bool)
+        self.d = np.ones(128, dtype=np.float64)
+        self.ed = np.ones(self.d.size, dtype=bool)
+        # generate values for all permutation of 256bit simd vectors
+        s = 0
+        for i in range(32):
+            self.f[s:s+8] = [i & 2**x for x in range(8)]
+            self.ef[s:s+8] = [(i & 2**x) != 0 for x in range(8)]
+            s += 8
+        s = 0
+        for i in range(16):
+            self.d[s:s+4] = [i & 2**x for x in range(4)]
+            self.ed[s:s+4] = [(i & 2**x) != 0 for x in range(4)]
+            s += 4
+
+        self.nf = self.f.copy()
+        self.nd = self.d.copy()
+        self.nf[self.ef] = np.nan
+        self.nd[self.ed] = np.nan
+
+        self.inff = self.f.copy()
+        self.infd = self.d.copy()
+        self.inff[::3][self.ef[::3]] = np.inf
+        self.infd[::3][self.ed[::3]] = np.inf
+        self.inff[1::3][self.ef[1::3]] = -np.inf
+        self.infd[1::3][self.ed[1::3]] = -np.inf
+        self.inff[2::3][self.ef[2::3]] = np.nan
+        self.infd[2::3][self.ed[2::3]] = np.nan
+        self.efnonan = self.ef.copy()
+        self.efnonan[2::3] = False
+        self.ednonan = self.ed.copy()
+        self.ednonan[2::3] = False
+
+        self.signf = self.f.copy()
+        self.signd = self.d.copy()
+        self.signf[self.ef] *= -1.
+        self.signd[self.ed] *= -1.
+        self.signf[1::6][self.ef[1::6]] = -np.inf
+        self.signd[1::6][self.ed[1::6]] = -np.inf
+        self.signf[3::6][self.ef[3::6]] = -np.nan
+        self.signd[3::6][self.ed[3::6]] = -np.nan
+        self.signf[4::6][self.ef[4::6]] = -0.
+        self.signd[4::6][self.ed[4::6]] = -0.
+
+    def test_float(self):
+        # offset for alignment test
+        for i in range(4):
+            assert_array_equal(self.f[i:] > 0, self.ef[i:])
+            assert_array_equal(self.f[i:] - 1 >= 0, self.ef[i:])
+            assert_array_equal(self.f[i:] == 0, ~self.ef[i:])
+            assert_array_equal(-self.f[i:] < 0, self.ef[i:])
+            assert_array_equal(-self.f[i:] + 1 <= 0, self.ef[i:])
+            r = self.f[i:] != 0
+            assert_array_equal(r, self.ef[i:])
+            r2 = self.f[i:] != np.zeros_like(self.f[i:])
+            r3 = 0 != self.f[i:]
+            assert_array_equal(r, r2)
+            assert_array_equal(r, r3)
+            # check bool == 0x1
+            assert_array_equal(r.view(np.int8), r.astype(np.int8))
+            assert_array_equal(r2.view(np.int8), r2.astype(np.int8))
+            assert_array_equal(r3.view(np.int8), r3.astype(np.int8))
+
+            # isnan on amd64 takes the same code path
+            assert_array_equal(np.isnan(self.nf[i:]), self.ef[i:])
+            assert_array_equal(np.isfinite(self.nf[i:]), ~self.ef[i:])
+            assert_array_equal(np.isfinite(self.inff[i:]), ~self.ef[i:])
+            assert_array_equal(np.isinf(self.inff[i:]), self.efnonan[i:])
+            assert_array_equal(np.signbit(self.signf[i:]), self.ef[i:])
+
+    def test_double(self):
+        # offset for alignment test
+        for i in range(2):
+            assert_array_equal(self.d[i:] > 0, self.ed[i:])
+            assert_array_equal(self.d[i:] - 1 >= 0, self.ed[i:])
+            assert_array_equal(self.d[i:] == 0, ~self.ed[i:])
+            assert_array_equal(-self.d[i:] < 0, self.ed[i:])
+            assert_array_equal(-self.d[i:] + 1 <= 0, self.ed[i:])
+            r = self.d[i:] != 0
+            assert_array_equal(r, self.ed[i:])
+            r2 = self.d[i:] != np.zeros_like(self.d[i:])
+            r3 = 0 != self.d[i:]
+            assert_array_equal(r, r2)
+            assert_array_equal(r, r3)
+            # check bool == 0x1
+            assert_array_equal(r.view(np.int8), r.astype(np.int8))
+            assert_array_equal(r2.view(np.int8), r2.astype(np.int8))
+            assert_array_equal(r3.view(np.int8), r3.astype(np.int8))
+
+            # isnan on amd64 takes the same code path
+            assert_array_equal(np.isnan(self.nd[i:]), self.ed[i:])
+            assert_array_equal(np.isfinite(self.nd[i:]), ~self.ed[i:])
+            assert_array_equal(np.isfinite(self.infd[i:]), ~self.ed[i:])
+            assert_array_equal(np.isinf(self.infd[i:]), self.ednonan[i:])
+            assert_array_equal(np.signbit(self.signd[i:]), self.ed[i:])
+
+
+class TestSeterr(object):
+    def test_default(self):
+        err = np.geterr()
+        assert_equal(err,
+                     dict(divide='warn',
+                          invalid='warn',
+                          over='warn',
+                          under='ignore')
+                     )
+
+    def test_set(self):
+        with np.errstate():
+            err = np.seterr()
+            old = np.seterr(divide='print')
+            assert_(err == old)
+            new = np.seterr()
+            assert_(new['divide'] == 'print')
+            np.seterr(over='raise')
+            assert_(np.geterr()['over'] == 'raise')
+            assert_(new['divide'] == 'print')
+            np.seterr(**old)
+            assert_(np.geterr() == old)
+
+    @pytest.mark.skipif(platform.machine() == "armv5tel", reason="See gh-413.")
+    def test_divide_err(self):
+        with np.errstate(divide='raise'):
+            with assert_raises(FloatingPointError):
+                np.array([1.]) / np.array([0.])
+
+            np.seterr(divide='ignore')
+            np.array([1.]) / np.array([0.])
+
+    def test_errobj(self):
+        olderrobj = np.geterrobj()
+        self.called = 0
+        try:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                with np.errstate(divide='warn'):
+                    np.seterrobj([20000, 1, None])
+                    np.array([1.]) / np.array([0.])
+                    assert_equal(len(w), 1)
+
+            def log_err(*args):
+                self.called += 1
+                extobj_err = args
+                assert_(len(extobj_err) == 2)
+                assert_("divide" in extobj_err[0])
+
+            with np.errstate(divide='ignore'):
+                np.seterrobj([20000, 3, log_err])
+                np.array([1.]) / np.array([0.])
+            assert_equal(self.called, 1)
+
+            np.seterrobj(olderrobj)
+            with np.errstate(divide='ignore'):
+                np.divide(1., 0., extobj=[20000, 3, log_err])
+            assert_equal(self.called, 2)
+        finally:
+            np.seterrobj(olderrobj)
+            del self.called
+
+    def test_errobj_noerrmask(self):
+        # errmask = 0 has a special code path for the default
+        olderrobj = np.geterrobj()
+        try:
+            # set errobj to something non default
+            np.seterrobj([umath.UFUNC_BUFSIZE_DEFAULT,
+                         umath.ERR_DEFAULT + 1, None])
+            # call a ufunc
+            np.isnan(np.array([6]))
+            # same with the default, lots of times to get rid of possible
+            # pre-existing stack in the code
+            for i in range(10000):
+                np.seterrobj([umath.UFUNC_BUFSIZE_DEFAULT, umath.ERR_DEFAULT,
+                             None])
+            np.isnan(np.array([6]))
+        finally:
+            np.seterrobj(olderrobj)
+
+
+class TestFloatExceptions(object):
+    def assert_raises_fpe(self, fpeerr, flop, x, y):
+        ftype = type(x)
+        try:
+            flop(x, y)
+            assert_(False,
+                    "Type %s did not raise fpe error '%s'." % (ftype, fpeerr))
+        except FloatingPointError as exc:
+            assert_(str(exc).find(fpeerr) >= 0,
+                    "Type %s raised wrong fpe error '%s'." % (ftype, exc))
+
+    def assert_op_raises_fpe(self, fpeerr, flop, sc1, sc2):
+        # Check that fpe exception is raised.
+        #
+        # Given a floating operation `flop` and two scalar values, check that
+        # the operation raises the floating point exception specified by
+        # `fpeerr`. Tests all variants with 0-d array scalars as well.
+
+        self.assert_raises_fpe(fpeerr, flop, sc1, sc2)
+        self.assert_raises_fpe(fpeerr, flop, sc1[()], sc2)
+        self.assert_raises_fpe(fpeerr, flop, sc1, sc2[()])
+        self.assert_raises_fpe(fpeerr, flop, sc1[()], sc2[()])
+
+    def test_floating_exceptions(self):
+        # Test basic arithmetic function errors
+        with np.errstate(all='raise'):
+            # Test for all real and complex float types
+            for typecode in np.typecodes['AllFloat']:
+                ftype = np.obj2sctype(typecode)
+                if np.dtype(ftype).kind == 'f':
+                    # Get some extreme values for the type
+                    fi = np.finfo(ftype)
+                    ft_tiny = fi.tiny
+                    ft_max = fi.max
+                    ft_eps = fi.eps
+                    underflow = 'underflow'
+                    divbyzero = 'divide by zero'
+                else:
+                    # 'c', complex, corresponding real dtype
+                    rtype = type(ftype(0).real)
+                    fi = np.finfo(rtype)
+                    ft_tiny = ftype(fi.tiny)
+                    ft_max = ftype(fi.max)
+                    ft_eps = ftype(fi.eps)
+                    # The complex types raise different exceptions
+                    underflow = ''
+                    divbyzero = ''
+                overflow = 'overflow'
+                invalid = 'invalid'
+
+                self.assert_raises_fpe(underflow,
+                                       lambda a, b: a/b, ft_tiny, ft_max)
+                self.assert_raises_fpe(underflow,
+                                       lambda a, b: a*b, ft_tiny, ft_tiny)
+                self.assert_raises_fpe(overflow,
+                                       lambda a, b: a*b, ft_max, ftype(2))
+                self.assert_raises_fpe(overflow,
+                                       lambda a, b: a/b, ft_max, ftype(0.5))
+                self.assert_raises_fpe(overflow,
+                                       lambda a, b: a+b, ft_max, ft_max*ft_eps)
+                self.assert_raises_fpe(overflow,
+                                       lambda a, b: a-b, -ft_max, ft_max*ft_eps)
+                self.assert_raises_fpe(overflow,
+                                       np.power, ftype(2), ftype(2**fi.nexp))
+                self.assert_raises_fpe(divbyzero,
+                                       lambda a, b: a/b, ftype(1), ftype(0))
+                self.assert_raises_fpe(invalid,
+                                       lambda a, b: a/b, ftype(np.inf), ftype(np.inf))
+                self.assert_raises_fpe(invalid,
+                                       lambda a, b: a/b, ftype(0), ftype(0))
+                self.assert_raises_fpe(invalid,
+                                       lambda a, b: a-b, ftype(np.inf), ftype(np.inf))
+                self.assert_raises_fpe(invalid,
+                                       lambda a, b: a+b, ftype(np.inf), ftype(-np.inf))
+                self.assert_raises_fpe(invalid,
+                                       lambda a, b: a*b, ftype(0), ftype(np.inf))
+
+    def test_warnings(self):
+        # test warning code path
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            with np.errstate(all="warn"):
+                np.divide(1, 0.)
+                assert_equal(len(w), 1)
+                assert_("divide by zero" in str(w[0].message))
+                np.array(1e300) * np.array(1e300)
+                assert_equal(len(w), 2)
+                assert_("overflow" in str(w[-1].message))
+                np.array(np.inf) - np.array(np.inf)
+                assert_equal(len(w), 3)
+                assert_("invalid value" in str(w[-1].message))
+                np.array(1e-300) * np.array(1e-300)
+                assert_equal(len(w), 4)
+                assert_("underflow" in str(w[-1].message))
+
+
+class TestTypes(object):
+    def check_promotion_cases(self, promote_func):
+        # tests that the scalars get coerced correctly.
+        b = np.bool_(0)
+        i8, i16, i32, i64 = np.int8(0), np.int16(0), np.int32(0), np.int64(0)
+        u8, u16, u32, u64 = np.uint8(0), np.uint16(0), np.uint32(0), np.uint64(0)
+        f32, f64, fld = np.float32(0), np.float64(0), np.longdouble(0)
+        c64, c128, cld = np.complex64(0), np.complex128(0), np.clongdouble(0)
+
+        # coercion within the same kind
+        assert_equal(promote_func(i8, i16), np.dtype(np.int16))
+        assert_equal(promote_func(i32, i8), np.dtype(np.int32))
+        assert_equal(promote_func(i16, i64), np.dtype(np.int64))
+        assert_equal(promote_func(u8, u32), np.dtype(np.uint32))
+        assert_equal(promote_func(f32, f64), np.dtype(np.float64))
+        assert_equal(promote_func(fld, f32), np.dtype(np.longdouble))
+        assert_equal(promote_func(f64, fld), np.dtype(np.longdouble))
+        assert_equal(promote_func(c128, c64), np.dtype(np.complex128))
+        assert_equal(promote_func(cld, c128), np.dtype(np.clongdouble))
+        assert_equal(promote_func(c64, fld), np.dtype(np.clongdouble))
+
+        # coercion between kinds
+        assert_equal(promote_func(b, i32), np.dtype(np.int32))
+        assert_equal(promote_func(b, u8), np.dtype(np.uint8))
+        assert_equal(promote_func(i8, u8), np.dtype(np.int16))
+        assert_equal(promote_func(u8, i32), np.dtype(np.int32))
+        assert_equal(promote_func(i64, u32), np.dtype(np.int64))
+        assert_equal(promote_func(u64, i32), np.dtype(np.float64))
+        assert_equal(promote_func(i32, f32), np.dtype(np.float64))
+        assert_equal(promote_func(i64, f32), np.dtype(np.float64))
+        assert_equal(promote_func(f32, i16), np.dtype(np.float32))
+        assert_equal(promote_func(f32, u32), np.dtype(np.float64))
+        assert_equal(promote_func(f32, c64), np.dtype(np.complex64))
+        assert_equal(promote_func(c128, f32), np.dtype(np.complex128))
+        assert_equal(promote_func(cld, f64), np.dtype(np.clongdouble))
+
+        # coercion between scalars and 1-D arrays
+        assert_equal(promote_func(np.array([b]), i8), np.dtype(np.int8))
+        assert_equal(promote_func(np.array([b]), u8), np.dtype(np.uint8))
+        assert_equal(promote_func(np.array([b]), i32), np.dtype(np.int32))
+        assert_equal(promote_func(np.array([b]), u32), np.dtype(np.uint32))
+        assert_equal(promote_func(np.array([i8]), i64), np.dtype(np.int8))
+        assert_equal(promote_func(u64, np.array([i32])), np.dtype(np.int32))
+        assert_equal(promote_func(i64, np.array([u32])), np.dtype(np.uint32))
+        assert_equal(promote_func(np.int32(-1), np.array([u64])),
+                     np.dtype(np.float64))
+        assert_equal(promote_func(f64, np.array([f32])), np.dtype(np.float32))
+        assert_equal(promote_func(fld, np.array([f32])), np.dtype(np.float32))
+        assert_equal(promote_func(np.array([f64]), fld), np.dtype(np.float64))
+        assert_equal(promote_func(fld, np.array([c64])),
+                     np.dtype(np.complex64))
+        assert_equal(promote_func(c64, np.array([f64])),
+                     np.dtype(np.complex128))
+        assert_equal(promote_func(np.complex64(3j), np.array([f64])),
+                     np.dtype(np.complex128))
+
+        # coercion between scalars and 1-D arrays, where
+        # the scalar has greater kind than the array
+        assert_equal(promote_func(np.array([b]), f64), np.dtype(np.float64))
+        assert_equal(promote_func(np.array([b]), i64), np.dtype(np.int64))
+        assert_equal(promote_func(np.array([b]), u64), np.dtype(np.uint64))
+        assert_equal(promote_func(np.array([i8]), f64), np.dtype(np.float64))
+        assert_equal(promote_func(np.array([u16]), f64), np.dtype(np.float64))
+
+        # uint and int are treated as the same "kind" for
+        # the purposes of array-scalar promotion.
+        assert_equal(promote_func(np.array([u16]), i32), np.dtype(np.uint16))
+
+        # float and complex are treated as the same "kind" for
+        # the purposes of array-scalar promotion, so that you can do
+        # (0j + float32array) to get a complex64 array instead of
+        # a complex128 array.
+        assert_equal(promote_func(np.array([f32]), c128),
+                     np.dtype(np.complex64))
+
+    def test_coercion(self):
+        def res_type(a, b):
+            return np.add(a, b).dtype
+
+        self.check_promotion_cases(res_type)
+
+        # Use-case: float/complex scalar * bool/int8 array
+        #           shouldn't narrow the float/complex type
+        for a in [np.array([True, False]), np.array([-3, 12], dtype=np.int8)]:
+            b = 1.234 * a
+            assert_equal(b.dtype, np.dtype('f8'), "array type %s" % a.dtype)
+            b = np.longdouble(1.234) * a
+            assert_equal(b.dtype, np.dtype(np.longdouble),
+                         "array type %s" % a.dtype)
+            b = np.float64(1.234) * a
+            assert_equal(b.dtype, np.dtype('f8'), "array type %s" % a.dtype)
+            b = np.float32(1.234) * a
+            assert_equal(b.dtype, np.dtype('f4'), "array type %s" % a.dtype)
+            b = np.float16(1.234) * a
+            assert_equal(b.dtype, np.dtype('f2'), "array type %s" % a.dtype)
+
+            b = 1.234j * a
+            assert_equal(b.dtype, np.dtype('c16'), "array type %s" % a.dtype)
+            b = np.clongdouble(1.234j) * a
+            assert_equal(b.dtype, np.dtype(np.clongdouble),
+                         "array type %s" % a.dtype)
+            b = np.complex128(1.234j) * a
+            assert_equal(b.dtype, np.dtype('c16'), "array type %s" % a.dtype)
+            b = np.complex64(1.234j) * a
+            assert_equal(b.dtype, np.dtype('c8'), "array type %s" % a.dtype)
+
+        # The following use-case is problematic, and to resolve its
+        # tricky side-effects requires more changes.
+        #
+        # Use-case: (1-t)*a, where 't' is a boolean array and 'a' is
+        #            a float32, shouldn't promote to float64
+        #
+        # a = np.array([1.0, 1.5], dtype=np.float32)
+        # t = np.array([True, False])
+        # b = t*a
+        # assert_equal(b, [1.0, 0.0])
+        # assert_equal(b.dtype, np.dtype('f4'))
+        # b = (1-t)*a
+        # assert_equal(b, [0.0, 1.5])
+        # assert_equal(b.dtype, np.dtype('f4'))
+        #
+        # Probably ~t (bitwise negation) is more proper to use here,
+        # but this is arguably less intuitive to understand at a glance, and
+        # would fail if 't' is actually an integer array instead of boolean:
+        #
+        # b = (~t)*a
+        # assert_equal(b, [0.0, 1.5])
+        # assert_equal(b.dtype, np.dtype('f4'))
+
+    def test_result_type(self):
+        self.check_promotion_cases(np.result_type)
+        assert_(np.result_type(None) == np.dtype(None))
+
+    def test_promote_types_endian(self):
+        # promote_types should always return native-endian types
+        assert_equal(np.promote_types('<i8', '<i8'), np.dtype('i8'))
+        assert_equal(np.promote_types('>i8', '>i8'), np.dtype('i8'))
+
+        assert_equal(np.promote_types('>i8', '>U16'), np.dtype('U21'))
+        assert_equal(np.promote_types('<i8', '<U16'), np.dtype('U21'))
+        assert_equal(np.promote_types('>U16', '>i8'), np.dtype('U21'))
+        assert_equal(np.promote_types('<U16', '<i8'), np.dtype('U21'))
+
+        assert_equal(np.promote_types('<S5', '<U8'), np.dtype('U8'))
+        assert_equal(np.promote_types('>S5', '>U8'), np.dtype('U8'))
+        assert_equal(np.promote_types('<U8', '<S5'), np.dtype('U8'))
+        assert_equal(np.promote_types('>U8', '>S5'), np.dtype('U8'))
+        assert_equal(np.promote_types('<U5', '<U8'), np.dtype('U8'))
+        assert_equal(np.promote_types('>U8', '>U5'), np.dtype('U8'))
+
+        assert_equal(np.promote_types('<M8', '<M8'), np.dtype('M8'))
+        assert_equal(np.promote_types('>M8', '>M8'), np.dtype('M8'))
+        assert_equal(np.promote_types('<m8', '<m8'), np.dtype('m8'))
+        assert_equal(np.promote_types('>m8', '>m8'), np.dtype('m8'))
+
+    def test_promote_types_strings(self):
+        assert_equal(np.promote_types('bool', 'S'), np.dtype('S5'))
+        assert_equal(np.promote_types('b', 'S'), np.dtype('S4'))
+        assert_equal(np.promote_types('u1', 'S'), np.dtype('S3'))
+        assert_equal(np.promote_types('u2', 'S'), np.dtype('S5'))
+        assert_equal(np.promote_types('u4', 'S'), np.dtype('S10'))
+        assert_equal(np.promote_types('u8', 'S'), np.dtype('S20'))
+        assert_equal(np.promote_types('i1', 'S'), np.dtype('S4'))
+        assert_equal(np.promote_types('i2', 'S'), np.dtype('S6'))
+        assert_equal(np.promote_types('i4', 'S'), np.dtype('S11'))
+        assert_equal(np.promote_types('i8', 'S'), np.dtype('S21'))
+        assert_equal(np.promote_types('bool', 'U'), np.dtype('U5'))
+        assert_equal(np.promote_types('b', 'U'), np.dtype('U4'))
+        assert_equal(np.promote_types('u1', 'U'), np.dtype('U3'))
+        assert_equal(np.promote_types('u2', 'U'), np.dtype('U5'))
+        assert_equal(np.promote_types('u4', 'U'), np.dtype('U10'))
+        assert_equal(np.promote_types('u8', 'U'), np.dtype('U20'))
+        assert_equal(np.promote_types('i1', 'U'), np.dtype('U4'))
+        assert_equal(np.promote_types('i2', 'U'), np.dtype('U6'))
+        assert_equal(np.promote_types('i4', 'U'), np.dtype('U11'))
+        assert_equal(np.promote_types('i8', 'U'), np.dtype('U21'))
+        assert_equal(np.promote_types('bool', 'S1'), np.dtype('S5'))
+        assert_equal(np.promote_types('bool', 'S30'), np.dtype('S30'))
+        assert_equal(np.promote_types('b', 'S1'), np.dtype('S4'))
+        assert_equal(np.promote_types('b', 'S30'), np.dtype('S30'))
+        assert_equal(np.promote_types('u1', 'S1'), np.dtype('S3'))
+        assert_equal(np.promote_types('u1', 'S30'), np.dtype('S30'))
+        assert_equal(np.promote_types('u2', 'S1'), np.dtype('S5'))
+        assert_equal(np.promote_types('u2', 'S30'), np.dtype('S30'))
+        assert_equal(np.promote_types('u4', 'S1'), np.dtype('S10'))
+        assert_equal(np.promote_types('u4', 'S30'), np.dtype('S30'))
+        assert_equal(np.promote_types('u8', 'S1'), np.dtype('S20'))
+        assert_equal(np.promote_types('u8', 'S30'), np.dtype('S30'))
+
+    def test_can_cast(self):
+        assert_(np.can_cast(np.int32, np.int64))
+        assert_(np.can_cast(np.float64, complex))
+        assert_(not np.can_cast(complex, float))
+
+        assert_(np.can_cast('i8', 'f8'))
+        assert_(not np.can_cast('i8', 'f4'))
+        assert_(np.can_cast('i4', 'S11'))
+
+        assert_(np.can_cast('i8', 'i8', 'no'))
+        assert_(not np.can_cast('<i8', '>i8', 'no'))
+
+        assert_(np.can_cast('<i8', '>i8', 'equiv'))
+        assert_(not np.can_cast('<i4', '>i8', 'equiv'))
+
+        assert_(np.can_cast('<i4', '>i8', 'safe'))
+        assert_(not np.can_cast('<i8', '>i4', 'safe'))
+
+        assert_(np.can_cast('<i8', '>i4', 'same_kind'))
+        assert_(not np.can_cast('<i8', '>u4', 'same_kind'))
+
+        assert_(np.can_cast('<i8', '>u4', 'unsafe'))
+
+        assert_(np.can_cast('bool', 'S5'))
+        assert_(not np.can_cast('bool', 'S4'))
+
+        assert_(np.can_cast('b', 'S4'))
+        assert_(not np.can_cast('b', 'S3'))
+
+        assert_(np.can_cast('u1', 'S3'))
+        assert_(not np.can_cast('u1', 'S2'))
+        assert_(np.can_cast('u2', 'S5'))
+        assert_(not np.can_cast('u2', 'S4'))
+        assert_(np.can_cast('u4', 'S10'))
+        assert_(not np.can_cast('u4', 'S9'))
+        assert_(np.can_cast('u8', 'S20'))
+        assert_(not np.can_cast('u8', 'S19'))
+
+        assert_(np.can_cast('i1', 'S4'))
+        assert_(not np.can_cast('i1', 'S3'))
+        assert_(np.can_cast('i2', 'S6'))
+        assert_(not np.can_cast('i2', 'S5'))
+        assert_(np.can_cast('i4', 'S11'))
+        assert_(not np.can_cast('i4', 'S10'))
+        assert_(np.can_cast('i8', 'S21'))
+        assert_(not np.can_cast('i8', 'S20'))
+
+        assert_(np.can_cast('bool', 'S5'))
+        assert_(not np.can_cast('bool', 'S4'))
+
+        assert_(np.can_cast('b', 'U4'))
+        assert_(not np.can_cast('b', 'U3'))
+
+        assert_(np.can_cast('u1', 'U3'))
+        assert_(not np.can_cast('u1', 'U2'))
+        assert_(np.can_cast('u2', 'U5'))
+        assert_(not np.can_cast('u2', 'U4'))
+        assert_(np.can_cast('u4', 'U10'))
+        assert_(not np.can_cast('u4', 'U9'))
+        assert_(np.can_cast('u8', 'U20'))
+        assert_(not np.can_cast('u8', 'U19'))
+
+        assert_(np.can_cast('i1', 'U4'))
+        assert_(not np.can_cast('i1', 'U3'))
+        assert_(np.can_cast('i2', 'U6'))
+        assert_(not np.can_cast('i2', 'U5'))
+        assert_(np.can_cast('i4', 'U11'))
+        assert_(not np.can_cast('i4', 'U10'))
+        assert_(np.can_cast('i8', 'U21'))
+        assert_(not np.can_cast('i8', 'U20'))
+
+        assert_raises(TypeError, np.can_cast, 'i4', None)
+        assert_raises(TypeError, np.can_cast, None, 'i4')
+
+        # Also test keyword arguments
+        assert_(np.can_cast(from_=np.int32, to=np.int64))
+
+    def test_can_cast_simple_to_structured(self):
+        # Non-structured can only be cast to structured in 'unsafe' mode.
+        assert_(not np.can_cast('i4', 'i4,i4'))
+        assert_(not np.can_cast('i4', 'i4,i2'))
+        assert_(np.can_cast('i4', 'i4,i4', casting='unsafe'))
+        assert_(np.can_cast('i4', 'i4,i2', casting='unsafe'))
+        # Even if there is just a single field which is OK.
+        assert_(not np.can_cast('i2', [('f1', 'i4')]))
+        assert_(not np.can_cast('i2', [('f1', 'i4')], casting='same_kind'))
+        assert_(np.can_cast('i2', [('f1', 'i4')], casting='unsafe'))
+        # It should be the same for recursive structured or subarrays.
+        assert_(not np.can_cast('i2', [('f1', 'i4,i4')]))
+        assert_(np.can_cast('i2', [('f1', 'i4,i4')], casting='unsafe'))
+        assert_(not np.can_cast('i2', [('f1', '(2,3)i4')]))
+        assert_(np.can_cast('i2', [('f1', '(2,3)i4')], casting='unsafe'))
+
+    def test_can_cast_structured_to_simple(self):
+        # Need unsafe casting for structured to simple.
+        assert_(not np.can_cast([('f1', 'i4')], 'i4'))
+        assert_(np.can_cast([('f1', 'i4')], 'i4', casting='unsafe'))
+        assert_(np.can_cast([('f1', 'i4')], 'i2', casting='unsafe'))
+        # Since it is unclear what is being cast, multiple fields to
+        # single should not work even for unsafe casting.
+        assert_(not np.can_cast('i4,i4', 'i4', casting='unsafe'))
+        # But a single field inside a single field is OK.
+        assert_(not np.can_cast([('f1', [('x', 'i4')])], 'i4'))
+        assert_(np.can_cast([('f1', [('x', 'i4')])], 'i4', casting='unsafe'))
+        # And a subarray is fine too - it will just take the first element
+        # (arguably not very consistently; might also take the first field).
+        assert_(not np.can_cast([('f0', '(3,)i4')], 'i4'))
+        assert_(np.can_cast([('f0', '(3,)i4')], 'i4', casting='unsafe'))
+        # But a structured subarray with multiple fields should fail.
+        assert_(not np.can_cast([('f0', ('i4,i4'), (2,))], 'i4',
+                                casting='unsafe'))
+
+    def test_can_cast_values(self):
+        # gh-5917
+        for dt in np.sctypes['int'] + np.sctypes['uint']:
+            ii = np.iinfo(dt)
+            assert_(np.can_cast(ii.min, dt))
+            assert_(np.can_cast(ii.max, dt))
+            assert_(not np.can_cast(ii.min - 1, dt))
+            assert_(not np.can_cast(ii.max + 1, dt))
+
+        for dt in np.sctypes['float']:
+            fi = np.finfo(dt)
+            assert_(np.can_cast(fi.min, dt))
+            assert_(np.can_cast(fi.max, dt))
+
+
+# Custom exception class to test exception propagation in fromiter
+class NIterError(Exception):
+    pass
+
+
+class TestFromiter(object):
+    def makegen(self):
+        for x in range(24):
+            yield x**2
+
+    def test_types(self):
+        ai32 = np.fromiter(self.makegen(), np.int32)
+        ai64 = np.fromiter(self.makegen(), np.int64)
+        af = np.fromiter(self.makegen(), float)
+        assert_(ai32.dtype == np.dtype(np.int32))
+        assert_(ai64.dtype == np.dtype(np.int64))
+        assert_(af.dtype == np.dtype(float))
+
+    def test_lengths(self):
+        expected = np.array(list(self.makegen()))
+        a = np.fromiter(self.makegen(), int)
+        a20 = np.fromiter(self.makegen(), int, 20)
+        assert_(len(a) == len(expected))
+        assert_(len(a20) == 20)
+        assert_raises(ValueError, np.fromiter,
+                          self.makegen(), int, len(expected) + 10)
+
+    def test_values(self):
+        expected = np.array(list(self.makegen()))
+        a = np.fromiter(self.makegen(), int)
+        a20 = np.fromiter(self.makegen(), int, 20)
+        assert_(np.alltrue(a == expected, axis=0))
+        assert_(np.alltrue(a20 == expected[:20], axis=0))
+
+    def load_data(self, n, eindex):
+        # Utility method for the issue 2592 tests.
+        # Raise an exception at the desired index in the iterator.
+        for e in range(n):
+            if e == eindex:
+                raise NIterError('error at index %s' % eindex)
+            yield e
+
+    def test_2592(self):
+        # Test iteration exceptions are correctly raised.
+        count, eindex = 10, 5
+        assert_raises(NIterError, np.fromiter,
+                          self.load_data(count, eindex), dtype=int, count=count)
+
+    def test_2592_edge(self):
+        # Test iter. exceptions, edge case (exception at end of iterator).
+        count = 10
+        eindex = count-1
+        assert_raises(NIterError, np.fromiter,
+                          self.load_data(count, eindex), dtype=int, count=count)
+
+
+class TestNonzero(object):
+    def test_nonzero_trivial(self):
+        assert_equal(np.count_nonzero(np.array([])), 0)
+        assert_equal(np.count_nonzero(np.array([], dtype='?')), 0)
+        assert_equal(np.nonzero(np.array([])), ([],))
+
+        assert_equal(np.count_nonzero(np.array(0)), 0)
+        assert_equal(np.count_nonzero(np.array(0, dtype='?')), 0)
+        assert_equal(np.nonzero(np.array(0)), ([],))
+        assert_equal(np.count_nonzero(np.array(1)), 1)
+        assert_equal(np.count_nonzero(np.array(1, dtype='?')), 1)
+        assert_equal(np.nonzero(np.array(1)), ([0],))
+
+    def test_nonzero_onedim(self):
+        x = np.array([1, 0, 2, -1, 0, 0, 8])
+        assert_equal(np.count_nonzero(x), 4)
+        assert_equal(np.count_nonzero(x), 4)
+        assert_equal(np.nonzero(x), ([0, 2, 3, 6],))
+
+        x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
+                     dtype=[('a', 'i4'), ('b', 'i2')])
+        assert_equal(np.count_nonzero(x['a']), 3)
+        assert_equal(np.count_nonzero(x['b']), 4)
+        assert_equal(np.nonzero(x['a']), ([0, 2, 3],))
+        assert_equal(np.nonzero(x['b']), ([0, 2, 3, 4],))
+
+    def test_nonzero_twodim(self):
+        x = np.array([[0, 1, 0], [2, 0, 3]])
+        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.nonzero(x), ([0, 1, 1], [1, 0, 2]))
+
+        x = np.eye(3)
+        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.nonzero(x), ([0, 1, 2], [0, 1, 2]))
+
+        x = np.array([[(0, 1), (0, 0), (1, 11)],
+                   [(1, 1), (1, 0), (0, 0)],
+                   [(0, 0), (1, 5), (0, 1)]], dtype=[('a', 'f4'), ('b', 'u1')])
+        assert_equal(np.count_nonzero(x['a']), 4)
+        assert_equal(np.count_nonzero(x['b']), 5)
+        assert_equal(np.nonzero(x['a']), ([0, 1, 1, 2], [2, 0, 1, 1]))
+        assert_equal(np.nonzero(x['b']), ([0, 0, 1, 2, 2], [0, 2, 0, 1, 2]))
+
+        assert_(not x['a'].T.flags.aligned)
+        assert_equal(np.count_nonzero(x['a'].T), 4)
+        assert_equal(np.count_nonzero(x['b'].T), 5)
+        assert_equal(np.nonzero(x['a'].T), ([0, 1, 1, 2], [1, 1, 2, 0]))
+        assert_equal(np.nonzero(x['b'].T), ([0, 0, 1, 2, 2], [0, 1, 2, 0, 2]))
+
+    def test_sparse(self):
+        # test special sparse condition boolean code path
+        for i in range(20):
+            c = np.zeros(200, dtype=bool)
+            c[i::20] = True
+            assert_equal(np.nonzero(c)[0], np.arange(i, 200 + i, 20))
+
+            c = np.zeros(400, dtype=bool)
+            c[10 + i:20 + i] = True
+            c[20 + i*2] = True
+            assert_equal(np.nonzero(c)[0],
+                         np.concatenate((np.arange(10 + i, 20 + i), [20 + i*2])))
+
+    def test_return_type(self):
+        class C(np.ndarray):
+            pass
+
+        for view in (C, np.ndarray):
+            for nd in range(1, 4):
+                shape = tuple(range(2, 2+nd))
+                x = np.arange(np.prod(shape)).reshape(shape).view(view)
+                for nzx in (np.nonzero(x), x.nonzero()):
+                    for nzx_i in nzx:
+                        assert_(type(nzx_i) is np.ndarray)
+                        assert_(nzx_i.flags.writeable)
+
+    def test_count_nonzero_axis(self):
+        # Basic check of functionality
+        m = np.array([[0, 1, 7, 0, 0], [3, 0, 0, 2, 19]])
+
+        expected = np.array([1, 1, 1, 1, 1])
+        assert_equal(np.count_nonzero(m, axis=0), expected)
+
+        expected = np.array([2, 3])
+        assert_equal(np.count_nonzero(m, axis=1), expected)
+
+        assert_raises(ValueError, np.count_nonzero, m, axis=(1, 1))
+        assert_raises(TypeError, np.count_nonzero, m, axis='foo')
+        assert_raises(np.AxisError, np.count_nonzero, m, axis=3)
+        assert_raises(TypeError, np.count_nonzero,
+                      m, axis=np.array([[1], [2]]))
+
+    def test_count_nonzero_axis_all_dtypes(self):
+        # More thorough test that the axis argument is respected
+        # for all dtypes and responds correctly when presented with
+        # either integer or tuple arguments for axis
+        msg = "Mismatch for dtype: %s"
+
+        def assert_equal_w_dt(a, b, err_msg):
+            assert_equal(a.dtype, b.dtype, err_msg=err_msg)
+            assert_equal(a, b, err_msg=err_msg)
+
+        for dt in np.typecodes['All']:
+            err_msg = msg % (np.dtype(dt).name,)
+
+            if dt != 'V':
+                if dt != 'M':
+                    m = np.zeros((3, 3), dtype=dt)
+                    n = np.ones(1, dtype=dt)
+
+                    m[0, 0] = n[0]
+                    m[1, 0] = n[0]
+
+                else:  # np.zeros doesn't work for np.datetime64
+                    m = np.array(['1970-01-01'] * 9)
+                    m = m.reshape((3, 3))
+
+                    m[0, 0] = '1970-01-12'
+                    m[1, 0] = '1970-01-12'
+                    m = m.astype(dt)
+
+                expected = np.array([2, 0, 0], dtype=np.intp)
+                assert_equal_w_dt(np.count_nonzero(m, axis=0),
+                                  expected, err_msg=err_msg)
+
+                expected = np.array([1, 1, 0], dtype=np.intp)
+                assert_equal_w_dt(np.count_nonzero(m, axis=1),
+                                  expected, err_msg=err_msg)
+
+                expected = np.array(2)
+                assert_equal(np.count_nonzero(m, axis=(0, 1)),
+                             expected, err_msg=err_msg)
+                assert_equal(np.count_nonzero(m, axis=None),
+                             expected, err_msg=err_msg)
+                assert_equal(np.count_nonzero(m),
+                             expected, err_msg=err_msg)
+
+            if dt == 'V':
+                # There are no 'nonzero' objects for np.void, so the testing
+                # setup is slightly different for this dtype
+                m = np.array([np.void(1)] * 6).reshape((2, 3))
+
+                expected = np.array([0, 0, 0], dtype=np.intp)
+                assert_equal_w_dt(np.count_nonzero(m, axis=0),
+                                  expected, err_msg=err_msg)
+
+                expected = np.array([0, 0], dtype=np.intp)
+                assert_equal_w_dt(np.count_nonzero(m, axis=1),
+                                  expected, err_msg=err_msg)
+
+                expected = np.array(0)
+                assert_equal(np.count_nonzero(m, axis=(0, 1)),
+                             expected, err_msg=err_msg)
+                assert_equal(np.count_nonzero(m, axis=None),
+                             expected, err_msg=err_msg)
+                assert_equal(np.count_nonzero(m),
+                             expected, err_msg=err_msg)
+
+    def test_count_nonzero_axis_consistent(self):
+        # Check that the axis behaviour for valid axes in
+        # non-special cases is consistent (and therefore
+        # correct) by checking it against an integer array
+        # that is then casted to the generic object dtype
+        from itertools import combinations, permutations
+
+        axis = (0, 1, 2, 3)
+        size = (5, 5, 5, 5)
+        msg = "Mismatch for axis: %s"
+
+        rng = np.random.RandomState(1234)
+        m = rng.randint(-100, 100, size=size)
+        n = m.astype(object)
+
+        for length in range(len(axis)):
+            for combo in combinations(axis, length):
+                for perm in permutations(combo):
+                    assert_equal(
+                        np.count_nonzero(m, axis=perm),
+                        np.count_nonzero(n, axis=perm),
+                        err_msg=msg % (perm,))
+
+    def test_countnonzero_axis_empty(self):
+        a = np.array([[0, 0, 1], [1, 0, 1]])
+        assert_equal(np.count_nonzero(a, axis=()), a.astype(bool))
+
+    def test_array_method(self):
+        # Tests that the array method
+        # call to nonzero works
+        m = np.array([[1, 0, 0], [4, 0, 6]])
+        tgt = [[0, 1, 1], [0, 0, 2]]
+
+        assert_equal(m.nonzero(), tgt)
+
+    def test_nonzero_invalid_object(self):
+        # gh-9295
+        a = np.array([np.array([1, 2]), 3])
+        assert_raises(ValueError, np.nonzero, a)
+
+        class BoolErrors:
+            def __bool__(self):
+                raise ValueError("Not allowed")
+            def __nonzero__(self):
+                raise ValueError("Not allowed")
+
+        assert_raises(ValueError, np.nonzero, np.array([BoolErrors()]))
+
+
+class TestIndex(object):
+    def test_boolean(self):
+        a = rand(3, 5, 8)
+        V = rand(5, 8)
+        g1 = randint(0, 5, size=15)
+        g2 = randint(0, 8, size=15)
+        V[g1, g2] = -V[g1, g2]
+        assert_((np.array([a[0][V > 0], a[1][V > 0], a[2][V > 0]]) == a[:, V > 0]).all())
+
+    def test_boolean_edgecase(self):
+        a = np.array([], dtype='int32')
+        b = np.array([], dtype='bool')
+        c = a[b]
+        assert_equal(c, [])
+        assert_equal(c.dtype, np.dtype('int32'))
+
+
+class TestBinaryRepr(object):
+    def test_zero(self):
+        assert_equal(np.binary_repr(0), '0')
+
+    def test_positive(self):
+        assert_equal(np.binary_repr(10), '1010')
+        assert_equal(np.binary_repr(12522),
+                     '11000011101010')
+        assert_equal(np.binary_repr(10736848),
+                     '101000111101010011010000')
+
+    def test_negative(self):
+        assert_equal(np.binary_repr(-1), '-1')
+        assert_equal(np.binary_repr(-10), '-1010')
+        assert_equal(np.binary_repr(-12522),
+                     '-11000011101010')
+        assert_equal(np.binary_repr(-10736848),
+                     '-101000111101010011010000')
+
+    def test_sufficient_width(self):
+        assert_equal(np.binary_repr(0, width=5), '00000')
+        assert_equal(np.binary_repr(10, width=7), '0001010')
+        assert_equal(np.binary_repr(-5, width=7), '1111011')
+
+    def test_neg_width_boundaries(self):
+        # see gh-8670
+
+        # Ensure that the example in the issue does not
+        # break before proceeding to a more thorough test.
+        assert_equal(np.binary_repr(-128, width=8), '10000000')
+
+        for width in range(1, 11):
+            num = -2**(width - 1)
+            exp = '1' + (width - 1) * '0'
+            assert_equal(np.binary_repr(num, width=width), exp)
+
+
+class TestBaseRepr(object):
+    def test_base3(self):
+        assert_equal(np.base_repr(3**5, 3), '100000')
+
+    def test_positive(self):
+        assert_equal(np.base_repr(12, 10), '12')
+        assert_equal(np.base_repr(12, 10, 4), '000012')
+        assert_equal(np.base_repr(12, 4), '30')
+        assert_equal(np.base_repr(3731624803700888, 36), '10QR0ROFCEW')
+
+    def test_negative(self):
+        assert_equal(np.base_repr(-12, 10), '-12')
+        assert_equal(np.base_repr(-12, 10, 4), '-000012')
+        assert_equal(np.base_repr(-12, 4), '-30')
+
+    def test_base_range(self):
+        with assert_raises(ValueError):
+            np.base_repr(1, 1)
+        with assert_raises(ValueError):
+            np.base_repr(1, 37)
+
+
+class TestArrayComparisons(object):
+    def test_array_equal(self):
+        res = np.array_equal(np.array([1, 2]), np.array([1, 2]))
+        assert_(res)
+        assert_(type(res) is bool)
+        res = np.array_equal(np.array([1, 2]), np.array([1, 2, 3]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equal(np.array([1, 2]), np.array([3, 4]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equal(np.array([1, 2]), np.array([1, 3]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equal(np.array(['a'], dtype='S1'), np.array(['a'], dtype='S1'))
+        assert_(res)
+        assert_(type(res) is bool)
+        res = np.array_equal(np.array([('a', 1)], dtype='S1,u4'),
+                             np.array([('a', 1)], dtype='S1,u4'))
+        assert_(res)
+        assert_(type(res) is bool)
+
+    def test_none_compares_elementwise(self):
+        a = np.array([None, 1, None], dtype=object)
+        assert_equal(a == None, [True, False, True])
+        assert_equal(a != None, [False, True, False])
+
+        a = np.ones(3)
+        assert_equal(a == None, [False, False, False])
+        assert_equal(a != None, [True, True, True])
+
+    def test_array_equiv(self):
+        res = np.array_equiv(np.array([1, 2]), np.array([1, 2]))
+        assert_(res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 2]), np.array([1, 2, 3]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 2]), np.array([3, 4]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 2]), np.array([1, 3]))
+        assert_(not res)
+        assert_(type(res) is bool)
+
+        res = np.array_equiv(np.array([1, 1]), np.array([1]))
+        assert_(res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 1]), np.array([[1], [1]]))
+        assert_(res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 2]), np.array([2]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 2]), np.array([[1], [2]]))
+        assert_(not res)
+        assert_(type(res) is bool)
+        res = np.array_equiv(np.array([1, 2]), np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
+        assert_(not res)
+        assert_(type(res) is bool)
+
+
+def assert_array_strict_equal(x, y):
+    assert_array_equal(x, y)
+    # Check flags, 32 bit arches typically don't provide 16 byte alignment
+    if ((x.dtype.alignment <= 8 or
+            np.intp().dtype.itemsize != 4) and
+            sys.platform != 'win32'):
+        assert_(x.flags == y.flags)
+    else:
+        assert_(x.flags.owndata == y.flags.owndata)
+        assert_(x.flags.writeable == y.flags.writeable)
+        assert_(x.flags.c_contiguous == y.flags.c_contiguous)
+        assert_(x.flags.f_contiguous == y.flags.f_contiguous)
+        assert_(x.flags.writebackifcopy == y.flags.writebackifcopy)
+    # check endianness
+    assert_(x.dtype.isnative == y.dtype.isnative)
+
+
+class TestClip(object):
+    def setup(self):
+        self.nr = 5
+        self.nc = 3
+
+    def fastclip(self, a, m, M, out=None):
+        if out is None:
+            return a.clip(m, M)
+        else:
+            return a.clip(m, M, out)
+
+    def clip(self, a, m, M, out=None):
+        # use slow-clip
+        selector = np.less(a, m) + 2*np.greater(a, M)
+        return selector.choose((a, m, M), out=out)
+
+    # Handy functions
+    def _generate_data(self, n, m):
+        return randn(n, m)
+
+    def _generate_data_complex(self, n, m):
+        return randn(n, m) + 1.j * rand(n, m)
+
+    def _generate_flt_data(self, n, m):
+        return (randn(n, m)).astype(np.float32)
+
+    def _neg_byteorder(self, a):
+        a = np.asarray(a)
+        if sys.byteorder == 'little':
+            a = a.astype(a.dtype.newbyteorder('>'))
+        else:
+            a = a.astype(a.dtype.newbyteorder('<'))
+        return a
+
+    def _generate_non_native_data(self, n, m):
+        data = randn(n, m)
+        data = self._neg_byteorder(data)
+        assert_(not data.dtype.isnative)
+        return data
+
+    def _generate_int_data(self, n, m):
+        return (10 * rand(n, m)).astype(np.int64)
+
+    def _generate_int32_data(self, n, m):
+        return (10 * rand(n, m)).astype(np.int32)
+
+    # Now the real test cases
+    def test_simple_double(self):
+        # Test native double input with scalar min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = 0.1
+        M = 0.6
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_int(self):
+        # Test native int input with scalar min/max.
+        a = self._generate_int_data(self.nr, self.nc)
+        a = a.astype(int)
+        m = -2
+        M = 4
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_array_double(self):
+        # Test native double input with array min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = np.zeros(a.shape)
+        M = m + 0.5
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_nonnative(self):
+        # Test non native double input with scalar min/max.
+        # Test native double input with non native double scalar min/max.
+        a = self._generate_non_native_data(self.nr, self.nc)
+        m = -0.5
+        M = 0.6
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_equal(ac, act)
+
+        # Test native double input with non native double scalar min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5
+        M = self._neg_byteorder(0.6)
+        assert_(not M.dtype.isnative)
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_equal(ac, act)
+
+    def test_simple_complex(self):
+        # Test native complex input with native double scalar min/max.
+        # Test native input with complex double scalar min/max.
+        a = 3 * self._generate_data_complex(self.nr, self.nc)
+        m = -0.5
+        M = 1.
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+        # Test native input with complex double scalar min/max.
+        a = 3 * self._generate_data(self.nr, self.nc)
+        m = -0.5 + 1.j
+        M = 1. + 2.j
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_complex(self):
+        # Address Issue gh-5354 for clipping complex arrays
+        # Test native complex input without explicit min/max
+        # ie, either min=None or max=None
+        a = np.ones(10, dtype=complex)
+        m = a.min()
+        M = a.max()
+        am = self.fastclip(a, m, None)
+        aM = self.fastclip(a, None, M)
+        assert_array_strict_equal(am, a)
+        assert_array_strict_equal(aM, a)
+
+    def test_clip_non_contig(self):
+        # Test clip for non contiguous native input and native scalar min/max.
+        a = self._generate_data(self.nr * 2, self.nc * 3)
+        a = a[::2, ::3]
+        assert_(not a.flags['F_CONTIGUOUS'])
+        assert_(not a.flags['C_CONTIGUOUS'])
+        ac = self.fastclip(a, -1.6, 1.7)
+        act = self.clip(a, -1.6, 1.7)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_out(self):
+        # Test native double input with scalar min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5
+        M = 0.6
+        ac = np.zeros(a.shape)
+        act = np.zeros(a.shape)
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_int32_inout(self):
+        # Test native int32 input with double min/max and int32 out.
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.float64(0)
+        M = np.float64(2)
+        ac = np.zeros(a.shape, dtype=np.int32)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_int64_out(self):
+        # Test native int32 input with int32 scalar min/max and int64 out.
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.int32(-1)
+        M = np.int32(1)
+        ac = np.zeros(a.shape, dtype=np.int64)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_int64_inout(self):
+        # Test native int32 input with double array min/max and int32 out.
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.zeros(a.shape, np.float64)
+        M = np.float64(1)
+        ac = np.zeros(a.shape, dtype=np.int32)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_int32_out(self):
+        # Test native double input with scalar min/max and int out.
+        a = self._generate_data(self.nr, self.nc)
+        m = -1.0
+        M = 2.0
+        ac = np.zeros(a.shape, dtype=np.int32)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_simple_inplace_01(self):
+        # Test native double input with array min/max in-place.
+        a = self._generate_data(self.nr, self.nc)
+        ac = a.copy()
+        m = np.zeros(a.shape)
+        M = 1.0
+        self.fastclip(a, m, M, a)
+        self.clip(a, m, M, ac)
+        assert_array_strict_equal(a, ac)
+
+    def test_simple_inplace_02(self):
+        # Test native double input with scalar min/max in-place.
+        a = self._generate_data(self.nr, self.nc)
+        ac = a.copy()
+        m = -0.5
+        M = 0.6
+        self.fastclip(a, m, M, a)
+        self.clip(ac, m, M, ac)
+        assert_array_strict_equal(a, ac)
+
+    def test_noncontig_inplace(self):
+        # Test non contiguous double input with double scalar min/max in-place.
+        a = self._generate_data(self.nr * 2, self.nc * 3)
+        a = a[::2, ::3]
+        assert_(not a.flags['F_CONTIGUOUS'])
+        assert_(not a.flags['C_CONTIGUOUS'])
+        ac = a.copy()
+        m = -0.5
+        M = 0.6
+        self.fastclip(a, m, M, a)
+        self.clip(ac, m, M, ac)
+        assert_array_equal(a, ac)
+
+    def test_type_cast_01(self):
+        # Test native double input with scalar min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5
+        M = 0.6
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_02(self):
+        # Test native int32 input with int32 scalar min/max.
+        a = self._generate_int_data(self.nr, self.nc)
+        a = a.astype(np.int32)
+        m = -2
+        M = 4
+        ac = self.fastclip(a, m, M)
+        act = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_03(self):
+        # Test native int32 input with float64 scalar min/max.
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = -2
+        M = 4
+        ac = self.fastclip(a, np.float64(m), np.float64(M))
+        act = self.clip(a, np.float64(m), np.float64(M))
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_04(self):
+        # Test native int32 input with float32 scalar min/max.
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.float32(-2)
+        M = np.float32(4)
+        act = self.fastclip(a, m, M)
+        ac = self.clip(a, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_05(self):
+        # Test native int32 with double arrays min/max.
+        a = self._generate_int_data(self.nr, self.nc)
+        m = -0.5
+        M = 1.
+        ac = self.fastclip(a, m * np.zeros(a.shape), M)
+        act = self.clip(a, m * np.zeros(a.shape), M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_06(self):
+        # Test native with NON native scalar min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = 0.5
+        m_s = self._neg_byteorder(m)
+        M = 1.
+        act = self.clip(a, m_s, M)
+        ac = self.fastclip(a, m_s, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_07(self):
+        # Test NON native with native array min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5 * np.ones(a.shape)
+        M = 1.
+        a_s = self._neg_byteorder(a)
+        assert_(not a_s.dtype.isnative)
+        act = a_s.clip(m, M)
+        ac = self.fastclip(a_s, m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_08(self):
+        # Test NON native with native scalar min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5
+        M = 1.
+        a_s = self._neg_byteorder(a)
+        assert_(not a_s.dtype.isnative)
+        ac = self.fastclip(a_s, m, M)
+        act = a_s.clip(m, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_09(self):
+        # Test native with NON native array min/max.
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5 * np.ones(a.shape)
+        M = 1.
+        m_s = self._neg_byteorder(m)
+        assert_(not m_s.dtype.isnative)
+        ac = self.fastclip(a, m_s, M)
+        act = self.clip(a, m_s, M)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_10(self):
+        # Test native int32 with float min/max and float out for output argument.
+        a = self._generate_int_data(self.nr, self.nc)
+        b = np.zeros(a.shape, dtype=np.float32)
+        m = np.float32(-0.5)
+        M = np.float32(1)
+        act = self.clip(a, m, M, out=b)
+        ac = self.fastclip(a, m, M, out=b)
+        assert_array_strict_equal(ac, act)
+
+    def test_type_cast_11(self):
+        # Test non native with native scalar, min/max, out non native
+        a = self._generate_non_native_data(self.nr, self.nc)
+        b = a.copy()
+        b = b.astype(b.dtype.newbyteorder('>'))
+        bt = b.copy()
+        m = -0.5
+        M = 1.
+        self.fastclip(a, m, M, out=b)
+        self.clip(a, m, M, out=bt)
+        assert_array_strict_equal(b, bt)
+
+    def test_type_cast_12(self):
+        # Test native int32 input and min/max and float out
+        a = self._generate_int_data(self.nr, self.nc)
+        b = np.zeros(a.shape, dtype=np.float32)
+        m = np.int32(0)
+        M = np.int32(1)
+        act = self.clip(a, m, M, out=b)
+        ac = self.fastclip(a, m, M, out=b)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_with_out_simple(self):
+        # Test native double input with scalar min/max
+        a = self._generate_data(self.nr, self.nc)
+        m = -0.5
+        M = 0.6
+        ac = np.zeros(a.shape)
+        act = np.zeros(a.shape)
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_with_out_simple2(self):
+        # Test native int32 input with double min/max and int32 out
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.float64(0)
+        M = np.float64(2)
+        ac = np.zeros(a.shape, dtype=np.int32)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_with_out_simple_int32(self):
+        # Test native int32 input with int32 scalar min/max and int64 out
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.int32(-1)
+        M = np.int32(1)
+        ac = np.zeros(a.shape, dtype=np.int64)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_with_out_array_int32(self):
+        # Test native int32 input with double array min/max and int32 out
+        a = self._generate_int32_data(self.nr, self.nc)
+        m = np.zeros(a.shape, np.float64)
+        M = np.float64(1)
+        ac = np.zeros(a.shape, dtype=np.int32)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_with_out_array_outint32(self):
+        # Test native double input with scalar min/max and int out
+        a = self._generate_data(self.nr, self.nc)
+        m = -1.0
+        M = 2.0
+        ac = np.zeros(a.shape, dtype=np.int32)
+        act = ac.copy()
+        self.fastclip(a, m, M, ac)
+        self.clip(a, m, M, act)
+        assert_array_strict_equal(ac, act)
+
+    def test_clip_with_out_transposed(self):
+        # Test that the out argument works when tranposed
+        a = np.arange(16).reshape(4, 4)
+        out = np.empty_like(a).T
+        a.clip(4, 10, out=out)
+        expected = self.clip(a, 4, 10)
+        assert_array_equal(out, expected)
+
+    def test_clip_with_out_memory_overlap(self):
+        # Test that the out argument works when it has memory overlap
+        a = np.arange(16).reshape(4, 4)
+        ac = a.copy()
+        a[:-1].clip(4, 10, out=a[1:])
+        expected = self.clip(ac[:-1], 4, 10)
+        assert_array_equal(a[1:], expected)
+
+    def test_clip_inplace_array(self):
+        # Test native double input with array min/max
+        a = self._generate_data(self.nr, self.nc)
+        ac = a.copy()
+        m = np.zeros(a.shape)
+        M = 1.0
+        self.fastclip(a, m, M, a)
+        self.clip(a, m, M, ac)
+        assert_array_strict_equal(a, ac)
+
+    def test_clip_inplace_simple(self):
+        # Test native double input with scalar min/max
+        a = self._generate_data(self.nr, self.nc)
+        ac = a.copy()
+        m = -0.5
+        M = 0.6
+        self.fastclip(a, m, M, a)
+        self.clip(a, m, M, ac)
+        assert_array_strict_equal(a, ac)
+
+    def test_clip_func_takes_out(self):
+        # Ensure that the clip() function takes an out=argument.
+        a = self._generate_data(self.nr, self.nc)
+        ac = a.copy()
+        m = -0.5
+        M = 0.6
+        a2 = np.clip(a, m, M, out=a)
+        self.clip(a, m, M, ac)
+        assert_array_strict_equal(a2, ac)
+        assert_(a2 is a)
+
+    def test_clip_nan(self):
+        d = np.arange(7.)
+        assert_equal(d.clip(min=np.nan), d)
+        assert_equal(d.clip(max=np.nan), d)
+        assert_equal(d.clip(min=np.nan, max=np.nan), d)
+        assert_equal(d.clip(min=-2, max=np.nan), d)
+        assert_equal(d.clip(min=np.nan, max=10), d)
+
+
+class TestAllclose(object):
+    rtol = 1e-5
+    atol = 1e-8
+
+    def setup(self):
+        self.olderr = np.seterr(invalid='ignore')
+
+    def teardown(self):
+        np.seterr(**self.olderr)
+
+    def tst_allclose(self, x, y):
+        assert_(np.allclose(x, y), "%s and %s not close" % (x, y))
+
+    def tst_not_allclose(self, x, y):
+        assert_(not np.allclose(x, y), "%s and %s shouldn't be close" % (x, y))
+
+    def test_ip_allclose(self):
+        # Parametric test factory.
+        arr = np.array([100, 1000])
+        aran = np.arange(125).reshape((5, 5, 5))
+
+        atol = self.atol
+        rtol = self.rtol
+
+        data = [([1, 0], [1, 0]),
+                ([atol], [0]),
+                ([1], [1+rtol+atol]),
+                (arr, arr + arr*rtol),
+                (arr, arr + arr*rtol + atol*2),
+                (aran, aran + aran*rtol),
+                (np.inf, np.inf),
+                (np.inf, [np.inf])]
+
+        for (x, y) in data:
+            self.tst_allclose(x, y)
+
+    def test_ip_not_allclose(self):
+        # Parametric test factory.
+        aran = np.arange(125).reshape((5, 5, 5))
+
+        atol = self.atol
+        rtol = self.rtol
+
+        data = [([np.inf, 0], [1, np.inf]),
+                ([np.inf, 0], [1, 0]),
+                ([np.inf, np.inf], [1, np.inf]),
+                ([np.inf, np.inf], [1, 0]),
+                ([-np.inf, 0], [np.inf, 0]),
+                ([np.nan, 0], [np.nan, 0]),
+                ([atol*2], [0]),
+                ([1], [1+rtol+atol*2]),
+                (aran, aran + aran*atol + atol*2),
+                (np.array([np.inf, 1]), np.array([0, np.inf]))]
+
+        for (x, y) in data:
+            self.tst_not_allclose(x, y)
+
+    def test_no_parameter_modification(self):
+        x = np.array([np.inf, 1])
+        y = np.array([0, np.inf])
+        np.allclose(x, y)
+        assert_array_equal(x, np.array([np.inf, 1]))
+        assert_array_equal(y, np.array([0, np.inf]))
+
+    def test_min_int(self):
+        # Could make problems because of abs(min_int) == min_int
+        min_int = np.iinfo(np.int_).min
+        a = np.array([min_int], dtype=np.int_)
+        assert_(np.allclose(a, a))
+
+    def test_equalnan(self):
+        x = np.array([1.0, np.nan])
+        assert_(np.allclose(x, x, equal_nan=True))
+
+    def test_return_class_is_ndarray(self):
+        # Issue gh-6475
+        # Check that allclose does not preserve subtypes
+        class Foo(np.ndarray):
+            def __new__(cls, *args, **kwargs):
+                return np.array(*args, **kwargs).view(cls)
+
+        a = Foo([1])
+        assert_(type(np.allclose(a, a)) is bool)
+
+
+class TestIsclose(object):
+    rtol = 1e-5
+    atol = 1e-8
+
+    def setup(self):
+        atol = self.atol
+        rtol = self.rtol
+        arr = np.array([100, 1000])
+        aran = np.arange(125).reshape((5, 5, 5))
+
+        self.all_close_tests = [
+                ([1, 0], [1, 0]),
+                ([atol], [0]),
+                ([1], [1 + rtol + atol]),
+                (arr, arr + arr*rtol),
+                (arr, arr + arr*rtol + atol),
+                (aran, aran + aran*rtol),
+                (np.inf, np.inf),
+                (np.inf, [np.inf]),
+                ([np.inf, -np.inf], [np.inf, -np.inf]),
+                ]
+        self.none_close_tests = [
+                ([np.inf, 0], [1, np.inf]),
+                ([np.inf, -np.inf], [1, 0]),
+                ([np.inf, np.inf], [1, -np.inf]),
+                ([np.inf, np.inf], [1, 0]),
+                ([np.nan, 0], [np.nan, -np.inf]),
+                ([atol*2], [0]),
+                ([1], [1 + rtol + atol*2]),
+                (aran, aran + rtol*1.1*aran + atol*1.1),
+                (np.array([np.inf, 1]), np.array([0, np.inf])),
+                ]
+        self.some_close_tests = [
+                ([np.inf, 0], [np.inf, atol*2]),
+                ([atol, 1, 1e6*(1 + 2*rtol) + atol], [0, np.nan, 1e6]),
+                (np.arange(3), [0, 1, 2.1]),
+                (np.nan, [np.nan, np.nan, np.nan]),
+                ([0], [atol, np.inf, -np.inf, np.nan]),
+                (0, [atol, np.inf, -np.inf, np.nan]),
+                ]
+        self.some_close_results = [
+                [True, False],
+                [True, False, False],
+                [True, True, False],
+                [False, False, False],
+                [True, False, False, False],
+                [True, False, False, False],
+                ]
+
+    def test_ip_isclose(self):
+        self.setup()
+        tests = self.some_close_tests
+        results = self.some_close_results
+        for (x, y), result in zip(tests, results):
+            assert_array_equal(np.isclose(x, y), result)
+
+    def tst_all_isclose(self, x, y):
+        assert_(np.all(np.isclose(x, y)), "%s and %s not close" % (x, y))
+
+    def tst_none_isclose(self, x, y):
+        msg = "%s and %s shouldn't be close"
+        assert_(not np.any(np.isclose(x, y)), msg % (x, y))
+
+    def tst_isclose_allclose(self, x, y):
+        msg = "isclose.all() and allclose aren't same for %s and %s"
+        msg2 = "isclose and allclose aren't same for %s and %s"
+        if np.isscalar(x) and np.isscalar(y):
+            assert_(np.isclose(x, y) == np.allclose(x, y), msg=msg2 % (x, y))
+        else:
+            assert_array_equal(np.isclose(x, y).all(), np.allclose(x, y), msg % (x, y))
+
+    def test_ip_all_isclose(self):
+        self.setup()
+        for (x, y) in self.all_close_tests:
+            self.tst_all_isclose(x, y)
+
+    def test_ip_none_isclose(self):
+        self.setup()
+        for (x, y) in self.none_close_tests:
+            self.tst_none_isclose(x, y)
+
+    def test_ip_isclose_allclose(self):
+        self.setup()
+        tests = (self.all_close_tests + self.none_close_tests +
+                 self.some_close_tests)
+        for (x, y) in tests:
+            self.tst_isclose_allclose(x, y)
+
+    def test_equal_nan(self):
+        assert_array_equal(np.isclose(np.nan, np.nan, equal_nan=True), [True])
+        arr = np.array([1.0, np.nan])
+        assert_array_equal(np.isclose(arr, arr, equal_nan=True), [True, True])
+
+    def test_masked_arrays(self):
+        # Make sure to test the output type when arguments are interchanged.
+
+        x = np.ma.masked_where([True, True, False], np.arange(3))
+        assert_(type(x) is type(np.isclose(2, x)))
+        assert_(type(x) is type(np.isclose(x, 2)))
+
+        x = np.ma.masked_where([True, True, False], [np.nan, np.inf, np.nan])
+        assert_(type(x) is type(np.isclose(np.inf, x)))
+        assert_(type(x) is type(np.isclose(x, np.inf)))
+
+        x = np.ma.masked_where([True, True, False], [np.nan, np.nan, np.nan])
+        y = np.isclose(np.nan, x, equal_nan=True)
+        assert_(type(x) is type(y))
+        # Ensure that the mask isn't modified...
+        assert_array_equal([True, True, False], y.mask)
+        y = np.isclose(x, np.nan, equal_nan=True)
+        assert_(type(x) is type(y))
+        # Ensure that the mask isn't modified...
+        assert_array_equal([True, True, False], y.mask)
+
+        x = np.ma.masked_where([True, True, False], [np.nan, np.nan, np.nan])
+        y = np.isclose(x, x, equal_nan=True)
+        assert_(type(x) is type(y))
+        # Ensure that the mask isn't modified...
+        assert_array_equal([True, True, False], y.mask)
+
+    def test_scalar_return(self):
+        assert_(np.isscalar(np.isclose(1, 1)))
+
+    def test_no_parameter_modification(self):
+        x = np.array([np.inf, 1])
+        y = np.array([0, np.inf])
+        np.isclose(x, y)
+        assert_array_equal(x, np.array([np.inf, 1]))
+        assert_array_equal(y, np.array([0, np.inf]))
+
+    def test_non_finite_scalar(self):
+        # GH7014, when two scalars are compared the output should also be a
+        # scalar
+        assert_(np.isclose(np.inf, -np.inf) is np.False_)
+        assert_(np.isclose(0, np.inf) is np.False_)
+        assert_(type(np.isclose(0, np.inf)) is np.bool_)
+
+
+class TestStdVar(object):
+    def setup(self):
+        self.A = np.array([1, -1, 1, -1])
+        self.real_var = 1
+
+    def test_basic(self):
+        assert_almost_equal(np.var(self.A), self.real_var)
+        assert_almost_equal(np.std(self.A)**2, self.real_var)
+
+    def test_scalars(self):
+        assert_equal(np.var(1), 0)
+        assert_equal(np.std(1), 0)
+
+    def test_ddof1(self):
+        assert_almost_equal(np.var(self.A, ddof=1),
+                            self.real_var*len(self.A)/float(len(self.A)-1))
+        assert_almost_equal(np.std(self.A, ddof=1)**2,
+                            self.real_var*len(self.A)/float(len(self.A)-1))
+
+    def test_ddof2(self):
+        assert_almost_equal(np.var(self.A, ddof=2),
+                            self.real_var*len(self.A)/float(len(self.A)-2))
+        assert_almost_equal(np.std(self.A, ddof=2)**2,
+                            self.real_var*len(self.A)/float(len(self.A)-2))
+
+    def test_out_scalar(self):
+        d = np.arange(10)
+        out = np.array(0.)
+        r = np.std(d, out=out)
+        assert_(r is out)
+        assert_array_equal(r, out)
+        r = np.var(d, out=out)
+        assert_(r is out)
+        assert_array_equal(r, out)
+        r = np.mean(d, out=out)
+        assert_(r is out)
+        assert_array_equal(r, out)
+
+
+class TestStdVarComplex(object):
+    def test_basic(self):
+        A = np.array([1, 1.j, -1, -1.j])
+        real_var = 1
+        assert_almost_equal(np.var(A), real_var)
+        assert_almost_equal(np.std(A)**2, real_var)
+
+    def test_scalars(self):
+        assert_equal(np.var(1j), 0)
+        assert_equal(np.std(1j), 0)
+
+
+class TestCreationFuncs(object):
+    # Test ones, zeros, empty and full.
+
+    def setup(self):
+        dtypes = {np.dtype(tp) for tp in itertools.chain(*np.sctypes.values())}
+        # void, bytes, str
+        variable_sized = {tp for tp in dtypes if tp.str.endswith('0')}
+        self.dtypes = sorted(dtypes - variable_sized |
+                             {np.dtype(tp.str.replace("0", str(i)))
+                              for tp in variable_sized for i in range(1, 10)},
+                             key=lambda dtype: dtype.str)
+        self.orders = {'C': 'c_contiguous', 'F': 'f_contiguous'}
+        self.ndims = 10
+
+    def check_function(self, func, fill_value=None):
+        par = ((0, 1, 2),
+               range(self.ndims),
+               self.orders,
+               self.dtypes)
+        fill_kwarg = {}
+        if fill_value is not None:
+            fill_kwarg = {'fill_value': fill_value}
+
+        for size, ndims, order, dtype in itertools.product(*par):
+            shape = ndims * [size]
+
+            # do not fill void type
+            if fill_kwarg and dtype.str.startswith('|V'):
+                continue
+
+            arr = func(shape, order=order, dtype=dtype,
+                       **fill_kwarg)
+
+            assert_equal(arr.dtype, dtype)
+            assert_(getattr(arr.flags, self.orders[order]))
+
+            if fill_value is not None:
+                if dtype.str.startswith('|S'):
+                    val = str(fill_value)
+                else:
+                    val = fill_value
+                assert_equal(arr, dtype.type(val))
+
+    def test_zeros(self):
+        self.check_function(np.zeros)
+
+    def test_ones(self):
+        self.check_function(np.zeros)
+
+    def test_empty(self):
+        self.check_function(np.empty)
+
+    def test_full(self):
+        self.check_function(np.full, 0)
+        self.check_function(np.full, 1)
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_for_reference_leak(self):
+        # Make sure we have an object for reference
+        dim = 1
+        beg = sys.getrefcount(dim)
+        np.zeros([dim]*10)
+        assert_(sys.getrefcount(dim) == beg)
+        np.ones([dim]*10)
+        assert_(sys.getrefcount(dim) == beg)
+        np.empty([dim]*10)
+        assert_(sys.getrefcount(dim) == beg)
+        np.full([dim]*10, 0)
+        assert_(sys.getrefcount(dim) == beg)
+
+
+class TestLikeFuncs(object):
+    '''Test ones_like, zeros_like, empty_like and full_like'''
+
+    def setup(self):
+        self.data = [
+                # Array scalars
+                (np.array(3.), None),
+                (np.array(3), 'f8'),
+                # 1D arrays
+                (np.arange(6, dtype='f4'), None),
+                (np.arange(6), 'c16'),
+                # 2D C-layout arrays
+                (np.arange(6).reshape(2, 3), None),
+                (np.arange(6).reshape(3, 2), 'i1'),
+                # 2D F-layout arrays
+                (np.arange(6).reshape((2, 3), order='F'), None),
+                (np.arange(6).reshape((3, 2), order='F'), 'i1'),
+                # 3D C-layout arrays
+                (np.arange(24).reshape(2, 3, 4), None),
+                (np.arange(24).reshape(4, 3, 2), 'f4'),
+                # 3D F-layout arrays
+                (np.arange(24).reshape((2, 3, 4), order='F'), None),
+                (np.arange(24).reshape((4, 3, 2), order='F'), 'f4'),
+                # 3D non-C/F-layout arrays
+                (np.arange(24).reshape(2, 3, 4).swapaxes(0, 1), None),
+                (np.arange(24).reshape(4, 3, 2).swapaxes(0, 1), '?'),
+                     ]
+
+    def compare_array_value(self, dz, value, fill_value):
+        if value is not None:
+            if fill_value:
+                try:
+                    z = dz.dtype.type(value)
+                except OverflowError:
+                    pass
+                else:
+                    assert_(np.all(dz == z))
+            else:
+                assert_(np.all(dz == value))
+
+    def check_like_function(self, like_function, value, fill_value=False):
+        if fill_value:
+            fill_kwarg = {'fill_value': value}
+        else:
+            fill_kwarg = {}
+        for d, dtype in self.data:
+            # default (K) order, dtype
+            dz = like_function(d, dtype=dtype, **fill_kwarg)
+            assert_equal(dz.shape, d.shape)
+            assert_equal(np.array(dz.strides)*d.dtype.itemsize,
+                         np.array(d.strides)*dz.dtype.itemsize)
+            assert_equal(d.flags.c_contiguous, dz.flags.c_contiguous)
+            assert_equal(d.flags.f_contiguous, dz.flags.f_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            self.compare_array_value(dz, value, fill_value)
+
+            # C order, default dtype
+            dz = like_function(d, order='C', dtype=dtype, **fill_kwarg)
+            assert_equal(dz.shape, d.shape)
+            assert_(dz.flags.c_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            self.compare_array_value(dz, value, fill_value)
+
+            # F order, default dtype
+            dz = like_function(d, order='F', dtype=dtype, **fill_kwarg)
+            assert_equal(dz.shape, d.shape)
+            assert_(dz.flags.f_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            self.compare_array_value(dz, value, fill_value)
+
+            # A order
+            dz = like_function(d, order='A', dtype=dtype, **fill_kwarg)
+            assert_equal(dz.shape, d.shape)
+            if d.flags.f_contiguous:
+                assert_(dz.flags.f_contiguous)
+            else:
+                assert_(dz.flags.c_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            self.compare_array_value(dz, value, fill_value)
+
+        # Test the 'subok' parameter
+        class MyNDArray(np.ndarray):
+            pass
+
+        a = np.array([[1, 2], [3, 4]]).view(MyNDArray)
+
+        b = like_function(a, **fill_kwarg)
+        assert_(type(b) is MyNDArray)
+
+        b = like_function(a, subok=False, **fill_kwarg)
+        assert_(type(b) is not MyNDArray)
+
+    def test_ones_like(self):
+        self.check_like_function(np.ones_like, 1)
+
+    def test_zeros_like(self):
+        self.check_like_function(np.zeros_like, 0)
+
+    def test_empty_like(self):
+        self.check_like_function(np.empty_like, None)
+
+    def test_filled_like(self):
+        self.check_like_function(np.full_like, 0, True)
+        self.check_like_function(np.full_like, 1, True)
+        self.check_like_function(np.full_like, 1000, True)
+        self.check_like_function(np.full_like, 123.456, True)
+        self.check_like_function(np.full_like, np.inf, True)
+
+
+class TestCorrelate(object):
+    def _setup(self, dt):
+        self.x = np.array([1, 2, 3, 4, 5], dtype=dt)
+        self.xs = np.arange(1, 20)[::3]
+        self.y = np.array([-1, -2, -3], dtype=dt)
+        self.z1 = np.array([ -3.,  -8., -14., -20., -26., -14.,  -5.], dtype=dt)
+        self.z1_4 = np.array([-2., -5., -8., -11., -14., -5.], dtype=dt)
+        self.z1r = np.array([-15., -22., -22., -16., -10.,  -4.,  -1.], dtype=dt)
+        self.z2 = np.array([-5., -14., -26., -20., -14., -8.,  -3.], dtype=dt)
+        self.z2r = np.array([-1., -4., -10., -16., -22., -22., -15.], dtype=dt)
+        self.zs = np.array([-3., -14., -30., -48., -66., -84.,
+                           -102., -54., -19.], dtype=dt)
+
+    def test_float(self):
+        self._setup(float)
+        z = np.correlate(self.x, self.y, 'full')
+        assert_array_almost_equal(z, self.z1)
+        z = np.correlate(self.x, self.y[:-1], 'full')
+        assert_array_almost_equal(z, self.z1_4)
+        z = np.correlate(self.y, self.x, 'full')
+        assert_array_almost_equal(z, self.z2)
+        z = np.correlate(self.x[::-1], self.y, 'full')
+        assert_array_almost_equal(z, self.z1r)
+        z = np.correlate(self.y, self.x[::-1], 'full')
+        assert_array_almost_equal(z, self.z2r)
+        z = np.correlate(self.xs, self.y, 'full')
+        assert_array_almost_equal(z, self.zs)
+
+    def test_object(self):
+        self._setup(Decimal)
+        z = np.correlate(self.x, self.y, 'full')
+        assert_array_almost_equal(z, self.z1)
+        z = np.correlate(self.y, self.x, 'full')
+        assert_array_almost_equal(z, self.z2)
+
+    def test_no_overwrite(self):
+        d = np.ones(100)
+        k = np.ones(3)
+        np.correlate(d, k)
+        assert_array_equal(d, np.ones(100))
+        assert_array_equal(k, np.ones(3))
+
+    def test_complex(self):
+        x = np.array([1, 2, 3, 4+1j], dtype=complex)
+        y = np.array([-1, -2j, 3+1j], dtype=complex)
+        r_z = np.array([3-1j, 6, 8+1j, 11+5j, -5+8j, -4-1j], dtype=complex)
+        r_z = r_z[::-1].conjugate()
+        z = np.correlate(y, x, mode='full')
+        assert_array_almost_equal(z, r_z)
+
+
+class TestConvolve(object):
+    def test_object(self):
+        d = [1.] * 100
+        k = [1.] * 3
+        assert_array_almost_equal(np.convolve(d, k)[2:-2], np.full(98, 3))
+
+    def test_no_overwrite(self):
+        d = np.ones(100)
+        k = np.ones(3)
+        np.convolve(d, k)
+        assert_array_equal(d, np.ones(100))
+        assert_array_equal(k, np.ones(3))
+
+
+class TestArgwhere(object):
+    def test_2D(self):
+        x = np.arange(6).reshape((2, 3))
+        assert_array_equal(np.argwhere(x > 1),
+                           [[0, 2],
+                            [1, 0],
+                            [1, 1],
+                            [1, 2]])
+
+    def test_list(self):
+        assert_equal(np.argwhere([4, 0, 2, 1, 3]), [[0], [2], [3], [4]])
+
+
+class TestStringFunction(object):
+
+    def test_set_string_function(self):
+        a = np.array([1])
+        np.set_string_function(lambda x: "FOO", repr=True)
+        assert_equal(repr(a), "FOO")
+        np.set_string_function(None, repr=True)
+        assert_equal(repr(a), "array([1])")
+
+        np.set_string_function(lambda x: "FOO", repr=False)
+        assert_equal(str(a), "FOO")
+        np.set_string_function(None, repr=False)
+        assert_equal(str(a), "[1]")
+
+
+class TestRoll(object):
+    def test_roll1d(self):
+        x = np.arange(10)
+        xr = np.roll(x, 2)
+        assert_equal(xr, np.array([8, 9, 0, 1, 2, 3, 4, 5, 6, 7]))
+
+    def test_roll2d(self):
+        x2 = np.reshape(np.arange(10), (2, 5))
+        x2r = np.roll(x2, 1)
+        assert_equal(x2r, np.array([[9, 0, 1, 2, 3], [4, 5, 6, 7, 8]]))
+
+        x2r = np.roll(x2, 1, axis=0)
+        assert_equal(x2r, np.array([[5, 6, 7, 8, 9], [0, 1, 2, 3, 4]]))
+
+        x2r = np.roll(x2, 1, axis=1)
+        assert_equal(x2r, np.array([[4, 0, 1, 2, 3], [9, 5, 6, 7, 8]]))
+
+        # Roll multiple axes at once.
+        x2r = np.roll(x2, 1, axis=(0, 1))
+        assert_equal(x2r, np.array([[9, 5, 6, 7, 8], [4, 0, 1, 2, 3]]))
+
+        x2r = np.roll(x2, (1, 0), axis=(0, 1))
+        assert_equal(x2r, np.array([[5, 6, 7, 8, 9], [0, 1, 2, 3, 4]]))
+
+        x2r = np.roll(x2, (-1, 0), axis=(0, 1))
+        assert_equal(x2r, np.array([[5, 6, 7, 8, 9], [0, 1, 2, 3, 4]]))
+
+        x2r = np.roll(x2, (0, 1), axis=(0, 1))
+        assert_equal(x2r, np.array([[4, 0, 1, 2, 3], [9, 5, 6, 7, 8]]))
+
+        x2r = np.roll(x2, (0, -1), axis=(0, 1))
+        assert_equal(x2r, np.array([[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]))
+
+        x2r = np.roll(x2, (1, 1), axis=(0, 1))
+        assert_equal(x2r, np.array([[9, 5, 6, 7, 8], [4, 0, 1, 2, 3]]))
+
+        x2r = np.roll(x2, (-1, -1), axis=(0, 1))
+        assert_equal(x2r, np.array([[6, 7, 8, 9, 5], [1, 2, 3, 4, 0]]))
+
+        # Roll the same axis multiple times.
+        x2r = np.roll(x2, 1, axis=(0, 0))
+        assert_equal(x2r, np.array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]))
+
+        x2r = np.roll(x2, 1, axis=(1, 1))
+        assert_equal(x2r, np.array([[3, 4, 0, 1, 2], [8, 9, 5, 6, 7]]))
+
+        # Roll more than one turn in either direction.
+        x2r = np.roll(x2, 6, axis=1)
+        assert_equal(x2r, np.array([[4, 0, 1, 2, 3], [9, 5, 6, 7, 8]]))
+
+        x2r = np.roll(x2, -4, axis=1)
+        assert_equal(x2r, np.array([[4, 0, 1, 2, 3], [9, 5, 6, 7, 8]]))
+
+    def test_roll_empty(self):
+        x = np.array([])
+        assert_equal(np.roll(x, 1), np.array([]))
+
+
+class TestRollaxis(object):
+
+    # expected shape indexed by (axis, start) for array of
+    # shape (1, 2, 3, 4)
+    tgtshape = {(0, 0): (1, 2, 3, 4), (0, 1): (1, 2, 3, 4),
+                (0, 2): (2, 1, 3, 4), (0, 3): (2, 3, 1, 4),
+                (0, 4): (2, 3, 4, 1),
+                (1, 0): (2, 1, 3, 4), (1, 1): (1, 2, 3, 4),
+                (1, 2): (1, 2, 3, 4), (1, 3): (1, 3, 2, 4),
+                (1, 4): (1, 3, 4, 2),
+                (2, 0): (3, 1, 2, 4), (2, 1): (1, 3, 2, 4),
+                (2, 2): (1, 2, 3, 4), (2, 3): (1, 2, 3, 4),
+                (2, 4): (1, 2, 4, 3),
+                (3, 0): (4, 1, 2, 3), (3, 1): (1, 4, 2, 3),
+                (3, 2): (1, 2, 4, 3), (3, 3): (1, 2, 3, 4),
+                (3, 4): (1, 2, 3, 4)}
+
+    def test_exceptions(self):
+        a = np.arange(1*2*3*4).reshape(1, 2, 3, 4)
+        assert_raises(np.AxisError, np.rollaxis, a, -5, 0)
+        assert_raises(np.AxisError, np.rollaxis, a, 0, -5)
+        assert_raises(np.AxisError, np.rollaxis, a, 4, 0)
+        assert_raises(np.AxisError, np.rollaxis, a, 0, 5)
+
+    def test_results(self):
+        a = np.arange(1*2*3*4).reshape(1, 2, 3, 4).copy()
+        aind = np.indices(a.shape)
+        assert_(a.flags['OWNDATA'])
+        for (i, j) in self.tgtshape:
+            # positive axis, positive start
+            res = np.rollaxis(a, axis=i, start=j)
+            i0, i1, i2, i3 = aind[np.array(res.shape) - 1]
+            assert_(np.all(res[i0, i1, i2, i3] == a))
+            assert_(res.shape == self.tgtshape[(i, j)], str((i,j)))
+            assert_(not res.flags['OWNDATA'])
+
+            # negative axis, positive start
+            ip = i + 1
+            res = np.rollaxis(a, axis=-ip, start=j)
+            i0, i1, i2, i3 = aind[np.array(res.shape) - 1]
+            assert_(np.all(res[i0, i1, i2, i3] == a))
+            assert_(res.shape == self.tgtshape[(4 - ip, j)])
+            assert_(not res.flags['OWNDATA'])
+
+            # positive axis, negative start
+            jp = j + 1 if j < 4 else j
+            res = np.rollaxis(a, axis=i, start=-jp)
+            i0, i1, i2, i3 = aind[np.array(res.shape) - 1]
+            assert_(np.all(res[i0, i1, i2, i3] == a))
+            assert_(res.shape == self.tgtshape[(i, 4 - jp)])
+            assert_(not res.flags['OWNDATA'])
+
+            # negative axis, negative start
+            ip = i + 1
+            jp = j + 1 if j < 4 else j
+            res = np.rollaxis(a, axis=-ip, start=-jp)
+            i0, i1, i2, i3 = aind[np.array(res.shape) - 1]
+            assert_(np.all(res[i0, i1, i2, i3] == a))
+            assert_(res.shape == self.tgtshape[(4 - ip, 4 - jp)])
+            assert_(not res.flags['OWNDATA'])
+
+
+class TestMoveaxis(object):
+    def test_move_to_end(self):
+        x = np.random.randn(5, 6, 7)
+        for source, expected in [(0, (6, 7, 5)),
+                                 (1, (5, 7, 6)),
+                                 (2, (5, 6, 7)),
+                                 (-1, (5, 6, 7))]:
+            actual = np.moveaxis(x, source, -1).shape
+            assert_(actual, expected)
+
+    def test_move_new_position(self):
+        x = np.random.randn(1, 2, 3, 4)
+        for source, destination, expected in [
+                (0, 1, (2, 1, 3, 4)),
+                (1, 2, (1, 3, 2, 4)),
+                (1, -1, (1, 3, 4, 2)),
+                ]:
+            actual = np.moveaxis(x, source, destination).shape
+            assert_(actual, expected)
+
+    def test_preserve_order(self):
+        x = np.zeros((1, 2, 3, 4))
+        for source, destination in [
+                (0, 0),
+                (3, -1),
+                (-1, 3),
+                ([0, -1], [0, -1]),
+                ([2, 0], [2, 0]),
+                (range(4), range(4)),
+                ]:
+            actual = np.moveaxis(x, source, destination).shape
+            assert_(actual, (1, 2, 3, 4))
+
+    def test_move_multiples(self):
+        x = np.zeros((0, 1, 2, 3))
+        for source, destination, expected in [
+                ([0, 1], [2, 3], (2, 3, 0, 1)),
+                ([2, 3], [0, 1], (2, 3, 0, 1)),
+                ([0, 1, 2], [2, 3, 0], (2, 3, 0, 1)),
+                ([3, 0], [1, 0], (0, 3, 1, 2)),
+                ([0, 3], [0, 1], (0, 3, 1, 2)),
+                ]:
+            actual = np.moveaxis(x, source, destination).shape
+            assert_(actual, expected)
+
+    def test_errors(self):
+        x = np.random.randn(1, 2, 3)
+        assert_raises_regex(np.AxisError, 'source.*out of bounds',
+                            np.moveaxis, x, 3, 0)
+        assert_raises_regex(np.AxisError, 'source.*out of bounds',
+                            np.moveaxis, x, -4, 0)
+        assert_raises_regex(np.AxisError, 'destination.*out of bounds',
+                            np.moveaxis, x, 0, 5)
+        assert_raises_regex(ValueError, 'repeated axis in `source`',
+                            np.moveaxis, x, [0, 0], [0, 1])
+        assert_raises_regex(ValueError, 'repeated axis in `destination`',
+                            np.moveaxis, x, [0, 1], [1, 1])
+        assert_raises_regex(ValueError, 'must have the same number',
+                            np.moveaxis, x, 0, [0, 1])
+        assert_raises_regex(ValueError, 'must have the same number',
+                            np.moveaxis, x, [0, 1], [0])
+
+    def test_array_likes(self):
+        x = np.ma.zeros((1, 2, 3))
+        result = np.moveaxis(x, 0, 0)
+        assert_(x.shape, result.shape)
+        assert_(isinstance(result, np.ma.MaskedArray))
+
+        x = [1, 2, 3]
+        result = np.moveaxis(x, 0, 0)
+        assert_(x, list(result))
+        assert_(isinstance(result, np.ndarray))
+
+
+class TestCross(object):
+    def test_2x2(self):
+        u = [1, 2]
+        v = [3, 4]
+        z = -2
+        cp = np.cross(u, v)
+        assert_equal(cp, z)
+        cp = np.cross(v, u)
+        assert_equal(cp, -z)
+
+    def test_2x3(self):
+        u = [1, 2]
+        v = [3, 4, 5]
+        z = np.array([10, -5, -2])
+        cp = np.cross(u, v)
+        assert_equal(cp, z)
+        cp = np.cross(v, u)
+        assert_equal(cp, -z)
+
+    def test_3x3(self):
+        u = [1, 2, 3]
+        v = [4, 5, 6]
+        z = np.array([-3, 6, -3])
+        cp = np.cross(u, v)
+        assert_equal(cp, z)
+        cp = np.cross(v, u)
+        assert_equal(cp, -z)
+
+    def test_broadcasting(self):
+        # Ticket #2624 (Trac #2032)
+        u = np.tile([1, 2], (11, 1))
+        v = np.tile([3, 4], (11, 1))
+        z = -2
+        assert_equal(np.cross(u, v), z)
+        assert_equal(np.cross(v, u), -z)
+        assert_equal(np.cross(u, u), 0)
+
+        u = np.tile([1, 2], (11, 1)).T
+        v = np.tile([3, 4, 5], (11, 1))
+        z = np.tile([10, -5, -2], (11, 1))
+        assert_equal(np.cross(u, v, axisa=0), z)
+        assert_equal(np.cross(v, u.T), -z)
+        assert_equal(np.cross(v, v), 0)
+
+        u = np.tile([1, 2, 3], (11, 1)).T
+        v = np.tile([3, 4], (11, 1)).T
+        z = np.tile([-12, 9, -2], (11, 1))
+        assert_equal(np.cross(u, v, axisa=0, axisb=0), z)
+        assert_equal(np.cross(v.T, u.T), -z)
+        assert_equal(np.cross(u.T, u.T), 0)
+
+        u = np.tile([1, 2, 3], (5, 1))
+        v = np.tile([4, 5, 6], (5, 1)).T
+        z = np.tile([-3, 6, -3], (5, 1))
+        assert_equal(np.cross(u, v, axisb=0), z)
+        assert_equal(np.cross(v.T, u), -z)
+        assert_equal(np.cross(u, u), 0)
+
+    def test_broadcasting_shapes(self):
+        u = np.ones((2, 1, 3))
+        v = np.ones((5, 3))
+        assert_equal(np.cross(u, v).shape, (2, 5, 3))
+        u = np.ones((10, 3, 5))
+        v = np.ones((2, 5))
+        assert_equal(np.cross(u, v, axisa=1, axisb=0).shape, (10, 5, 3))
+        assert_raises(np.AxisError, np.cross, u, v, axisa=1, axisb=2)
+        assert_raises(np.AxisError, np.cross, u, v, axisa=3, axisb=0)
+        u = np.ones((10, 3, 5, 7))
+        v = np.ones((5, 7, 2))
+        assert_equal(np.cross(u, v, axisa=1, axisc=2).shape, (10, 5, 3, 7))
+        assert_raises(np.AxisError, np.cross, u, v, axisa=-5, axisb=2)
+        assert_raises(np.AxisError, np.cross, u, v, axisa=1, axisb=-4)
+        # gh-5885
+        u = np.ones((3, 4, 2))
+        for axisc in range(-2, 2):
+            assert_equal(np.cross(u, u, axisc=axisc).shape, (3, 4))
+
+
+def test_outer_out_param():
+    arr1 = np.ones((5,))
+    arr2 = np.ones((2,))
+    arr3 = np.linspace(-2, 2, 5)
+    out1 = np.ndarray(shape=(5,5))
+    out2 = np.ndarray(shape=(2, 5))
+    res1 = np.outer(arr1, arr3, out1)
+    assert_equal(res1, out1)
+    assert_equal(np.outer(arr2, arr3, out2), out2)
+
+
+class TestRequire(object):
+    flag_names = ['C', 'C_CONTIGUOUS', 'CONTIGUOUS',
+                  'F', 'F_CONTIGUOUS', 'FORTRAN',
+                  'A', 'ALIGNED',
+                  'W', 'WRITEABLE',
+                  'O', 'OWNDATA']
+
+    def generate_all_false(self, dtype):
+        arr = np.zeros((2, 2), [('junk', 'i1'), ('a', dtype)])
+        arr.setflags(write=False)
+        a = arr['a']
+        assert_(not a.flags['C'])
+        assert_(not a.flags['F'])
+        assert_(not a.flags['O'])
+        assert_(not a.flags['W'])
+        assert_(not a.flags['A'])
+        return a
+
+    def set_and_check_flag(self, flag, dtype, arr):
+        if dtype is None:
+            dtype = arr.dtype
+        b = np.require(arr, dtype, [flag])
+        assert_(b.flags[flag])
+        assert_(b.dtype == dtype)
+
+        # a further call to np.require ought to return the same array
+        # unless OWNDATA is specified.
+        c = np.require(b, None, [flag])
+        if flag[0] != 'O':
+            assert_(c is b)
+        else:
+            assert_(c.flags[flag])
+
+    def test_require_each(self):
+
+        id = ['f8', 'i4']
+        fd = [None, 'f8', 'c16']
+        for idtype, fdtype, flag in itertools.product(id, fd, self.flag_names):
+            a = self.generate_all_false(idtype)
+            self.set_and_check_flag(flag, fdtype,  a)
+
+    def test_unknown_requirement(self):
+        a = self.generate_all_false('f8')
+        assert_raises(KeyError, np.require, a, None, 'Q')
+
+    def test_non_array_input(self):
+        a = np.require([1, 2, 3, 4], 'i4', ['C', 'A', 'O'])
+        assert_(a.flags['O'])
+        assert_(a.flags['C'])
+        assert_(a.flags['A'])
+        assert_(a.dtype == 'i4')
+        assert_equal(a, [1, 2, 3, 4])
+
+    def test_C_and_F_simul(self):
+        a = self.generate_all_false('f8')
+        assert_raises(ValueError, np.require, a, None, ['C', 'F'])
+
+    def test_ensure_array(self):
+        class ArraySubclass(np.ndarray):
+            pass
+
+        a = ArraySubclass((2, 2))
+        b = np.require(a, None, ['E'])
+        assert_(type(b) is np.ndarray)
+
+    def test_preserve_subtype(self):
+        class ArraySubclass(np.ndarray):
+            pass
+
+        for flag in self.flag_names:
+            a = ArraySubclass((2, 2))
+            self.set_and_check_flag(flag, None, a)
+
+
+class TestBroadcast(object):
+    def test_broadcast_in_args(self):
+        # gh-5881
+        arrs = [np.empty((6, 7)), np.empty((5, 6, 1)), np.empty((7,)),
+                np.empty((5, 1, 7))]
+        mits = [np.broadcast(*arrs),
+                np.broadcast(np.broadcast(*arrs[:2]), np.broadcast(*arrs[2:])),
+                np.broadcast(arrs[0], np.broadcast(*arrs[1:-1]), arrs[-1])]
+        for mit in mits:
+            assert_equal(mit.shape, (5, 6, 7))
+            assert_equal(mit.ndim, 3)
+            assert_equal(mit.nd, 3)
+            assert_equal(mit.numiter, 4)
+            for a, ia in zip(arrs, mit.iters):
+                assert_(a is ia.base)
+
+    def test_broadcast_single_arg(self):
+        # gh-6899
+        arrs = [np.empty((5, 6, 7))]
+        mit = np.broadcast(*arrs)
+        assert_equal(mit.shape, (5, 6, 7))
+        assert_equal(mit.ndim, 3)
+        assert_equal(mit.nd, 3)
+        assert_equal(mit.numiter, 1)
+        assert_(arrs[0] is mit.iters[0].base)
+
+    def test_number_of_arguments(self):
+        arr = np.empty((5,))
+        for j in range(35):
+            arrs = [arr] * j
+            if j < 1 or j > 32:
+                assert_raises(ValueError, np.broadcast, *arrs)
+            else:
+                mit = np.broadcast(*arrs)
+                assert_equal(mit.numiter, j)
+
+    def test_broadcast_error_kwargs(self):
+        #gh-13455
+        arrs = [np.empty((5, 6, 7))]
+        mit  = np.broadcast(*arrs)
+        mit2 = np.broadcast(*arrs, **{})
+        assert_equal(mit.shape, mit2.shape)
+        assert_equal(mit.ndim, mit2.ndim)
+        assert_equal(mit.nd, mit2.nd)
+        assert_equal(mit.numiter, mit2.numiter)
+        assert_(mit.iters[0].base is mit2.iters[0].base)
+
+        assert_raises(ValueError, np.broadcast, 1, **{'x': 1})
+
+class TestKeepdims(object):
+
+    class sub_array(np.ndarray):
+        def sum(self, axis=None, dtype=None, out=None):
+            return np.ndarray.sum(self, axis, dtype, out, keepdims=True)
+
+    def test_raise(self):
+        sub_class = self.sub_array
+        x = np.arange(30).view(sub_class)
+        assert_raises(TypeError, np.sum, x, keepdims=True)
+
+
+class TestTensordot(object):
+
+    def test_zero_dimension(self):
+        # Test resolution to issue #5663
+        a = np.ndarray((3,0))
+        b = np.ndarray((0,4))
+        td = np.tensordot(a, b, (1, 0))
+        assert_array_equal(td, np.dot(a, b))
+        assert_array_equal(td, np.einsum('ij,jk', a, b))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_numerictypes.py b/contrib/python/numpy/py2/numpy/core/tests/test_numerictypes.py
new file mode 100644
index 00000000000..d0ff5578a0a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_numerictypes.py
@@ -0,0 +1,500 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import itertools
+
+import pytest
+import numpy as np
+from numpy.testing import assert_, assert_equal, assert_raises, IS_PYPY
+
+# This is the structure of the table used for plain objects:
+#
+# +-+-+-+
+# |x|y|z|
+# +-+-+-+
+
+# Structure of a plain array description:
+Pdescr = [
+    ('x', 'i4', (2,)),
+    ('y', 'f8', (2, 2)),
+    ('z', 'u1')]
+
+# A plain list of tuples with values for testing:
+PbufferT = [
+    # x     y                  z
+    ([3, 2], [[6., 4.], [6., 4.]], 8),
+    ([4, 3], [[7., 5.], [7., 5.]], 9),
+    ]
+
+
+# This is the structure of the table used for nested objects (DON'T PANIC!):
+#
+# +-+---------------------------------+-----+----------+-+-+
+# |x|Info                             |color|info      |y|z|
+# | +-----+--+----------------+----+--+     +----+-----+ | |
+# | |value|y2|Info2           |name|z2|     |Name|Value| | |
+# | |     |  +----+-----+--+--+    |  |     |    |     | | |
+# | |     |  |name|value|y3|z3|    |  |     |    |     | | |
+# +-+-----+--+----+-----+--+--+----+--+-----+----+-----+-+-+
+#
+
+# The corresponding nested array description:
+Ndescr = [
+    ('x', 'i4', (2,)),
+    ('Info', [
+        ('value', 'c16'),
+        ('y2', 'f8'),
+        ('Info2', [
+            ('name', 'S2'),
+            ('value', 'c16', (2,)),
+            ('y3', 'f8', (2,)),
+            ('z3', 'u4', (2,))]),
+        ('name', 'S2'),
+        ('z2', 'b1')]),
+    ('color', 'S2'),
+    ('info', [
+        ('Name', 'U8'),
+        ('Value', 'c16')]),
+    ('y', 'f8', (2, 2)),
+    ('z', 'u1')]
+
+NbufferT = [
+    # x     Info                                                color info        y                  z
+    #       value y2 Info2                            name z2         Name Value
+    #                name   value    y3       z3
+    ([3, 2], (6j, 6., (b'nn', [6j, 4j], [6., 4.], [1, 2]), b'NN', True), b'cc', (u'NN', 6j), [[6., 4.], [6., 4.]], 8),
+    ([4, 3], (7j, 7., (b'oo', [7j, 5j], [7., 5.], [2, 1]), b'OO', False), b'dd', (u'OO', 7j), [[7., 5.], [7., 5.]], 9),
+    ]
+
+
+byteorder = {'little':'<', 'big':'>'}[sys.byteorder]
+
+def normalize_descr(descr):
+    "Normalize a description adding the platform byteorder."
+
+    out = []
+    for item in descr:
+        dtype = item[1]
+        if isinstance(dtype, str):
+            if dtype[0] not in ['|', '<', '>']:
+                onebyte = dtype[1:] == "1"
+                if onebyte or dtype[0] in ['S', 'V', 'b']:
+                    dtype = "|" + dtype
+                else:
+                    dtype = byteorder + dtype
+            if len(item) > 2 and np.prod(item[2]) > 1:
+                nitem = (item[0], dtype, item[2])
+            else:
+                nitem = (item[0], dtype)
+            out.append(nitem)
+        elif isinstance(dtype, list):
+            l = normalize_descr(dtype)
+            out.append((item[0], l))
+        else:
+            raise ValueError("Expected a str or list and got %s" %
+                             (type(item)))
+    return out
+
+
+############################################################
+#    Creation tests
+############################################################
+
+class CreateZeros(object):
+    """Check the creation of heterogeneous arrays zero-valued"""
+
+    def test_zeros0D(self):
+        """Check creation of 0-dimensional objects"""
+        h = np.zeros((), dtype=self._descr)
+        assert_(normalize_descr(self._descr) == h.dtype.descr)
+        assert_(h.dtype.fields['x'][0].name[:4] == 'void')
+        assert_(h.dtype.fields['x'][0].char == 'V')
+        assert_(h.dtype.fields['x'][0].type == np.void)
+        # A small check that data is ok
+        assert_equal(h['z'], np.zeros((), dtype='u1'))
+
+    def test_zerosSD(self):
+        """Check creation of single-dimensional objects"""
+        h = np.zeros((2,), dtype=self._descr)
+        assert_(normalize_descr(self._descr) == h.dtype.descr)
+        assert_(h.dtype['y'].name[:4] == 'void')
+        assert_(h.dtype['y'].char == 'V')
+        assert_(h.dtype['y'].type == np.void)
+        # A small check that data is ok
+        assert_equal(h['z'], np.zeros((2,), dtype='u1'))
+
+    def test_zerosMD(self):
+        """Check creation of multi-dimensional objects"""
+        h = np.zeros((2, 3), dtype=self._descr)
+        assert_(normalize_descr(self._descr) == h.dtype.descr)
+        assert_(h.dtype['z'].name == 'uint8')
+        assert_(h.dtype['z'].char == 'B')
+        assert_(h.dtype['z'].type == np.uint8)
+        # A small check that data is ok
+        assert_equal(h['z'], np.zeros((2, 3), dtype='u1'))
+
+
+class TestCreateZerosPlain(CreateZeros):
+    """Check the creation of heterogeneous arrays zero-valued (plain)"""
+    _descr = Pdescr
+
+class TestCreateZerosNested(CreateZeros):
+    """Check the creation of heterogeneous arrays zero-valued (nested)"""
+    _descr = Ndescr
+
+
+class CreateValues(object):
+    """Check the creation of heterogeneous arrays with values"""
+
+    def test_tuple(self):
+        """Check creation from tuples"""
+        h = np.array(self._buffer, dtype=self._descr)
+        assert_(normalize_descr(self._descr) == h.dtype.descr)
+        if self.multiple_rows:
+            assert_(h.shape == (2,))
+        else:
+            assert_(h.shape == ())
+
+    def test_list_of_tuple(self):
+        """Check creation from list of tuples"""
+        h = np.array([self._buffer], dtype=self._descr)
+        assert_(normalize_descr(self._descr) == h.dtype.descr)
+        if self.multiple_rows:
+            assert_(h.shape == (1, 2))
+        else:
+            assert_(h.shape == (1,))
+
+    def test_list_of_list_of_tuple(self):
+        """Check creation from list of list of tuples"""
+        h = np.array([[self._buffer]], dtype=self._descr)
+        assert_(normalize_descr(self._descr) == h.dtype.descr)
+        if self.multiple_rows:
+            assert_(h.shape == (1, 1, 2))
+        else:
+            assert_(h.shape == (1, 1))
+
+
+class TestCreateValuesPlainSingle(CreateValues):
+    """Check the creation of heterogeneous arrays (plain, single row)"""
+    _descr = Pdescr
+    multiple_rows = 0
+    _buffer = PbufferT[0]
+
+class TestCreateValuesPlainMultiple(CreateValues):
+    """Check the creation of heterogeneous arrays (plain, multiple rows)"""
+    _descr = Pdescr
+    multiple_rows = 1
+    _buffer = PbufferT
+
+class TestCreateValuesNestedSingle(CreateValues):
+    """Check the creation of heterogeneous arrays (nested, single row)"""
+    _descr = Ndescr
+    multiple_rows = 0
+    _buffer = NbufferT[0]
+
+class TestCreateValuesNestedMultiple(CreateValues):
+    """Check the creation of heterogeneous arrays (nested, multiple rows)"""
+    _descr = Ndescr
+    multiple_rows = 1
+    _buffer = NbufferT
+
+
+############################################################
+#    Reading tests
+############################################################
+
+class ReadValuesPlain(object):
+    """Check the reading of values in heterogeneous arrays (plain)"""
+
+    def test_access_fields(self):
+        h = np.array(self._buffer, dtype=self._descr)
+        if not self.multiple_rows:
+            assert_(h.shape == ())
+            assert_equal(h['x'], np.array(self._buffer[0], dtype='i4'))
+            assert_equal(h['y'], np.array(self._buffer[1], dtype='f8'))
+            assert_equal(h['z'], np.array(self._buffer[2], dtype='u1'))
+        else:
+            assert_(len(h) == 2)
+            assert_equal(h['x'], np.array([self._buffer[0][0],
+                                             self._buffer[1][0]], dtype='i4'))
+            assert_equal(h['y'], np.array([self._buffer[0][1],
+                                             self._buffer[1][1]], dtype='f8'))
+            assert_equal(h['z'], np.array([self._buffer[0][2],
+                                             self._buffer[1][2]], dtype='u1'))
+
+
+class TestReadValuesPlainSingle(ReadValuesPlain):
+    """Check the creation of heterogeneous arrays (plain, single row)"""
+    _descr = Pdescr
+    multiple_rows = 0
+    _buffer = PbufferT[0]
+
+class TestReadValuesPlainMultiple(ReadValuesPlain):
+    """Check the values of heterogeneous arrays (plain, multiple rows)"""
+    _descr = Pdescr
+    multiple_rows = 1
+    _buffer = PbufferT
+
+class ReadValuesNested(object):
+    """Check the reading of values in heterogeneous arrays (nested)"""
+
+    def test_access_top_fields(self):
+        """Check reading the top fields of a nested array"""
+        h = np.array(self._buffer, dtype=self._descr)
+        if not self.multiple_rows:
+            assert_(h.shape == ())
+            assert_equal(h['x'], np.array(self._buffer[0], dtype='i4'))
+            assert_equal(h['y'], np.array(self._buffer[4], dtype='f8'))
+            assert_equal(h['z'], np.array(self._buffer[5], dtype='u1'))
+        else:
+            assert_(len(h) == 2)
+            assert_equal(h['x'], np.array([self._buffer[0][0],
+                                           self._buffer[1][0]], dtype='i4'))
+            assert_equal(h['y'], np.array([self._buffer[0][4],
+                                           self._buffer[1][4]], dtype='f8'))
+            assert_equal(h['z'], np.array([self._buffer[0][5],
+                                           self._buffer[1][5]], dtype='u1'))
+
+    def test_nested1_acessors(self):
+        """Check reading the nested fields of a nested array (1st level)"""
+        h = np.array(self._buffer, dtype=self._descr)
+        if not self.multiple_rows:
+            assert_equal(h['Info']['value'],
+                         np.array(self._buffer[1][0], dtype='c16'))
+            assert_equal(h['Info']['y2'],
+                         np.array(self._buffer[1][1], dtype='f8'))
+            assert_equal(h['info']['Name'],
+                         np.array(self._buffer[3][0], dtype='U2'))
+            assert_equal(h['info']['Value'],
+                         np.array(self._buffer[3][1], dtype='c16'))
+        else:
+            assert_equal(h['Info']['value'],
+                         np.array([self._buffer[0][1][0],
+                                self._buffer[1][1][0]],
+                                dtype='c16'))
+            assert_equal(h['Info']['y2'],
+                         np.array([self._buffer[0][1][1],
+                                self._buffer[1][1][1]],
+                                dtype='f8'))
+            assert_equal(h['info']['Name'],
+                         np.array([self._buffer[0][3][0],
+                                self._buffer[1][3][0]],
+                               dtype='U2'))
+            assert_equal(h['info']['Value'],
+                         np.array([self._buffer[0][3][1],
+                                self._buffer[1][3][1]],
+                               dtype='c16'))
+
+    def test_nested2_acessors(self):
+        """Check reading the nested fields of a nested array (2nd level)"""
+        h = np.array(self._buffer, dtype=self._descr)
+        if not self.multiple_rows:
+            assert_equal(h['Info']['Info2']['value'],
+                         np.array(self._buffer[1][2][1], dtype='c16'))
+            assert_equal(h['Info']['Info2']['z3'],
+                         np.array(self._buffer[1][2][3], dtype='u4'))
+        else:
+            assert_equal(h['Info']['Info2']['value'],
+                         np.array([self._buffer[0][1][2][1],
+                                self._buffer[1][1][2][1]],
+                               dtype='c16'))
+            assert_equal(h['Info']['Info2']['z3'],
+                         np.array([self._buffer[0][1][2][3],
+                                self._buffer[1][1][2][3]],
+                               dtype='u4'))
+
+    def test_nested1_descriptor(self):
+        """Check access nested descriptors of a nested array (1st level)"""
+        h = np.array(self._buffer, dtype=self._descr)
+        assert_(h.dtype['Info']['value'].name == 'complex128')
+        assert_(h.dtype['Info']['y2'].name == 'float64')
+        if sys.version_info[0] >= 3:
+            assert_(h.dtype['info']['Name'].name == 'str256')
+        else:
+            assert_(h.dtype['info']['Name'].name == 'unicode256')
+        assert_(h.dtype['info']['Value'].name == 'complex128')
+
+    def test_nested2_descriptor(self):
+        """Check access nested descriptors of a nested array (2nd level)"""
+        h = np.array(self._buffer, dtype=self._descr)
+        assert_(h.dtype['Info']['Info2']['value'].name == 'void256')
+        assert_(h.dtype['Info']['Info2']['z3'].name == 'void64')
+
+
+class TestReadValuesNestedSingle(ReadValuesNested):
+    """Check the values of heterogeneous arrays (nested, single row)"""
+    _descr = Ndescr
+    multiple_rows = False
+    _buffer = NbufferT[0]
+
+class TestReadValuesNestedMultiple(ReadValuesNested):
+    """Check the values of heterogeneous arrays (nested, multiple rows)"""
+    _descr = Ndescr
+    multiple_rows = True
+    _buffer = NbufferT
+
+class TestEmptyField(object):
+    def test_assign(self):
+        a = np.arange(10, dtype=np.float32)
+        a.dtype = [("int",   "<0i4"), ("float", "<2f4")]
+        assert_(a['int'].shape == (5, 0))
+        assert_(a['float'].shape == (5, 2))
+
+class TestCommonType(object):
+    def test_scalar_loses1(self):
+        res = np.find_common_type(['f4', 'f4', 'i2'], ['f8'])
+        assert_(res == 'f4')
+
+    def test_scalar_loses2(self):
+        res = np.find_common_type(['f4', 'f4'], ['i8'])
+        assert_(res == 'f4')
+
+    def test_scalar_wins(self):
+        res = np.find_common_type(['f4', 'f4', 'i2'], ['c8'])
+        assert_(res == 'c8')
+
+    def test_scalar_wins2(self):
+        res = np.find_common_type(['u4', 'i4', 'i4'], ['f4'])
+        assert_(res == 'f8')
+
+    def test_scalar_wins3(self):  # doesn't go up to 'f16' on purpose
+        res = np.find_common_type(['u8', 'i8', 'i8'], ['f8'])
+        assert_(res == 'f8')
+
+class TestMultipleFields(object):
+    def setup(self):
+        self.ary = np.array([(1, 2, 3, 4), (5, 6, 7, 8)], dtype='i4,f4,i2,c8')
+
+    def _bad_call(self):
+        return self.ary['f0', 'f1']
+
+    def test_no_tuple(self):
+        assert_raises(IndexError, self._bad_call)
+
+    def test_return(self):
+        res = self.ary[['f0', 'f2']].tolist()
+        assert_(res == [(1, 3), (5, 7)])
+
+
+class TestIsSubDType(object):
+    # scalar types can be promoted into dtypes
+    wrappers = [np.dtype, lambda x: x]
+
+    def test_both_abstract(self):
+        assert_(np.issubdtype(np.floating, np.inexact))
+        assert_(not np.issubdtype(np.inexact, np.floating))
+
+    def test_same(self):
+        for cls in (np.float32, np.int32):
+            for w1, w2 in itertools.product(self.wrappers, repeat=2):
+                assert_(np.issubdtype(w1(cls), w2(cls)))
+
+    def test_subclass(self):
+        # note we cannot promote floating to a dtype, as it would turn into a
+        # concrete type
+        for w in self.wrappers:
+            assert_(np.issubdtype(w(np.float32), np.floating))
+            assert_(np.issubdtype(w(np.float64), np.floating))
+
+    def test_subclass_backwards(self):
+        for w in self.wrappers:
+            assert_(not np.issubdtype(np.floating, w(np.float32)))
+            assert_(not np.issubdtype(np.floating, w(np.float64)))
+
+    def test_sibling_class(self):
+        for w1, w2 in itertools.product(self.wrappers, repeat=2):
+            assert_(not np.issubdtype(w1(np.float32), w2(np.float64)))
+            assert_(not np.issubdtype(w1(np.float64), w2(np.float32)))
+
+
+class TestSctypeDict(object):
+    def test_longdouble(self):
+        assert_(np.sctypeDict['f8'] is not np.longdouble)
+        assert_(np.sctypeDict['c16'] is not np.clongdouble)
+
+
+class TestBitName(object):
+    def test_abstract(self):
+        assert_raises(ValueError, np.core.numerictypes.bitname, np.floating)
+
+
+class TestMaximumSctype(object):
+
+    # note that parametrizing with sctype['int'] and similar would skip types
+    # with the same size (gh-11923)
+
+    @pytest.mark.parametrize('t', [np.byte, np.short, np.intc, np.int_, np.longlong])
+    def test_int(self, t):
+        assert_equal(np.maximum_sctype(t), np.sctypes['int'][-1])
+
+    @pytest.mark.parametrize('t', [np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong])
+    def test_uint(self, t):
+        assert_equal(np.maximum_sctype(t), np.sctypes['uint'][-1])
+
+    @pytest.mark.parametrize('t', [np.half, np.single, np.double, np.longdouble])
+    def test_float(self, t):
+        assert_equal(np.maximum_sctype(t), np.sctypes['float'][-1])
+
+    @pytest.mark.parametrize('t', [np.csingle, np.cdouble, np.clongdouble])
+    def test_complex(self, t):
+        assert_equal(np.maximum_sctype(t), np.sctypes['complex'][-1])
+
+    @pytest.mark.parametrize('t', [np.bool_, np.object_, np.unicode_, np.bytes_, np.void])
+    def test_other(self, t):
+        assert_equal(np.maximum_sctype(t), t)
+
+
+class Test_sctype2char(object):
+    # This function is old enough that we're really just documenting the quirks
+    # at this point.
+
+    def test_scalar_type(self):
+        assert_equal(np.sctype2char(np.double), 'd')
+        assert_equal(np.sctype2char(np.int_), 'l')
+        assert_equal(np.sctype2char(np.unicode_), 'U')
+        assert_equal(np.sctype2char(np.bytes_), 'S')
+
+    def test_other_type(self):
+        assert_equal(np.sctype2char(float), 'd')
+        assert_equal(np.sctype2char(list), 'O')
+        assert_equal(np.sctype2char(np.ndarray), 'O')
+
+    def test_third_party_scalar_type(self):
+        from numpy.core._rational_tests import rational
+        assert_raises(KeyError, np.sctype2char, rational)
+        assert_raises(KeyError, np.sctype2char, rational(1))
+
+    def test_array_instance(self):
+        assert_equal(np.sctype2char(np.array([1.0, 2.0])), 'd')
+
+    def test_abstract_type(self):
+        assert_raises(KeyError, np.sctype2char, np.floating)
+
+    def test_non_type(self):
+        assert_raises(ValueError, np.sctype2char, 1)
+
+[email protected]("rep, expected", [
+    (np.int32, True),
+    (list, False),
+    (1.1, False),
+    (str, True),
+    (np.dtype(np.float64), True),
+    (np.dtype((np.int16, (3, 4))), True),
+    (np.dtype([('a', np.int8)]), True),
+    ])
+def test_issctype(rep, expected):
+    # ensure proper identification of scalar
+    # data-types by issctype()
+    actual = np.issctype(rep)
+    assert_equal(actual, expected)
+
+
+[email protected](sys.flags.optimize > 1,
+                    reason="no docstrings present to inspect when PYTHONOPTIMIZE/Py_OptimizeFlag > 1")
+[email protected](IS_PYPY, reason="PyPy does not modify tp_doc")
+class TestDocStrings(object):
+    def test_platform_dependent_aliases(self):
+        if np.int64 is np.int_:
+            assert_('int64' in np.int_.__doc__)
+        elif np.int64 is np.longlong:
+            assert_('int64' in np.longlong.__doc__)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_overrides.py b/contrib/python/numpy/py2/numpy/core/tests/test_overrides.py
new file mode 100644
index 00000000000..8f1c16539ba
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_overrides.py
@@ -0,0 +1,392 @@
+from __future__ import division, absolute_import, print_function
+
+import inspect
+import sys
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_raises_regex)
+from numpy.core.overrides import (
+    _get_implementing_args, array_function_dispatch,
+    verify_matching_signatures, ENABLE_ARRAY_FUNCTION)
+from numpy.core.numeric import pickle
+import pytest
+
+
+requires_array_function = pytest.mark.skipif(
+    not ENABLE_ARRAY_FUNCTION,
+    reason="__array_function__ dispatch not enabled.")
+
+
+def _return_not_implemented(self, *args, **kwargs):
+    return NotImplemented
+
+
+# need to define this at the top level to test pickling
+@array_function_dispatch(lambda array: (array,))
+def dispatched_one_arg(array):
+    """Docstring."""
+    return 'original'
+
+
+@array_function_dispatch(lambda array1, array2: (array1, array2))
+def dispatched_two_arg(array1, array2):
+    """Docstring."""
+    return 'original'
+
+
+@requires_array_function
+class TestGetImplementingArgs(object):
+
+    def test_ndarray(self):
+        array = np.array(1)
+
+        args = _get_implementing_args([array])
+        assert_equal(list(args), [array])
+
+        args = _get_implementing_args([array, array])
+        assert_equal(list(args), [array])
+
+        args = _get_implementing_args([array, 1])
+        assert_equal(list(args), [array])
+
+        args = _get_implementing_args([1, array])
+        assert_equal(list(args), [array])
+
+    def test_ndarray_subclasses(self):
+
+        class OverrideSub(np.ndarray):
+            __array_function__ = _return_not_implemented
+
+        class NoOverrideSub(np.ndarray):
+            pass
+
+        array = np.array(1).view(np.ndarray)
+        override_sub = np.array(1).view(OverrideSub)
+        no_override_sub = np.array(1).view(NoOverrideSub)
+
+        args = _get_implementing_args([array, override_sub])
+        assert_equal(list(args), [override_sub, array])
+
+        args = _get_implementing_args([array, no_override_sub])
+        assert_equal(list(args), [no_override_sub, array])
+
+        args = _get_implementing_args(
+            [override_sub, no_override_sub])
+        assert_equal(list(args), [override_sub, no_override_sub])
+
+    def test_ndarray_and_duck_array(self):
+
+        class Other(object):
+            __array_function__ = _return_not_implemented
+
+        array = np.array(1)
+        other = Other()
+
+        args = _get_implementing_args([other, array])
+        assert_equal(list(args), [other, array])
+
+        args = _get_implementing_args([array, other])
+        assert_equal(list(args), [array, other])
+
+    def test_ndarray_subclass_and_duck_array(self):
+
+        class OverrideSub(np.ndarray):
+            __array_function__ = _return_not_implemented
+
+        class Other(object):
+            __array_function__ = _return_not_implemented
+
+        array = np.array(1)
+        subarray = np.array(1).view(OverrideSub)
+        other = Other()
+
+        assert_equal(_get_implementing_args([array, subarray, other]),
+                     [subarray, array, other])
+        assert_equal(_get_implementing_args([array, other, subarray]),
+                     [subarray, array, other])
+
+    def test_many_duck_arrays(self):
+
+        class A(object):
+            __array_function__ = _return_not_implemented
+
+        class B(A):
+            __array_function__ = _return_not_implemented
+
+        class C(A):
+            __array_function__ = _return_not_implemented
+
+        class D(object):
+            __array_function__ = _return_not_implemented
+
+        a = A()
+        b = B()
+        c = C()
+        d = D()
+
+        assert_equal(_get_implementing_args([1]), [])
+        assert_equal(_get_implementing_args([a]), [a])
+        assert_equal(_get_implementing_args([a, 1]), [a])
+        assert_equal(_get_implementing_args([a, a, a]), [a])
+        assert_equal(_get_implementing_args([a, d, a]), [a, d])
+        assert_equal(_get_implementing_args([a, b]), [b, a])
+        assert_equal(_get_implementing_args([b, a]), [b, a])
+        assert_equal(_get_implementing_args([a, b, c]), [b, c, a])
+        assert_equal(_get_implementing_args([a, c, b]), [c, b, a])
+
+    def test_too_many_duck_arrays(self):
+        namespace = dict(__array_function__=_return_not_implemented)
+        types = [type('A' + str(i), (object,), namespace) for i in range(33)]
+        relevant_args = [t() for t in types]
+
+        actual = _get_implementing_args(relevant_args[:32])
+        assert_equal(actual, relevant_args[:32])
+
+        with assert_raises_regex(TypeError, 'distinct argument types'):
+            _get_implementing_args(relevant_args)
+
+
+@requires_array_function
+class TestNDArrayArrayFunction(object):
+
+    def test_method(self):
+
+        class Other(object):
+            __array_function__ = _return_not_implemented
+
+        class NoOverrideSub(np.ndarray):
+            pass
+
+        class OverrideSub(np.ndarray):
+            __array_function__ = _return_not_implemented
+
+        array = np.array([1])
+        other = Other()
+        no_override_sub = array.view(NoOverrideSub)
+        override_sub = array.view(OverrideSub)
+
+        result = array.__array_function__(func=dispatched_two_arg,
+                                          types=(np.ndarray,),
+                                          args=(array, 1.), kwargs={})
+        assert_equal(result, 'original')
+
+        result = array.__array_function__(func=dispatched_two_arg,
+                                          types=(np.ndarray, Other),
+                                          args=(array, other), kwargs={})
+        assert_(result is NotImplemented)
+
+        result = array.__array_function__(func=dispatched_two_arg,
+                                          types=(np.ndarray, NoOverrideSub),
+                                          args=(array, no_override_sub),
+                                          kwargs={})
+        assert_equal(result, 'original')
+
+        result = array.__array_function__(func=dispatched_two_arg,
+                                          types=(np.ndarray, OverrideSub),
+                                          args=(array, override_sub),
+                                          kwargs={})
+        assert_equal(result, 'original')
+
+        with assert_raises_regex(TypeError, 'no implementation found'):
+            np.concatenate((array, other))
+
+        expected = np.concatenate((array, array))
+        result = np.concatenate((array, no_override_sub))
+        assert_equal(result, expected.view(NoOverrideSub))
+        result = np.concatenate((array, override_sub))
+        assert_equal(result, expected.view(OverrideSub))
+
+    def test_no_wrapper(self):
+        array = np.array(1)
+        func = dispatched_one_arg.__wrapped__
+        with assert_raises_regex(AttributeError, '__wrapped__'):
+            array.__array_function__(func=func,
+                                     types=(np.ndarray,),
+                                     args=(array,), kwargs={})
+
+
+@requires_array_function
+class TestArrayFunctionDispatch(object):
+
+    def test_pickle(self):
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            roundtripped = pickle.loads(
+                    pickle.dumps(dispatched_one_arg, protocol=proto))
+            assert_(roundtripped is dispatched_one_arg)
+
+    def test_name_and_docstring(self):
+        assert_equal(dispatched_one_arg.__name__, 'dispatched_one_arg')
+        if sys.flags.optimize < 2:
+            assert_equal(dispatched_one_arg.__doc__, 'Docstring.')
+
+    def test_interface(self):
+
+        class MyArray(object):
+            def __array_function__(self, func, types, args, kwargs):
+                return (self, func, types, args, kwargs)
+
+        original = MyArray()
+        (obj, func, types, args, kwargs) = dispatched_one_arg(original)
+        assert_(obj is original)
+        assert_(func is dispatched_one_arg)
+        assert_equal(set(types), {MyArray})
+        # assert_equal uses the overloaded np.iscomplexobj() internally
+        assert_(args == (original,))
+        assert_equal(kwargs, {})
+
+    def test_not_implemented(self):
+
+        class MyArray(object):
+            def __array_function__(self, func, types, args, kwargs):
+                return NotImplemented
+
+        array = MyArray()
+        with assert_raises_regex(TypeError, 'no implementation found'):
+            dispatched_one_arg(array)
+
+
+@requires_array_function
+class TestVerifyMatchingSignatures(object):
+
+    def test_verify_matching_signatures(self):
+
+        verify_matching_signatures(lambda x: 0, lambda x: 0)
+        verify_matching_signatures(lambda x=None: 0, lambda x=None: 0)
+        verify_matching_signatures(lambda x=1: 0, lambda x=None: 0)
+
+        with assert_raises(RuntimeError):
+            verify_matching_signatures(lambda a: 0, lambda b: 0)
+        with assert_raises(RuntimeError):
+            verify_matching_signatures(lambda x: 0, lambda x=None: 0)
+        with assert_raises(RuntimeError):
+            verify_matching_signatures(lambda x=None: 0, lambda y=None: 0)
+        with assert_raises(RuntimeError):
+            verify_matching_signatures(lambda x=1: 0, lambda y=1: 0)
+
+    def test_array_function_dispatch(self):
+
+        with assert_raises(RuntimeError):
+            @array_function_dispatch(lambda x: (x,))
+            def f(y):
+                pass
+
+        # should not raise
+        @array_function_dispatch(lambda x: (x,), verify=False)
+        def f(y):
+            pass
+
+
+def _new_duck_type_and_implements():
+    """Create a duck array type and implements functions."""
+    HANDLED_FUNCTIONS = {}
+
+    class MyArray(object):
+        def __array_function__(self, func, types, args, kwargs):
+            if func not in HANDLED_FUNCTIONS:
+                return NotImplemented
+            if not all(issubclass(t, MyArray) for t in types):
+                return NotImplemented
+            return HANDLED_FUNCTIONS[func](*args, **kwargs)
+
+    def implements(numpy_function):
+        """Register an __array_function__ implementations."""
+        def decorator(func):
+            HANDLED_FUNCTIONS[numpy_function] = func
+            return func
+        return decorator
+
+    return (MyArray, implements)
+
+
+@requires_array_function
+class TestArrayFunctionImplementation(object):
+
+    def test_one_arg(self):
+        MyArray, implements = _new_duck_type_and_implements()
+
+        @implements(dispatched_one_arg)
+        def _(array):
+            return 'myarray'
+
+        assert_equal(dispatched_one_arg(1), 'original')
+        assert_equal(dispatched_one_arg(MyArray()), 'myarray')
+
+    def test_optional_args(self):
+        MyArray, implements = _new_duck_type_and_implements()
+
+        @array_function_dispatch(lambda array, option=None: (array,))
+        def func_with_option(array, option='default'):
+            return option
+
+        @implements(func_with_option)
+        def my_array_func_with_option(array, new_option='myarray'):
+            return new_option
+
+        # we don't need to implement every option on __array_function__
+        # implementations
+        assert_equal(func_with_option(1), 'default')
+        assert_equal(func_with_option(1, option='extra'), 'extra')
+        assert_equal(func_with_option(MyArray()), 'myarray')
+        with assert_raises(TypeError):
+            func_with_option(MyArray(), option='extra')
+
+        # but new options on implementations can't be used
+        result = my_array_func_with_option(MyArray(), new_option='yes')
+        assert_equal(result, 'yes')
+        with assert_raises(TypeError):
+            func_with_option(MyArray(), new_option='no')
+
+    def test_not_implemented(self):
+        MyArray, implements = _new_duck_type_and_implements()
+
+        @array_function_dispatch(lambda array: (array,), module='my')
+        def func(array):
+            return array
+
+        array = np.array(1)
+        assert_(func(array) is array)
+        assert_equal(func.__module__, 'my')
+
+        with assert_raises_regex(
+                TypeError, "no implementation found for 'my.func'"):
+            func(MyArray())
+
+
+class TestNDArrayMethods(object):
+
+    def test_repr(self):
+        # gh-12162: should still be defined even if __array_function__ doesn't
+        # implement np.array_repr()
+
+        class MyArray(np.ndarray):
+            def __array_function__(*args, **kwargs):
+                return NotImplemented
+
+        array = np.array(1).view(MyArray)
+        assert_equal(repr(array), 'MyArray(1)')
+        assert_equal(str(array), '1')
+
+
+class TestNumPyFunctions(object):
+
+    def test_set_module(self):
+        assert_equal(np.sum.__module__, 'numpy')
+        assert_equal(np.char.equal.__module__, 'numpy.char')
+        assert_equal(np.fft.fft.__module__, 'numpy.fft')
+        assert_equal(np.linalg.solve.__module__, 'numpy.linalg')
+
+    @pytest.mark.skipif(sys.version_info[0] < 3, reason="Python 3 only")
+    def test_inspect_sum(self):
+        signature = inspect.signature(np.sum)
+        assert_('axis' in signature.parameters)
+
+    @requires_array_function
+    def test_override_sum(self):
+        MyArray, implements = _new_duck_type_and_implements()
+
+        @implements(np.sum)
+        def _(array):
+            return 'yes'
+
+        assert_equal(np.sum(MyArray()), 'yes')
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_print.py b/contrib/python/numpy/py2/numpy/core/tests/test_print.py
new file mode 100644
index 00000000000..c5c091e13ac
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_print.py
@@ -0,0 +1,205 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+import pytest
+
+import numpy as np
+from numpy.testing import assert_, assert_equal
+from numpy.core.tests._locales import CommaDecimalPointLocale
+
+
+if sys.version_info[0] >= 3:
+    from io import StringIO
+else:
+    from StringIO import StringIO
+
+_REF = {np.inf: 'inf', -np.inf: '-inf', np.nan: 'nan'}
+
+
+[email protected]('tp', [np.float32, np.double, np.longdouble])
+def test_float_types(tp):
+    """ Check formatting.
+
+        This is only for the str function, and only for simple types.
+        The precision of np.float32 and np.longdouble aren't the same as the
+        python float precision.
+
+    """
+    for x in [0, 1, -1, 1e20]:
+        assert_equal(str(tp(x)), str(float(x)),
+                     err_msg='Failed str formatting for type %s' % tp)
+
+    if tp(1e16).itemsize > 4:
+        assert_equal(str(tp(1e16)), str(float('1e16')),
+                     err_msg='Failed str formatting for type %s' % tp)
+    else:
+        ref = '1e+16'
+        assert_equal(str(tp(1e16)), ref,
+                     err_msg='Failed str formatting for type %s' % tp)
+
+
+[email protected]('tp', [np.float32, np.double, np.longdouble])
+def test_nan_inf_float(tp):
+    """ Check formatting of nan & inf.
+
+        This is only for the str function, and only for simple types.
+        The precision of np.float32 and np.longdouble aren't the same as the
+        python float precision.
+
+    """
+    for x in [np.inf, -np.inf, np.nan]:
+        assert_equal(str(tp(x)), _REF[x],
+                     err_msg='Failed str formatting for type %s' % tp)
+
+
+[email protected]('tp', [np.complex64, np.cdouble, np.clongdouble])
+def test_complex_types(tp):
+    """Check formatting of complex types.
+
+        This is only for the str function, and only for simple types.
+        The precision of np.float32 and np.longdouble aren't the same as the
+        python float precision.
+
+    """
+    for x in [0, 1, -1, 1e20]:
+        assert_equal(str(tp(x)), str(complex(x)),
+                     err_msg='Failed str formatting for type %s' % tp)
+        assert_equal(str(tp(x*1j)), str(complex(x*1j)),
+                     err_msg='Failed str formatting for type %s' % tp)
+        assert_equal(str(tp(x + x*1j)), str(complex(x + x*1j)),
+                     err_msg='Failed str formatting for type %s' % tp)
+
+    if tp(1e16).itemsize > 8:
+        assert_equal(str(tp(1e16)), str(complex(1e16)),
+                     err_msg='Failed str formatting for type %s' % tp)
+    else:
+        ref = '(1e+16+0j)'
+        assert_equal(str(tp(1e16)), ref,
+                     err_msg='Failed str formatting for type %s' % tp)
+
+
+[email protected]('dtype', [np.complex64, np.cdouble, np.clongdouble])
+def test_complex_inf_nan(dtype):
+    """Check inf/nan formatting of complex types."""
+    TESTS = {
+        complex(np.inf, 0): "(inf+0j)",
+        complex(0, np.inf): "infj",
+        complex(-np.inf, 0): "(-inf+0j)",
+        complex(0, -np.inf): "-infj",
+        complex(np.inf, 1): "(inf+1j)",
+        complex(1, np.inf): "(1+infj)",
+        complex(-np.inf, 1): "(-inf+1j)",
+        complex(1, -np.inf): "(1-infj)",
+        complex(np.nan, 0): "(nan+0j)",
+        complex(0, np.nan): "nanj",
+        complex(-np.nan, 0): "(nan+0j)",
+        complex(0, -np.nan): "nanj",
+        complex(np.nan, 1): "(nan+1j)",
+        complex(1, np.nan): "(1+nanj)",
+        complex(-np.nan, 1): "(nan+1j)",
+        complex(1, -np.nan): "(1+nanj)",
+    }
+    for c, s in TESTS.items():
+        assert_equal(str(dtype(c)), s)
+
+
+# print tests
+def _test_redirected_print(x, tp, ref=None):
+    file = StringIO()
+    file_tp = StringIO()
+    stdout = sys.stdout
+    try:
+        sys.stdout = file_tp
+        print(tp(x))
+        sys.stdout = file
+        if ref:
+            print(ref)
+        else:
+            print(x)
+    finally:
+        sys.stdout = stdout
+
+    assert_equal(file.getvalue(), file_tp.getvalue(),
+                 err_msg='print failed for type%s' % tp)
+
+
+[email protected]('tp', [np.float32, np.double, np.longdouble])
+def test_float_type_print(tp):
+    """Check formatting when using print """
+    for x in [0, 1, -1, 1e20]:
+        _test_redirected_print(float(x), tp)
+
+    for x in [np.inf, -np.inf, np.nan]:
+        _test_redirected_print(float(x), tp, _REF[x])
+
+    if tp(1e16).itemsize > 4:
+        _test_redirected_print(float(1e16), tp)
+    else:
+        ref = '1e+16'
+        _test_redirected_print(float(1e16), tp, ref)
+
+
+[email protected]('tp', [np.complex64, np.cdouble, np.clongdouble])
+def test_complex_type_print(tp):
+    """Check formatting when using print """
+    # We do not create complex with inf/nan directly because the feature is
+    # missing in python < 2.6
+    for x in [0, 1, -1, 1e20]:
+        _test_redirected_print(complex(x), tp)
+
+    if tp(1e16).itemsize > 8:
+        _test_redirected_print(complex(1e16), tp)
+    else:
+        ref = '(1e+16+0j)'
+        _test_redirected_print(complex(1e16), tp, ref)
+
+    _test_redirected_print(complex(np.inf, 1), tp, '(inf+1j)')
+    _test_redirected_print(complex(-np.inf, 1), tp, '(-inf+1j)')
+    _test_redirected_print(complex(-np.nan, 1), tp, '(nan+1j)')
+
+
+def test_scalar_format():
+    """Test the str.format method with NumPy scalar types"""
+    tests = [('{0}', True, np.bool_),
+            ('{0}', False, np.bool_),
+            ('{0:d}', 130, np.uint8),
+            ('{0:d}', 50000, np.uint16),
+            ('{0:d}', 3000000000, np.uint32),
+            ('{0:d}', 15000000000000000000, np.uint64),
+            ('{0:d}', -120, np.int8),
+            ('{0:d}', -30000, np.int16),
+            ('{0:d}', -2000000000, np.int32),
+            ('{0:d}', -7000000000000000000, np.int64),
+            ('{0:g}', 1.5, np.float16),
+            ('{0:g}', 1.5, np.float32),
+            ('{0:g}', 1.5, np.float64),
+            ('{0:g}', 1.5, np.longdouble),
+            ('{0:g}', 1.5+0.5j, np.complex64),
+            ('{0:g}', 1.5+0.5j, np.complex128),
+            ('{0:g}', 1.5+0.5j, np.clongdouble)]
+
+    for (fmat, val, valtype) in tests:
+        try:
+            assert_equal(fmat.format(val), fmat.format(valtype(val)),
+                    "failed with val %s, type %s" % (val, valtype))
+        except ValueError as e:
+            assert_(False,
+               "format raised exception (fmt='%s', val=%s, type=%s, exc='%s')" %
+                            (fmat, repr(val), repr(valtype), str(e)))
+
+
+#
+# Locale tests: scalar types formatting should be independent of the locale
+#
+
+class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
+
+    def test_locale_single(self):
+        assert_equal(str(np.float32(1.2)), str(float(1.2)))
+
+    def test_locale_double(self):
+        assert_equal(str(np.double(1.2)), str(float(1.2)))
+
+    def test_locale_longdouble(self):
+        assert_equal(str(np.longdouble('1.2')), str(float(1.2)))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_records.py b/contrib/python/numpy/py2/numpy/core/tests/test_records.py
new file mode 100644
index 00000000000..95ed1fa5bb0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_records.py
@@ -0,0 +1,499 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+try:
+    # Accessing collections abstract classes from collections
+    # has been deprecated since Python 3.3
+    import collections.abc as collections_abc
+except ImportError:
+    import collections as collections_abc
+import textwrap
+from os import path
+import pytest
+
+import numpy as np
+from numpy.compat import Path
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_array_almost_equal,
+    assert_raises, temppath
+    )
+from numpy.core.numeric import pickle
+
+
+class TestFromrecords(object):
+    def test_fromrecords(self):
+        r = np.rec.fromrecords([[456, 'dbe', 1.2], [2, 'de', 1.3]],
+                            names='col1,col2,col3')
+        assert_equal(r[0].item(), (456, 'dbe', 1.2))
+        assert_equal(r['col1'].dtype.kind, 'i')
+        if sys.version_info[0] >= 3:
+            assert_equal(r['col2'].dtype.kind, 'U')
+            assert_equal(r['col2'].dtype.itemsize, 12)
+        else:
+            assert_equal(r['col2'].dtype.kind, 'S')
+            assert_equal(r['col2'].dtype.itemsize, 3)
+        assert_equal(r['col3'].dtype.kind, 'f')
+
+    def test_fromrecords_0len(self):
+        """ Verify fromrecords works with a 0-length input """
+        dtype = [('a', float), ('b', float)]
+        r = np.rec.fromrecords([], dtype=dtype)
+        assert_equal(r.shape, (0,))
+
+    def test_fromrecords_2d(self):
+        data = [
+            [(1, 2), (3, 4), (5, 6)],
+            [(6, 5), (4, 3), (2, 1)]
+        ]
+        expected_a = [[1, 3, 5], [6, 4, 2]]
+        expected_b = [[2, 4, 6], [5, 3, 1]]
+
+        # try with dtype
+        r1 = np.rec.fromrecords(data, dtype=[('a', int), ('b', int)])
+        assert_equal(r1['a'], expected_a)
+        assert_equal(r1['b'], expected_b)
+
+        # try with names
+        r2 = np.rec.fromrecords(data, names=['a', 'b'])
+        assert_equal(r2['a'], expected_a)
+        assert_equal(r2['b'], expected_b)
+
+        assert_equal(r1, r2)
+
+    def test_method_array(self):
+        r = np.rec.array(b'abcdefg' * 100, formats='i2,a3,i4', shape=3, byteorder='big')
+        assert_equal(r[1].item(), (25444, b'efg', 1633837924))
+
+    def test_method_array2(self):
+        r = np.rec.array([(1, 11, 'a'), (2, 22, 'b'), (3, 33, 'c'), (4, 44, 'd'), (5, 55, 'ex'),
+                     (6, 66, 'f'), (7, 77, 'g')], formats='u1,f4,a1')
+        assert_equal(r[1].item(), (2, 22.0, b'b'))
+
+    def test_recarray_slices(self):
+        r = np.rec.array([(1, 11, 'a'), (2, 22, 'b'), (3, 33, 'c'), (4, 44, 'd'), (5, 55, 'ex'),
+                     (6, 66, 'f'), (7, 77, 'g')], formats='u1,f4,a1')
+        assert_equal(r[1::2][1].item(), (4, 44.0, b'd'))
+
+    def test_recarray_fromarrays(self):
+        x1 = np.array([1, 2, 3, 4])
+        x2 = np.array(['a', 'dd', 'xyz', '12'])
+        x3 = np.array([1.1, 2, 3, 4])
+        r = np.rec.fromarrays([x1, x2, x3], names='a,b,c')
+        assert_equal(r[1].item(), (2, 'dd', 2.0))
+        x1[1] = 34
+        assert_equal(r.a, np.array([1, 2, 3, 4]))
+
+    def test_recarray_fromfile(self):
+        data_dir = path.join(path.dirname(__file__), 'data')
+        filename = path.join(data_dir, 'recarray_from_file.fits')
+        fd = open(filename, 'rb')
+        fd.seek(2880 * 2)
+        r1 = np.rec.fromfile(fd, formats='f8,i4,a5', shape=3, byteorder='big')
+        fd.seek(2880 * 2)
+        r2 = np.rec.array(fd, formats='f8,i4,a5', shape=3, byteorder='big')
+        fd.close()
+        assert_equal(r1, r2)
+
+    def test_recarray_from_obj(self):
+        count = 10
+        a = np.zeros(count, dtype='O')
+        b = np.zeros(count, dtype='f8')
+        c = np.zeros(count, dtype='f8')
+        for i in range(len(a)):
+            a[i] = list(range(1, 10))
+
+        mine = np.rec.fromarrays([a, b, c], names='date,data1,data2')
+        for i in range(len(a)):
+            assert_((mine.date[i] == list(range(1, 10))))
+            assert_((mine.data1[i] == 0.0))
+            assert_((mine.data2[i] == 0.0))
+
+    def test_recarray_repr(self):
+        a = np.array([(1, 0.1), (2, 0.2)],
+                     dtype=[('foo', '<i4'), ('bar', '<f8')])
+        a = np.rec.array(a)
+        assert_equal(
+            repr(a),
+            textwrap.dedent("""\
+            rec.array([(1, 0.1), (2, 0.2)],
+                      dtype=[('foo', '<i4'), ('bar', '<f8')])""")
+        )
+
+        # make sure non-structured dtypes also show up as rec.array
+        a = np.array(np.ones(4, dtype='f8'))
+        assert_(repr(np.rec.array(a)).startswith('rec.array'))
+
+        # check that the 'np.record' part of the dtype isn't shown
+        a = np.rec.array(np.ones(3, dtype='i4,i4'))
+        assert_equal(repr(a).find('numpy.record'), -1)
+        a = np.rec.array(np.ones(3, dtype='i4'))
+        assert_(repr(a).find('dtype=int32') != -1)
+
+    def test_0d_recarray_repr(self):
+        arr_0d = np.rec.array((1, 2.0, '2003'), dtype='<i4,<f8,<M8[Y]')
+        assert_equal(repr(arr_0d), textwrap.dedent("""\
+            rec.array((1, 2., '2003'),
+                      dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<M8[Y]')])"""))
+
+        record = arr_0d[()]
+        assert_equal(repr(record), "(1, 2., '2003')")
+        # 1.13 converted to python scalars before the repr
+        try:
+            np.set_printoptions(legacy='1.13')
+            assert_equal(repr(record), '(1, 2.0, datetime.date(2003, 1, 1))')
+        finally:
+            np.set_printoptions(legacy=False)
+
+    def test_recarray_from_repr(self):
+        a = np.array([(1,'ABC'), (2, "DEF")],
+                     dtype=[('foo', int), ('bar', 'S4')])
+        recordarr = np.rec.array(a)
+        recarr = a.view(np.recarray)
+        recordview = a.view(np.dtype((np.record, a.dtype)))
+
+        recordarr_r = eval("numpy." + repr(recordarr), {'numpy': np})
+        recarr_r = eval("numpy." + repr(recarr), {'numpy': np})
+        recordview_r = eval("numpy." + repr(recordview), {'numpy': np})
+
+        assert_equal(type(recordarr_r), np.recarray)
+        assert_equal(recordarr_r.dtype.type, np.record)
+        assert_equal(recordarr, recordarr_r)
+
+        assert_equal(type(recarr_r), np.recarray)
+        assert_equal(recarr_r.dtype.type, np.record)
+        assert_equal(recarr, recarr_r)
+
+        assert_equal(type(recordview_r), np.ndarray)
+        assert_equal(recordview.dtype.type, np.record)
+        assert_equal(recordview, recordview_r)
+
+    def test_recarray_views(self):
+        a = np.array([(1,'ABC'), (2, "DEF")],
+                     dtype=[('foo', int), ('bar', 'S4')])
+        b = np.array([1,2,3,4,5], dtype=np.int64)
+
+        #check that np.rec.array gives right dtypes
+        assert_equal(np.rec.array(a).dtype.type, np.record)
+        assert_equal(type(np.rec.array(a)), np.recarray)
+        assert_equal(np.rec.array(b).dtype.type, np.int64)
+        assert_equal(type(np.rec.array(b)), np.recarray)
+
+        #check that viewing as recarray does the same
+        assert_equal(a.view(np.recarray).dtype.type, np.record)
+        assert_equal(type(a.view(np.recarray)), np.recarray)
+        assert_equal(b.view(np.recarray).dtype.type, np.int64)
+        assert_equal(type(b.view(np.recarray)), np.recarray)
+
+        #check that view to non-structured dtype preserves type=np.recarray
+        r = np.rec.array(np.ones(4, dtype="f4,i4"))
+        rv = r.view('f8').view('f4,i4')
+        assert_equal(type(rv), np.recarray)
+        assert_equal(rv.dtype.type, np.record)
+
+        #check that getitem also preserves np.recarray and np.record
+        r = np.rec.array(np.ones(4, dtype=[('a', 'i4'), ('b', 'i4'),
+                                           ('c', 'i4,i4')]))
+        assert_equal(r['c'].dtype.type, np.record)
+        assert_equal(type(r['c']), np.recarray)
+
+        #and that it preserves subclasses (gh-6949)
+        class C(np.recarray):
+            pass
+
+        c = r.view(C)
+        assert_equal(type(c['c']), C)
+
+        # check that accessing nested structures keep record type, but
+        # not for subarrays, non-void structures, non-structured voids
+        test_dtype = [('a', 'f4,f4'), ('b', 'V8'), ('c', ('f4',2)),
+                      ('d', ('i8', 'i4,i4'))]
+        r = np.rec.array([((1,1), b'11111111', [1,1], 1),
+                          ((1,1), b'11111111', [1,1], 1)], dtype=test_dtype)
+        assert_equal(r.a.dtype.type, np.record)
+        assert_equal(r.b.dtype.type, np.void)
+        assert_equal(r.c.dtype.type, np.float32)
+        assert_equal(r.d.dtype.type, np.int64)
+        # check the same, but for views
+        r = np.rec.array(np.ones(4, dtype='i4,i4'))
+        assert_equal(r.view('f4,f4').dtype.type, np.record)
+        assert_equal(r.view(('i4',2)).dtype.type, np.int32)
+        assert_equal(r.view('V8').dtype.type, np.void)
+        assert_equal(r.view(('i8', 'i4,i4')).dtype.type, np.int64)
+
+        #check that we can undo the view
+        arrs = [np.ones(4, dtype='f4,i4'), np.ones(4, dtype='f8')]
+        for arr in arrs:
+            rec = np.rec.array(arr)
+            # recommended way to view as an ndarray:
+            arr2 = rec.view(rec.dtype.fields or rec.dtype, np.ndarray)
+            assert_equal(arr2.dtype.type, arr.dtype.type)
+            assert_equal(type(arr2), type(arr))
+
+    def test_recarray_from_names(self):
+        ra = np.rec.array([
+            (1, 'abc', 3.7000002861022949, 0),
+            (2, 'xy', 6.6999998092651367, 1),
+            (0, ' ', 0.40000000596046448, 0)],
+                       names='c1, c2, c3, c4')
+        pa = np.rec.fromrecords([
+            (1, 'abc', 3.7000002861022949, 0),
+            (2, 'xy', 6.6999998092651367, 1),
+            (0, ' ', 0.40000000596046448, 0)],
+                       names='c1, c2, c3, c4')
+        assert_(ra.dtype == pa.dtype)
+        assert_(ra.shape == pa.shape)
+        for k in range(len(ra)):
+            assert_(ra[k].item() == pa[k].item())
+
+    def test_recarray_conflict_fields(self):
+        ra = np.rec.array([(1, 'abc', 2.3), (2, 'xyz', 4.2),
+                        (3, 'wrs', 1.3)],
+                       names='field, shape, mean')
+        ra.mean = [1.1, 2.2, 3.3]
+        assert_array_almost_equal(ra['mean'], [1.1, 2.2, 3.3])
+        assert_(type(ra.mean) is type(ra.var))
+        ra.shape = (1, 3)
+        assert_(ra.shape == (1, 3))
+        ra.shape = ['A', 'B', 'C']
+        assert_array_equal(ra['shape'], [['A', 'B', 'C']])
+        ra.field = 5
+        assert_array_equal(ra['field'], [[5, 5, 5]])
+        assert_(isinstance(ra.field, collections_abc.Callable))
+
+    def test_fromrecords_with_explicit_dtype(self):
+        a = np.rec.fromrecords([(1, 'a'), (2, 'bbb')],
+                                dtype=[('a', int), ('b', object)])
+        assert_equal(a.a, [1, 2])
+        assert_equal(a[0].a, 1)
+        assert_equal(a.b, ['a', 'bbb'])
+        assert_equal(a[-1].b, 'bbb')
+        #
+        ndtype = np.dtype([('a', int), ('b', object)])
+        a = np.rec.fromrecords([(1, 'a'), (2, 'bbb')], dtype=ndtype)
+        assert_equal(a.a, [1, 2])
+        assert_equal(a[0].a, 1)
+        assert_equal(a.b, ['a', 'bbb'])
+        assert_equal(a[-1].b, 'bbb')
+
+    def test_recarray_stringtypes(self):
+        # Issue #3993
+        a = np.array([('abc ', 1), ('abc', 2)],
+                     dtype=[('foo', 'S4'), ('bar', int)])
+        a = a.view(np.recarray)
+        assert_equal(a.foo[0] == a.foo[1], False)
+
+    def test_recarray_returntypes(self):
+        qux_fields = {'C': (np.dtype('S5'), 0), 'D': (np.dtype('S5'), 6)}
+        a = np.rec.array([('abc ', (1,1), 1, ('abcde', 'fgehi')),
+                          ('abc', (2,3), 1, ('abcde', 'jklmn'))],
+                         dtype=[('foo', 'S4'),
+                                ('bar', [('A', int), ('B', int)]),
+                                ('baz', int), ('qux', qux_fields)])
+        assert_equal(type(a.foo), np.ndarray)
+        assert_equal(type(a['foo']), np.ndarray)
+        assert_equal(type(a.bar), np.recarray)
+        assert_equal(type(a['bar']), np.recarray)
+        assert_equal(a.bar.dtype.type, np.record)
+        assert_equal(type(a['qux']), np.recarray)
+        assert_equal(a.qux.dtype.type, np.record)
+        assert_equal(dict(a.qux.dtype.fields), qux_fields)
+        assert_equal(type(a.baz), np.ndarray)
+        assert_equal(type(a['baz']), np.ndarray)
+        assert_equal(type(a[0].bar), np.record)
+        assert_equal(type(a[0]['bar']), np.record)
+        assert_equal(a[0].bar.A, 1)
+        assert_equal(a[0].bar['A'], 1)
+        assert_equal(a[0]['bar'].A, 1)
+        assert_equal(a[0]['bar']['A'], 1)
+        assert_equal(a[0].qux.D, b'fgehi')
+        assert_equal(a[0].qux['D'], b'fgehi')
+        assert_equal(a[0]['qux'].D, b'fgehi')
+        assert_equal(a[0]['qux']['D'], b'fgehi')
+
+    def test_zero_width_strings(self):
+        # Test for #6430, based on the test case from #1901
+
+        cols = [['test'] * 3, [''] * 3]
+        rec = np.rec.fromarrays(cols)
+        assert_equal(rec['f0'], ['test', 'test', 'test'])
+        assert_equal(rec['f1'], ['', '', ''])
+
+        dt = np.dtype([('f0', '|S4'), ('f1', '|S')])
+        rec = np.rec.fromarrays(cols, dtype=dt)
+        assert_equal(rec.itemsize, 4)
+        assert_equal(rec['f0'], [b'test', b'test', b'test'])
+        assert_equal(rec['f1'], [b'', b'', b''])
+
+
+[email protected](Path is None, reason="No pathlib.Path")
+class TestPathUsage(object):
+    # Test that pathlib.Path can be used
+    def test_tofile_fromfile(self):
+        with temppath(suffix='.bin') as path:
+            path = Path(path)
+            np.random.seed(123)
+            a = np.random.rand(10).astype('f8,i4,a5')
+            a[5] = (0.5,10,'abcde')
+            with path.open("wb") as fd:
+                a.tofile(fd)
+            x = np.core.records.fromfile(path,
+                                         formats='f8,i4,a5',
+                                         shape=10)
+            assert_array_equal(x, a)
+
+
+class TestRecord(object):
+    def setup(self):
+        self.data = np.rec.fromrecords([(1, 2, 3), (4, 5, 6)],
+                            dtype=[("col1", "<i4"),
+                                   ("col2", "<i4"),
+                                   ("col3", "<i4")])
+
+    def test_assignment1(self):
+        a = self.data
+        assert_equal(a.col1[0], 1)
+        a[0].col1 = 0
+        assert_equal(a.col1[0], 0)
+
+    def test_assignment2(self):
+        a = self.data
+        assert_equal(a.col1[0], 1)
+        a.col1[0] = 0
+        assert_equal(a.col1[0], 0)
+
+    def test_invalid_assignment(self):
+        a = self.data
+
+        def assign_invalid_column(x):
+            x[0].col5 = 1
+
+        assert_raises(AttributeError, assign_invalid_column, a)
+
+    def test_nonwriteable_setfield(self):
+        # gh-8171
+        r = np.rec.array([(0,), (1,)], dtype=[('f', 'i4')])
+        r.flags.writeable = False
+        with assert_raises(ValueError):
+            r.f = [2, 3]
+        with assert_raises(ValueError):
+            r.setfield([2,3], *r.dtype.fields['f'])
+
+    def test_out_of_order_fields(self):
+        # names in the same order, padding added to descr
+        x = self.data[['col1', 'col2']]
+        assert_equal(x.dtype.names, ('col1', 'col2'))
+        assert_equal(x.dtype.descr,
+                     [('col1', '<i4'), ('col2', '<i4'), ('', '|V4')])
+
+        # names change order to match indexing, as of 1.14 - descr can't
+        # represent that
+        y = self.data[['col2', 'col1']]
+        assert_equal(y.dtype.names, ('col2', 'col1'))
+        assert_raises(ValueError, lambda: y.dtype.descr)
+
+    def test_pickle_1(self):
+        # Issue #1529
+        a = np.array([(1, [])], dtype=[('a', np.int32), ('b', np.int32, 0)])
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            assert_equal(a, pickle.loads(pickle.dumps(a, protocol=proto)))
+            assert_equal(a[0], pickle.loads(pickle.dumps(a[0],
+                                                         protocol=proto)))
+
+    def test_pickle_2(self):
+        a = self.data
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            assert_equal(a, pickle.loads(pickle.dumps(a, protocol=proto)))
+            assert_equal(a[0], pickle.loads(pickle.dumps(a[0],
+                                                         protocol=proto)))
+
+    def test_pickle_3(self):
+        # Issue #7140
+        a = self.data
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            pa = pickle.loads(pickle.dumps(a[0], protocol=proto))
+            assert_(pa.flags.c_contiguous)
+            assert_(pa.flags.f_contiguous)
+            assert_(pa.flags.writeable)
+            assert_(pa.flags.aligned)
+
+    def test_objview_record(self):
+        # https://github.com/numpy/numpy/issues/2599
+        dt = np.dtype([('foo', 'i8'), ('bar', 'O')])
+        r = np.zeros((1,3), dtype=dt).view(np.recarray)
+        r.foo = np.array([1, 2, 3])  # TypeError?
+
+        # https://github.com/numpy/numpy/issues/3256
+        ra = np.recarray((2,), dtype=[('x', object), ('y', float), ('z', int)])
+        ra[['x','y']]  # TypeError?
+
+    def test_record_scalar_setitem(self):
+        # https://github.com/numpy/numpy/issues/3561
+        rec = np.recarray(1, dtype=[('x', float, 5)])
+        rec[0].x = 1
+        assert_equal(rec[0].x, np.ones(5))
+
+    def test_missing_field(self):
+        # https://github.com/numpy/numpy/issues/4806
+        arr = np.zeros((3,), dtype=[('x', int), ('y', int)])
+        assert_raises(ValueError, lambda: arr[['nofield']])
+
+    @pytest.mark.parametrize('nfields', [0, 1, 2])
+    def test_assign_dtype_attribute(self, nfields):
+        dt = np.dtype([('a', np.uint8), ('b', np.uint8), ('c', np.uint8)][:nfields])
+        data = np.zeros(3, dt).view(np.recarray)
+
+        # the original and resulting dtypes differ on whether they are records
+        assert data.dtype.type == np.record
+        assert dt.type != np.record
+
+        # ensure that the dtype remains a record even when assigned
+        data.dtype = dt
+        assert data.dtype.type == np.record
+
+    @pytest.mark.parametrize('nfields', [0, 1, 2])
+    def test_nested_fields_are_records(self, nfields):
+        """ Test that nested structured types are treated as records too """
+        dt = np.dtype([('a', np.uint8), ('b', np.uint8), ('c', np.uint8)][:nfields])
+        dt_outer = np.dtype([('inner', dt)])
+
+        data = np.zeros(3, dt_outer).view(np.recarray)
+        assert isinstance(data, np.recarray)
+        assert isinstance(data['inner'], np.recarray)
+
+        data0 = data[0]
+        assert isinstance(data0, np.record)
+        assert isinstance(data0['inner'], np.record)
+
+    def test_nested_dtype_padding(self):
+        """ test that trailing padding is preserved """
+        # construct a dtype with padding at the end
+        dt = np.dtype([('a', np.uint8), ('b', np.uint8), ('c', np.uint8)])
+        dt_padded_end = np.dtype(dict(
+            names=['a', 'b'],
+            formats=[np.uint8, np.uint8],
+            offsets=[0, 1],
+            itemsize=3
+        ))  # dt[['a', 'b']], but that's not supported in 1.16
+        assert dt_padded_end.itemsize == dt.itemsize
+
+        dt_outer = np.dtype([('inner', dt_padded_end)])
+
+        data = np.zeros(3, dt_outer).view(np.recarray)
+        assert_equal(data['inner'].dtype, dt_padded_end)
+
+        data0 = data[0]
+        assert_equal(data0['inner'].dtype, dt_padded_end)
+
+
+def test_find_duplicate():
+    l1 = [1, 2, 3, 4, 5, 6]
+    assert_(np.rec.find_duplicate(l1) == [])
+
+    l2 = [1, 2, 1, 4, 5, 6]
+    assert_(np.rec.find_duplicate(l2) == [1])
+
+    l3 = [1, 2, 1, 4, 1, 6, 2, 3]
+    assert_(np.rec.find_duplicate(l3) == [1, 2])
+
+    l3 = [2, 2, 1, 4, 1, 6, 2, 3]
+    assert_(np.rec.find_duplicate(l3) == [2, 1])
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_regression.py b/contrib/python/numpy/py2/numpy/core/tests/test_regression.py
new file mode 100644
index 00000000000..8d84b2c12db
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_regression.py
@@ -0,0 +1,2487 @@
+from __future__ import division, absolute_import, print_function
+
+import copy
+import sys
+import gc
+import tempfile
+import pytest
+from os import path
+from io import BytesIO
+from itertools import chain
+
+import numpy as np
+from numpy.testing import (
+        assert_, assert_equal, IS_PYPY, assert_almost_equal,
+        assert_array_equal, assert_array_almost_equal, assert_raises,
+        assert_raises_regex, assert_warns, suppress_warnings,
+        _assert_valid_refcount, HAS_REFCOUNT,
+        )
+from numpy.compat import asbytes, asunicode, long
+from numpy.core.numeric import pickle
+
+try:
+    RecursionError
+except NameError:
+    RecursionError = RuntimeError  # python < 3.5
+
+class TestRegression(object):
+    def test_invalid_round(self):
+        # Ticket #3
+        v = 4.7599999999999998
+        assert_array_equal(np.array([v]), np.array(v))
+
+    def test_mem_empty(self):
+        # Ticket #7
+        np.empty((1,), dtype=[('x', np.int64)])
+
+    def test_pickle_transposed(self):
+        # Ticket #16
+        a = np.transpose(np.array([[2, 9], [7, 0], [3, 8]]))
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            f = BytesIO()
+            pickle.dump(a, f, protocol=proto)
+            f.seek(0)
+            b = pickle.load(f)
+            f.close()
+            assert_array_equal(a, b)
+
+    def test_typeNA(self):
+        # Issue gh-515
+        with suppress_warnings() as sup:
+            sup.filter(np.VisibleDeprecationWarning)
+            assert_equal(np.typeNA[np.int64], 'Int64')
+            assert_equal(np.typeNA[np.uint64], 'UInt64')
+
+    def test_dtype_names(self):
+        # Ticket #35
+        # Should succeed
+        np.dtype([(('name', 'label'), np.int32, 3)])
+
+    def test_reduce(self):
+        # Ticket #40
+        assert_almost_equal(np.add.reduce([1., .5], dtype=None), 1.5)
+
+    def test_zeros_order(self):
+        # Ticket #43
+        np.zeros([3], int, 'C')
+        np.zeros([3], order='C')
+        np.zeros([3], int, order='C')
+
+    def test_asarray_with_order(self):
+        # Check that nothing is done when order='F' and array C/F-contiguous
+        a = np.ones(2)
+        assert_(a is np.asarray(a, order='F'))
+
+    def test_ravel_with_order(self):
+        # Check that ravel works when order='F' and array C/F-contiguous
+        a = np.ones(2)
+        assert_(not a.ravel('F').flags.owndata)
+
+    def test_sort_bigendian(self):
+        # Ticket #47
+        a = np.linspace(0, 10, 11)
+        c = a.astype(np.dtype('<f8'))
+        c.sort()
+        assert_array_almost_equal(c, a)
+
+    def test_negative_nd_indexing(self):
+        # Ticket #49
+        c = np.arange(125).reshape((5, 5, 5))
+        origidx = np.array([-1, 0, 1])
+        idx = np.array(origidx)
+        c[idx]
+        assert_array_equal(idx, origidx)
+
+    def test_char_dump(self):
+        # Ticket #50
+        ca = np.char.array(np.arange(1000, 1010), itemsize=4)
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            f = BytesIO()
+            pickle.dump(ca, f, protocol=proto)
+            f.seek(0)
+            ca = np.load(f, allow_pickle=True)
+            f.close()
+
+    def test_noncontiguous_fill(self):
+        # Ticket #58.
+        a = np.zeros((5, 3))
+        b = a[:, :2,]
+
+        def rs():
+            b.shape = (10,)
+
+        assert_raises(AttributeError, rs)
+
+    def test_bool(self):
+        # Ticket #60
+        np.bool_(1)  # Should succeed
+
+    def test_indexing1(self):
+        # Ticket #64
+        descr = [('x', [('y', [('z', 'c16', (2,)),]),]),]
+        buffer = ((([6j, 4j],),),)
+        h = np.array(buffer, dtype=descr)
+        h['x']['y']['z']
+
+    def test_indexing2(self):
+        # Ticket #65
+        descr = [('x', 'i4', (2,))]
+        buffer = ([3, 2],)
+        h = np.array(buffer, dtype=descr)
+        h['x']
+
+    def test_round(self):
+        # Ticket #67
+        x = np.array([1+2j])
+        assert_almost_equal(x**(-1), [1/(1+2j)])
+
+    def test_scalar_compare(self):
+        # Trac Ticket #72
+        # https://github.com/numpy/numpy/issues/565
+        a = np.array(['test', 'auto'])
+        assert_array_equal(a == 'auto', np.array([False, True]))
+        assert_(a[1] == 'auto')
+        assert_(a[0] != 'auto')
+        b = np.linspace(0, 10, 11)
+        # This should return true for now, but will eventually raise an error:
+        with suppress_warnings() as sup:
+            sup.filter(FutureWarning)
+            assert_(b != 'auto')
+        assert_(b[0] != 'auto')
+
+    def test_unicode_swapping(self):
+        # Ticket #79
+        ulen = 1
+        ucs_value = u'\U0010FFFF'
+        ua = np.array([[[ucs_value*ulen]*2]*3]*4, dtype='U%s' % ulen)
+        ua.newbyteorder()  # Should succeed.
+
+    def test_object_array_fill(self):
+        # Ticket #86
+        x = np.zeros(1, 'O')
+        x.fill([])
+
+    def test_mem_dtype_align(self):
+        # Ticket #93
+        assert_raises(TypeError, np.dtype,
+                              {'names':['a'], 'formats':['foo']}, align=1)
+
+    def test_endian_bool_indexing(self):
+        # Ticket #105
+        a = np.arange(10., dtype='>f8')
+        b = np.arange(10., dtype='<f8')
+        xa = np.where((a > 2) & (a < 6))
+        xb = np.where((b > 2) & (b < 6))
+        ya = ((a > 2) & (a < 6))
+        yb = ((b > 2) & (b < 6))
+        assert_array_almost_equal(xa, ya.nonzero())
+        assert_array_almost_equal(xb, yb.nonzero())
+        assert_(np.all(a[ya] > 0.5))
+        assert_(np.all(b[yb] > 0.5))
+
+    def test_endian_where(self):
+        # GitHub issue #369
+        net = np.zeros(3, dtype='>f4')
+        net[1] = 0.00458849
+        net[2] = 0.605202
+        max_net = net.max()
+        test = np.where(net <= 0., max_net, net)
+        correct = np.array([ 0.60520202,  0.00458849,  0.60520202])
+        assert_array_almost_equal(test, correct)
+
+    def test_endian_recarray(self):
+        # Ticket #2185
+        dt = np.dtype([
+               ('head', '>u4'),
+               ('data', '>u4', 2),
+            ])
+        buf = np.recarray(1, dtype=dt)
+        buf[0]['head'] = 1
+        buf[0]['data'][:] = [1, 1]
+
+        h = buf[0]['head']
+        d = buf[0]['data'][0]
+        buf[0]['head'] = h
+        buf[0]['data'][0] = d
+        assert_(buf[0]['head'] == 1)
+
+    def test_mem_dot(self):
+        # Ticket #106
+        x = np.random.randn(0, 1)
+        y = np.random.randn(10, 1)
+        # Dummy array to detect bad memory access:
+        _z = np.ones(10)
+        _dummy = np.empty((0, 10))
+        z = np.lib.stride_tricks.as_strided(_z, _dummy.shape, _dummy.strides)
+        np.dot(x, np.transpose(y), out=z)
+        assert_equal(_z, np.ones(10))
+        # Do the same for the built-in dot:
+        np.core.multiarray.dot(x, np.transpose(y), out=z)
+        assert_equal(_z, np.ones(10))
+
+    def test_arange_endian(self):
+        # Ticket #111
+        ref = np.arange(10)
+        x = np.arange(10, dtype='<f8')
+        assert_array_equal(ref, x)
+        x = np.arange(10, dtype='>f8')
+        assert_array_equal(ref, x)
+
+    def test_arange_inf_step(self):
+        ref = np.arange(0, 1, 10)
+        x = np.arange(0, 1, np.inf)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, 1, -10)
+        x = np.arange(0, 1, -np.inf)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -1, -10)
+        x = np.arange(0, -1, -np.inf)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -1, 10)
+        x = np.arange(0, -1, np.inf)
+        assert_array_equal(ref, x)
+
+    def test_arange_underflow_stop_and_step(self):
+        finfo = np.finfo(np.float64)
+
+        ref = np.arange(0, finfo.eps, 2 * finfo.eps)
+        x = np.arange(0, finfo.eps, finfo.max)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, finfo.eps, -2 * finfo.eps)
+        x = np.arange(0, finfo.eps, -finfo.max)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -finfo.eps, -2 * finfo.eps)
+        x = np.arange(0, -finfo.eps, -finfo.max)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -finfo.eps, 2 * finfo.eps)
+        x = np.arange(0, -finfo.eps, finfo.max)
+        assert_array_equal(ref, x)
+
+    def test_argmax(self):
+        # Ticket #119
+        a = np.random.normal(0, 1, (4, 5, 6, 7, 8))
+        for i in range(a.ndim):
+            a.argmax(i)  # Should succeed
+
+    def test_mem_divmod(self):
+        # Ticket #126
+        for i in range(10):
+            divmod(np.array([i])[0], 10)
+
+    def test_hstack_invalid_dims(self):
+        # Ticket #128
+        x = np.arange(9).reshape((3, 3))
+        y = np.array([0, 0, 0])
+        assert_raises(ValueError, np.hstack, (x, y))
+
+    def test_squeeze_type(self):
+        # Ticket #133
+        a = np.array([3])
+        b = np.array(3)
+        assert_(type(a.squeeze()) is np.ndarray)
+        assert_(type(b.squeeze()) is np.ndarray)
+
+    def test_add_identity(self):
+        # Ticket #143
+        assert_equal(0, np.add.identity)
+
+    def test_numpy_float_python_long_addition(self):
+        # Check that numpy float and python longs can be added correctly.
+        a = np.float_(23.) + 2**135
+        assert_equal(a, 23. + 2**135)
+
+    def test_binary_repr_0(self):
+        # Ticket #151
+        assert_equal('0', np.binary_repr(0))
+
+    def test_rec_iterate(self):
+        # Ticket #160
+        descr = np.dtype([('i', int), ('f', float), ('s', '|S3')])
+        x = np.rec.array([(1, 1.1, '1.0'),
+                         (2, 2.2, '2.0')], dtype=descr)
+        x[0].tolist()
+        [i for i in x[0]]
+
+    def test_unicode_string_comparison(self):
+        # Ticket #190
+        a = np.array('hello', np.unicode_)
+        b = np.array('world')
+        a == b
+
+    def test_tobytes_FORTRANORDER_discontiguous(self):
+        # Fix in r2836
+        # Create non-contiguous Fortran ordered array
+        x = np.array(np.random.rand(3, 3), order='F')[:, :2]
+        assert_array_almost_equal(x.ravel(), np.frombuffer(x.tobytes()))
+
+    def test_flat_assignment(self):
+        # Correct behaviour of ticket #194
+        x = np.empty((3, 1))
+        x.flat = np.arange(3)
+        assert_array_almost_equal(x, [[0], [1], [2]])
+        x.flat = np.arange(3, dtype=float)
+        assert_array_almost_equal(x, [[0], [1], [2]])
+
+    def test_broadcast_flat_assignment(self):
+        # Ticket #194
+        x = np.empty((3, 1))
+
+        def bfa():
+            x[:] = np.arange(3)
+
+        def bfb():
+            x[:] = np.arange(3, dtype=float)
+
+        assert_raises(ValueError, bfa)
+        assert_raises(ValueError, bfb)
+
+    def test_nonarray_assignment(self):
+        # See also Issue gh-2870, test for non-array assignment
+        # and equivalent unsafe casted array assignment
+        a = np.arange(10)
+        b = np.ones(10, dtype=bool)
+        r = np.arange(10)
+
+        def assign(a, b, c):
+            a[b] = c
+
+        assert_raises(ValueError, assign, a, b, np.nan)
+        a[b] = np.array(np.nan)  # but not this.
+        assert_raises(ValueError, assign, a, r, np.nan)
+        a[r] = np.array(np.nan)
+
+    def test_unpickle_dtype_with_object(self):
+        # Implemented in r2840
+        dt = np.dtype([('x', int), ('y', np.object_), ('z', 'O')])
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            f = BytesIO()
+            pickle.dump(dt, f, protocol=proto)
+            f.seek(0)
+            dt_ = pickle.load(f)
+            f.close()
+            assert_equal(dt, dt_)
+
+    def test_mem_array_creation_invalid_specification(self):
+        # Ticket #196
+        dt = np.dtype([('x', int), ('y', np.object_)])
+        # Wrong way
+        assert_raises(ValueError, np.array, [1, 'object'], dt)
+        # Correct way
+        np.array([(1, 'object')], dt)
+
+    def test_recarray_single_element(self):
+        # Ticket #202
+        a = np.array([1, 2, 3], dtype=np.int32)
+        b = a.copy()
+        r = np.rec.array(a, shape=1, formats=['3i4'], names=['d'])
+        assert_array_equal(a, b)
+        assert_equal(a, r[0][0])
+
+    def test_zero_sized_array_indexing(self):
+        # Ticket #205
+        tmp = np.array([])
+
+        def index_tmp():
+            tmp[np.array(10)]
+
+        assert_raises(IndexError, index_tmp)
+
+    def test_chararray_rstrip(self):
+        # Ticket #222
+        x = np.chararray((1,), 5)
+        x[0] = b'a   '
+        x = x.rstrip()
+        assert_equal(x[0], b'a')
+
+    def test_object_array_shape(self):
+        # Ticket #239
+        assert_equal(np.array([[1, 2], 3, 4], dtype=object).shape, (3,))
+        assert_equal(np.array([[1, 2], [3, 4]], dtype=object).shape, (2, 2))
+        assert_equal(np.array([(1, 2), (3, 4)], dtype=object).shape, (2, 2))
+        assert_equal(np.array([], dtype=object).shape, (0,))
+        assert_equal(np.array([[], [], []], dtype=object).shape, (3, 0))
+        assert_equal(np.array([[3, 4], [5, 6], None], dtype=object).shape, (3,))
+
+    def test_mem_around(self):
+        # Ticket #243
+        x = np.zeros((1,))
+        y = [0]
+        decimal = 6
+        np.around(abs(x-y), decimal) <= 10.0**(-decimal)
+
+    def test_character_array_strip(self):
+        # Ticket #246
+        x = np.char.array(("x", "x ", "x  "))
+        for c in x:
+            assert_equal(c, "x")
+
+    def test_lexsort(self):
+        # Lexsort memory error
+        v = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+        assert_equal(np.lexsort(v), 0)
+
+    def test_lexsort_invalid_sequence(self):
+        # Issue gh-4123
+        class BuggySequence(object):
+            def __len__(self):
+                return 4
+
+            def __getitem__(self, key):
+                raise KeyError
+
+        assert_raises(KeyError, np.lexsort, BuggySequence())
+
+    def test_pickle_py2_bytes_encoding(self):
+        # Check that arrays and scalars pickled on Py2 are
+        # unpickleable on Py3 using encoding='bytes'
+
+        test_data = [
+            # (original, py2_pickle)
+            (np.unicode_('\u6f2c'),
+             b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n"
+             b"(S'U1'\np2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI4\nI4\n"
+             b"I0\ntp6\nbS',o\\x00\\x00'\np7\ntp8\nRp9\n."),
+
+            (np.array([9e123], dtype=np.float64),
+             b"cnumpy.core.multiarray\n_reconstruct\np0\n(cnumpy\nndarray\n"
+             b"p1\n(I0\ntp2\nS'b'\np3\ntp4\nRp5\n(I1\n(I1\ntp6\ncnumpy\ndtype\n"
+             b"p7\n(S'f8'\np8\nI0\nI1\ntp9\nRp10\n(I3\nS'<'\np11\nNNNI-1\nI-1\n"
+             b"I0\ntp12\nbI00\nS'O\\x81\\xb7Z\\xaa:\\xabY'\np13\ntp14\nb."),
+
+            (np.array([(9e123,)], dtype=[('name', float)]),
+             b"cnumpy.core.multiarray\n_reconstruct\np0\n(cnumpy\nndarray\np1\n"
+             b"(I0\ntp2\nS'b'\np3\ntp4\nRp5\n(I1\n(I1\ntp6\ncnumpy\ndtype\np7\n"
+             b"(S'V8'\np8\nI0\nI1\ntp9\nRp10\n(I3\nS'|'\np11\nN(S'name'\np12\ntp13\n"
+             b"(dp14\ng12\n(g7\n(S'f8'\np15\nI0\nI1\ntp16\nRp17\n(I3\nS'<'\np18\nNNNI-1\n"
+             b"I-1\nI0\ntp19\nbI0\ntp20\nsI8\nI1\nI0\ntp21\n"
+             b"bI00\nS'O\\x81\\xb7Z\\xaa:\\xabY'\np22\ntp23\nb."),
+        ]
+
+        if sys.version_info[:2] >= (3, 4):
+            # encoding='bytes' was added in Py3.4
+            for original, data in test_data:
+                result = pickle.loads(data, encoding='bytes')
+                assert_equal(result, original)
+
+                if isinstance(result, np.ndarray) and result.dtype.names is not None:
+                    for name in result.dtype.names:
+                        assert_(isinstance(name, str))
+
+    def test_pickle_dtype(self):
+        # Ticket #251
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            pickle.dumps(float, protocol=proto)
+
+    def test_swap_real(self):
+        # Ticket #265
+        assert_equal(np.arange(4, dtype='>c8').imag.max(), 0.0)
+        assert_equal(np.arange(4, dtype='<c8').imag.max(), 0.0)
+        assert_equal(np.arange(4, dtype='>c8').real.max(), 3.0)
+        assert_equal(np.arange(4, dtype='<c8').real.max(), 3.0)
+
+    def test_object_array_from_list(self):
+        # Ticket #270
+        assert_(np.array([1, 'A', None]).shape == (3,))
+
+    def test_multiple_assign(self):
+        # Ticket #273
+        a = np.zeros((3, 1), int)
+        a[[1, 2]] = 1
+
+    def test_empty_array_type(self):
+        assert_equal(np.array([]).dtype, np.zeros(0).dtype)
+
+    def test_void_copyswap(self):
+        dt = np.dtype([('one', '<i4'), ('two', '<i4')])
+        x = np.array((1, 2), dtype=dt)
+        x = x.byteswap()
+        assert_(x['one'] > 1 and x['two'] > 2)
+
+    def test_method_args(self):
+        # Make sure methods and functions have same default axis
+        # keyword and arguments
+        funcs1 = ['argmax', 'argmin', 'sum', ('product', 'prod'),
+                 ('sometrue', 'any'),
+                 ('alltrue', 'all'), 'cumsum', ('cumproduct', 'cumprod'),
+                 'ptp', 'cumprod', 'prod', 'std', 'var', 'mean',
+                 'round', 'min', 'max', 'argsort', 'sort']
+        funcs2 = ['compress', 'take', 'repeat']
+
+        for func in funcs1:
+            arr = np.random.rand(8, 7)
+            arr2 = arr.copy()
+            if isinstance(func, tuple):
+                func_meth = func[1]
+                func = func[0]
+            else:
+                func_meth = func
+            res1 = getattr(arr, func_meth)()
+            res2 = getattr(np, func)(arr2)
+            if res1 is None:
+                res1 = arr
+
+            if res1.dtype.kind in 'uib':
+                assert_((res1 == res2).all(), func)
+            else:
+                assert_(abs(res1-res2).max() < 1e-8, func)
+
+        for func in funcs2:
+            arr1 = np.random.rand(8, 7)
+            arr2 = np.random.rand(8, 7)
+            res1 = None
+            if func == 'compress':
+                arr1 = arr1.ravel()
+                res1 = getattr(arr2, func)(arr1)
+            else:
+                arr2 = (15*arr2).astype(int).ravel()
+            if res1 is None:
+                res1 = getattr(arr1, func)(arr2)
+            res2 = getattr(np, func)(arr1, arr2)
+            assert_(abs(res1-res2).max() < 1e-8, func)
+
+    def test_mem_lexsort_strings(self):
+        # Ticket #298
+        lst = ['abc', 'cde', 'fgh']
+        np.lexsort((lst,))
+
+    def test_fancy_index(self):
+        # Ticket #302
+        x = np.array([1, 2])[np.array([0])]
+        assert_equal(x.shape, (1,))
+
+    def test_recarray_copy(self):
+        # Ticket #312
+        dt = [('x', np.int16), ('y', np.float64)]
+        ra = np.array([(1, 2.3)], dtype=dt)
+        rb = np.rec.array(ra, dtype=dt)
+        rb['x'] = 2.
+        assert_(ra['x'] != rb['x'])
+
+    def test_rec_fromarray(self):
+        # Ticket #322
+        x1 = np.array([[1, 2], [3, 4], [5, 6]])
+        x2 = np.array(['a', 'dd', 'xyz'])
+        x3 = np.array([1.1, 2, 3])
+        np.rec.fromarrays([x1, x2, x3], formats="(2,)i4,a3,f8")
+
+    def test_object_array_assign(self):
+        x = np.empty((2, 2), object)
+        x.flat[2] = (1, 2, 3)
+        assert_equal(x.flat[2], (1, 2, 3))
+
+    def test_ndmin_float64(self):
+        # Ticket #324
+        x = np.array([1, 2, 3], dtype=np.float64)
+        assert_equal(np.array(x, dtype=np.float32, ndmin=2).ndim, 2)
+        assert_equal(np.array(x, dtype=np.float64, ndmin=2).ndim, 2)
+
+    def test_ndmin_order(self):
+        # Issue #465 and related checks
+        assert_(np.array([1, 2], order='C', ndmin=3).flags.c_contiguous)
+        assert_(np.array([1, 2], order='F', ndmin=3).flags.f_contiguous)
+        assert_(np.array(np.ones((2, 2), order='F'), ndmin=3).flags.f_contiguous)
+        assert_(np.array(np.ones((2, 2), order='C'), ndmin=3).flags.c_contiguous)
+
+    def test_mem_axis_minimization(self):
+        # Ticket #327
+        data = np.arange(5)
+        data = np.add.outer(data, data)
+
+    def test_mem_float_imag(self):
+        # Ticket #330
+        np.float64(1.0).imag
+
+    def test_dtype_tuple(self):
+        # Ticket #334
+        assert_(np.dtype('i4') == np.dtype(('i4', ())))
+
+    def test_dtype_posttuple(self):
+        # Ticket #335
+        np.dtype([('col1', '()i4')])
+
+    def test_numeric_carray_compare(self):
+        # Ticket #341
+        assert_equal(np.array(['X'], 'c'), b'X')
+
+    def test_string_array_size(self):
+        # Ticket #342
+        assert_raises(ValueError,
+                              np.array, [['X'], ['X', 'X', 'X']], '|S1')
+
+    def test_dtype_repr(self):
+        # Ticket #344
+        dt1 = np.dtype(('uint32', 2))
+        dt2 = np.dtype(('uint32', (2,)))
+        assert_equal(dt1.__repr__(), dt2.__repr__())
+
+    def test_reshape_order(self):
+        # Make sure reshape order works.
+        a = np.arange(6).reshape(2, 3, order='F')
+        assert_equal(a, [[0, 2, 4], [1, 3, 5]])
+        a = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+        b = a[:, 1]
+        assert_equal(b.reshape(2, 2, order='F'), [[2, 6], [4, 8]])
+
+    def test_reshape_zero_strides(self):
+        # Issue #380, test reshaping of zero strided arrays
+        a = np.ones(1)
+        a = np.lib.stride_tricks.as_strided(a, shape=(5,), strides=(0,))
+        assert_(a.reshape(5, 1).strides[0] == 0)
+
+    def test_reshape_zero_size(self):
+        # GitHub Issue #2700, setting shape failed for 0-sized arrays
+        a = np.ones((0, 2))
+        a.shape = (-1, 2)
+
+    # Cannot test if NPY_RELAXED_STRIDES_CHECKING changes the strides.
+    # With NPY_RELAXED_STRIDES_CHECKING the test becomes superfluous.
+    @pytest.mark.skipif(np.ones(1).strides[0] == np.iinfo(np.intp).max,
+                        reason="Using relaxed stride checking")
+    def test_reshape_trailing_ones_strides(self):
+        # GitHub issue gh-2949, bad strides for trailing ones of new shape
+        a = np.zeros(12, dtype=np.int32)[::2]  # not contiguous
+        strides_c = (16, 8, 8, 8)
+        strides_f = (8, 24, 48, 48)
+        assert_equal(a.reshape(3, 2, 1, 1).strides, strides_c)
+        assert_equal(a.reshape(3, 2, 1, 1, order='F').strides, strides_f)
+        assert_equal(np.array(0, dtype=np.int32).reshape(1, 1).strides, (4, 4))
+
+    def test_repeat_discont(self):
+        # Ticket #352
+        a = np.arange(12).reshape(4, 3)[:, 2]
+        assert_equal(a.repeat(3), [2, 2, 2, 5, 5, 5, 8, 8, 8, 11, 11, 11])
+
+    def test_array_index(self):
+        # Make sure optimization is not called in this case.
+        a = np.array([1, 2, 3])
+        a2 = np.array([[1, 2, 3]])
+        assert_equal(a[np.where(a == 3)], a2[np.where(a2 == 3)])
+
+    def test_object_argmax(self):
+        a = np.array([1, 2, 3], dtype=object)
+        assert_(a.argmax() == 2)
+
+    def test_recarray_fields(self):
+        # Ticket #372
+        dt0 = np.dtype([('f0', 'i4'), ('f1', 'i4')])
+        dt1 = np.dtype([('f0', 'i8'), ('f1', 'i8')])
+        for a in [np.array([(1, 2), (3, 4)], "i4,i4"),
+                  np.rec.array([(1, 2), (3, 4)], "i4,i4"),
+                  np.rec.array([(1, 2), (3, 4)]),
+                  np.rec.fromarrays([(1, 2), (3, 4)], "i4,i4"),
+                  np.rec.fromarrays([(1, 2), (3, 4)])]:
+            assert_(a.dtype in [dt0, dt1])
+
+    def test_random_shuffle(self):
+        # Ticket #374
+        a = np.arange(5).reshape((5, 1))
+        b = a.copy()
+        np.random.shuffle(b)
+        assert_equal(np.sort(b, axis=0), a)
+
+    def test_refcount_vdot(self):
+        # Changeset #3443
+        _assert_valid_refcount(np.vdot)
+
+    def test_startswith(self):
+        ca = np.char.array(['Hi', 'There'])
+        assert_equal(ca.startswith('H'), [True, False])
+
+    def test_noncommutative_reduce_accumulate(self):
+        # Ticket #413
+        tosubtract = np.arange(5)
+        todivide = np.array([2.0, 0.5, 0.25])
+        assert_equal(np.subtract.reduce(tosubtract), -10)
+        assert_equal(np.divide.reduce(todivide), 16.0)
+        assert_array_equal(np.subtract.accumulate(tosubtract),
+            np.array([0, -1, -3, -6, -10]))
+        assert_array_equal(np.divide.accumulate(todivide),
+            np.array([2., 4., 16.]))
+
+    def test_convolve_empty(self):
+        # Convolve should raise an error for empty input array.
+        assert_raises(ValueError, np.convolve, [], [1])
+        assert_raises(ValueError, np.convolve, [1], [])
+
+    def test_multidim_byteswap(self):
+        # Ticket #449
+        r = np.array([(1, (0, 1, 2))], dtype="i2,3i2")
+        assert_array_equal(r.byteswap(),
+                           np.array([(256, (0, 256, 512))], r.dtype))
+
+    def test_string_NULL(self):
+        # Changeset 3557
+        assert_equal(np.array("a\x00\x0b\x0c\x00").item(),
+                     'a\x00\x0b\x0c')
+
+    def test_junk_in_string_fields_of_recarray(self):
+        # Ticket #483
+        r = np.array([[b'abc']], dtype=[('var1', '|S20')])
+        assert_(asbytes(r['var1'][0][0]) == b'abc')
+
+    def test_take_output(self):
+        # Ensure that 'take' honours output parameter.
+        x = np.arange(12).reshape((3, 4))
+        a = np.take(x, [0, 2], axis=1)
+        b = np.zeros_like(a)
+        np.take(x, [0, 2], axis=1, out=b)
+        assert_array_equal(a, b)
+
+    def test_take_object_fail(self):
+        # Issue gh-3001
+        d = 123.
+        a = np.array([d, 1], dtype=object)
+        if HAS_REFCOUNT:
+            ref_d = sys.getrefcount(d)
+        try:
+            a.take([0, 100])
+        except IndexError:
+            pass
+        if HAS_REFCOUNT:
+            assert_(ref_d == sys.getrefcount(d))
+
+    def test_array_str_64bit(self):
+        # Ticket #501
+        s = np.array([1, np.nan], dtype=np.float64)
+        with np.errstate(all='raise'):
+            np.array_str(s)  # Should succeed
+
+    def test_frompyfunc_endian(self):
+        # Ticket #503
+        from math import radians
+        uradians = np.frompyfunc(radians, 1, 1)
+        big_endian = np.array([83.4, 83.5], dtype='>f8')
+        little_endian = np.array([83.4, 83.5], dtype='<f8')
+        assert_almost_equal(uradians(big_endian).astype(float),
+                            uradians(little_endian).astype(float))
+
+    def test_mem_string_arr(self):
+        # Ticket #514
+        s = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+        t = []
+        np.hstack((t, s))
+
+    def test_arr_transpose(self):
+        # Ticket #516
+        x = np.random.rand(*(2,)*16)
+        x.transpose(list(range(16)))  # Should succeed
+
+    def test_string_mergesort(self):
+        # Ticket #540
+        x = np.array(['a']*32)
+        assert_array_equal(x.argsort(kind='m'), np.arange(32))
+
+    def test_argmax_byteorder(self):
+        # Ticket #546
+        a = np.arange(3, dtype='>f')
+        assert_(a[a.argmax()] == a.max())
+
+    def test_rand_seed(self):
+        # Ticket #555
+        for l in np.arange(4):
+            np.random.seed(l)
+
+    def test_mem_deallocation_leak(self):
+        # Ticket #562
+        a = np.zeros(5, dtype=float)
+        b = np.array(a, dtype=float)
+        del a, b
+
+    def test_mem_on_invalid_dtype(self):
+        "Ticket #583"
+        assert_raises(ValueError, np.fromiter, [['12', ''], ['13', '']], str)
+
+    def test_dot_negative_stride(self):
+        # Ticket #588
+        x = np.array([[1, 5, 25, 125., 625]])
+        y = np.array([[20.], [160.], [640.], [1280.], [1024.]])
+        z = y[::-1].copy()
+        y2 = y[::-1]
+        assert_equal(np.dot(x, z), np.dot(x, y2))
+
+    def test_object_casting(self):
+        # This used to trigger the object-type version of
+        # the bitwise_or operation, because float64 -> object
+        # casting succeeds
+        def rs():
+            x = np.ones([484, 286])
+            y = np.zeros([484, 286])
+            x |= y
+
+        assert_raises(TypeError, rs)
+
+    def test_unicode_scalar(self):
+        # Ticket #600
+        x = np.array(["DROND", "DROND1"], dtype="U6")
+        el = x[1]
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            new = pickle.loads(pickle.dumps(el, protocol=proto))
+            assert_equal(new, el)
+
+    def test_arange_non_native_dtype(self):
+        # Ticket #616
+        for T in ('>f4', '<f4'):
+            dt = np.dtype(T)
+            assert_equal(np.arange(0, dtype=dt).dtype, dt)
+            assert_equal(np.arange(0.5, dtype=dt).dtype, dt)
+            assert_equal(np.arange(5, dtype=dt).dtype, dt)
+
+    def test_bool_flat_indexing_invalid_nr_elements(self):
+        s = np.ones(10, dtype=float)
+        x = np.array((15,), dtype=float)
+
+        def ia(x, s, v):
+            x[(s > 0)] = v
+
+        assert_raises(IndexError, ia, x, s, np.zeros(9, dtype=float))
+        assert_raises(IndexError, ia, x, s, np.zeros(11, dtype=float))
+
+        # Old special case (different code path):
+        assert_raises(ValueError, ia, x.flat, s, np.zeros(9, dtype=float))
+        assert_raises(ValueError, ia, x.flat, s, np.zeros(11, dtype=float))
+
+    def test_mem_scalar_indexing(self):
+        # Ticket #603
+        x = np.array([0], dtype=float)
+        index = np.array(0, dtype=np.int32)
+        x[index]
+
+    def test_binary_repr_0_width(self):
+        assert_equal(np.binary_repr(0, width=3), '000')
+
+    def test_fromstring(self):
+        assert_equal(np.fromstring("12:09:09", dtype=int, sep=":"),
+                     [12, 9, 9])
+
+    def test_searchsorted_variable_length(self):
+        x = np.array(['a', 'aa', 'b'])
+        y = np.array(['d', 'e'])
+        assert_equal(x.searchsorted(y), [3, 3])
+
+    def test_string_argsort_with_zeros(self):
+        # Check argsort for strings containing zeros.
+        x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2")
+        assert_array_equal(x.argsort(kind='m'), np.array([1, 0]))
+        assert_array_equal(x.argsort(kind='q'), np.array([1, 0]))
+
+    def test_string_sort_with_zeros(self):
+        # Check sort for strings containing zeros.
+        x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2")
+        y = np.frombuffer(b"\x00\x01\x00\x02", dtype="|S2")
+        assert_array_equal(np.sort(x, kind="q"), y)
+
+    def test_copy_detection_zero_dim(self):
+        # Ticket #658
+        np.indices((0, 3, 4)).T.reshape(-1, 3)
+
+    def test_flat_byteorder(self):
+        # Ticket #657
+        x = np.arange(10)
+        assert_array_equal(x.astype('>i4'), x.astype('<i4').flat[:])
+        assert_array_equal(x.astype('>i4').flat[:], x.astype('<i4'))
+
+    def test_sign_bit(self):
+        x = np.array([0, -0.0, 0])
+        assert_equal(str(np.abs(x)), '[0. 0. 0.]')
+
+    def test_flat_index_byteswap(self):
+        for dt in (np.dtype('<i4'), np.dtype('>i4')):
+            x = np.array([-1, 0, 1], dtype=dt)
+            assert_equal(x.flat[0].dtype, x[0].dtype)
+
+    def test_copy_detection_corner_case(self):
+        # Ticket #658
+        np.indices((0, 3, 4)).T.reshape(-1, 3)
+
+    # Cannot test if NPY_RELAXED_STRIDES_CHECKING changes the strides.
+    # With NPY_RELAXED_STRIDES_CHECKING the test becomes superfluous,
+    # 0-sized reshape itself is tested elsewhere.
+    @pytest.mark.skipif(np.ones(1).strides[0] == np.iinfo(np.intp).max,
+                        reason="Using relaxed stride checking")
+    def test_copy_detection_corner_case2(self):
+        # Ticket #771: strides are not set correctly when reshaping 0-sized
+        # arrays
+        b = np.indices((0, 3, 4)).T.reshape(-1, 3)
+        assert_equal(b.strides, (3 * b.itemsize, b.itemsize))
+
+    def test_object_array_refcounting(self):
+        # Ticket #633
+        if not hasattr(sys, 'getrefcount'):
+            return
+
+        # NB. this is probably CPython-specific
+
+        cnt = sys.getrefcount
+
+        a = object()
+        b = object()
+        c = object()
+
+        cnt0_a = cnt(a)
+        cnt0_b = cnt(b)
+        cnt0_c = cnt(c)
+
+        # -- 0d -> 1-d broadcast slice assignment
+
+        arr = np.zeros(5, dtype=np.object_)
+
+        arr[:] = a
+        assert_equal(cnt(a), cnt0_a + 5)
+
+        arr[:] = b
+        assert_equal(cnt(a), cnt0_a)
+        assert_equal(cnt(b), cnt0_b + 5)
+
+        arr[:2] = c
+        assert_equal(cnt(b), cnt0_b + 3)
+        assert_equal(cnt(c), cnt0_c + 2)
+
+        del arr
+
+        # -- 1-d -> 2-d broadcast slice assignment
+
+        arr = np.zeros((5, 2), dtype=np.object_)
+        arr0 = np.zeros(2, dtype=np.object_)
+
+        arr0[0] = a
+        assert_(cnt(a) == cnt0_a + 1)
+        arr0[1] = b
+        assert_(cnt(b) == cnt0_b + 1)
+
+        arr[:, :] = arr0
+        assert_(cnt(a) == cnt0_a + 6)
+        assert_(cnt(b) == cnt0_b + 6)
+
+        arr[:, 0] = None
+        assert_(cnt(a) == cnt0_a + 1)
+
+        del arr, arr0
+
+        # -- 2-d copying + flattening
+
+        arr = np.zeros((5, 2), dtype=np.object_)
+
+        arr[:, 0] = a
+        arr[:, 1] = b
+        assert_(cnt(a) == cnt0_a + 5)
+        assert_(cnt(b) == cnt0_b + 5)
+
+        arr2 = arr.copy()
+        assert_(cnt(a) == cnt0_a + 10)
+        assert_(cnt(b) == cnt0_b + 10)
+
+        arr2 = arr[:, 0].copy()
+        assert_(cnt(a) == cnt0_a + 10)
+        assert_(cnt(b) == cnt0_b + 5)
+
+        arr2 = arr.flatten()
+        assert_(cnt(a) == cnt0_a + 10)
+        assert_(cnt(b) == cnt0_b + 10)
+
+        del arr, arr2
+
+        # -- concatenate, repeat, take, choose
+
+        arr1 = np.zeros((5, 1), dtype=np.object_)
+        arr2 = np.zeros((5, 1), dtype=np.object_)
+
+        arr1[...] = a
+        arr2[...] = b
+        assert_(cnt(a) == cnt0_a + 5)
+        assert_(cnt(b) == cnt0_b + 5)
+
+        tmp = np.concatenate((arr1, arr2))
+        assert_(cnt(a) == cnt0_a + 5 + 5)
+        assert_(cnt(b) == cnt0_b + 5 + 5)
+
+        tmp = arr1.repeat(3, axis=0)
+        assert_(cnt(a) == cnt0_a + 5 + 3*5)
+
+        tmp = arr1.take([1, 2, 3], axis=0)
+        assert_(cnt(a) == cnt0_a + 5 + 3)
+
+        x = np.array([[0], [1], [0], [1], [1]], int)
+        tmp = x.choose(arr1, arr2)
+        assert_(cnt(a) == cnt0_a + 5 + 2)
+        assert_(cnt(b) == cnt0_b + 5 + 3)
+
+        del tmp  # Avoid pyflakes unused variable warning
+
+    def test_mem_custom_float_to_array(self):
+        # Ticket 702
+        class MyFloat(object):
+            def __float__(self):
+                return 1.0
+
+        tmp = np.atleast_1d([MyFloat()])
+        tmp.astype(float)  # Should succeed
+
+    def test_object_array_refcount_self_assign(self):
+        # Ticket #711
+        class VictimObject(object):
+            deleted = False
+
+            def __del__(self):
+                self.deleted = True
+
+        d = VictimObject()
+        arr = np.zeros(5, dtype=np.object_)
+        arr[:] = d
+        del d
+        arr[:] = arr  # refcount of 'd' might hit zero here
+        assert_(not arr[0].deleted)
+        arr[:] = arr  # trying to induce a segfault by doing it again...
+        assert_(not arr[0].deleted)
+
+    def test_mem_fromiter_invalid_dtype_string(self):
+        x = [1, 2, 3]
+        assert_raises(ValueError,
+                              np.fromiter, [xi for xi in x], dtype='S')
+
+    def test_reduce_big_object_array(self):
+        # Ticket #713
+        oldsize = np.setbufsize(10*16)
+        a = np.array([None]*161, object)
+        assert_(not np.any(a))
+        np.setbufsize(oldsize)
+
+    def test_mem_0d_array_index(self):
+        # Ticket #714
+        np.zeros(10)[np.array(0)]
+
+    def test_nonnative_endian_fill(self):
+        # Non-native endian arrays were incorrectly filled with scalars
+        # before r5034.
+        if sys.byteorder == 'little':
+            dtype = np.dtype('>i4')
+        else:
+            dtype = np.dtype('<i4')
+        x = np.empty([1], dtype=dtype)
+        x.fill(1)
+        assert_equal(x, np.array([1], dtype=dtype))
+
+    def test_dot_alignment_sse2(self):
+        # Test for ticket #551, changeset r5140
+        x = np.zeros((30, 40))
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            y = pickle.loads(pickle.dumps(x, protocol=proto))
+            # y is now typically not aligned on a 8-byte boundary
+            z = np.ones((1, y.shape[0]))
+            # This shouldn't cause a segmentation fault:
+            np.dot(z, y)
+
+    def test_astype_copy(self):
+        # Ticket #788, changeset r5155
+        # The test data file was generated by scipy.io.savemat.
+        # The dtype is float64, but the isbuiltin attribute is 0.
+        data_dir = path.join(path.dirname(__file__), 'data')
+        filename = path.join(data_dir, "astype_copy.pkl")
+        if sys.version_info[0] >= 3:
+            f = open(filename, 'rb')
+            xp = pickle.load(f, encoding='latin1')
+            f.close()
+        else:
+            f = open(filename)
+            xp = pickle.load(f)
+            f.close()
+        xpd = xp.astype(np.float64)
+        assert_((xp.__array_interface__['data'][0] !=
+                xpd.__array_interface__['data'][0]))
+
+    def test_compress_small_type(self):
+        # Ticket #789, changeset 5217.
+        # compress with out argument segfaulted if cannot cast safely
+        import numpy as np
+        a = np.array([[1, 2], [3, 4]])
+        b = np.zeros((2, 1), dtype=np.single)
+        try:
+            a.compress([True, False], axis=1, out=b)
+            raise AssertionError("compress with an out which cannot be "
+                                 "safely casted should not return "
+                                 "successfully")
+        except TypeError:
+            pass
+
+    def test_attributes(self):
+        # Ticket #791
+        class TestArray(np.ndarray):
+            def __new__(cls, data, info):
+                result = np.array(data)
+                result = result.view(cls)
+                result.info = info
+                return result
+
+            def __array_finalize__(self, obj):
+                self.info = getattr(obj, 'info', '')
+
+        dat = TestArray([[1, 2, 3, 4], [5, 6, 7, 8]], 'jubba')
+        assert_(dat.info == 'jubba')
+        dat.resize((4, 2))
+        assert_(dat.info == 'jubba')
+        dat.sort()
+        assert_(dat.info == 'jubba')
+        dat.fill(2)
+        assert_(dat.info == 'jubba')
+        dat.put([2, 3, 4], [6, 3, 4])
+        assert_(dat.info == 'jubba')
+        dat.setfield(4, np.int32, 0)
+        assert_(dat.info == 'jubba')
+        dat.setflags()
+        assert_(dat.info == 'jubba')
+        assert_(dat.all(1).info == 'jubba')
+        assert_(dat.any(1).info == 'jubba')
+        assert_(dat.argmax(1).info == 'jubba')
+        assert_(dat.argmin(1).info == 'jubba')
+        assert_(dat.argsort(1).info == 'jubba')
+        assert_(dat.astype(TestArray).info == 'jubba')
+        assert_(dat.byteswap().info == 'jubba')
+        assert_(dat.clip(2, 7).info == 'jubba')
+        assert_(dat.compress([0, 1, 1]).info == 'jubba')
+        assert_(dat.conj().info == 'jubba')
+        assert_(dat.conjugate().info == 'jubba')
+        assert_(dat.copy().info == 'jubba')
+        dat2 = TestArray([2, 3, 1, 0], 'jubba')
+        choices = [[0, 1, 2, 3], [10, 11, 12, 13],
+                   [20, 21, 22, 23], [30, 31, 32, 33]]
+        assert_(dat2.choose(choices).info == 'jubba')
+        assert_(dat.cumprod(1).info == 'jubba')
+        assert_(dat.cumsum(1).info == 'jubba')
+        assert_(dat.diagonal().info == 'jubba')
+        assert_(dat.flatten().info == 'jubba')
+        assert_(dat.getfield(np.int32, 0).info == 'jubba')
+        assert_(dat.imag.info == 'jubba')
+        assert_(dat.max(1).info == 'jubba')
+        assert_(dat.mean(1).info == 'jubba')
+        assert_(dat.min(1).info == 'jubba')
+        assert_(dat.newbyteorder().info == 'jubba')
+        assert_(dat.prod(1).info == 'jubba')
+        assert_(dat.ptp(1).info == 'jubba')
+        assert_(dat.ravel().info == 'jubba')
+        assert_(dat.real.info == 'jubba')
+        assert_(dat.repeat(2).info == 'jubba')
+        assert_(dat.reshape((2, 4)).info == 'jubba')
+        assert_(dat.round().info == 'jubba')
+        assert_(dat.squeeze().info == 'jubba')
+        assert_(dat.std(1).info == 'jubba')
+        assert_(dat.sum(1).info == 'jubba')
+        assert_(dat.swapaxes(0, 1).info == 'jubba')
+        assert_(dat.take([2, 3, 5]).info == 'jubba')
+        assert_(dat.transpose().info == 'jubba')
+        assert_(dat.T.info == 'jubba')
+        assert_(dat.var(1).info == 'jubba')
+        assert_(dat.view(TestArray).info == 'jubba')
+        # These methods do not preserve subclasses
+        assert_(type(dat.nonzero()[0]) is np.ndarray)
+        assert_(type(dat.nonzero()[1]) is np.ndarray)
+
+    def test_recarray_tolist(self):
+        # Ticket #793, changeset r5215
+        # Comparisons fail for NaN, so we can't use random memory
+        # for the test.
+        buf = np.zeros(40, dtype=np.int8)
+        a = np.recarray(2, formats="i4,f8,f8", names="id,x,y", buf=buf)
+        b = a.tolist()
+        assert_( a[0].tolist() == b[0])
+        assert_( a[1].tolist() == b[1])
+
+    def test_nonscalar_item_method(self):
+        # Make sure that .item() fails graciously when it should
+        a = np.arange(5)
+        assert_raises(ValueError, a.item)
+
+    def test_char_array_creation(self):
+        a = np.array('123', dtype='c')
+        b = np.array([b'1', b'2', b'3'])
+        assert_equal(a, b)
+
+    def test_unaligned_unicode_access(self):
+        # Ticket #825
+        for i in range(1, 9):
+            msg = 'unicode offset: %d chars' % i
+            t = np.dtype([('a', 'S%d' % i), ('b', 'U2')])
+            x = np.array([(b'a', u'b')], dtype=t)
+            if sys.version_info[0] >= 3:
+                assert_equal(str(x), "[(b'a', 'b')]", err_msg=msg)
+            else:
+                assert_equal(str(x), "[('a', u'b')]", err_msg=msg)
+
+    def test_sign_for_complex_nan(self):
+        # Ticket 794.
+        with np.errstate(invalid='ignore'):
+            C = np.array([-np.inf, -2+1j, 0, 2-1j, np.inf, np.nan])
+            have = np.sign(C)
+            want = np.array([-1+0j, -1+0j, 0+0j, 1+0j, 1+0j, np.nan])
+            assert_equal(have, want)
+
+    def test_for_equal_names(self):
+        # Ticket #674
+        dt = np.dtype([('foo', float), ('bar', float)])
+        a = np.zeros(10, dt)
+        b = list(a.dtype.names)
+        b[0] = "notfoo"
+        a.dtype.names = b
+        assert_(a.dtype.names[0] == "notfoo")
+        assert_(a.dtype.names[1] == "bar")
+
+    def test_for_object_scalar_creation(self):
+        # Ticket #816
+        a = np.object_()
+        b = np.object_(3)
+        b2 = np.object_(3.0)
+        c = np.object_([4, 5])
+        d = np.object_([None, {}, []])
+        assert_(a is None)
+        assert_(type(b) is int)
+        assert_(type(b2) is float)
+        assert_(type(c) is np.ndarray)
+        assert_(c.dtype == object)
+        assert_(d.dtype == object)
+
+    def test_array_resize_method_system_error(self):
+        # Ticket #840 - order should be an invalid keyword.
+        x = np.array([[0, 1], [2, 3]])
+        assert_raises(TypeError, x.resize, (2, 2), order='C')
+
+    def test_for_zero_length_in_choose(self):
+        "Ticket #882"
+        a = np.array(1)
+        assert_raises(ValueError, lambda x: x.choose([]), a)
+
+    def test_array_ndmin_overflow(self):
+        "Ticket #947."
+        assert_raises(ValueError, lambda: np.array([1], ndmin=33))
+
+    def test_void_scalar_with_titles(self):
+        # No ticket
+        data = [('john', 4), ('mary', 5)]
+        dtype1 = [(('source:yy', 'name'), 'O'), (('source:xx', 'id'), int)]
+        arr = np.array(data, dtype=dtype1)
+        assert_(arr[0][0] == 'john')
+        assert_(arr[0][1] == 4)
+
+    def test_void_scalar_constructor(self):
+        #Issue #1550
+
+        #Create test string data, construct void scalar from data and assert
+        #that void scalar contains original data.
+        test_string = np.array("test")
+        test_string_void_scalar = np.core.multiarray.scalar(
+            np.dtype(("V", test_string.dtype.itemsize)), test_string.tobytes())
+
+        assert_(test_string_void_scalar.view(test_string.dtype) == test_string)
+
+        #Create record scalar, construct from data and assert that
+        #reconstructed scalar is correct.
+        test_record = np.ones((), "i,i")
+        test_record_void_scalar = np.core.multiarray.scalar(
+            test_record.dtype, test_record.tobytes())
+
+        assert_(test_record_void_scalar == test_record)
+
+        # Test pickle and unpickle of void and record scalars
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            assert_(pickle.loads(
+                pickle.dumps(test_string, protocol=proto)) == test_string)
+            assert_(pickle.loads(
+                pickle.dumps(test_record, protocol=proto)) == test_record)
+
+    def test_blasdot_uninitialized_memory(self):
+        # Ticket #950
+        for m in [0, 1, 2]:
+            for n in [0, 1, 2]:
+                for k in range(3):
+                    # Try to ensure that x->data contains non-zero floats
+                    x = np.array([123456789e199], dtype=np.float64)
+                    if IS_PYPY:
+                        x.resize((m, 0), refcheck=False)
+                    else:
+                        x.resize((m, 0))
+                    y = np.array([123456789e199], dtype=np.float64)
+                    if IS_PYPY:
+                        y.resize((0, n), refcheck=False)
+                    else:
+                        y.resize((0, n))
+
+                    # `dot` should just return zero (m, n) matrix
+                    z = np.dot(x, y)
+                    assert_(np.all(z == 0))
+                    assert_(z.shape == (m, n))
+
+    def test_zeros(self):
+        # Regression test for #1061.
+        # Set a size which cannot fit into a 64 bits signed integer
+        sz = 2 ** 64
+        with assert_raises_regex(ValueError,
+                                 'Maximum allowed dimension exceeded'):
+            np.empty(sz)
+
+    def test_huge_arange(self):
+        # Regression test for #1062.
+        # Set a size which cannot fit into a 64 bits signed integer
+        sz = 2 ** 64
+        with assert_raises_regex(ValueError,
+                                 'Maximum allowed size exceeded'):
+            np.arange(sz)
+            assert_(np.size == sz)
+
+    def test_fromiter_bytes(self):
+        # Ticket #1058
+        a = np.fromiter(list(range(10)), dtype='b')
+        b = np.fromiter(list(range(10)), dtype='B')
+        assert_(np.alltrue(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.alltrue(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+
+    def test_array_from_sequence_scalar_array(self):
+        # Ticket #1078: segfaults when creating an array with a sequence of
+        # 0d arrays.
+        a = np.array((np.ones(2), np.array(2)))
+        assert_equal(a.shape, (2,))
+        assert_equal(a.dtype, np.dtype(object))
+        assert_equal(a[0], np.ones(2))
+        assert_equal(a[1], np.array(2))
+
+        a = np.array(((1,), np.array(1)))
+        assert_equal(a.shape, (2,))
+        assert_equal(a.dtype, np.dtype(object))
+        assert_equal(a[0], (1,))
+        assert_equal(a[1], np.array(1))
+
+    def test_array_from_sequence_scalar_array2(self):
+        # Ticket #1081: weird array with strange input...
+        t = np.array([np.array([]), np.array(0, object)])
+        assert_equal(t.shape, (2,))
+        assert_equal(t.dtype, np.dtype(object))
+
+    def test_array_too_big(self):
+        # Ticket #1080.
+        assert_raises(ValueError, np.zeros, [975]*7, np.int8)
+        assert_raises(ValueError, np.zeros, [26244]*5, np.int8)
+
+    def test_dtype_keyerrors_(self):
+        # Ticket #1106.
+        dt = np.dtype([('f1', np.uint)])
+        assert_raises(KeyError, dt.__getitem__, "f2")
+        assert_raises(IndexError, dt.__getitem__, 1)
+        assert_raises(TypeError, dt.__getitem__, 0.0)
+
+    def test_lexsort_buffer_length(self):
+        # Ticket #1217, don't segfault.
+        a = np.ones(100, dtype=np.int8)
+        b = np.ones(100, dtype=np.int32)
+        i = np.lexsort((a[::-1], b))
+        assert_equal(i, np.arange(100, dtype=int))
+
+    def test_object_array_to_fixed_string(self):
+        # Ticket #1235.
+        a = np.array(['abcdefgh', 'ijklmnop'], dtype=np.object_)
+        b = np.array(a, dtype=(np.str_, 8))
+        assert_equal(a, b)
+        c = np.array(a, dtype=(np.str_, 5))
+        assert_equal(c, np.array(['abcde', 'ijklm']))
+        d = np.array(a, dtype=(np.str_, 12))
+        assert_equal(a, d)
+        e = np.empty((2, ), dtype=(np.str_, 8))
+        e[:] = a[:]
+        assert_equal(a, e)
+
+    def test_unicode_to_string_cast(self):
+        # Ticket #1240.
+        a = np.array([[u'abc', u'\u03a3'],
+                      [u'asdf', u'erw']],
+                     dtype='U')
+        assert_raises(UnicodeEncodeError, np.array, a, 'S4')
+
+    def test_mixed_string_unicode_array_creation(self):
+        a = np.array(['1234', u'123'])
+        assert_(a.itemsize == 16)
+        a = np.array([u'123', '1234'])
+        assert_(a.itemsize == 16)
+        a = np.array(['1234', u'123', '12345'])
+        assert_(a.itemsize == 20)
+        a = np.array([u'123', '1234', u'12345'])
+        assert_(a.itemsize == 20)
+        a = np.array([u'123', '1234', u'1234'])
+        assert_(a.itemsize == 16)
+
+    def test_misaligned_objects_segfault(self):
+        # Ticket #1198 and #1267
+        a1 = np.zeros((10,), dtype='O,c')
+        a2 = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 'S10')
+        a1['f0'] = a2
+        repr(a1)
+        np.argmax(a1['f0'])
+        a1['f0'][1] = "FOO"
+        a1['f0'] = "FOO"
+        np.array(a1['f0'], dtype='S')
+        np.nonzero(a1['f0'])
+        a1.sort()
+        copy.deepcopy(a1)
+
+    def test_misaligned_scalars_segfault(self):
+        # Ticket #1267
+        s1 = np.array(('a', 'Foo'), dtype='c,O')
+        s2 = np.array(('b', 'Bar'), dtype='c,O')
+        s1['f1'] = s2['f1']
+        s1['f1'] = 'Baz'
+
+    def test_misaligned_dot_product_objects(self):
+        # Ticket #1267
+        # This didn't require a fix, but it's worth testing anyway, because
+        # it may fail if .dot stops enforcing the arrays to be BEHAVED
+        a = np.array([[(1, 'a'), (0, 'a')], [(0, 'a'), (1, 'a')]], dtype='O,c')
+        b = np.array([[(4, 'a'), (1, 'a')], [(2, 'a'), (2, 'a')]], dtype='O,c')
+        np.dot(a['f0'], b['f0'])
+
+    def test_byteswap_complex_scalar(self):
+        # Ticket #1259 and gh-441
+        for dtype in [np.dtype('<'+t) for t in np.typecodes['Complex']]:
+            z = np.array([2.2-1.1j], dtype)
+            x = z[0]  # always native-endian
+            y = x.byteswap()
+            if x.dtype.byteorder == z.dtype.byteorder:
+                # little-endian machine
+                assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype.newbyteorder()))
+            else:
+                # big-endian machine
+                assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype))
+            # double check real and imaginary parts:
+            assert_equal(x.real, y.real.byteswap())
+            assert_equal(x.imag, y.imag.byteswap())
+
+    def test_structured_arrays_with_objects1(self):
+        # Ticket #1299
+        stra = 'aaaa'
+        strb = 'bbbb'
+        x = np.array([[(0, stra), (1, strb)]], 'i8,O')
+        x[x.nonzero()] = x.ravel()[:1]
+        assert_(x[0, 1] == x[0, 0])
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_structured_arrays_with_objects2(self):
+        # Ticket #1299 second test
+        stra = 'aaaa'
+        strb = 'bbbb'
+        numb = sys.getrefcount(strb)
+        numa = sys.getrefcount(stra)
+        x = np.array([[(0, stra), (1, strb)]], 'i8,O')
+        x[x.nonzero()] = x.ravel()[:1]
+        assert_(sys.getrefcount(strb) == numb)
+        assert_(sys.getrefcount(stra) == numa + 2)
+
+    def test_duplicate_title_and_name(self):
+        # Ticket #1254
+        dtspec = [(('a', 'a'), 'i'), ('b', 'i')]
+        assert_raises(ValueError, np.dtype, dtspec)
+
+    def test_signed_integer_division_overflow(self):
+        # Ticket #1317.
+        def test_type(t):
+            min = np.array([np.iinfo(t).min])
+            min //= -1
+
+        with np.errstate(divide="ignore"):
+            for t in (np.int8, np.int16, np.int32, np.int64, int, np.long):
+                test_type(t)
+
+    def test_buffer_hashlib(self):
+        try:
+            from hashlib import md5
+        except ImportError:
+            from md5 import new as md5
+
+        x = np.array([1, 2, 3], dtype=np.dtype('<i4'))
+        assert_equal(md5(x).hexdigest(), '2a1dd1e1e59d0a384c26951e316cd7e6')
+
+    def test_0d_string_scalar(self):
+        # Bug #1436; the following should succeed
+        np.asarray('x', '>c')
+
+    def test_log1p_compiler_shenanigans(self):
+        # Check if log1p is behaving on 32 bit intel systems.
+        assert_(np.isfinite(np.log1p(np.exp2(-53))))
+
+    def test_fromiter_comparison(self):
+        a = np.fromiter(list(range(10)), dtype='b')
+        b = np.fromiter(list(range(10)), dtype='B')
+        assert_(np.alltrue(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.alltrue(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+
+    def test_fromstring_crash(self):
+        # Ticket #1345: the following should not cause a crash
+        np.fromstring(b'aa, aa, 1.0', sep=',')
+
+    def test_ticket_1539(self):
+        dtypes = [x for x in np.typeDict.values()
+                  if (issubclass(x, np.number)
+                      and not issubclass(x, np.timedelta64))]
+        a = np.array([], np.bool_)  # not x[0] because it is unordered
+        failures = []
+
+        for x in dtypes:
+            b = a.astype(x)
+            for y in dtypes:
+                c = a.astype(y)
+                try:
+                    np.dot(b, c)
+                except TypeError:
+                    failures.append((x, y))
+        if failures:
+            raise AssertionError("Failures: %r" % failures)
+
+    def test_ticket_1538(self):
+        x = np.finfo(np.float32)
+        for name in 'eps epsneg max min resolution tiny'.split():
+            assert_equal(type(getattr(x, name)), np.float32,
+                         err_msg=name)
+
+    def test_ticket_1434(self):
+        # Check that the out= argument in var and std has an effect
+        data = np.array(((1, 2, 3), (4, 5, 6), (7, 8, 9)))
+        out = np.zeros((3,))
+
+        ret = data.var(axis=1, out=out)
+        assert_(ret is out)
+        assert_array_equal(ret, data.var(axis=1))
+
+        ret = data.std(axis=1, out=out)
+        assert_(ret is out)
+        assert_array_equal(ret, data.std(axis=1))
+
+    def test_complex_nan_maximum(self):
+        cnan = complex(0, np.nan)
+        assert_equal(np.maximum(1, cnan), cnan)
+
+    def test_subclass_int_tuple_assignment(self):
+        # ticket #1563
+        class Subclass(np.ndarray):
+            def __new__(cls, i):
+                return np.ones((i,)).view(cls)
+
+        x = Subclass(5)
+        x[(0,)] = 2  # shouldn't raise an exception
+        assert_equal(x[0], 2)
+
+    def test_ufunc_no_unnecessary_views(self):
+        # ticket #1548
+        class Subclass(np.ndarray):
+            pass
+        x = np.array([1, 2, 3]).view(Subclass)
+        y = np.add(x, x, x)
+        assert_equal(id(x), id(y))
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_take_refcount(self):
+        # ticket #939
+        a = np.arange(16, dtype=float)
+        a.shape = (4, 4)
+        lut = np.ones((5 + 3, 4), float)
+        rgba = np.empty(shape=a.shape + (4,), dtype=lut.dtype)
+        c1 = sys.getrefcount(rgba)
+        try:
+            lut.take(a, axis=0, mode='clip', out=rgba)
+        except TypeError:
+            pass
+        c2 = sys.getrefcount(rgba)
+        assert_equal(c1, c2)
+
+    def test_fromfile_tofile_seeks(self):
+        # On Python 3, tofile/fromfile used to get (#1610) the Python
+        # file handle out of sync
+        f0 = tempfile.NamedTemporaryFile()
+        f = f0.file
+        f.write(np.arange(255, dtype='u1').tobytes())
+
+        f.seek(20)
+        ret = np.fromfile(f, count=4, dtype='u1')
+        assert_equal(ret, np.array([20, 21, 22, 23], dtype='u1'))
+        assert_equal(f.tell(), 24)
+
+        f.seek(40)
+        np.array([1, 2, 3], dtype='u1').tofile(f)
+        assert_equal(f.tell(), 43)
+
+        f.seek(40)
+        data = f.read(3)
+        assert_equal(data, b"\x01\x02\x03")
+
+        f.seek(80)
+        f.read(4)
+        data = np.fromfile(f, dtype='u1', count=4)
+        assert_equal(data, np.array([84, 85, 86, 87], dtype='u1'))
+
+        f.close()
+
+    def test_complex_scalar_warning(self):
+        for tp in [np.csingle, np.cdouble, np.clongdouble]:
+            x = tp(1+2j)
+            assert_warns(np.ComplexWarning, float, x)
+            with suppress_warnings() as sup:
+                sup.filter(np.ComplexWarning)
+                assert_equal(float(x), float(x.real))
+
+    def test_complex_scalar_complex_cast(self):
+        for tp in [np.csingle, np.cdouble, np.clongdouble]:
+            x = tp(1+2j)
+            assert_equal(complex(x), 1+2j)
+
+    def test_complex_boolean_cast(self):
+        # Ticket #2218
+        for tp in [np.csingle, np.cdouble, np.clongdouble]:
+            x = np.array([0, 0+0.5j, 0.5+0j], dtype=tp)
+            assert_equal(x.astype(bool), np.array([0, 1, 1], dtype=bool))
+            assert_(np.any(x))
+            assert_(np.all(x[1:]))
+
+    def test_uint_int_conversion(self):
+        x = 2**64 - 1
+        assert_equal(int(np.uint64(x)), x)
+
+    def test_duplicate_field_names_assign(self):
+        ra = np.fromiter(((i*3, i*2) for i in range(10)), dtype='i8,f8')
+        ra.dtype.names = ('f1', 'f2')
+        repr(ra)  # should not cause a segmentation fault
+        assert_raises(ValueError, setattr, ra.dtype, 'names', ('f1', 'f1'))
+
+    def test_eq_string_and_object_array(self):
+        # From e-mail thread "__eq__ with str and object" (Keith Goodman)
+        a1 = np.array(['a', 'b'], dtype=object)
+        a2 = np.array(['a', 'c'])
+        assert_array_equal(a1 == a2, [True, False])
+        assert_array_equal(a2 == a1, [True, False])
+
+    def test_nonzero_byteswap(self):
+        a = np.array([0x80000000, 0x00000080, 0], dtype=np.uint32)
+        a.dtype = np.float32
+        assert_equal(a.nonzero()[0], [1])
+        a = a.byteswap().newbyteorder()
+        assert_equal(a.nonzero()[0], [1])  # [0] if nonzero() ignores swap
+
+    def test_find_common_type_boolean(self):
+        # Ticket #1695
+        assert_(np.find_common_type([], ['?', '?']) == '?')
+
+    def test_empty_mul(self):
+        a = np.array([1.])
+        a[1:1] *= 2
+        assert_equal(a, [1.])
+
+    def test_array_side_effect(self):
+        # The second use of itemsize was throwing an exception because in
+        # ctors.c, discover_itemsize was calling PyObject_Length without
+        # checking the return code.  This failed to get the length of the
+        # number 2, and the exception hung around until something checked
+        # PyErr_Occurred() and returned an error.
+        assert_equal(np.dtype('S10').itemsize, 10)
+        np.array([['abc', 2], ['long   ', '0123456789']], dtype=np.string_)
+        assert_equal(np.dtype('S10').itemsize, 10)
+
+    def test_any_float(self):
+        # all and any for floats
+        a = np.array([0.1, 0.9])
+        assert_(np.any(a))
+        assert_(np.all(a))
+
+    def test_large_float_sum(self):
+        a = np.arange(10000, dtype='f')
+        assert_equal(a.sum(dtype='d'), a.astype('d').sum())
+
+    def test_ufunc_casting_out(self):
+        a = np.array(1.0, dtype=np.float32)
+        b = np.array(1.0, dtype=np.float64)
+        c = np.array(1.0, dtype=np.float32)
+        np.add(a, b, out=c)
+        assert_equal(c, 2.0)
+
+    def test_array_scalar_contiguous(self):
+        # Array scalars are both C and Fortran contiguous
+        assert_(np.array(1.0).flags.c_contiguous)
+        assert_(np.array(1.0).flags.f_contiguous)
+        assert_(np.array(np.float32(1.0)).flags.c_contiguous)
+        assert_(np.array(np.float32(1.0)).flags.f_contiguous)
+
+    def test_squeeze_contiguous(self):
+        # Similar to GitHub issue #387
+        a = np.zeros((1, 2)).squeeze()
+        b = np.zeros((2, 2, 2), order='F')[:, :, ::2].squeeze()
+        assert_(a.flags.c_contiguous)
+        assert_(a.flags.f_contiguous)
+        assert_(b.flags.f_contiguous)
+
+    def test_squeeze_axis_handling(self):
+        # Issue #10779
+        # Ensure proper handling of objects
+        # that don't support axis specification
+        # when squeezing
+
+        class OldSqueeze(np.ndarray):
+
+            def __new__(cls,
+                        input_array):
+                obj = np.asarray(input_array).view(cls)
+                return obj
+
+            # it is perfectly reasonable that prior
+            # to numpy version 1.7.0 a subclass of ndarray
+            # might have been created that did not expect
+            # squeeze to have an axis argument
+            # NOTE: this example is somewhat artificial;
+            # it is designed to simulate an old API
+            # expectation to guard against regression
+            def squeeze(self):
+                return super(OldSqueeze, self).squeeze()
+
+        oldsqueeze = OldSqueeze(np.array([[1],[2],[3]]))
+
+        # if no axis argument is specified the old API
+        # expectation should give the correct result
+        assert_equal(np.squeeze(oldsqueeze),
+                     np.array([1,2,3]))
+
+        # likewise, axis=None should work perfectly well
+        # with the old API expectation
+        assert_equal(np.squeeze(oldsqueeze, axis=None),
+                     np.array([1,2,3]))
+
+        # however, specification of any particular axis
+        # should raise a TypeError in the context of the
+        # old API specification, even when using a valid
+        # axis specification like 1 for this array
+        with assert_raises(TypeError):
+            # this would silently succeed for array
+            # subclasses / objects that did not support
+            # squeeze axis argument handling before fixing
+            # Issue #10779
+            np.squeeze(oldsqueeze, axis=1)
+
+        # check for the same behavior when using an invalid
+        # axis specification -- in this case axis=0 does not
+        # have size 1, but the priority should be to raise
+        # a TypeError for the axis argument and NOT a
+        # ValueError for squeezing a non-empty dimension
+        with assert_raises(TypeError):
+            np.squeeze(oldsqueeze, axis=0)
+
+        # the new API knows how to handle the axis
+        # argument and will return a ValueError if
+        # attempting to squeeze an axis that is not
+        # of length 1
+        with assert_raises(ValueError):
+            np.squeeze(np.array([[1],[2],[3]]), axis=0)
+
+    def test_reduce_contiguous(self):
+        # GitHub issue #387
+        a = np.add.reduce(np.zeros((2, 1, 2)), (0, 1))
+        b = np.add.reduce(np.zeros((2, 1, 2)), 1)
+        assert_(a.flags.c_contiguous)
+        assert_(a.flags.f_contiguous)
+        assert_(b.flags.c_contiguous)
+
+    def test_object_array_self_reference(self):
+        # Object arrays with references to themselves can cause problems
+        a = np.array(0, dtype=object)
+        a[()] = a
+        assert_raises(RecursionError, int, a)
+        assert_raises(RecursionError, long, a)
+        assert_raises(RecursionError, float, a)
+        if sys.version_info.major == 2:
+            # in python 3, this falls back on operator.index, which fails on
+            # on dtype=object
+            assert_raises(RecursionError, oct, a)
+            assert_raises(RecursionError, hex, a)
+        a[()] = None
+
+    def test_object_array_circular_reference(self):
+        # Test the same for a circular reference.
+        a = np.array(0, dtype=object)
+        b = np.array(0, dtype=object)
+        a[()] = b
+        b[()] = a
+        assert_raises(RecursionError, int, a)
+        # NumPy has no tp_traverse currently, so circular references
+        # cannot be detected. So resolve it:
+        a[()] = None
+
+        # This was causing a to become like the above
+        a = np.array(0, dtype=object)
+        a[...] += 1
+        assert_equal(a, 1)
+
+    def test_object_array_nested(self):
+        # but is fine with a reference to a different array
+        a = np.array(0, dtype=object)
+        b = np.array(0, dtype=object)
+        a[()] = b
+        assert_equal(int(a), int(0))
+        assert_equal(long(a), long(0))
+        assert_equal(float(a), float(0))
+        if sys.version_info.major == 2:
+            # in python 3, this falls back on operator.index, which fails on
+            # on dtype=object
+            assert_equal(oct(a), oct(0))
+            assert_equal(hex(a), hex(0))
+
+    def test_object_array_self_copy(self):
+        # An object array being copied into itself DECREF'ed before INCREF'ing
+        # causing segmentation faults (gh-3787)
+        a = np.array(object(), dtype=object)
+        np.copyto(a, a)
+        if HAS_REFCOUNT:
+            assert_(sys.getrefcount(a[()]) == 2)
+        a[()].__class__  # will segfault if object was deleted
+
+    def test_zerosize_accumulate(self):
+        "Ticket #1733"
+        x = np.array([[42, 0]], dtype=np.uint32)
+        assert_equal(np.add.accumulate(x[:-1, 0]), [])
+
+    def test_objectarray_setfield(self):
+        # Setfield should not overwrite Object fields with non-Object data
+        x = np.array([1, 2, 3], dtype=object)
+        assert_raises(TypeError, x.setfield, 4, np.int32, 0)
+
+    def test_setting_rank0_string(self):
+        "Ticket #1736"
+        s1 = b"hello1"
+        s2 = b"hello2"
+        a = np.zeros((), dtype="S10")
+        a[()] = s1
+        assert_equal(a, np.array(s1))
+        a[()] = np.array(s2)
+        assert_equal(a, np.array(s2))
+
+        a = np.zeros((), dtype='f4')
+        a[()] = 3
+        assert_equal(a, np.array(3))
+        a[()] = np.array(4)
+        assert_equal(a, np.array(4))
+
+    def test_string_astype(self):
+        "Ticket #1748"
+        s1 = b'black'
+        s2 = b'white'
+        s3 = b'other'
+        a = np.array([[s1], [s2], [s3]])
+        assert_equal(a.dtype, np.dtype('S5'))
+        b = a.astype(np.dtype('S0'))
+        assert_equal(b.dtype, np.dtype('S5'))
+
+    def test_ticket_1756(self):
+        # Ticket #1756
+        s = b'0123456789abcdef'
+        a = np.array([s]*5)
+        for i in range(1, 17):
+            a1 = np.array(a, "|S%d" % i)
+            a2 = np.array([s[:i]]*5)
+            assert_equal(a1, a2)
+
+    def test_fields_strides(self):
+        "gh-2355"
+        r = np.frombuffer(b'abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2')
+        assert_equal(r[0:3:2]['f1'], r['f1'][0:3:2])
+        assert_equal(r[0:3:2]['f1'][0], r[0:3:2][0]['f1'])
+        assert_equal(r[0:3:2]['f1'][0][()], r[0:3:2][0]['f1'][()])
+        assert_equal(r[0:3:2]['f1'][0].strides, r[0:3:2][0]['f1'].strides)
+
+    def test_alignment_update(self):
+        # Check that alignment flag is updated on stride setting
+        a = np.arange(10)
+        assert_(a.flags.aligned)
+        a.strides = 3
+        assert_(not a.flags.aligned)
+
+    def test_ticket_1770(self):
+        "Should not segfault on python 3k"
+        import numpy as np
+        try:
+            a = np.zeros((1,), dtype=[('f1', 'f')])
+            a['f1'] = 1
+            a['f2'] = 1
+        except ValueError:
+            pass
+        except Exception:
+            raise AssertionError
+
+    def test_ticket_1608(self):
+        "x.flat shouldn't modify data"
+        x = np.array([[1, 2], [3, 4]]).T
+        np.array(x.flat)
+        assert_equal(x, [[1, 3], [2, 4]])
+
+    def test_pickle_string_overwrite(self):
+        import re
+
+        data = np.array([1], dtype='b')
+        blob = pickle.dumps(data, protocol=1)
+        data = pickle.loads(blob)
+
+        # Check that loads does not clobber interned strings
+        s = re.sub("a(.)", "\x01\\1", "a_")
+        assert_equal(s[0], "\x01")
+        data[0] = 0xbb
+        s = re.sub("a(.)", "\x01\\1", "a_")
+        assert_equal(s[0], "\x01")
+
+    def test_pickle_bytes_overwrite(self):
+        if sys.version_info[0] >= 3:
+            for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+                data = np.array([1], dtype='b')
+                data = pickle.loads(pickle.dumps(data, protocol=proto))
+                data[0] = 0xdd
+                bytestring = "\x01  ".encode('ascii')
+                assert_equal(bytestring[0:1], '\x01'.encode('ascii'))
+
+    def test_pickle_py2_array_latin1_hack(self):
+        # Check that unpickling hacks in Py3 that support
+        # encoding='latin1' work correctly.
+
+        # Python2 output for pickle.dumps(numpy.array([129], dtype='b'))
+        data = (b"cnumpy.core.multiarray\n_reconstruct\np0\n(cnumpy\nndarray\np1\n(I0\n"
+                b"tp2\nS'b'\np3\ntp4\nRp5\n(I1\n(I1\ntp6\ncnumpy\ndtype\np7\n(S'i1'\np8\n"
+                b"I0\nI1\ntp9\nRp10\n(I3\nS'|'\np11\nNNNI-1\nI-1\nI0\ntp12\nbI00\nS'\\x81'\n"
+                b"p13\ntp14\nb.")
+        if sys.version_info[0] >= 3:
+            # This should work:
+            result = pickle.loads(data, encoding='latin1')
+            assert_array_equal(result, np.array([129], dtype='b'))
+            # Should not segfault:
+            assert_raises(Exception, pickle.loads, data, encoding='koi8-r')
+
+    def test_pickle_py2_scalar_latin1_hack(self):
+        # Check that scalar unpickling hack in Py3 that supports
+        # encoding='latin1' work correctly.
+
+        # Python2 output for pickle.dumps(...)
+        datas = [
+            # (original, python2_pickle, koi8r_validity)
+            (np.unicode_('\u6bd2'),
+             (b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n"
+              b"(S'U1'\np2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI4\nI4\nI0\n"
+              b"tp6\nbS'\\xd2k\\x00\\x00'\np7\ntp8\nRp9\n."),
+             'invalid'),
+
+            (np.float64(9e123),
+             (b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n(S'f8'\n"
+              b"p2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI-1\nI-1\nI0\ntp6\n"
+              b"bS'O\\x81\\xb7Z\\xaa:\\xabY'\np7\ntp8\nRp9\n."),
+             'invalid'),
+
+            (np.bytes_(b'\x9c'),  # different 8-bit code point in KOI8-R vs latin1
+             (b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n(S'S1'\np2\n"
+              b"I0\nI1\ntp3\nRp4\n(I3\nS'|'\np5\nNNNI1\nI1\nI0\ntp6\nbS'\\x9c'\np7\n"
+              b"tp8\nRp9\n."),
+             'different'),
+        ]
+        if sys.version_info[0] >= 3:
+            for original, data, koi8r_validity in datas:
+                result = pickle.loads(data, encoding='latin1')
+                assert_equal(result, original)
+
+                # Decoding under non-latin1 encoding (e.g.) KOI8-R can
+                # produce bad results, but should not segfault.
+                if koi8r_validity == 'different':
+                    # Unicode code points happen to lie within latin1,
+                    # but are different in koi8-r, resulting to silent
+                    # bogus results
+                    result = pickle.loads(data, encoding='koi8-r')
+                    assert_(result != original)
+                elif koi8r_validity == 'invalid':
+                    # Unicode code points outside latin1, so results
+                    # to an encoding exception
+                    assert_raises(ValueError, pickle.loads, data, encoding='koi8-r')
+                else:
+                    raise ValueError(koi8r_validity)
+
+    def test_structured_type_to_object(self):
+        a_rec = np.array([(0, 1), (3, 2)], dtype='i4,i8')
+        a_obj = np.empty((2,), dtype=object)
+        a_obj[0] = (0, 1)
+        a_obj[1] = (3, 2)
+        # astype records -> object
+        assert_equal(a_rec.astype(object), a_obj)
+        # '=' records -> object
+        b = np.empty_like(a_obj)
+        b[...] = a_rec
+        assert_equal(b, a_obj)
+        # '=' object -> records
+        b = np.empty_like(a_rec)
+        b[...] = a_obj
+        assert_equal(b, a_rec)
+
+    def test_assign_obj_listoflists(self):
+        # Ticket # 1870
+        # The inner list should get assigned to the object elements
+        a = np.zeros(4, dtype=object)
+        b = a.copy()
+        a[0] = [1]
+        a[1] = [2]
+        a[2] = [3]
+        a[3] = [4]
+        b[...] = [[1], [2], [3], [4]]
+        assert_equal(a, b)
+        # The first dimension should get broadcast
+        a = np.zeros((2, 2), dtype=object)
+        a[...] = [[1, 2]]
+        assert_equal(a, [[1, 2], [1, 2]])
+
+    def test_memoryleak(self):
+        # Ticket #1917 - ensure that array data doesn't leak
+        for i in range(1000):
+            # 100MB times 1000 would give 100GB of memory usage if it leaks
+            a = np.empty((100000000,), dtype='i1')
+            del a
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_ufunc_reduce_memoryleak(self):
+        a = np.arange(6)
+        acnt = sys.getrefcount(a)
+        np.add.reduce(a)
+        assert_equal(sys.getrefcount(a), acnt)
+
+    def test_search_sorted_invalid_arguments(self):
+        # Ticket #2021, should not segfault.
+        x = np.arange(0, 4, dtype='datetime64[D]')
+        assert_raises(TypeError, x.searchsorted, 1)
+
+    def test_string_truncation(self):
+        # Ticket #1990 - Data can be truncated in creation of an array from a
+        # mixed sequence of numeric values and strings
+        for val in [True, 1234, 123.4, complex(1, 234)]:
+            for tostr in [asunicode, asbytes]:
+                b = np.array([val, tostr('xx')])
+                assert_equal(tostr(b[0]), tostr(val))
+                b = np.array([tostr('xx'), val])
+                assert_equal(tostr(b[1]), tostr(val))
+
+                # test also with longer strings
+                b = np.array([val, tostr('xxxxxxxxxx')])
+                assert_equal(tostr(b[0]), tostr(val))
+                b = np.array([tostr('xxxxxxxxxx'), val])
+                assert_equal(tostr(b[1]), tostr(val))
+
+    def test_string_truncation_ucs2(self):
+        # Ticket #2081. Python compiled with two byte unicode
+        # can lead to truncation if itemsize is not properly
+        # adjusted for NumPy's four byte unicode.
+        if sys.version_info[0] >= 3:
+            a = np.array(['abcd'])
+        else:
+            a = np.array([u'abcd'])
+        assert_equal(a.dtype.itemsize, 16)
+
+    def test_unique_stable(self):
+        # Ticket #2063 must always choose stable sort for argsort to
+        # get consistent results
+        v = np.array(([0]*5 + [1]*6 + [2]*6)*4)
+        res = np.unique(v, return_index=True)
+        tgt = (np.array([0, 1, 2]), np.array([ 0,  5, 11]))
+        assert_equal(res, tgt)
+
+    def test_unicode_alloc_dealloc_match(self):
+        # Ticket #1578, the mismatch only showed up when running
+        # python-debug for python versions >= 2.7, and then as
+        # a core dump and error message.
+        a = np.array(['abc'], dtype=np.unicode)[0]
+        del a
+
+    def test_refcount_error_in_clip(self):
+        # Ticket #1588
+        a = np.zeros((2,), dtype='>i2').clip(min=0)
+        x = a + a
+        # This used to segfault:
+        y = str(x)
+        # Check the final string:
+        assert_(y == "[0 0]")
+
+    def test_searchsorted_wrong_dtype(self):
+        # Ticket #2189, it used to segfault, so we check that it raises the
+        # proper exception.
+        a = np.array([('a', 1)], dtype='S1, int')
+        assert_raises(TypeError, np.searchsorted, a, 1.2)
+        # Ticket #2066, similar problem:
+        dtype = np.format_parser(['i4', 'i4'], [], [])
+        a = np.recarray((2, ), dtype)
+        assert_raises(TypeError, np.searchsorted, a, 1)
+
+    def test_complex64_alignment(self):
+        # Issue gh-2668 (trac 2076), segfault on sparc due to misalignment
+        dtt = np.complex64
+        arr = np.arange(10, dtype=dtt)
+        # 2D array
+        arr2 = np.reshape(arr, (2, 5))
+        # Fortran write followed by (C or F) read caused bus error
+        data_str = arr2.tobytes('F')
+        data_back = np.ndarray(arr2.shape,
+                              arr2.dtype,
+                              buffer=data_str,
+                              order='F')
+        assert_array_equal(arr2, data_back)
+
+    def test_structured_count_nonzero(self):
+        arr = np.array([0, 1]).astype('i4, (2)i4')[:1]
+        count = np.count_nonzero(arr)
+        assert_equal(count, 0)
+
+    def test_copymodule_preserves_f_contiguity(self):
+        a = np.empty((2, 2), order='F')
+        b = copy.copy(a)
+        c = copy.deepcopy(a)
+        assert_(b.flags.fortran)
+        assert_(b.flags.f_contiguous)
+        assert_(c.flags.fortran)
+        assert_(c.flags.f_contiguous)
+
+    def test_fortran_order_buffer(self):
+        import numpy as np
+        a = np.array([['Hello', 'Foob']], dtype='U5', order='F')
+        arr = np.ndarray(shape=[1, 2, 5], dtype='U1', buffer=a)
+        arr2 = np.array([[[u'H', u'e', u'l', u'l', u'o'],
+                          [u'F', u'o', u'o', u'b', u'']]])
+        assert_array_equal(arr, arr2)
+
+    def test_assign_from_sequence_error(self):
+        # Ticket #4024.
+        arr = np.array([1, 2, 3])
+        assert_raises(ValueError, arr.__setitem__, slice(None), [9, 9])
+        arr.__setitem__(slice(None), [9])
+        assert_equal(arr, [9, 9, 9])
+
+    def test_format_on_flex_array_element(self):
+        # Ticket #4369.
+        dt = np.dtype([('date', '<M8[D]'), ('val', '<f8')])
+        arr = np.array([('2000-01-01', 1)], dt)
+        formatted = '{0}'.format(arr[0])
+        assert_equal(formatted, str(arr[0]))
+
+    def test_deepcopy_on_0d_array(self):
+        # Ticket #3311.
+        arr = np.array(3)
+        arr_cp = copy.deepcopy(arr)
+
+        assert_equal(arr, arr_cp)
+        assert_equal(arr.shape, arr_cp.shape)
+        assert_equal(int(arr), int(arr_cp))
+        assert_(arr is not arr_cp)
+        assert_(isinstance(arr_cp, type(arr)))
+
+    def test_deepcopy_F_order_object_array(self):
+        # Ticket #6456.
+        a = {'a': 1}
+        b = {'b': 2}
+        arr = np.array([[a, b], [a, b]], order='F')
+        arr_cp = copy.deepcopy(arr)
+
+        assert_equal(arr, arr_cp)
+        assert_(arr is not arr_cp)
+        # Ensure that we have actually copied the item.
+        assert_(arr[0, 1] is not arr_cp[1, 1])
+        # Ensure we are allowed to have references to the same object.
+        assert_(arr[0, 1] is arr[1, 1])
+        # Check the references hold for the copied objects.
+        assert_(arr_cp[0, 1] is arr_cp[1, 1])
+
+    def test_deepcopy_empty_object_array(self):
+        # Ticket #8536.
+        # Deepcopy should succeed
+        a = np.array([], dtype=object)
+        b = copy.deepcopy(a)
+        assert_(a.shape == b.shape)
+
+    def test_bool_subscript_crash(self):
+        # gh-4494
+        c = np.rec.array([(1, 2, 3), (4, 5, 6)])
+        masked = c[np.array([True, False])]
+        base = masked.base
+        del masked, c
+        base.dtype
+
+    def test_richcompare_crash(self):
+        # gh-4613
+        import operator as op
+
+        # dummy class where __array__ throws exception
+        class Foo(object):
+            __array_priority__ = 1002
+
+            def __array__(self, *args, **kwargs):
+                raise Exception()
+
+        rhs = Foo()
+        lhs = np.array(1)
+        for f in [op.lt, op.le, op.gt, op.ge]:
+            if sys.version_info[0] >= 3:
+                assert_raises(TypeError, f, lhs, rhs)
+            elif not sys.py3kwarning:
+                # With -3 switch in python 2, DeprecationWarning is raised
+                # which we are not interested in
+                f(lhs, rhs)
+        assert_(not op.eq(lhs, rhs))
+        assert_(op.ne(lhs, rhs))
+
+    def test_richcompare_scalar_and_subclass(self):
+        # gh-4709
+        class Foo(np.ndarray):
+            def __eq__(self, other):
+                return "OK"
+
+        x = np.array([1, 2, 3]).view(Foo)
+        assert_equal(10 == x, "OK")
+        assert_equal(np.int32(10) == x, "OK")
+        assert_equal(np.array([10]) == x, "OK")
+
+    def test_pickle_empty_string(self):
+        # gh-3926
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            test_string = np.string_('')
+            assert_equal(pickle.loads(
+                pickle.dumps(test_string, protocol=proto)), test_string)
+
+    def test_frompyfunc_many_args(self):
+        # gh-5672
+
+        def passer(*args):
+            pass
+
+        assert_raises(ValueError, np.frompyfunc, passer, 32, 1)
+
+    def test_repeat_broadcasting(self):
+        # gh-5743
+        a = np.arange(60).reshape(3, 4, 5)
+        for axis in chain(range(-a.ndim, a.ndim), [None]):
+            assert_equal(a.repeat(2, axis=axis), a.repeat([2], axis=axis))
+
+    def test_frompyfunc_nout_0(self):
+        # gh-2014
+
+        def f(x):
+            x[0], x[-1] = x[-1], x[0]
+
+        uf = np.frompyfunc(f, 1, 0)
+        a = np.array([[1, 2, 3], [4, 5], [6, 7, 8, 9]])
+        assert_equal(uf(a), ())
+        assert_array_equal(a, [[3, 2, 1], [5, 4], [9, 7, 8, 6]])
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_leak_in_structured_dtype_comparison(self):
+        # gh-6250
+        recordtype = np.dtype([('a', np.float64),
+                               ('b', np.int32),
+                               ('d', (str, 5))])
+
+        # Simple case
+        a = np.zeros(2, dtype=recordtype)
+        for i in range(100):
+            a == a
+        assert_(sys.getrefcount(a) < 10)
+
+        # The case in the bug report.
+        before = sys.getrefcount(a)
+        u, v = a[0], a[1]
+        u == v
+        del u, v
+        gc.collect()
+        after = sys.getrefcount(a)
+        assert_equal(before, after)
+
+    def test_empty_percentile(self):
+        # gh-6530 / gh-6553
+        assert_array_equal(np.percentile(np.arange(10), []), np.array([]))
+
+    def test_void_compare_segfault(self):
+        # gh-6922. The following should not segfault
+        a = np.ones(3, dtype=[('object', 'O'), ('int', '<i2')])
+        a.sort()
+
+    def test_reshape_size_overflow(self):
+        # gh-7455
+        a = np.ones(20)[::2]
+        if np.dtype(np.intp).itemsize == 8:
+            # 64 bit. The following are the prime factors of 2**63 + 5,
+            # plus a leading 2, so when multiplied together as int64,
+            # the result overflows to a total size of 10.
+            new_shape = (2, 13, 419, 691, 823, 2977518503)
+        else:
+            # 32 bit. The following are the prime factors of 2**31 + 5,
+            # plus a leading 2, so when multiplied together as int32,
+            # the result overflows to a total size of 10.
+            new_shape = (2, 7, 7, 43826197)
+        assert_raises(ValueError, a.reshape, new_shape)
+
+    def test_invalid_structured_dtypes(self):
+        # gh-2865
+        # mapping python objects to other dtypes
+        assert_raises(ValueError, np.dtype, ('O', [('name', 'i8')]))
+        assert_raises(ValueError, np.dtype, ('i8', [('name', 'O')]))
+        assert_raises(ValueError, np.dtype,
+                      ('i8', [('name', [('name', 'O')])]))
+        assert_raises(ValueError, np.dtype, ([('a', 'i4'), ('b', 'i4')], 'O'))
+        assert_raises(ValueError, np.dtype, ('i8', 'O'))
+        # wrong number/type of tuple elements in dict
+        assert_raises(ValueError, np.dtype,
+                      ('i', {'name': ('i', 0, 'title', 'oops')}))
+        assert_raises(ValueError, np.dtype,
+                      ('i', {'name': ('i', 'wrongtype', 'title')}))
+        # disallowed as of 1.13
+        assert_raises(ValueError, np.dtype,
+                      ([('a', 'O'), ('b', 'O')], [('c', 'O'), ('d', 'O')]))
+        # allowed as a special case due to existing use, see gh-2798
+        a = np.ones(1, dtype=('O', [('name', 'O')]))
+        assert_equal(a[0], 1)
+
+    def test_correct_hash_dict(self):
+        # gh-8887 - __hash__ would be None despite tp_hash being set
+        all_types = set(np.typeDict.values()) - {np.void}
+        for t in all_types:
+            val = t()
+
+            try:
+                hash(val)
+            except TypeError as e:
+                assert_equal(t.__hash__, None)
+            else:
+                assert_(t.__hash__ != None)
+
+    def test_scalar_copy(self):
+        scalar_types = set(np.sctypeDict.values())
+        values = {
+            np.void: b"a",
+            np.bytes_: b"a",
+            np.unicode_: "a",
+            np.datetime64: "2017-08-25",
+        }
+        for sctype in scalar_types:
+            item = sctype(values.get(sctype, 1))
+            item2 = copy.copy(item)
+            assert_equal(item, item2)
+
+    def test_void_item_memview(self):
+        va = np.zeros(10, 'V4')
+        x = va[:1].item()
+        va[0] = b'\xff\xff\xff\xff'
+        del va
+        assert_equal(x, b'\x00\x00\x00\x00')
+
+    def test_void_getitem(self):
+        # Test fix for gh-11668.
+        assert_(np.array([b'a'], 'V1').astype('O') == b'a')
+        assert_(np.array([b'ab'], 'V2').astype('O') == b'ab')
+        assert_(np.array([b'abc'], 'V3').astype('O') == b'abc')
+        assert_(np.array([b'abcd'], 'V4').astype('O') == b'abcd')
+
+    def test_structarray_title(self):
+        # The following used to segfault on pypy, due to NPY_TITLE_KEY
+        # not working properly and resulting to double-decref of the
+        # structured array field items:
+        # See: https://bitbucket.org/pypy/pypy/issues/2789
+        for j in range(5):
+            structure = np.array([1], dtype=[(('x', 'X'), np.object_)])
+            structure[0]['x'] = np.array([2])
+            gc.collect()
+
+    def test_dtype_scalar_squeeze(self):
+        # gh-11384
+        values = {
+            'S': b"a",
+            'M': "2018-06-20",
+        }
+        for ch in np.typecodes['All']:
+            if ch in 'O':
+                continue
+            sctype = np.dtype(ch).type
+            scvalue = sctype(values.get(ch, 3))
+            for axis in [None, ()]:
+                squeezed = scvalue.squeeze(axis=axis)
+                assert_equal(squeezed, scvalue)
+                assert_equal(type(squeezed), type(scvalue))
+
+    def test_field_access_by_title(self):
+        # gh-11507
+        s = 'Some long field name'
+        if HAS_REFCOUNT:
+            base = sys.getrefcount(s)
+        t = np.dtype([((s, 'f1'), np.float64)])
+        data = np.zeros(10, t)
+        for i in range(10):
+            str(data[['f1']])
+            if HAS_REFCOUNT:
+                assert_(base <= sys.getrefcount(s))
+
+    @pytest.mark.parametrize('val', [
+        # arrays and scalars
+        np.ones((10, 10), dtype='int32'),
+        np.uint64(10),
+        ])
+    @pytest.mark.parametrize('protocol',
+        range(2, pickle.HIGHEST_PROTOCOL + 1)
+        )
+    def test_pickle_module(self, protocol, val):
+        # gh-12837
+        s = pickle.dumps(val, protocol)
+        assert b'_multiarray_umath' not in s
+        if protocol == 5 and len(val.shape) > 0:
+            # unpickling ndarray goes through _frombuffer for protocol 5
+            assert b'numpy.core.numeric' in s
+        else:
+            assert b'numpy.core.multiarray' in s
+
+    def test_object_casting_errors(self):
+        # gh-11993
+        arr = np.array(['AAAAA', 18465886.0, 18465886.0], dtype=object)
+        assert_raises(TypeError, arr.astype, 'c8')
+
+    def test_eff1d_casting(self):
+        # gh-12711
+        x = np.array([1, 2, 4, 7, 0], dtype=np.int16)
+        res = np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
+        assert_equal(res, [-99,   1,   2,   3,  -7,  88,  99])
+        assert_raises(ValueError, np.ediff1d, x, to_begin=(1<<20))
+        assert_raises(ValueError, np.ediff1d, x, to_end=(1<<20))
+
+    def test_pickle_datetime64_array(self):
+        # gh-12745 (would fail with pickle5 installed)
+        d = np.datetime64('2015-07-04 12:59:59.50', 'ns')
+        arr = np.array([d])
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            dumped = pickle.dumps(arr, protocol=proto)
+            assert_equal(pickle.loads(dumped), arr)
+
+    def test_bad_array_interface(self):
+        class T(object):
+            __array_interface__ = {}
+
+        np.array([T()])
+
+    def test_2d__array__shape(self):
+        class T(object):
+            def __array__(self):
+                return np.ndarray(shape=(0,0))
+
+            # Make sure __array__ is used instead of Sequence methods.
+            def __iter__(self):
+                return iter([])
+
+            def __getitem__(self, idx):
+                raise AssertionError("__getitem__ was called")
+
+            def __len__(self):
+                return 0
+
+
+        t = T()
+        #gh-13659, would raise in broadcasting [x=t for x in result]
+        np.array([t])
+
+    @pytest.mark.skipif(sys.maxsize < 2 ** 31 + 1, reason='overflows 32-bit python')
+    @pytest.mark.skipif(sys.platform == 'win32' and sys.version_info[:2] < (3, 8),
+                        reason='overflows on windows, fixed in bpo-16865')
+    def test_to_ctypes(self):
+        #gh-14214
+        arr = np.zeros((2 ** 31 + 1,), 'b')
+        assert arr.size * arr.itemsize > 2 ** 31
+        c_arr = np.ctypeslib.as_ctypes(arr)
+        assert_equal(c_arr._length_, arr.size)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_scalar_ctors.py b/contrib/python/numpy/py2/numpy/core/tests/test_scalar_ctors.py
new file mode 100644
index 00000000000..b21bc9dad07
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_scalar_ctors.py
@@ -0,0 +1,65 @@
+"""
+Test the scalar constructors, which also do type-coercion
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys
+import platform
+import pytest
+
+import numpy as np
+from numpy.testing import (
+    assert_equal, assert_almost_equal, assert_raises, assert_warns,
+    )
+
+class TestFromString(object):
+    def test_floating(self):
+        # Ticket #640, floats from string
+        fsingle = np.single('1.234')
+        fdouble = np.double('1.234')
+        flongdouble = np.longdouble('1.234')
+        assert_almost_equal(fsingle, 1.234)
+        assert_almost_equal(fdouble, 1.234)
+        assert_almost_equal(flongdouble, 1.234)
+
+    def test_floating_overflow(self):
+        """ Strings containing an unrepresentable float overflow """
+        fhalf = np.half('1e10000')
+        assert_equal(fhalf, np.inf)
+        fsingle = np.single('1e10000')
+        assert_equal(fsingle, np.inf)
+        fdouble = np.double('1e10000')
+        assert_equal(fdouble, np.inf)
+        flongdouble = assert_warns(RuntimeWarning, np.longdouble, '1e10000')
+        assert_equal(flongdouble, np.inf)
+
+        fhalf = np.half('-1e10000')
+        assert_equal(fhalf, -np.inf)
+        fsingle = np.single('-1e10000')
+        assert_equal(fsingle, -np.inf)
+        fdouble = np.double('-1e10000')
+        assert_equal(fdouble, -np.inf)
+        flongdouble = assert_warns(RuntimeWarning, np.longdouble, '-1e10000')
+        assert_equal(flongdouble, -np.inf)
+
+    @pytest.mark.skipif((sys.version_info[0] >= 3)
+                        or (sys.platform == "win32"
+                            and platform.architecture()[0] == "64bit"),
+                        reason="numpy.intp('0xff', 16) not supported on Py3 "
+                               "or 64 bit Windows")
+    def test_intp(self):
+        # Ticket #99
+        i_width = np.int_(0).nbytes*2 - 1
+        np.intp('0x' + 'f'*i_width, 16)
+        assert_raises(OverflowError, np.intp, '0x' + 'f'*(i_width+1), 16)
+        assert_raises(ValueError, np.intp, '0x1', 32)
+        assert_equal(255, np.intp('0xFF', 16))
+
+
+class TestFromInt(object):
+    def test_intp(self):
+        # Ticket #99
+        assert_equal(1024, np.intp(1024))
+
+    def test_uint64_from_negative(self):
+        assert_equal(np.uint64(-2), np.uint64(18446744073709551614))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_scalarbuffer.py b/contrib/python/numpy/py2/numpy/core/tests/test_scalarbuffer.py
new file mode 100644
index 00000000000..cd520d99b6d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_scalarbuffer.py
@@ -0,0 +1,105 @@
+"""
+Test scalar buffer interface adheres to PEP 3118
+"""
+import sys
+import numpy as np
+import pytest
+
+from numpy.testing import assert_, assert_equal, assert_raises
+
+# PEP3118 format strings for native (standard alignment and byteorder) types
+scalars_and_codes = [
+    (np.bool_, '?'),
+    (np.byte, 'b'),
+    (np.short, 'h'),
+    (np.intc, 'i'),
+    (np.int_, 'l'),
+    (np.longlong, 'q'),
+    (np.ubyte, 'B'),
+    (np.ushort, 'H'),
+    (np.uintc, 'I'),
+    (np.uint, 'L'),
+    (np.ulonglong, 'Q'),
+    (np.half, 'e'),
+    (np.single, 'f'),
+    (np.double, 'd'),
+    (np.longdouble, 'g'),
+    (np.csingle, 'Zf'),
+    (np.cdouble, 'Zd'),
+    (np.clongdouble, 'Zg'),
+]
+scalars_only, codes_only = zip(*scalars_and_codes)
+
+
+[email protected](sys.version_info.major < 3,
+                    reason="Python 2 scalars lack a buffer interface")
+class TestScalarPEP3118(object):
+
+    @pytest.mark.parametrize('scalar', scalars_only, ids=codes_only)
+    def test_scalar_match_array(self, scalar):
+        x = scalar()
+        a = np.array([], dtype=np.dtype(scalar))
+        mv_x = memoryview(x)
+        mv_a = memoryview(a)
+        assert_equal(mv_x.format, mv_a.format)
+
+    @pytest.mark.parametrize('scalar', scalars_only, ids=codes_only)
+    def test_scalar_dim(self, scalar):
+        x = scalar()
+        mv_x = memoryview(x)
+        assert_equal(mv_x.itemsize, np.dtype(scalar).itemsize)
+        assert_equal(mv_x.ndim, 0)
+        assert_equal(mv_x.shape, ())
+        assert_equal(mv_x.strides, ())
+        assert_equal(mv_x.suboffsets, ())
+
+    @pytest.mark.parametrize('scalar, code', scalars_and_codes, ids=codes_only)
+    def test_scalar_known_code(self, scalar, code):
+        x = scalar()
+        mv_x = memoryview(x)
+        assert_equal(mv_x.format, code)
+
+    def test_void_scalar_structured_data(self):
+        dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
+        x = np.array(('ndarray_scalar', (1.2, 3.0)), dtype=dt)[()]
+        assert_(isinstance(x, np.void))
+        mv_x = memoryview(x)
+        expected_size = 16 * np.dtype((np.unicode_, 1)).itemsize
+        expected_size += 2 * np.dtype((np.float64, 1)).itemsize
+        assert_equal(mv_x.itemsize, expected_size)
+        assert_equal(mv_x.ndim, 0)
+        assert_equal(mv_x.shape, ())
+        assert_equal(mv_x.strides, ())
+        assert_equal(mv_x.suboffsets, ())
+
+        # check scalar format string against ndarray format string
+        a = np.array([('Sarah', (8.0, 7.0)), ('John', (6.0, 7.0))], dtype=dt)
+        assert_(isinstance(a, np.ndarray))
+        mv_a = memoryview(a)
+        assert_equal(mv_x.itemsize, mv_a.itemsize)
+        assert_equal(mv_x.format, mv_a.format)
+
+    def test_datetime_memoryview(self):
+        # gh-11656
+        # Values verified with v1.13.3, shape is not () as in test_scalar_dim
+        def as_dict(m):
+            return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
+                        ndim=m.ndim, format=m.format)
+
+        dt1 = np.datetime64('2016-01-01')
+        dt2 = np.datetime64('2017-01-01')
+        expected = {'strides': (1,), 'itemsize': 1, 'ndim': 1,
+                    'shape': (8,), 'format': 'B'}
+        v = memoryview(dt1)
+        res = as_dict(v) 
+        assert_equal(res, expected)
+
+        v = memoryview(dt2 - dt1)
+        res = as_dict(v)
+        assert_equal(res, expected)
+
+        dt = np.dtype([('a', 'uint16'), ('b', 'M8[s]')])
+        a = np.empty(1, dt)
+        # Fails to create a PEP 3118 valid buffer
+        assert_raises((ValueError, BufferError), memoryview, a[0])
+
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_scalarinherit.py b/contrib/python/numpy/py2/numpy/core/tests/test_scalarinherit.py
new file mode 100644
index 00000000000..9e32cf624de
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_scalarinherit.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+""" Test printing of scalar types.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_
+
+
+class A(object):
+    pass
+class B(A, np.float64):
+    pass
+
+class C(B):
+    pass
+class D(C, B):
+    pass
+
+class B0(np.float64, A):
+    pass
+class C0(B0):
+    pass
+
+class TestInherit(object):
+    def test_init(self):
+        x = B(1.0)
+        assert_(str(x) == '1.0')
+        y = C(2.0)
+        assert_(str(y) == '2.0')
+        z = D(3.0)
+        assert_(str(z) == '3.0')
+
+    def test_init2(self):
+        x = B0(1.0)
+        assert_(str(x) == '1.0')
+        y = C0(2.0)
+        assert_(str(y) == '2.0')
+
+
+class TestCharacter(object):
+    def test_char_radd(self):
+        # GH issue 9620, reached gentype_add and raise TypeError
+        np_s = np.string_('abc')
+        np_u = np.unicode_('abc')
+        s = b'def'
+        u = u'def'
+        assert_(np_s.__radd__(np_s) is NotImplemented)
+        assert_(np_s.__radd__(np_u) is NotImplemented)
+        assert_(np_s.__radd__(s) is NotImplemented)
+        assert_(np_s.__radd__(u) is NotImplemented)
+        assert_(np_u.__radd__(np_s) is NotImplemented)
+        assert_(np_u.__radd__(np_u) is NotImplemented)
+        assert_(np_u.__radd__(s) is NotImplemented)
+        assert_(np_u.__radd__(u) is NotImplemented)
+        assert_(s + np_s == b'defabc')
+        assert_(u + np_u == u'defabc')
+
+
+        class Mystr(str, np.generic):
+            # would segfault
+            pass
+
+        ret = s + Mystr('abc')
+        assert_(type(ret) is type(s))
+
+    def test_char_repeat(self):
+        np_s = np.string_('abc')
+        np_u = np.unicode_('abc')
+        np_i = np.int(5)
+        res_s = b'abc' * 5
+        res_u = u'abc' * 5
+        assert_(np_s * np_i == res_s)
+        assert_(np_u * np_i == res_u)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_scalarmath.py b/contrib/python/numpy/py2/numpy/core/tests/test_scalarmath.py
new file mode 100644
index 00000000000..ebba457e3fa
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_scalarmath.py
@@ -0,0 +1,666 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import warnings
+import itertools
+import operator
+import platform
+import pytest
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_almost_equal,
+    assert_array_equal, IS_PYPY, suppress_warnings, _gen_alignment_data,
+    assert_warns
+    )
+
+types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
+         np.int_, np.uint, np.longlong, np.ulonglong,
+         np.single, np.double, np.longdouble, np.csingle,
+         np.cdouble, np.clongdouble]
+
+floating_types = np.floating.__subclasses__()
+complex_floating_types = np.complexfloating.__subclasses__()
+
+
+# This compares scalarmath against ufuncs.
+
+class TestTypes(object):
+    def test_types(self):
+        for atype in types:
+            a = atype(1)
+            assert_(a == 1, "error with %r: got %r" % (atype, a))
+
+    def test_type_add(self):
+        # list of types
+        for k, atype in enumerate(types):
+            a_scalar = atype(3)
+            a_array = np.array([3], dtype=atype)
+            for l, btype in enumerate(types):
+                b_scalar = btype(1)
+                b_array = np.array([1], dtype=btype)
+                c_scalar = a_scalar + b_scalar
+                c_array = a_array + b_array
+                # It was comparing the type numbers, but the new ufunc
+                # function-finding mechanism finds the lowest function
+                # to which both inputs can be cast - which produces 'l'
+                # when you do 'q' + 'b'.  The old function finding mechanism
+                # skipped ahead based on the first argument, but that
+                # does not produce properly symmetric results...
+                assert_equal(c_scalar.dtype, c_array.dtype,
+                           "error with types (%d/'%c' + %d/'%c')" %
+                            (k, np.dtype(atype).char, l, np.dtype(btype).char))
+
+    def test_type_create(self):
+        for k, atype in enumerate(types):
+            a = np.array([1, 2, 3], atype)
+            b = atype([1, 2, 3])
+            assert_equal(a, b)
+
+    def test_leak(self):
+        # test leak of scalar objects
+        # a leak would show up in valgrind as still-reachable of ~2.6MB
+        for i in range(200000):
+            np.add(1, 1)
+
+
+class TestBaseMath(object):
+    def test_blocked(self):
+        # test alignments offsets for simd instructions
+        # alignments for vz + 2 * (vs - 1) + 1
+        for dt, sz in [(np.float32, 11), (np.float64, 7), (np.int32, 11)]:
+            for out, inp1, inp2, msg in _gen_alignment_data(dtype=dt,
+                                                            type='binary',
+                                                            max_size=sz):
+                exp1 = np.ones_like(inp1)
+                inp1[...] = np.ones_like(inp1)
+                inp2[...] = np.zeros_like(inp2)
+                assert_almost_equal(np.add(inp1, inp2), exp1, err_msg=msg)
+                assert_almost_equal(np.add(inp1, 2), exp1 + 2, err_msg=msg)
+                assert_almost_equal(np.add(1, inp2), exp1, err_msg=msg)
+
+                np.add(inp1, inp2, out=out)
+                assert_almost_equal(out, exp1, err_msg=msg)
+
+                inp2[...] += np.arange(inp2.size, dtype=dt) + 1
+                assert_almost_equal(np.square(inp2),
+                                    np.multiply(inp2, inp2),  err_msg=msg)
+                # skip true divide for ints
+                if dt != np.int32 or (sys.version_info.major < 3 and not sys.py3kwarning):
+                    assert_almost_equal(np.reciprocal(inp2),
+                                        np.divide(1, inp2),  err_msg=msg)
+
+                inp1[...] = np.ones_like(inp1)
+                np.add(inp1, 2, out=out)
+                assert_almost_equal(out, exp1 + 2, err_msg=msg)
+                inp2[...] = np.ones_like(inp2)
+                np.add(2, inp2, out=out)
+                assert_almost_equal(out, exp1 + 2, err_msg=msg)
+
+    def test_lower_align(self):
+        # check data that is not aligned to element size
+        # i.e doubles are aligned to 4 bytes on i386
+        d = np.zeros(23 * 8, dtype=np.int8)[4:-4].view(np.float64)
+        o = np.zeros(23 * 8, dtype=np.int8)[4:-4].view(np.float64)
+        assert_almost_equal(d + d, d * 2)
+        np.add(d, d, out=o)
+        np.add(np.ones_like(d), d, out=o)
+        np.add(d, np.ones_like(d), out=o)
+        np.add(np.ones_like(d), d)
+        np.add(d, np.ones_like(d))
+
+
+class TestPower(object):
+    def test_small_types(self):
+        for t in [np.int8, np.int16, np.float16]:
+            a = t(3)
+            b = a ** 4
+            assert_(b == 81, "error with %r: got %r" % (t, b))
+
+    def test_large_types(self):
+        for t in [np.int32, np.int64, np.float32, np.float64, np.longdouble]:
+            a = t(51)
+            b = a ** 4
+            msg = "error with %r: got %r" % (t, b)
+            if np.issubdtype(t, np.integer):
+                assert_(b == 6765201, msg)
+            else:
+                assert_almost_equal(b, 6765201, err_msg=msg)
+
+    def test_integers_to_negative_integer_power(self):
+        # Note that the combination of uint64 with a signed integer
+        # has common type np.float64. The other combinations should all
+        # raise a ValueError for integer ** negative integer.
+        exp = [np.array(-1, dt)[()] for dt in 'bhilq']
+
+        # 1 ** -1 possible special case
+        base = [np.array(1, dt)[()] for dt in 'bhilqBHILQ']
+        for i1, i2 in itertools.product(base, exp):
+            if i1.dtype != np.uint64:
+                assert_raises(ValueError, operator.pow, i1, i2)
+            else:
+                res = operator.pow(i1, i2)
+                assert_(res.dtype.type is np.float64)
+                assert_almost_equal(res, 1.)
+
+        # -1 ** -1 possible special case
+        base = [np.array(-1, dt)[()] for dt in 'bhilq']
+        for i1, i2 in itertools.product(base, exp):
+            if i1.dtype != np.uint64:
+                assert_raises(ValueError, operator.pow, i1, i2)
+            else:
+                res = operator.pow(i1, i2)
+                assert_(res.dtype.type is np.float64)
+                assert_almost_equal(res, -1.)
+
+        # 2 ** -1 perhaps generic
+        base = [np.array(2, dt)[()] for dt in 'bhilqBHILQ']
+        for i1, i2 in itertools.product(base, exp):
+            if i1.dtype != np.uint64:
+                assert_raises(ValueError, operator.pow, i1, i2)
+            else:
+                res = operator.pow(i1, i2)
+                assert_(res.dtype.type is np.float64)
+                assert_almost_equal(res, .5)
+
+    def test_mixed_types(self):
+        typelist = [np.int8, np.int16, np.float16,
+                    np.float32, np.float64, np.int8,
+                    np.int16, np.int32, np.int64]
+        for t1 in typelist:
+            for t2 in typelist:
+                a = t1(3)
+                b = t2(2)
+                result = a**b
+                msg = ("error with %r and %r:"
+                       "got %r, expected %r") % (t1, t2, result, 9)
+                if np.issubdtype(np.dtype(result), np.integer):
+                    assert_(result == 9, msg)
+                else:
+                    assert_almost_equal(result, 9, err_msg=msg)
+
+    def test_modular_power(self):
+        # modular power is not implemented, so ensure it errors
+        a = 5
+        b = 4
+        c = 10
+        expected = pow(a, b, c)  # noqa: F841
+        for t in (np.int32, np.float32, np.complex64):
+            # note that 3-operand power only dispatches on the first argument
+            assert_raises(TypeError, operator.pow, t(a), b, c)
+            assert_raises(TypeError, operator.pow, np.array(t(a)), b, c)
+
+
+def floordiv_and_mod(x, y):
+    return (x // y, x % y)
+
+
+def _signs(dt):
+    if dt in np.typecodes['UnsignedInteger']:
+        return (+1,)
+    else:
+        return (+1, -1)
+
+
+class TestModulus(object):
+
+    def test_modulus_basic(self):
+        dt = np.typecodes['AllInteger'] + np.typecodes['Float']
+        for op in [floordiv_and_mod, divmod]:
+            for dt1, dt2 in itertools.product(dt, dt):
+                for sg1, sg2 in itertools.product(_signs(dt1), _signs(dt2)):
+                    fmt = 'op: %s, dt1: %s, dt2: %s, sg1: %s, sg2: %s'
+                    msg = fmt % (op.__name__, dt1, dt2, sg1, sg2)
+                    a = np.array(sg1*71, dtype=dt1)[()]
+                    b = np.array(sg2*19, dtype=dt2)[()]
+                    div, rem = op(a, b)
+                    assert_equal(div*b + rem, a, err_msg=msg)
+                    if sg2 == -1:
+                        assert_(b < rem <= 0, msg)
+                    else:
+                        assert_(b > rem >= 0, msg)
+
+    def test_float_modulus_exact(self):
+        # test that float results are exact for small integers. This also
+        # holds for the same integers scaled by powers of two.
+        nlst = list(range(-127, 0))
+        plst = list(range(1, 128))
+        dividend = nlst + [0] + plst
+        divisor = nlst + plst
+        arg = list(itertools.product(dividend, divisor))
+        tgt = list(divmod(*t) for t in arg)
+
+        a, b = np.array(arg, dtype=int).T
+        # convert exact integer results from Python to float so that
+        # signed zero can be used, it is checked.
+        tgtdiv, tgtrem = np.array(tgt, dtype=float).T
+        tgtdiv = np.where((tgtdiv == 0.0) & ((b < 0) ^ (a < 0)), -0.0, tgtdiv)
+        tgtrem = np.where((tgtrem == 0.0) & (b < 0), -0.0, tgtrem)
+
+        for op in [floordiv_and_mod, divmod]:
+            for dt in np.typecodes['Float']:
+                msg = 'op: %s, dtype: %s' % (op.__name__, dt)
+                fa = a.astype(dt)
+                fb = b.astype(dt)
+                # use list comprehension so a_ and b_ are scalars
+                div, rem = zip(*[op(a_, b_) for  a_, b_ in zip(fa, fb)])
+                assert_equal(div, tgtdiv, err_msg=msg)
+                assert_equal(rem, tgtrem, err_msg=msg)
+
+    def test_float_modulus_roundoff(self):
+        # gh-6127
+        dt = np.typecodes['Float']
+        for op in [floordiv_and_mod, divmod]:
+            for dt1, dt2 in itertools.product(dt, dt):
+                for sg1, sg2 in itertools.product((+1, -1), (+1, -1)):
+                    fmt = 'op: %s, dt1: %s, dt2: %s, sg1: %s, sg2: %s'
+                    msg = fmt % (op.__name__, dt1, dt2, sg1, sg2)
+                    a = np.array(sg1*78*6e-8, dtype=dt1)[()]
+                    b = np.array(sg2*6e-8, dtype=dt2)[()]
+                    div, rem = op(a, b)
+                    # Equal assertion should hold when fmod is used
+                    assert_equal(div*b + rem, a, err_msg=msg)
+                    if sg2 == -1:
+                        assert_(b < rem <= 0, msg)
+                    else:
+                        assert_(b > rem >= 0, msg)
+
+    def test_float_modulus_corner_cases(self):
+        # Check remainder magnitude.
+        for dt in np.typecodes['Float']:
+            b = np.array(1.0, dtype=dt)
+            a = np.nextafter(np.array(0.0, dtype=dt), -b)
+            rem = operator.mod(a, b)
+            assert_(rem <= b, 'dt: %s' % dt)
+            rem = operator.mod(-a, -b)
+            assert_(rem >= -b, 'dt: %s' % dt)
+
+        # Check nans, inf
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning, "invalid value encountered in remainder")
+            for dt in np.typecodes['Float']:
+                fone = np.array(1.0, dtype=dt)
+                fzer = np.array(0.0, dtype=dt)
+                finf = np.array(np.inf, dtype=dt)
+                fnan = np.array(np.nan, dtype=dt)
+                rem = operator.mod(fone, fzer)
+                assert_(np.isnan(rem), 'dt: %s' % dt)
+                # MSVC 2008 returns NaN here, so disable the check.
+                #rem = operator.mod(fone, finf)
+                #assert_(rem == fone, 'dt: %s' % dt)
+                rem = operator.mod(fone, fnan)
+                assert_(np.isnan(rem), 'dt: %s' % dt)
+                rem = operator.mod(finf, fone)
+                assert_(np.isnan(rem), 'dt: %s' % dt)
+
+
+class TestComplexDivision(object):
+    def test_zero_division(self):
+        with np.errstate(all="ignore"):
+            for t in [np.complex64, np.complex128]:
+                a = t(0.0)
+                b = t(1.0)
+                assert_(np.isinf(b/a))
+                b = t(complex(np.inf, np.inf))
+                assert_(np.isinf(b/a))
+                b = t(complex(np.inf, np.nan))
+                assert_(np.isinf(b/a))
+                b = t(complex(np.nan, np.inf))
+                assert_(np.isinf(b/a))
+                b = t(complex(np.nan, np.nan))
+                assert_(np.isnan(b/a))
+                b = t(0.)
+                assert_(np.isnan(b/a))
+
+    def test_signed_zeros(self):
+        with np.errstate(all="ignore"):
+            for t in [np.complex64, np.complex128]:
+                # tupled (numerator, denominator, expected)
+                # for testing as expected == numerator/denominator
+                data = (
+                    (( 0.0,-1.0), ( 0.0, 1.0), (-1.0,-0.0)),
+                    (( 0.0,-1.0), ( 0.0,-1.0), ( 1.0,-0.0)),
+                    (( 0.0,-1.0), (-0.0,-1.0), ( 1.0, 0.0)),
+                    (( 0.0,-1.0), (-0.0, 1.0), (-1.0, 0.0)),
+                    (( 0.0, 1.0), ( 0.0,-1.0), (-1.0, 0.0)),
+                    (( 0.0,-1.0), ( 0.0,-1.0), ( 1.0,-0.0)),
+                    ((-0.0,-1.0), ( 0.0,-1.0), ( 1.0,-0.0)),
+                    ((-0.0, 1.0), ( 0.0,-1.0), (-1.0,-0.0))
+                )
+                for cases in data:
+                    n = cases[0]
+                    d = cases[1]
+                    ex = cases[2]
+                    result = t(complex(n[0], n[1])) / t(complex(d[0], d[1]))
+                    # check real and imag parts separately to avoid comparison
+                    # in array context, which does not account for signed zeros
+                    assert_equal(result.real, ex[0])
+                    assert_equal(result.imag, ex[1])
+
+    def test_branches(self):
+        with np.errstate(all="ignore"):
+            for t in [np.complex64, np.complex128]:
+                # tupled (numerator, denominator, expected)
+                # for testing as expected == numerator/denominator
+                data = list()
+
+                # trigger branch: real(fabs(denom)) > imag(fabs(denom))
+                # followed by else condition as neither are == 0
+                data.append((( 2.0, 1.0), ( 2.0, 1.0), (1.0, 0.0)))
+
+                # trigger branch: real(fabs(denom)) > imag(fabs(denom))
+                # followed by if condition as both are == 0
+                # is performed in test_zero_division(), so this is skipped
+
+                # trigger else if branch: real(fabs(denom)) < imag(fabs(denom))
+                data.append((( 1.0, 2.0), ( 1.0, 2.0), (1.0, 0.0)))
+
+                for cases in data:
+                    n = cases[0]
+                    d = cases[1]
+                    ex = cases[2]
+                    result = t(complex(n[0], n[1])) / t(complex(d[0], d[1]))
+                    # check real and imag parts separately to avoid comparison
+                    # in array context, which does not account for signed zeros
+                    assert_equal(result.real, ex[0])
+                    assert_equal(result.imag, ex[1])
+
+
+class TestConversion(object):
+    def test_int_from_long(self):
+        l = [1e6, 1e12, 1e18, -1e6, -1e12, -1e18]
+        li = [10**6, 10**12, 10**18, -10**6, -10**12, -10**18]
+        for T in [None, np.float64, np.int64]:
+            a = np.array(l, dtype=T)
+            assert_equal([int(_m) for _m in a], li)
+
+        a = np.array(l[:3], dtype=np.uint64)
+        assert_equal([int(_m) for _m in a], li[:3])
+
+    def test_iinfo_long_values(self):
+        for code in 'bBhH':
+            res = np.array(np.iinfo(code).max + 1, dtype=code)
+            tgt = np.iinfo(code).min
+            assert_(res == tgt)
+
+        for code in np.typecodes['AllInteger']:
+            res = np.array(np.iinfo(code).max, dtype=code)
+            tgt = np.iinfo(code).max
+            assert_(res == tgt)
+
+        for code in np.typecodes['AllInteger']:
+            res = np.typeDict[code](np.iinfo(code).max)
+            tgt = np.iinfo(code).max
+            assert_(res == tgt)
+
+    def test_int_raise_behaviour(self):
+        def overflow_error_func(dtype):
+            np.typeDict[dtype](np.iinfo(dtype).max + 1)
+
+        for code in 'lLqQ':
+            assert_raises(OverflowError, overflow_error_func, code)
+
+    def test_int_from_infinite_longdouble(self):
+        # gh-627
+        x = np.longdouble(np.inf)
+        assert_raises(OverflowError, int, x)
+        with suppress_warnings() as sup:
+            sup.record(np.ComplexWarning)
+            x = np.clongdouble(np.inf)
+            assert_raises(OverflowError, int, x)
+            assert_equal(len(sup.log), 1)
+
+    @pytest.mark.skipif(not IS_PYPY, reason="Test is PyPy only (gh-9972)")
+    def test_int_from_infinite_longdouble___int__(self):
+        x = np.longdouble(np.inf)
+        assert_raises(OverflowError, x.__int__)
+        with suppress_warnings() as sup:
+            sup.record(np.ComplexWarning)
+            x = np.clongdouble(np.inf)
+            assert_raises(OverflowError, x.__int__)
+            assert_equal(len(sup.log), 1)
+
+    @pytest.mark.skipif(np.finfo(np.double) == np.finfo(np.longdouble),
+                        reason="long double is same as double")
+    @pytest.mark.skipif(platform.machine().startswith("ppc"),
+                        reason="IBM double double")
+    def test_int_from_huge_longdouble(self):
+        # Produce a longdouble that would overflow a double,
+        # use exponent that avoids bug in Darwin pow function.
+        exp = np.finfo(np.double).maxexp - 1
+        huge_ld = 2 * 1234 * np.longdouble(2) ** exp
+        huge_i = 2 * 1234 * 2 ** exp
+        assert_(huge_ld != np.inf)
+        assert_equal(int(huge_ld), huge_i)
+
+    def test_int_from_longdouble(self):
+        x = np.longdouble(1.5)
+        assert_equal(int(x), 1)
+        x = np.longdouble(-10.5)
+        assert_equal(int(x), -10)
+
+    def test_numpy_scalar_relational_operators(self):
+        # All integer
+        for dt1 in np.typecodes['AllInteger']:
+            assert_(1 > np.array(0, dtype=dt1)[()], "type %s failed" % (dt1,))
+            assert_(not 1 < np.array(0, dtype=dt1)[()], "type %s failed" % (dt1,))
+
+            for dt2 in np.typecodes['AllInteger']:
+                assert_(np.array(1, dtype=dt1)[()] > np.array(0, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(not np.array(1, dtype=dt1)[()] < np.array(0, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+
+        #Unsigned integers
+        for dt1 in 'BHILQP':
+            assert_(-1 < np.array(1, dtype=dt1)[()], "type %s failed" % (dt1,))
+            assert_(not -1 > np.array(1, dtype=dt1)[()], "type %s failed" % (dt1,))
+            assert_(-1 != np.array(1, dtype=dt1)[()], "type %s failed" % (dt1,))
+
+            #unsigned vs signed
+            for dt2 in 'bhilqp':
+                assert_(np.array(1, dtype=dt1)[()] > np.array(-1, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(not np.array(1, dtype=dt1)[()] < np.array(-1, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(np.array(1, dtype=dt1)[()] != np.array(-1, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+
+        #Signed integers and floats
+        for dt1 in 'bhlqp' + np.typecodes['Float']:
+            assert_(1 > np.array(-1, dtype=dt1)[()], "type %s failed" % (dt1,))
+            assert_(not 1 < np.array(-1, dtype=dt1)[()], "type %s failed" % (dt1,))
+            assert_(-1 == np.array(-1, dtype=dt1)[()], "type %s failed" % (dt1,))
+
+            for dt2 in 'bhlqp' + np.typecodes['Float']:
+                assert_(np.array(1, dtype=dt1)[()] > np.array(-1, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(not np.array(1, dtype=dt1)[()] < np.array(-1, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+                assert_(np.array(-1, dtype=dt1)[()] == np.array(-1, dtype=dt2)[()],
+                        "type %s and %s failed" % (dt1, dt2))
+
+    def test_scalar_comparison_to_none(self):
+        # Scalars should just return False and not give a warnings.
+        # The comparisons are flagged by pep8, ignore that.
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', FutureWarning)
+            assert_(not np.float32(1) == None)
+            assert_(not np.str_('test') == None)
+            # This is dubious (see below):
+            assert_(not np.datetime64('NaT') == None)
+
+            assert_(np.float32(1) != None)
+            assert_(np.str_('test') != None)
+            # This is dubious (see below):
+            assert_(np.datetime64('NaT') != None)
+        assert_(len(w) == 0)
+
+        # For documentation purposes, this is why the datetime is dubious.
+        # At the time of deprecation this was no behaviour change, but
+        # it has to be considered when the deprecations are done.
+        assert_(np.equal(np.datetime64('NaT'), None))
+
+
+#class TestRepr(object):
+#    def test_repr(self):
+#        for t in types:
+#            val = t(1197346475.0137341)
+#            val_repr = repr(val)
+#            val2 = eval(val_repr)
+#            assert_equal( val, val2 )
+
+
+class TestRepr(object):
+    def _test_type_repr(self, t):
+        finfo = np.finfo(t)
+        last_fraction_bit_idx = finfo.nexp + finfo.nmant
+        last_exponent_bit_idx = finfo.nexp
+        storage_bytes = np.dtype(t).itemsize*8
+        # could add some more types to the list below
+        for which in ['small denorm', 'small norm']:
+            # Values from https://en.wikipedia.org/wiki/IEEE_754
+            constr = np.array([0x00]*storage_bytes, dtype=np.uint8)
+            if which == 'small denorm':
+                byte = last_fraction_bit_idx // 8
+                bytebit = 7-(last_fraction_bit_idx % 8)
+                constr[byte] = 1 << bytebit
+            elif which == 'small norm':
+                byte = last_exponent_bit_idx // 8
+                bytebit = 7-(last_exponent_bit_idx % 8)
+                constr[byte] = 1 << bytebit
+            else:
+                raise ValueError('hmm')
+            val = constr.view(t)[0]
+            val_repr = repr(val)
+            val2 = t(eval(val_repr))
+            if not (val2 == 0 and val < 1e-100):
+                assert_equal(val, val2)
+
+    def test_float_repr(self):
+        # long double test cannot work, because eval goes through a python
+        # float
+        for t in [np.float32, np.float64]:
+            self._test_type_repr(t)
+
+
+if not IS_PYPY:
+    # sys.getsizeof() is not valid on PyPy
+    class TestSizeOf(object):
+
+        def test_equal_nbytes(self):
+            for type in types:
+                x = type(0)
+                assert_(sys.getsizeof(x) > x.nbytes)
+
+        def test_error(self):
+            d = np.float32()
+            assert_raises(TypeError, d.__sizeof__, "a")
+
+
+class TestMultiply(object):
+    def test_seq_repeat(self):
+        # Test that basic sequences get repeated when multiplied with
+        # numpy integers. And errors are raised when multiplied with others.
+        # Some of this behaviour may be controversial and could be open for
+        # change.
+        accepted_types = set(np.typecodes["AllInteger"])
+        deprecated_types = {'?'}
+        forbidden_types = (
+            set(np.typecodes["All"]) - accepted_types - deprecated_types)
+        forbidden_types -= {'V'}  # can't default-construct void scalars
+
+        for seq_type in (list, tuple):
+            seq = seq_type([1, 2, 3])
+            for numpy_type in accepted_types:
+                i = np.dtype(numpy_type).type(2)
+                assert_equal(seq * i, seq * int(i))
+                assert_equal(i * seq, int(i) * seq)
+
+            for numpy_type in deprecated_types:
+                i = np.dtype(numpy_type).type()
+                assert_equal(
+                    assert_warns(DeprecationWarning, operator.mul, seq, i),
+                    seq * int(i))
+                assert_equal(
+                    assert_warns(DeprecationWarning, operator.mul, i, seq),
+                    int(i) * seq)
+
+            for numpy_type in forbidden_types:
+                i = np.dtype(numpy_type).type()
+                assert_raises(TypeError, operator.mul, seq, i)
+                assert_raises(TypeError, operator.mul, i, seq)
+
+    def test_no_seq_repeat_basic_array_like(self):
+        # Test that an array-like which does not know how to be multiplied
+        # does not attempt sequence repeat (raise TypeError).
+        # See also gh-7428.
+        class ArrayLike(object):
+            def __init__(self, arr):
+                self.arr = arr
+            def __array__(self):
+                return self.arr
+
+        # Test for simple ArrayLike above and memoryviews (original report)
+        for arr_like in (ArrayLike(np.ones(3)), memoryview(np.ones(3))):
+            assert_array_equal(arr_like * np.float32(3.), np.full(3, 3.))
+            assert_array_equal(np.float32(3.) * arr_like, np.full(3, 3.))
+            assert_array_equal(arr_like * np.int_(3), np.full(3, 3))
+            assert_array_equal(np.int_(3) * arr_like, np.full(3, 3))
+
+
+class TestNegative(object):
+    def test_exceptions(self):
+        a = np.ones((), dtype=np.bool_)[()]
+        assert_raises(TypeError, operator.neg, a)
+
+    def test_result(self):
+        types = np.typecodes['AllInteger'] + np.typecodes['AllFloat']
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning)
+            for dt in types:
+                a = np.ones((), dtype=dt)[()]
+                assert_equal(operator.neg(a) + a, 0)
+
+
+class TestSubtract(object):
+    def test_exceptions(self):
+        a = np.ones((), dtype=np.bool_)[()]
+        assert_raises(TypeError, operator.sub, a, a)
+
+    def test_result(self):
+        types = np.typecodes['AllInteger'] + np.typecodes['AllFloat']
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning)
+            for dt in types:
+                a = np.ones((), dtype=dt)[()]
+                assert_equal(operator.sub(a, a), 0)
+
+
+class TestAbs(object):
+    def _test_abs_func(self, absfunc):
+        for tp in floating_types + complex_floating_types:
+            x = tp(-1.5)
+            assert_equal(absfunc(x), 1.5)
+            x = tp(0.0)
+            res = absfunc(x)
+            # assert_equal() checks zero signedness
+            assert_equal(res, 0.0)
+            x = tp(-0.0)
+            res = absfunc(x)
+            assert_equal(res, 0.0)
+
+            x = tp(np.finfo(tp).max)
+            assert_equal(absfunc(x), x.real)
+
+            x = tp(np.finfo(tp).tiny)
+            assert_equal(absfunc(x), x.real)
+
+            x = tp(np.finfo(tp).min)
+            assert_equal(absfunc(x), -x.real)
+
+    def test_builtin_abs(self):
+        self._test_abs_func(abs)
+
+    def test_numpy_abs(self):
+        self._test_abs_func(np.abs)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_scalarprint.py b/contrib/python/numpy/py2/numpy/core/tests/test_scalarprint.py
new file mode 100644
index 00000000000..cde1355aa15
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_scalarprint.py
@@ -0,0 +1,326 @@
+# -*- coding: utf-8 -*-
+""" Test printing of scalar types.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import code, sys
+import platform
+import pytest
+
+from tempfile import TemporaryFile
+import numpy as np
+from numpy.testing import assert_, assert_equal, suppress_warnings
+
+class TestRealScalars(object):
+    def test_str(self):
+        svals = [0.0, -0.0, 1, -1, np.inf, -np.inf, np.nan]
+        styps = [np.float16, np.float32, np.float64, np.longdouble]
+        wanted = [
+             ['0.0',  '0.0',  '0.0',  '0.0' ],
+             ['-0.0', '-0.0', '-0.0', '-0.0'],
+             ['1.0',  '1.0',  '1.0',  '1.0' ],
+             ['-1.0', '-1.0', '-1.0', '-1.0'],
+             ['inf',  'inf',  'inf',  'inf' ],
+             ['-inf', '-inf', '-inf', '-inf'],
+             ['nan',  'nan',  'nan',  'nan']]
+
+        for wants, val in zip(wanted, svals):
+            for want, styp in zip(wants, styps):
+                msg = 'for str({}({}))'.format(np.dtype(styp).name, repr(val))
+                assert_equal(str(styp(val)), want, err_msg=msg)
+
+    def test_scalar_cutoffs(self):
+        # test that both the str and repr of np.float64 behaves
+        # like python floats in python3. Note that in python2
+        # the str has truncated digits, but we do not do this
+        def check(v):
+            # we compare str to repr, to avoid python2 truncation behavior
+            assert_equal(str(np.float64(v)), repr(v))
+            assert_equal(repr(np.float64(v)), repr(v))
+
+        # check we use the same number of significant digits
+        check(1.12345678901234567890)
+        check(0.0112345678901234567890)
+
+        # check switch from scientific output to positional and back
+        check(1e-5)
+        check(1e-4)
+        check(1e15)
+        check(1e16)
+
+    def test_py2_float_print(self):
+        # gh-10753
+        # In python2, the python float type implements an obsolte method
+        # tp_print, which overrides tp_repr and tp_str when using "print" to
+        # output to a "real file" (ie, not a StringIO). Make sure we don't
+        # inherit it.
+        x = np.double(0.1999999999999)
+        with TemporaryFile('r+t') as f:
+            print(x, file=f)
+            f.seek(0)
+            output = f.read()
+        assert_equal(output, str(x) + '\n')
+        # In python2 the value float('0.1999999999999') prints with reduced
+        # precision as '0.2', but we want numpy's np.double('0.1999999999999')
+        # to print the unique value, '0.1999999999999'.
+
+        # gh-11031
+        # Only in the python2 interactive shell and when stdout is a "real"
+        # file, the output of the last command is printed to stdout without
+        # Py_PRINT_RAW (unlike the print statement) so `>>> x` and `>>> print
+        # x` are potentially different. Make sure they are the same. The only
+        # way I found to get prompt-like output is using an actual prompt from
+        # the 'code' module. Again, must use tempfile to get a "real" file.
+
+        # dummy user-input which enters one line and then ctrl-Ds.
+        def userinput():
+            yield 'np.sqrt(2)'
+            raise EOFError
+        gen = userinput()
+        input_func = lambda prompt="": next(gen)
+
+        with TemporaryFile('r+t') as fo, TemporaryFile('r+t') as fe:
+            orig_stdout, orig_stderr = sys.stdout, sys.stderr
+            sys.stdout, sys.stderr = fo, fe
+
+            # py2 code.interact sends irrelevant internal DeprecationWarnings
+            with suppress_warnings() as sup:
+                sup.filter(DeprecationWarning)
+                code.interact(local={'np': np}, readfunc=input_func, banner='')
+
+            sys.stdout, sys.stderr = orig_stdout, orig_stderr
+
+            fo.seek(0)
+            capture = fo.read().strip()
+
+        assert_equal(capture, repr(np.sqrt(2)))
+
+    def test_dragon4(self):
+        # these tests are adapted from Ryan Juckett's dragon4 implementation,
+        # see dragon4.c for details.
+
+        fpos32 = lambda x, **k: np.format_float_positional(np.float32(x), **k)
+        fsci32 = lambda x, **k: np.format_float_scientific(np.float32(x), **k)
+        fpos64 = lambda x, **k: np.format_float_positional(np.float64(x), **k)
+        fsci64 = lambda x, **k: np.format_float_scientific(np.float64(x), **k)
+
+        preckwd = lambda prec: {'unique': False, 'precision': prec}
+
+        assert_equal(fpos32('1.0'), "1.")
+        assert_equal(fsci32('1.0'), "1.e+00")
+        assert_equal(fpos32('10.234'), "10.234")
+        assert_equal(fpos32('-10.234'), "-10.234")
+        assert_equal(fsci32('10.234'), "1.0234e+01")
+        assert_equal(fsci32('-10.234'), "-1.0234e+01")
+        assert_equal(fpos32('1000.0'), "1000.")
+        assert_equal(fpos32('1.0', precision=0), "1.")
+        assert_equal(fsci32('1.0', precision=0), "1.e+00")
+        assert_equal(fpos32('10.234', precision=0), "10.")
+        assert_equal(fpos32('-10.234', precision=0), "-10.")
+        assert_equal(fsci32('10.234', precision=0), "1.e+01")
+        assert_equal(fsci32('-10.234', precision=0), "-1.e+01")
+        assert_equal(fpos32('10.234', precision=2), "10.23")
+        assert_equal(fsci32('-10.234', precision=2), "-1.02e+01")
+        assert_equal(fsci64('9.9999999999999995e-08', **preckwd(16)),
+                            '9.9999999999999995e-08')
+        assert_equal(fsci64('9.8813129168249309e-324', **preckwd(16)),
+                            '9.8813129168249309e-324')
+        assert_equal(fsci64('9.9999999999999694e-311', **preckwd(16)),
+                            '9.9999999999999694e-311')
+
+
+        # test rounding
+        # 3.1415927410 is closest float32 to np.pi
+        assert_equal(fpos32('3.14159265358979323846', **preckwd(10)),
+                            "3.1415927410")
+        assert_equal(fsci32('3.14159265358979323846', **preckwd(10)),
+                            "3.1415927410e+00")
+        assert_equal(fpos64('3.14159265358979323846', **preckwd(10)),
+                            "3.1415926536")
+        assert_equal(fsci64('3.14159265358979323846', **preckwd(10)),
+                            "3.1415926536e+00")
+        # 299792448 is closest float32 to 299792458
+        assert_equal(fpos32('299792458.0', **preckwd(5)), "299792448.00000")
+        assert_equal(fsci32('299792458.0', **preckwd(5)), "2.99792e+08")
+        assert_equal(fpos64('299792458.0', **preckwd(5)), "299792458.00000")
+        assert_equal(fsci64('299792458.0', **preckwd(5)), "2.99792e+08")
+
+        assert_equal(fpos32('3.14159265358979323846', **preckwd(25)),
+                            "3.1415927410125732421875000")
+        assert_equal(fpos64('3.14159265358979323846', **preckwd(50)),
+                         "3.14159265358979311599796346854418516159057617187500")
+        assert_equal(fpos64('3.14159265358979323846'), "3.141592653589793")
+
+
+        # smallest numbers
+        assert_equal(fpos32(0.5**(126 + 23), unique=False, precision=149),
+                    "0.00000000000000000000000000000000000000000000140129846432"
+                    "4817070923729583289916131280261941876515771757068283889791"
+                    "08268586060148663818836212158203125")
+        assert_equal(fpos64(0.5**(1022 + 52), unique=False, precision=1074),
+                    "0.00000000000000000000000000000000000000000000000000000000"
+                    "0000000000000000000000000000000000000000000000000000000000"
+                    "0000000000000000000000000000000000000000000000000000000000"
+                    "0000000000000000000000000000000000000000000000000000000000"
+                    "0000000000000000000000000000000000000000000000000000000000"
+                    "0000000000000000000000000000000000049406564584124654417656"
+                    "8792868221372365059802614324764425585682500675507270208751"
+                    "8652998363616359923797965646954457177309266567103559397963"
+                    "9877479601078187812630071319031140452784581716784898210368"
+                    "8718636056998730723050006387409153564984387312473397273169"
+                    "6151400317153853980741262385655911710266585566867681870395"
+                    "6031062493194527159149245532930545654440112748012970999954"
+                    "1931989409080416563324524757147869014726780159355238611550"
+                    "1348035264934720193790268107107491703332226844753335720832"
+                    "4319360923828934583680601060115061698097530783422773183292"
+                    "4790498252473077637592724787465608477820373446969953364701"
+                    "7972677717585125660551199131504891101451037862738167250955"
+                    "8373897335989936648099411642057026370902792427675445652290"
+                    "87538682506419718265533447265625")
+
+        # largest numbers
+        assert_equal(fpos32(np.finfo(np.float32).max, **preckwd(0)),
+                    "340282346638528859811704183484516925440.")
+        assert_equal(fpos64(np.finfo(np.float64).max, **preckwd(0)),
+                    "1797693134862315708145274237317043567980705675258449965989"
+                    "1747680315726078002853876058955863276687817154045895351438"
+                    "2464234321326889464182768467546703537516986049910576551282"
+                    "0762454900903893289440758685084551339423045832369032229481"
+                    "6580855933212334827479782620414472316873817718091929988125"
+                    "0404026184124858368.")
+        # Warning: In unique mode only the integer digits necessary for
+        # uniqueness are computed, the rest are 0. Should we change this?
+        assert_equal(fpos32(np.finfo(np.float32).max, precision=0),
+                    "340282350000000000000000000000000000000.")
+
+        # test trailing zeros
+        assert_equal(fpos32('1.0', unique=False, precision=3), "1.000")
+        assert_equal(fpos64('1.0', unique=False, precision=3), "1.000")
+        assert_equal(fsci32('1.0', unique=False, precision=3), "1.000e+00")
+        assert_equal(fsci64('1.0', unique=False, precision=3), "1.000e+00")
+        assert_equal(fpos32('1.5', unique=False, precision=3), "1.500")
+        assert_equal(fpos64('1.5', unique=False, precision=3), "1.500")
+        assert_equal(fsci32('1.5', unique=False, precision=3), "1.500e+00")
+        assert_equal(fsci64('1.5', unique=False, precision=3), "1.500e+00")
+        # gh-10713
+        assert_equal(fpos64('324', unique=False, precision=5, fractional=False), "324.00")
+
+    def test_dragon4_interface(self):
+        tps = [np.float16, np.float32, np.float64]
+        if hasattr(np, 'float128'):
+            tps.append(np.float128)
+
+        fpos = np.format_float_positional
+        fsci = np.format_float_scientific
+
+        for tp in tps:
+            # test padding
+            assert_equal(fpos(tp('1.0'), pad_left=4, pad_right=4), "   1.    ")
+            assert_equal(fpos(tp('-1.0'), pad_left=4, pad_right=4), "  -1.    ")
+            assert_equal(fpos(tp('-10.2'),
+                         pad_left=4, pad_right=4), " -10.2   ")
+
+            # test exp_digits
+            assert_equal(fsci(tp('1.23e1'), exp_digits=5), "1.23e+00001")
+
+            # test fixed (non-unique) mode
+            assert_equal(fpos(tp('1.0'), unique=False, precision=4), "1.0000")
+            assert_equal(fsci(tp('1.0'), unique=False, precision=4),
+                         "1.0000e+00")
+
+            # test trimming
+            # trim of 'k' or '.' only affects non-unique mode, since unique
+            # mode will not output trailing 0s.
+            assert_equal(fpos(tp('1.'), unique=False, precision=4, trim='k'),
+                         "1.0000")
+
+            assert_equal(fpos(tp('1.'), unique=False, precision=4, trim='.'),
+                         "1.")
+            assert_equal(fpos(tp('1.2'), unique=False, precision=4, trim='.'),
+                         "1.2" if tp != np.float16 else "1.2002")
+
+            assert_equal(fpos(tp('1.'), unique=False, precision=4, trim='0'),
+                         "1.0")
+            assert_equal(fpos(tp('1.2'), unique=False, precision=4, trim='0'),
+                         "1.2" if tp != np.float16 else "1.2002")
+            assert_equal(fpos(tp('1.'), trim='0'), "1.0")
+
+            assert_equal(fpos(tp('1.'), unique=False, precision=4, trim='-'),
+                         "1")
+            assert_equal(fpos(tp('1.2'), unique=False, precision=4, trim='-'),
+                         "1.2" if tp != np.float16 else "1.2002")
+            assert_equal(fpos(tp('1.'), trim='-'), "1")
+
+    @pytest.mark.skipif(not platform.machine().startswith("ppc64"),
+                        reason="only applies to ppc float128 values")
+    def test_ppc64_ibm_double_double128(self):
+        # check that the precision decreases once we get into the subnormal
+        # range. Unlike float64, this starts around 1e-292 instead of 1e-308,
+        # which happens when the first double is normal and the second is
+        # subnormal.
+        x = np.float128('2.123123123123123123123123123123123e-286')
+        got = [str(x/np.float128('2e' + str(i))) for i in range(0,40)]
+        expected = [
+            "1.06156156156156156156156156156157e-286",
+            "1.06156156156156156156156156156158e-287",
+            "1.06156156156156156156156156156159e-288",
+            "1.0615615615615615615615615615616e-289",
+            "1.06156156156156156156156156156157e-290",
+            "1.06156156156156156156156156156156e-291",
+            "1.0615615615615615615615615615616e-292",
+            "1.0615615615615615615615615615615e-293",
+            "1.061561561561561561561561561562e-294",
+            "1.06156156156156156156156156155e-295",
+            "1.0615615615615615615615615616e-296",
+            "1.06156156156156156156156156e-297",
+            "1.06156156156156156156156157e-298",
+            "1.0615615615615615615615616e-299",
+            "1.06156156156156156156156e-300",
+            "1.06156156156156156156155e-301",
+            "1.0615615615615615615616e-302",
+            "1.061561561561561561562e-303",
+            "1.06156156156156156156e-304",
+            "1.0615615615615615618e-305",
+            "1.06156156156156156e-306",
+            "1.06156156156156157e-307",
+            "1.0615615615615616e-308",
+            "1.06156156156156e-309",
+            "1.06156156156157e-310",
+            "1.0615615615616e-311",
+            "1.06156156156e-312",
+            "1.06156156154e-313",
+            "1.0615615616e-314",
+            "1.06156156e-315",
+            "1.06156155e-316",
+            "1.061562e-317",
+            "1.06156e-318",
+            "1.06155e-319",
+            "1.0617e-320",
+            "1.06e-321",
+            "1.04e-322",
+            "1e-323",
+            "0.0",
+            "0.0"]
+        assert_equal(got, expected)
+
+        # Note: we follow glibc behavior, but it (or gcc) might not be right.
+        # In particular we can get two values that print the same but are not
+        # equal:
+        a = np.float128('2')/np.float128('3')
+        b = np.float128(str(a))
+        assert_equal(str(a), str(b))
+        assert_(a != b)
+
+    def float32_roundtrip(self):
+        # gh-9360
+        x = np.float32(1024 - 2**-14)
+        y = np.float32(1024 - 2**-13)
+        assert_(repr(x) != repr(y))
+        assert_equal(np.float32(repr(x)), x)
+        assert_equal(np.float32(repr(y)), y)
+
+    def float64_vs_python(self):
+        # gh-2643, gh-6136, gh-6908
+        assert_equal(repr(np.float64(0.1)), repr(0.1))
+        assert_(repr(np.float64(0.20000000000000004)) != repr(0.2))
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_shape_base.py b/contrib/python/numpy/py2/numpy/core/tests/test_shape_base.py
new file mode 100644
index 00000000000..b996321c2ee
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_shape_base.py
@@ -0,0 +1,706 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+import sys
+import numpy as np
+from numpy.core import (
+    array, arange, atleast_1d, atleast_2d, atleast_3d, block, vstack, hstack,
+    newaxis, concatenate, stack
+    )
+from numpy.core.shape_base import (_block_dispatcher, _block_setup,
+                                   _block_concatenate, _block_slicing)
+from numpy.testing import (
+    assert_, assert_raises, assert_array_equal, assert_equal,
+    assert_raises_regex, assert_warns
+    )
+
+from numpy.compat import long
+
+class TestAtleast1d(object):
+    def test_0D_array(self):
+        a = array(1)
+        b = array(2)
+        res = [atleast_1d(a), atleast_1d(b)]
+        desired = [array([1]), array([2])]
+        assert_array_equal(res, desired)
+
+    def test_1D_array(self):
+        a = array([1, 2])
+        b = array([2, 3])
+        res = [atleast_1d(a), atleast_1d(b)]
+        desired = [array([1, 2]), array([2, 3])]
+        assert_array_equal(res, desired)
+
+    def test_2D_array(self):
+        a = array([[1, 2], [1, 2]])
+        b = array([[2, 3], [2, 3]])
+        res = [atleast_1d(a), atleast_1d(b)]
+        desired = [a, b]
+        assert_array_equal(res, desired)
+
+    def test_3D_array(self):
+        a = array([[1, 2], [1, 2]])
+        b = array([[2, 3], [2, 3]])
+        a = array([a, a])
+        b = array([b, b])
+        res = [atleast_1d(a), atleast_1d(b)]
+        desired = [a, b]
+        assert_array_equal(res, desired)
+
+    def test_r1array(self):
+        """ Test to make sure equivalent Travis O's r1array function
+        """
+        assert_(atleast_1d(3).shape == (1,))
+        assert_(atleast_1d(3j).shape == (1,))
+        assert_(atleast_1d(long(3)).shape == (1,))
+        assert_(atleast_1d(3.0).shape == (1,))
+        assert_(atleast_1d([[2, 3], [4, 5]]).shape == (2, 2))
+
+
+class TestAtleast2d(object):
+    def test_0D_array(self):
+        a = array(1)
+        b = array(2)
+        res = [atleast_2d(a), atleast_2d(b)]
+        desired = [array([[1]]), array([[2]])]
+        assert_array_equal(res, desired)
+
+    def test_1D_array(self):
+        a = array([1, 2])
+        b = array([2, 3])
+        res = [atleast_2d(a), atleast_2d(b)]
+        desired = [array([[1, 2]]), array([[2, 3]])]
+        assert_array_equal(res, desired)
+
+    def test_2D_array(self):
+        a = array([[1, 2], [1, 2]])
+        b = array([[2, 3], [2, 3]])
+        res = [atleast_2d(a), atleast_2d(b)]
+        desired = [a, b]
+        assert_array_equal(res, desired)
+
+    def test_3D_array(self):
+        a = array([[1, 2], [1, 2]])
+        b = array([[2, 3], [2, 3]])
+        a = array([a, a])
+        b = array([b, b])
+        res = [atleast_2d(a), atleast_2d(b)]
+        desired = [a, b]
+        assert_array_equal(res, desired)
+
+    def test_r2array(self):
+        """ Test to make sure equivalent Travis O's r2array function
+        """
+        assert_(atleast_2d(3).shape == (1, 1))
+        assert_(atleast_2d([3j, 1]).shape == (1, 2))
+        assert_(atleast_2d([[[3, 1], [4, 5]], [[3, 5], [1, 2]]]).shape == (2, 2, 2))
+
+
+class TestAtleast3d(object):
+    def test_0D_array(self):
+        a = array(1)
+        b = array(2)
+        res = [atleast_3d(a), atleast_3d(b)]
+        desired = [array([[[1]]]), array([[[2]]])]
+        assert_array_equal(res, desired)
+
+    def test_1D_array(self):
+        a = array([1, 2])
+        b = array([2, 3])
+        res = [atleast_3d(a), atleast_3d(b)]
+        desired = [array([[[1], [2]]]), array([[[2], [3]]])]
+        assert_array_equal(res, desired)
+
+    def test_2D_array(self):
+        a = array([[1, 2], [1, 2]])
+        b = array([[2, 3], [2, 3]])
+        res = [atleast_3d(a), atleast_3d(b)]
+        desired = [a[:,:, newaxis], b[:,:, newaxis]]
+        assert_array_equal(res, desired)
+
+    def test_3D_array(self):
+        a = array([[1, 2], [1, 2]])
+        b = array([[2, 3], [2, 3]])
+        a = array([a, a])
+        b = array([b, b])
+        res = [atleast_3d(a), atleast_3d(b)]
+        desired = [a, b]
+        assert_array_equal(res, desired)
+
+
+class TestHstack(object):
+    def test_non_iterable(self):
+        assert_raises(TypeError, hstack, 1)
+
+    def test_empty_input(self):
+        assert_raises(ValueError, hstack, ())
+
+    def test_0D_array(self):
+        a = array(1)
+        b = array(2)
+        res = hstack([a, b])
+        desired = array([1, 2])
+        assert_array_equal(res, desired)
+
+    def test_1D_array(self):
+        a = array([1])
+        b = array([2])
+        res = hstack([a, b])
+        desired = array([1, 2])
+        assert_array_equal(res, desired)
+
+    def test_2D_array(self):
+        a = array([[1], [2]])
+        b = array([[1], [2]])
+        res = hstack([a, b])
+        desired = array([[1, 1], [2, 2]])
+        assert_array_equal(res, desired)
+
+    def test_generator(self):
+        with assert_warns(FutureWarning):
+            hstack((np.arange(3) for _ in range(2)))
+        if sys.version_info.major > 2:
+            # map returns a list on Python 2
+            with assert_warns(FutureWarning):
+                hstack(map(lambda x: x, np.ones((3, 2))))
+
+
+class TestVstack(object):
+    def test_non_iterable(self):
+        assert_raises(TypeError, vstack, 1)
+
+    def test_empty_input(self):
+        assert_raises(ValueError, vstack, ())
+
+    def test_0D_array(self):
+        a = array(1)
+        b = array(2)
+        res = vstack([a, b])
+        desired = array([[1], [2]])
+        assert_array_equal(res, desired)
+
+    def test_1D_array(self):
+        a = array([1])
+        b = array([2])
+        res = vstack([a, b])
+        desired = array([[1], [2]])
+        assert_array_equal(res, desired)
+
+    def test_2D_array(self):
+        a = array([[1], [2]])
+        b = array([[1], [2]])
+        res = vstack([a, b])
+        desired = array([[1], [2], [1], [2]])
+        assert_array_equal(res, desired)
+
+    def test_2D_array2(self):
+        a = array([1, 2])
+        b = array([1, 2])
+        res = vstack([a, b])
+        desired = array([[1, 2], [1, 2]])
+        assert_array_equal(res, desired)
+
+    def test_generator(self):
+        with assert_warns(FutureWarning):
+            vstack((np.arange(3) for _ in range(2)))
+
+
+class TestConcatenate(object):
+    def test_returns_copy(self):
+        a = np.eye(3)
+        b = np.concatenate([a])
+        b[0, 0] = 2
+        assert b[0, 0] != a[0, 0]
+
+    def test_exceptions(self):
+        # test axis must be in bounds
+        for ndim in [1, 2, 3]:
+            a = np.ones((1,)*ndim)
+            np.concatenate((a, a), axis=0)  # OK
+            assert_raises(np.AxisError, np.concatenate, (a, a), axis=ndim)
+            assert_raises(np.AxisError, np.concatenate, (a, a), axis=-(ndim + 1))
+
+        # Scalars cannot be concatenated
+        assert_raises(ValueError, concatenate, (0,))
+        assert_raises(ValueError, concatenate, (np.array(0),))
+
+        # test shapes must match except for concatenation axis
+        a = np.ones((1, 2, 3))
+        b = np.ones((2, 2, 3))
+        axis = list(range(3))
+        for i in range(3):
+            np.concatenate((a, b), axis=axis[0])  # OK
+            assert_raises(ValueError, np.concatenate, (a, b), axis=axis[1])
+            assert_raises(ValueError, np.concatenate, (a, b), axis=axis[2])
+            a = np.moveaxis(a, -1, 0)
+            b = np.moveaxis(b, -1, 0)
+            axis.append(axis.pop(0))
+
+        # No arrays to concatenate raises ValueError
+        assert_raises(ValueError, concatenate, ())
+
+    def test_concatenate_axis_None(self):
+        a = np.arange(4, dtype=np.float64).reshape((2, 2))
+        b = list(range(3))
+        c = ['x']
+        r = np.concatenate((a, a), axis=None)
+        assert_equal(r.dtype, a.dtype)
+        assert_equal(r.ndim, 1)
+        r = np.concatenate((a, b), axis=None)
+        assert_equal(r.size, a.size + len(b))
+        assert_equal(r.dtype, a.dtype)
+        r = np.concatenate((a, b, c), axis=None)
+        d = array(['0.0', '1.0', '2.0', '3.0',
+                   '0', '1', '2', 'x'])
+        assert_array_equal(r, d)
+
+        out = np.zeros(a.size + len(b))
+        r = np.concatenate((a, b), axis=None)
+        rout = np.concatenate((a, b), axis=None, out=out)
+        assert_(out is rout)
+        assert_equal(r, rout)
+
+    def test_large_concatenate_axis_None(self):
+        # When no axis is given, concatenate uses flattened versions.
+        # This also had a bug with many arrays (see gh-5979).
+        x = np.arange(1, 100)
+        r = np.concatenate(x, None)
+        assert_array_equal(x, r)
+
+        # This should probably be deprecated:
+        r = np.concatenate(x, 100)  # axis is >= MAXDIMS
+        assert_array_equal(x, r)
+
+    def test_concatenate(self):
+        # Test concatenate function
+        # One sequence returns unmodified (but as array)
+        r4 = list(range(4))
+        assert_array_equal(concatenate((r4,)), r4)
+        # Any sequence
+        assert_array_equal(concatenate((tuple(r4),)), r4)
+        assert_array_equal(concatenate((array(r4),)), r4)
+        # 1D default concatenation
+        r3 = list(range(3))
+        assert_array_equal(concatenate((r4, r3)), r4 + r3)
+        # Mixed sequence types
+        assert_array_equal(concatenate((tuple(r4), r3)), r4 + r3)
+        assert_array_equal(concatenate((array(r4), r3)), r4 + r3)
+        # Explicit axis specification
+        assert_array_equal(concatenate((r4, r3), 0), r4 + r3)
+        # Including negative
+        assert_array_equal(concatenate((r4, r3), -1), r4 + r3)
+        # 2D
+        a23 = array([[10, 11, 12], [13, 14, 15]])
+        a13 = array([[0, 1, 2]])
+        res = array([[10, 11, 12], [13, 14, 15], [0, 1, 2]])
+        assert_array_equal(concatenate((a23, a13)), res)
+        assert_array_equal(concatenate((a23, a13), 0), res)
+        assert_array_equal(concatenate((a23.T, a13.T), 1), res.T)
+        assert_array_equal(concatenate((a23.T, a13.T), -1), res.T)
+        # Arrays much match shape
+        assert_raises(ValueError, concatenate, (a23.T, a13.T), 0)
+        # 3D
+        res = arange(2 * 3 * 7).reshape((2, 3, 7))
+        a0 = res[..., :4]
+        a1 = res[..., 4:6]
+        a2 = res[..., 6:]
+        assert_array_equal(concatenate((a0, a1, a2), 2), res)
+        assert_array_equal(concatenate((a0, a1, a2), -1), res)
+        assert_array_equal(concatenate((a0.T, a1.T, a2.T), 0), res.T)
+
+        out = res.copy()
+        rout = concatenate((a0, a1, a2), 2, out=out)
+        assert_(out is rout)
+        assert_equal(res, rout)
+
+    def test_bad_out_shape(self):
+        a = array([1, 2])
+        b = array([3, 4])
+
+        assert_raises(ValueError, concatenate, (a, b), out=np.empty(5))
+        assert_raises(ValueError, concatenate, (a, b), out=np.empty((4,1)))
+        assert_raises(ValueError, concatenate, (a, b), out=np.empty((1,4)))
+        concatenate((a, b), out=np.empty(4))
+
+    def test_out_dtype(self):
+        out = np.empty(4, np.float32)
+        res = concatenate((array([1, 2]), array([3, 4])), out=out)
+        assert_(out is res)
+
+        out = np.empty(4, np.complex64)
+        res = concatenate((array([0.1, 0.2]), array([0.3, 0.4])), out=out)
+        assert_(out is res)
+
+        # invalid cast
+        out = np.empty(4, np.int32)
+        assert_raises(TypeError, concatenate,
+            (array([0.1, 0.2]), array([0.3, 0.4])), out=out)
+
+
+def test_stack():
+    # non-iterable input
+    assert_raises(TypeError, stack, 1)
+
+    # 0d input
+    for input_ in [(1, 2, 3),
+                   [np.int32(1), np.int32(2), np.int32(3)],
+                   [np.array(1), np.array(2), np.array(3)]]:
+        assert_array_equal(stack(input_), [1, 2, 3])
+    # 1d input examples
+    a = np.array([1, 2, 3])
+    b = np.array([4, 5, 6])
+    r1 = array([[1, 2, 3], [4, 5, 6]])
+    assert_array_equal(np.stack((a, b)), r1)
+    assert_array_equal(np.stack((a, b), axis=1), r1.T)
+    # all input types
+    assert_array_equal(np.stack(list([a, b])), r1)
+    assert_array_equal(np.stack(array([a, b])), r1)
+    # all shapes for 1d input
+    arrays = [np.random.randn(3) for _ in range(10)]
+    axes = [0, 1, -1, -2]
+    expected_shapes = [(10, 3), (3, 10), (3, 10), (10, 3)]
+    for axis, expected_shape in zip(axes, expected_shapes):
+        assert_equal(np.stack(arrays, axis).shape, expected_shape)
+    assert_raises_regex(np.AxisError, 'out of bounds', stack, arrays, axis=2)
+    assert_raises_regex(np.AxisError, 'out of bounds', stack, arrays, axis=-3)
+    # all shapes for 2d input
+    arrays = [np.random.randn(3, 4) for _ in range(10)]
+    axes = [0, 1, 2, -1, -2, -3]
+    expected_shapes = [(10, 3, 4), (3, 10, 4), (3, 4, 10),
+                       (3, 4, 10), (3, 10, 4), (10, 3, 4)]
+    for axis, expected_shape in zip(axes, expected_shapes):
+        assert_equal(np.stack(arrays, axis).shape, expected_shape)
+    # empty arrays
+    assert_(stack([[], [], []]).shape == (3, 0))
+    assert_(stack([[], [], []], axis=1).shape == (0, 3))
+    # out
+    out = np.zeros_like(r1)
+    np.stack((a, b), out=out)
+    assert_array_equal(out, r1)
+    # edge cases
+    assert_raises_regex(ValueError, 'need at least one array', stack, [])
+    assert_raises_regex(ValueError, 'must have the same shape',
+                        stack, [1, np.arange(3)])
+    assert_raises_regex(ValueError, 'must have the same shape',
+                        stack, [np.arange(3), 1])
+    assert_raises_regex(ValueError, 'must have the same shape',
+                        stack, [np.arange(3), 1], axis=1)
+    assert_raises_regex(ValueError, 'must have the same shape',
+                        stack, [np.zeros((3, 3)), np.zeros(3)], axis=1)
+    assert_raises_regex(ValueError, 'must have the same shape',
+                        stack, [np.arange(2), np.arange(3)])
+    # generator is deprecated
+    with assert_warns(FutureWarning):
+        result = stack((x for x in range(3)))
+    assert_array_equal(result, np.array([0, 1, 2]))
+
+
+class TestBlock(object):
+    @pytest.fixture(params=['block', 'force_concatenate', 'force_slicing'])
+    def block(self, request):
+        # blocking small arrays and large arrays go through different paths.
+        # the algorithm is triggered depending on the number of element
+        # copies required.
+        # We define a test fixture that forces most tests to go through
+        # both code paths.
+        # Ultimately, this should be removed if a single algorithm is found
+        # to be faster for both small and large arrays.
+        def _block_force_concatenate(arrays):
+            arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
+            return _block_concatenate(arrays, list_ndim, result_ndim)
+
+        def _block_force_slicing(arrays):
+            arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
+            return _block_slicing(arrays, list_ndim, result_ndim)
+
+        if request.param == 'force_concatenate':
+            return _block_force_concatenate
+        elif request.param == 'force_slicing':
+            return _block_force_slicing
+        elif request.param == 'block':
+            return block
+        else:
+            raise ValueError('Unknown blocking request. There is a typo in the tests.')
+
+    def test_returns_copy(self, block):
+        a = np.eye(3)
+        b = block(a)
+        b[0, 0] = 2
+        assert b[0, 0] != a[0, 0]
+
+    def test_block_total_size_estimate(self, block):
+        _, _, _, total_size = _block_setup([1])
+        assert total_size == 1
+
+        _, _, _, total_size = _block_setup([[1]])
+        assert total_size == 1
+
+        _, _, _, total_size = _block_setup([[1, 1]])
+        assert total_size == 2
+
+        _, _, _, total_size = _block_setup([[1], [1]])
+        assert total_size == 2
+
+        _, _, _, total_size = _block_setup([[1, 2], [3, 4]])
+        assert total_size == 4
+
+    def test_block_simple_row_wise(self, block):
+        a_2d = np.ones((2, 2))
+        b_2d = 2 * a_2d
+        desired = np.array([[1, 1, 2, 2],
+                            [1, 1, 2, 2]])
+        result = block([a_2d, b_2d])
+        assert_equal(desired, result)
+
+    def test_block_simple_column_wise(self, block):
+        a_2d = np.ones((2, 2))
+        b_2d = 2 * a_2d
+        expected = np.array([[1, 1],
+                             [1, 1],
+                             [2, 2],
+                             [2, 2]])
+        result = block([[a_2d], [b_2d]])
+        assert_equal(expected, result)
+
+    def test_block_with_1d_arrays_row_wise(self, block):
+        # # # 1-D vectors are treated as row arrays
+        a = np.array([1, 2, 3])
+        b = np.array([2, 3, 4])
+        expected = np.array([1, 2, 3, 2, 3, 4])
+        result = block([a, b])
+        assert_equal(expected, result)
+
+    def test_block_with_1d_arrays_multiple_rows(self, block):
+        a = np.array([1, 2, 3])
+        b = np.array([2, 3, 4])
+        expected = np.array([[1, 2, 3, 2, 3, 4],
+                             [1, 2, 3, 2, 3, 4]])
+        result = block([[a, b], [a, b]])
+        assert_equal(expected, result)
+
+    def test_block_with_1d_arrays_column_wise(self, block):
+        # # # 1-D vectors are treated as row arrays
+        a_1d = np.array([1, 2, 3])
+        b_1d = np.array([2, 3, 4])
+        expected = np.array([[1, 2, 3],
+                             [2, 3, 4]])
+        result = block([[a_1d], [b_1d]])
+        assert_equal(expected, result)
+
+    def test_block_mixed_1d_and_2d(self, block):
+        a_2d = np.ones((2, 2))
+        b_1d = np.array([2, 2])
+        result = block([[a_2d], [b_1d]])
+        expected = np.array([[1, 1],
+                             [1, 1],
+                             [2, 2]])
+        assert_equal(expected, result)
+
+    def test_block_complicated(self, block):
+        # a bit more complicated
+        one_2d = np.array([[1, 1, 1]])
+        two_2d = np.array([[2, 2, 2]])
+        three_2d = np.array([[3, 3, 3, 3, 3, 3]])
+        four_1d = np.array([4, 4, 4, 4, 4, 4])
+        five_0d = np.array(5)
+        six_1d = np.array([6, 6, 6, 6, 6])
+        zero_2d = np.zeros((2, 6))
+
+        expected = np.array([[1, 1, 1, 2, 2, 2],
+                             [3, 3, 3, 3, 3, 3],
+                             [4, 4, 4, 4, 4, 4],
+                             [5, 6, 6, 6, 6, 6],
+                             [0, 0, 0, 0, 0, 0],
+                             [0, 0, 0, 0, 0, 0]])
+
+        result = block([[one_2d, two_2d],
+                        [three_2d],
+                        [four_1d],
+                        [five_0d, six_1d],
+                        [zero_2d]])
+        assert_equal(result, expected)
+
+    def test_nested(self, block):
+        one = np.array([1, 1, 1])
+        two = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]])
+        three = np.array([3, 3, 3])
+        four = np.array([4, 4, 4])
+        five = np.array(5)
+        six = np.array([6, 6, 6, 6, 6])
+        zero = np.zeros((2, 6))
+
+        result = block([
+            [
+                block([
+                   [one],
+                   [three],
+                   [four]
+                ]),
+                two
+            ],
+            [five, six],
+            [zero]
+        ])
+        expected = np.array([[1, 1, 1, 2, 2, 2],
+                             [3, 3, 3, 2, 2, 2],
+                             [4, 4, 4, 2, 2, 2],
+                             [5, 6, 6, 6, 6, 6],
+                             [0, 0, 0, 0, 0, 0],
+                             [0, 0, 0, 0, 0, 0]])
+
+        assert_equal(result, expected)
+
+    def test_3d(self, block):
+        a000 = np.ones((2, 2, 2), int) * 1
+
+        a100 = np.ones((3, 2, 2), int) * 2
+        a010 = np.ones((2, 3, 2), int) * 3
+        a001 = np.ones((2, 2, 3), int) * 4
+
+        a011 = np.ones((2, 3, 3), int) * 5
+        a101 = np.ones((3, 2, 3), int) * 6
+        a110 = np.ones((3, 3, 2), int) * 7
+
+        a111 = np.ones((3, 3, 3), int) * 8
+
+        result = block([
+            [
+                [a000, a001],
+                [a010, a011],
+            ],
+            [
+                [a100, a101],
+                [a110, a111],
+            ]
+        ])
+        expected = array([[[1, 1, 4, 4, 4],
+                           [1, 1, 4, 4, 4],
+                           [3, 3, 5, 5, 5],
+                           [3, 3, 5, 5, 5],
+                           [3, 3, 5, 5, 5]],
+
+                          [[1, 1, 4, 4, 4],
+                           [1, 1, 4, 4, 4],
+                           [3, 3, 5, 5, 5],
+                           [3, 3, 5, 5, 5],
+                           [3, 3, 5, 5, 5]],
+
+                          [[2, 2, 6, 6, 6],
+                           [2, 2, 6, 6, 6],
+                           [7, 7, 8, 8, 8],
+                           [7, 7, 8, 8, 8],
+                           [7, 7, 8, 8, 8]],
+
+                          [[2, 2, 6, 6, 6],
+                           [2, 2, 6, 6, 6],
+                           [7, 7, 8, 8, 8],
+                           [7, 7, 8, 8, 8],
+                           [7, 7, 8, 8, 8]],
+
+                          [[2, 2, 6, 6, 6],
+                           [2, 2, 6, 6, 6],
+                           [7, 7, 8, 8, 8],
+                           [7, 7, 8, 8, 8],
+                           [7, 7, 8, 8, 8]]])
+
+        assert_array_equal(result, expected)
+
+    def test_block_with_mismatched_shape(self, block):
+        a = np.array([0, 0])
+        b = np.eye(2)
+        assert_raises(ValueError, block, [a, b])
+        assert_raises(ValueError, block, [b, a])
+
+        to_block = [[np.ones((2,3)), np.ones((2,2))],
+                    [np.ones((2,2)), np.ones((2,2))]]
+        assert_raises(ValueError, block, to_block)
+    def test_no_lists(self, block):
+        assert_equal(block(1),         np.array(1))
+        assert_equal(block(np.eye(3)), np.eye(3))
+
+    def test_invalid_nesting(self, block):
+        msg = 'depths are mismatched'
+        assert_raises_regex(ValueError, msg, block, [1, [2]])
+        assert_raises_regex(ValueError, msg, block, [1, []])
+        assert_raises_regex(ValueError, msg, block, [[1], 2])
+        assert_raises_regex(ValueError, msg, block, [[], 2])
+        assert_raises_regex(ValueError, msg, block, [
+            [[1], [2]],
+            [[3, 4]],
+            [5]  # missing brackets
+        ])
+
+    def test_empty_lists(self, block):
+        assert_raises_regex(ValueError, 'empty', block, [])
+        assert_raises_regex(ValueError, 'empty', block, [[]])
+        assert_raises_regex(ValueError, 'empty', block, [[1], []])
+
+    def test_tuple(self, block):
+        assert_raises_regex(TypeError, 'tuple', block, ([1, 2], [3, 4]))
+        assert_raises_regex(TypeError, 'tuple', block, [(1, 2), (3, 4)])
+
+    def test_different_ndims(self, block):
+        a = 1.
+        b = 2 * np.ones((1, 2))
+        c = 3 * np.ones((1, 1, 3))
+
+        result = block([a, b, c])
+        expected = np.array([[[1., 2., 2., 3., 3., 3.]]])
+
+        assert_equal(result, expected)
+
+    def test_different_ndims_depths(self, block):
+        a = 1.
+        b = 2 * np.ones((1, 2))
+        c = 3 * np.ones((1, 2, 3))
+
+        result = block([[a, b], [c]])
+        expected = np.array([[[1., 2., 2.],
+                              [3., 3., 3.],
+                              [3., 3., 3.]]])
+
+        assert_equal(result, expected)
+
+    def test_block_memory_order(self, block):
+        # 3D
+        arr_c = np.zeros((3,)*3, order='C')
+        arr_f = np.zeros((3,)*3, order='F')
+
+        b_c = [[[arr_c, arr_c],
+                [arr_c, arr_c]],
+               [[arr_c, arr_c],
+                [arr_c, arr_c]]]
+
+        b_f = [[[arr_f, arr_f],
+                [arr_f, arr_f]],
+               [[arr_f, arr_f],
+                [arr_f, arr_f]]]
+
+        assert block(b_c).flags['C_CONTIGUOUS']
+        assert block(b_f).flags['F_CONTIGUOUS']
+
+        arr_c = np.zeros((3, 3), order='C')
+        arr_f = np.zeros((3, 3), order='F')
+        # 2D
+        b_c = [[arr_c, arr_c],
+               [arr_c, arr_c]]
+
+        b_f = [[arr_f, arr_f],
+               [arr_f, arr_f]]
+
+        assert block(b_c).flags['C_CONTIGUOUS']
+        assert block(b_f).flags['F_CONTIGUOUS']
+
+
+def test_block_dispatcher():
+    class ArrayLike(object):
+        pass
+    a = ArrayLike()
+    b = ArrayLike()
+    c = ArrayLike()
+    assert_equal(list(_block_dispatcher(a)), [a])
+    assert_equal(list(_block_dispatcher([a])), [a])
+    assert_equal(list(_block_dispatcher([a, b])), [a, b])
+    assert_equal(list(_block_dispatcher([[a], [b, [c]]])), [a, b, c])
+    # don't recurse into non-lists
+    assert_equal(list(_block_dispatcher((a, b))), [(a, b)])
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_ufunc.py b/contrib/python/numpy/py2/numpy/core/tests/test_ufunc.py
new file mode 100644
index 00000000000..b83b8ccffaf
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_ufunc.py
@@ -0,0 +1,1859 @@
+from __future__ import division, absolute_import, print_function
+
+import warnings
+import itertools
+
+import numpy as np
+import numpy.core._umath_tests as umt
+import numpy.linalg._umath_linalg as uml
+import numpy.core._operand_flag_tests as opflag_tests
+import numpy.core._rational_tests as _rational_tests
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_array_equal,
+    assert_almost_equal, assert_array_almost_equal, assert_no_warnings,
+    assert_allclose,
+    )
+from numpy.core.numeric import pickle
+
+
+class TestUfuncKwargs(object):
+    def test_kwarg_exact(self):
+        assert_raises(TypeError, np.add, 1, 2, castingx='safe')
+        assert_raises(TypeError, np.add, 1, 2, dtypex=int)
+        assert_raises(TypeError, np.add, 1, 2, extobjx=[4096])
+        assert_raises(TypeError, np.add, 1, 2, outx=None)
+        assert_raises(TypeError, np.add, 1, 2, sigx='ii->i')
+        assert_raises(TypeError, np.add, 1, 2, signaturex='ii->i')
+        assert_raises(TypeError, np.add, 1, 2, subokx=False)
+        assert_raises(TypeError, np.add, 1, 2, wherex=[True])
+
+    def test_sig_signature(self):
+        assert_raises(ValueError, np.add, 1, 2, sig='ii->i',
+                      signature='ii->i')
+
+    def test_sig_dtype(self):
+        assert_raises(RuntimeError, np.add, 1, 2, sig='ii->i',
+                      dtype=int)
+        assert_raises(RuntimeError, np.add, 1, 2, signature='ii->i',
+                      dtype=int)
+
+    def test_extobj_refcount(self):
+        # Should not segfault with USE_DEBUG.
+        assert_raises(TypeError, np.add, 1, 2, extobj=[4096], parrot=True)
+
+
+class TestUfunc(object):
+    def test_pickle(self):
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            assert_(pickle.loads(pickle.dumps(np.sin,
+                                              protocol=proto)) is np.sin)
+
+            # Check that ufunc not defined in the top level numpy namespace
+            # such as numpy.core._rational_tests.test_add can also be pickled
+            res = pickle.loads(pickle.dumps(_rational_tests.test_add,
+                                            protocol=proto))
+            assert_(res is _rational_tests.test_add)
+
+    def test_pickle_withstring(self):
+        astring = (b"cnumpy.core\n_ufunc_reconstruct\np0\n"
+                   b"(S'numpy.core.umath'\np1\nS'cos'\np2\ntp3\nRp4\n.")
+        assert_(pickle.loads(astring) is np.cos)
+
+    def test_reduceat_shifting_sum(self):
+        L = 6
+        x = np.arange(L)
+        idx = np.array(list(zip(np.arange(L - 2), np.arange(L - 2) + 2))).ravel()
+        assert_array_equal(np.add.reduceat(x, idx)[::2], [1, 3, 5, 7])
+
+    def test_generic_loops(self):
+        """Test generic loops.
+
+        The loops to be tested are:
+
+            PyUFunc_ff_f_As_dd_d
+            PyUFunc_ff_f
+            PyUFunc_dd_d
+            PyUFunc_gg_g
+            PyUFunc_FF_F_As_DD_D
+            PyUFunc_DD_D
+            PyUFunc_FF_F
+            PyUFunc_GG_G
+            PyUFunc_OO_O
+            PyUFunc_OO_O_method
+            PyUFunc_f_f_As_d_d
+            PyUFunc_d_d
+            PyUFunc_f_f
+            PyUFunc_g_g
+            PyUFunc_F_F_As_D_D
+            PyUFunc_F_F
+            PyUFunc_D_D
+            PyUFunc_G_G
+            PyUFunc_O_O
+            PyUFunc_O_O_method
+            PyUFunc_On_Om
+
+        Where:
+
+            f -- float
+            d -- double
+            g -- long double
+            F -- complex float
+            D -- complex double
+            G -- complex long double
+            O -- python object
+
+        It is difficult to assure that each of these loops is entered from the
+        Python level as the special cased loops are a moving target and the
+        corresponding types are architecture dependent. We probably need to
+        define C level testing ufuncs to get at them. For the time being, I've
+        just looked at the signatures registered in the build directory to find
+        relevant functions.
+
+        Fixme, currently untested:
+
+            PyUFunc_ff_f_As_dd_d
+            PyUFunc_FF_F_As_DD_D
+            PyUFunc_f_f_As_d_d
+            PyUFunc_F_F_As_D_D
+            PyUFunc_On_Om
+
+        """
+        fone = np.exp
+        ftwo = lambda x, y: x**y
+        fone_val = 1
+        ftwo_val = 1
+        # check unary PyUFunc_f_f.
+        msg = "PyUFunc_f_f"
+        x = np.zeros(10, dtype=np.single)[0::2]
+        assert_almost_equal(fone(x), fone_val, err_msg=msg)
+        # check unary PyUFunc_d_d.
+        msg = "PyUFunc_d_d"
+        x = np.zeros(10, dtype=np.double)[0::2]
+        assert_almost_equal(fone(x), fone_val, err_msg=msg)
+        # check unary PyUFunc_g_g.
+        msg = "PyUFunc_g_g"
+        x = np.zeros(10, dtype=np.longdouble)[0::2]
+        assert_almost_equal(fone(x), fone_val, err_msg=msg)
+        # check unary PyUFunc_F_F.
+        msg = "PyUFunc_F_F"
+        x = np.zeros(10, dtype=np.csingle)[0::2]
+        assert_almost_equal(fone(x), fone_val, err_msg=msg)
+        # check unary PyUFunc_D_D.
+        msg = "PyUFunc_D_D"
+        x = np.zeros(10, dtype=np.cdouble)[0::2]
+        assert_almost_equal(fone(x), fone_val, err_msg=msg)
+        # check unary PyUFunc_G_G.
+        msg = "PyUFunc_G_G"
+        x = np.zeros(10, dtype=np.clongdouble)[0::2]
+        assert_almost_equal(fone(x), fone_val, err_msg=msg)
+
+        # check binary PyUFunc_ff_f.
+        msg = "PyUFunc_ff_f"
+        x = np.ones(10, dtype=np.single)[0::2]
+        assert_almost_equal(ftwo(x, x), ftwo_val, err_msg=msg)
+        # check binary PyUFunc_dd_d.
+        msg = "PyUFunc_dd_d"
+        x = np.ones(10, dtype=np.double)[0::2]
+        assert_almost_equal(ftwo(x, x), ftwo_val, err_msg=msg)
+        # check binary PyUFunc_gg_g.
+        msg = "PyUFunc_gg_g"
+        x = np.ones(10, dtype=np.longdouble)[0::2]
+        assert_almost_equal(ftwo(x, x), ftwo_val, err_msg=msg)
+        # check binary PyUFunc_FF_F.
+        msg = "PyUFunc_FF_F"
+        x = np.ones(10, dtype=np.csingle)[0::2]
+        assert_almost_equal(ftwo(x, x), ftwo_val, err_msg=msg)
+        # check binary PyUFunc_DD_D.
+        msg = "PyUFunc_DD_D"
+        x = np.ones(10, dtype=np.cdouble)[0::2]
+        assert_almost_equal(ftwo(x, x), ftwo_val, err_msg=msg)
+        # check binary PyUFunc_GG_G.
+        msg = "PyUFunc_GG_G"
+        x = np.ones(10, dtype=np.clongdouble)[0::2]
+        assert_almost_equal(ftwo(x, x), ftwo_val, err_msg=msg)
+
+        # class to use in testing object method loops
+        class foo(object):
+            def conjugate(self):
+                return np.bool_(1)
+
+            def logical_xor(self, obj):
+                return np.bool_(1)
+
+        # check unary PyUFunc_O_O
+        msg = "PyUFunc_O_O"
+        x = np.ones(10, dtype=object)[0::2]
+        assert_(np.all(np.abs(x) == 1), msg)
+        # check unary PyUFunc_O_O_method
+        msg = "PyUFunc_O_O_method"
+        x = np.zeros(10, dtype=object)[0::2]
+        for i in range(len(x)):
+            x[i] = foo()
+        assert_(np.all(np.conjugate(x) == True), msg)
+
+        # check binary PyUFunc_OO_O
+        msg = "PyUFunc_OO_O"
+        x = np.ones(10, dtype=object)[0::2]
+        assert_(np.all(np.add(x, x) == 2), msg)
+        # check binary PyUFunc_OO_O_method
+        msg = "PyUFunc_OO_O_method"
+        x = np.zeros(10, dtype=object)[0::2]
+        for i in range(len(x)):
+            x[i] = foo()
+        assert_(np.all(np.logical_xor(x, x)), msg)
+
+        # check PyUFunc_On_Om
+        # fixme -- I don't know how to do this yet
+
+    def test_all_ufunc(self):
+        """Try to check presence and results of all ufuncs.
+
+        The list of ufuncs comes from generate_umath.py and is as follows:
+
+        =====  ====  =============  ===============  ========================
+        done   args   function        types                notes
+        =====  ====  =============  ===============  ========================
+        n      1     conjugate      nums + O
+        n      1     absolute       nums + O         complex -> real
+        n      1     negative       nums + O
+        n      1     sign           nums + O         -> int
+        n      1     invert         bool + ints + O  flts raise an error
+        n      1     degrees        real + M         cmplx raise an error
+        n      1     radians        real + M         cmplx raise an error
+        n      1     arccos         flts + M
+        n      1     arccosh        flts + M
+        n      1     arcsin         flts + M
+        n      1     arcsinh        flts + M
+        n      1     arctan         flts + M
+        n      1     arctanh        flts + M
+        n      1     cos            flts + M
+        n      1     sin            flts + M
+        n      1     tan            flts + M
+        n      1     cosh           flts + M
+        n      1     sinh           flts + M
+        n      1     tanh           flts + M
+        n      1     exp            flts + M
+        n      1     expm1          flts + M
+        n      1     log            flts + M
+        n      1     log10          flts + M
+        n      1     log1p          flts + M
+        n      1     sqrt           flts + M         real x < 0 raises error
+        n      1     ceil           real + M
+        n      1     trunc          real + M
+        n      1     floor          real + M
+        n      1     fabs           real + M
+        n      1     rint           flts + M
+        n      1     isnan          flts             -> bool
+        n      1     isinf          flts             -> bool
+        n      1     isfinite       flts             -> bool
+        n      1     signbit        real             -> bool
+        n      1     modf           real             -> (frac, int)
+        n      1     logical_not    bool + nums + M  -> bool
+        n      2     left_shift     ints + O         flts raise an error
+        n      2     right_shift    ints + O         flts raise an error
+        n      2     add            bool + nums + O  boolean + is ||
+        n      2     subtract       bool + nums + O  boolean - is ^
+        n      2     multiply       bool + nums + O  boolean * is &
+        n      2     divide         nums + O
+        n      2     floor_divide   nums + O
+        n      2     true_divide    nums + O         bBhH -> f, iIlLqQ -> d
+        n      2     fmod           nums + M
+        n      2     power          nums + O
+        n      2     greater        bool + nums + O  -> bool
+        n      2     greater_equal  bool + nums + O  -> bool
+        n      2     less           bool + nums + O  -> bool
+        n      2     less_equal     bool + nums + O  -> bool
+        n      2     equal          bool + nums + O  -> bool
+        n      2     not_equal      bool + nums + O  -> bool
+        n      2     logical_and    bool + nums + M  -> bool
+        n      2     logical_or     bool + nums + M  -> bool
+        n      2     logical_xor    bool + nums + M  -> bool
+        n      2     maximum        bool + nums + O
+        n      2     minimum        bool + nums + O
+        n      2     bitwise_and    bool + ints + O  flts raise an error
+        n      2     bitwise_or     bool + ints + O  flts raise an error
+        n      2     bitwise_xor    bool + ints + O  flts raise an error
+        n      2     arctan2        real + M
+        n      2     remainder      ints + real + O
+        n      2     hypot          real + M
+        =====  ====  =============  ===============  ========================
+
+        Types other than those listed will be accepted, but they are cast to
+        the smallest compatible type for which the function is defined. The
+        casting rules are:
+
+        bool -> int8 -> float32
+        ints -> double
+
+        """
+        pass
+
+    # from include/numpy/ufuncobject.h
+    size_inferred = 2
+    can_ignore = 4
+    def test_signature0(self):
+        # the arguments to test_signature are: nin, nout, core_signature
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            2, 1, "(i),(i)->()")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1,  1,  0))
+        assert_equal(ixs, (0, 0))
+        assert_equal(flags, (self.size_inferred,))
+        assert_equal(sizes, (-1,))
+
+    def test_signature1(self):
+        # empty core signature; treat as plain ufunc (with trivial core)
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            2, 1, "(),()->()")
+        assert_equal(enabled, 0)
+        assert_equal(num_dims, (0,  0,  0))
+        assert_equal(ixs, ())
+        assert_equal(flags, ())
+        assert_equal(sizes, ())
+
+    def test_signature2(self):
+        # more complicated names for variables
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            2, 1, "(i1,i2),(J_1)->(_kAB)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 1, 1))
+        assert_equal(ixs, (0, 1, 2, 3))
+        assert_equal(flags, (self.size_inferred,)*4)
+        assert_equal(sizes, (-1, -1, -1, -1))
+
+    def test_signature3(self):
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            2, 1, u"(i1, i12),   (J_1)->(i12, i2)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 1, 2))
+        assert_equal(ixs, (0, 1, 2, 1, 3))
+        assert_equal(flags, (self.size_inferred,)*4)
+        assert_equal(sizes, (-1, -1, -1, -1))
+
+    def test_signature4(self):
+        # matrix_multiply signature from _umath_tests
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            2, 1, "(n,k),(k,m)->(n,m)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 2, 2))
+        assert_equal(ixs, (0, 1, 1, 2, 0, 2))
+        assert_equal(flags, (self.size_inferred,)*3)
+        assert_equal(sizes, (-1, -1, -1))
+
+    def test_signature5(self):
+        # matmul signature from _umath_tests
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            2, 1, "(n?,k),(k,m?)->(n?,m?)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 2, 2))
+        assert_equal(ixs, (0, 1, 1, 2, 0, 2))
+        assert_equal(flags, (self.size_inferred | self.can_ignore,
+                             self.size_inferred,
+                             self.size_inferred | self.can_ignore))
+        assert_equal(sizes, (-1, -1, -1))
+
+    def test_signature6(self):
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            1, 1, "(3)->()")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1, 0))
+        assert_equal(ixs, (0,))
+        assert_equal(flags, (0,))
+        assert_equal(sizes, (3,))
+
+    def test_signature7(self):
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            3, 1, "(3),(03,3),(n)->(9)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1, 2, 1, 1))
+        assert_equal(ixs, (0, 0, 0, 1, 2))
+        assert_equal(flags, (0, self.size_inferred, 0))
+        assert_equal(sizes, (3, -1, 9))
+
+    def test_signature8(self):
+        enabled, num_dims, ixs, flags, sizes = umt.test_signature(
+            3, 1, "(3?),(3?,3?),(n)->(9)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1, 2, 1, 1))
+        assert_equal(ixs, (0, 0, 0, 1, 2))
+        assert_equal(flags, (self.can_ignore, self.size_inferred, 0))
+        assert_equal(sizes, (3, -1, 9))
+
+    def test_signature_failure0(self):
+        # in the following calls, a ValueError should be raised because
+        # of error in core signature
+        # FIXME These should be using assert_raises
+
+        # error: extra parenthesis
+        msg = "core_sig: extra parenthesis"
+        try:
+            ret = umt.test_signature(2, 1, "((i)),(i)->()")
+            assert_equal(ret, None, err_msg=msg)
+        except ValueError:
+            pass
+
+    def test_signature_failure1(self):
+        # error: parenthesis matching
+        msg = "core_sig: parenthesis matching"
+        try:
+            ret = umt.test_signature(2, 1, "(i),)i(->()")
+            assert_equal(ret, None, err_msg=msg)
+        except ValueError:
+            pass
+
+    def test_signature_failure2(self):
+        # error: incomplete signature. letters outside of parenthesis are ignored
+        msg = "core_sig: incomplete signature"
+        try:
+            ret = umt.test_signature(2, 1, "(i),->()")
+            assert_equal(ret, None, err_msg=msg)
+        except ValueError:
+            pass
+
+    def test_signature_failure3(self):
+        # error: incomplete signature. 2 output arguments are specified
+        msg = "core_sig: incomplete signature"
+        try:
+            ret = umt.test_signature(2, 2, "(i),(i)->()")
+            assert_equal(ret, None, err_msg=msg)
+        except ValueError:
+            pass
+
+    def test_get_signature(self):
+        assert_equal(umt.inner1d.signature, "(i),(i)->()")
+
+    def test_forced_sig(self):
+        a = 0.5*np.arange(3, dtype='f8')
+        assert_equal(np.add(a, 0.5), [0.5, 1, 1.5])
+        assert_equal(np.add(a, 0.5, sig='i', casting='unsafe'), [0, 0, 1])
+        assert_equal(np.add(a, 0.5, sig='ii->i', casting='unsafe'), [0, 0, 1])
+        assert_equal(np.add(a, 0.5, sig=('i4',), casting='unsafe'), [0, 0, 1])
+        assert_equal(np.add(a, 0.5, sig=('i4', 'i4', 'i4'),
+                                            casting='unsafe'), [0, 0, 1])
+
+        b = np.zeros((3,), dtype='f8')
+        np.add(a, 0.5, out=b)
+        assert_equal(b, [0.5, 1, 1.5])
+        b[:] = 0
+        np.add(a, 0.5, sig='i', out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        b[:] = 0
+        np.add(a, 0.5, sig='ii->i', out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        b[:] = 0
+        np.add(a, 0.5, sig=('i4',), out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        b[:] = 0
+        np.add(a, 0.5, sig=('i4', 'i4', 'i4'), out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+
+    def test_true_divide(self):
+        a = np.array(10)
+        b = np.array(20)
+        tgt = np.array(0.5)
+
+        for tc in 'bhilqBHILQefdgFDG':
+            dt = np.dtype(tc)
+            aa = a.astype(dt)
+            bb = b.astype(dt)
+
+            # Check result value and dtype.
+            for x, y in itertools.product([aa, -aa], [bb, -bb]):
+
+                # Check with no output type specified
+                if tc in 'FDG':
+                    tgt = complex(x)/complex(y)
+                else:
+                    tgt = float(x)/float(y)
+
+                res = np.true_divide(x, y)
+                rtol = max(np.finfo(res).resolution, 1e-15)
+                assert_allclose(res, tgt, rtol=rtol)
+
+                if tc in 'bhilqBHILQ':
+                    assert_(res.dtype.name == 'float64')
+                else:
+                    assert_(res.dtype.name == dt.name )
+
+                # Check with output type specified.  This also checks for the
+                # incorrect casts in issue gh-3484 because the unary '-' does
+                # not change types, even for unsigned types, Hence casts in the
+                # ufunc from signed to unsigned and vice versa will lead to
+                # errors in the values.
+                for tcout in 'bhilqBHILQ':
+                    dtout = np.dtype(tcout)
+                    assert_raises(TypeError, np.true_divide, x, y, dtype=dtout)
+
+                for tcout in 'efdg':
+                    dtout = np.dtype(tcout)
+                    if tc in 'FDG':
+                        # Casting complex to float is not allowed
+                        assert_raises(TypeError, np.true_divide, x, y, dtype=dtout)
+                    else:
+                        tgt = float(x)/float(y)
+                        rtol = max(np.finfo(dtout).resolution, 1e-15)
+                        atol = max(np.finfo(dtout).tiny, 3e-308)
+                        # Some test values result in invalid for float16.
+                        with np.errstate(invalid='ignore'):
+                            res = np.true_divide(x, y, dtype=dtout)
+                        if not np.isfinite(res) and tcout == 'e':
+                            continue
+                        assert_allclose(res, tgt, rtol=rtol, atol=atol)
+                        assert_(res.dtype.name == dtout.name)
+
+                for tcout in 'FDG':
+                    dtout = np.dtype(tcout)
+                    tgt = complex(x)/complex(y)
+                    rtol = max(np.finfo(dtout).resolution, 1e-15)
+                    atol = max(np.finfo(dtout).tiny, 3e-308)
+                    res = np.true_divide(x, y, dtype=dtout)
+                    if not np.isfinite(res):
+                        continue
+                    assert_allclose(res, tgt, rtol=rtol, atol=atol)
+                    assert_(res.dtype.name == dtout.name)
+
+        # Check booleans
+        a = np.ones((), dtype=np.bool_)
+        res = np.true_divide(a, a)
+        assert_(res == 1.0)
+        assert_(res.dtype.name == 'float64')
+        res = np.true_divide(~a, a)
+        assert_(res == 0.0)
+        assert_(res.dtype.name == 'float64')
+
+    def test_sum_stability(self):
+        a = np.ones(500, dtype=np.float32)
+        assert_almost_equal((a / 10.).sum() - a.size / 10., 0, 4)
+
+        a = np.ones(500, dtype=np.float64)
+        assert_almost_equal((a / 10.).sum() - a.size / 10., 0, 13)
+
+    def test_sum(self):
+        for dt in (int, np.float16, np.float32, np.float64, np.longdouble):
+            for v in (0, 1, 2, 7, 8, 9, 15, 16, 19, 127,
+                      128, 1024, 1235):
+                tgt = dt(v * (v + 1) / 2)
+                d = np.arange(1, v + 1, dtype=dt)
+
+                # warning if sum overflows, which it does in float16
+                overflow = not np.isfinite(tgt)
+
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter("always")
+                    assert_almost_equal(np.sum(d), tgt)
+                    assert_equal(len(w), 1 * overflow)
+
+                    assert_almost_equal(np.sum(d[::-1]), tgt)
+                    assert_equal(len(w), 2 * overflow)
+
+            d = np.ones(500, dtype=dt)
+            assert_almost_equal(np.sum(d[::2]), 250.)
+            assert_almost_equal(np.sum(d[1::2]), 250.)
+            assert_almost_equal(np.sum(d[::3]), 167.)
+            assert_almost_equal(np.sum(d[1::3]), 167.)
+            assert_almost_equal(np.sum(d[::-2]), 250.)
+            assert_almost_equal(np.sum(d[-1::-2]), 250.)
+            assert_almost_equal(np.sum(d[::-3]), 167.)
+            assert_almost_equal(np.sum(d[-1::-3]), 167.)
+            # sum with first reduction entry != 0
+            d = np.ones((1,), dtype=dt)
+            d += d
+            assert_almost_equal(d, 2.)
+
+    def test_sum_complex(self):
+        for dt in (np.complex64, np.complex128, np.clongdouble):
+            for v in (0, 1, 2, 7, 8, 9, 15, 16, 19, 127,
+                      128, 1024, 1235):
+                tgt = dt(v * (v + 1) / 2) - dt((v * (v + 1) / 2) * 1j)
+                d = np.empty(v, dtype=dt)
+                d.real = np.arange(1, v + 1)
+                d.imag = -np.arange(1, v + 1)
+                assert_almost_equal(np.sum(d), tgt)
+                assert_almost_equal(np.sum(d[::-1]), tgt)
+
+            d = np.ones(500, dtype=dt) + 1j
+            assert_almost_equal(np.sum(d[::2]), 250. + 250j)
+            assert_almost_equal(np.sum(d[1::2]), 250. + 250j)
+            assert_almost_equal(np.sum(d[::3]), 167. + 167j)
+            assert_almost_equal(np.sum(d[1::3]), 167. + 167j)
+            assert_almost_equal(np.sum(d[::-2]), 250. + 250j)
+            assert_almost_equal(np.sum(d[-1::-2]), 250. + 250j)
+            assert_almost_equal(np.sum(d[::-3]), 167. + 167j)
+            assert_almost_equal(np.sum(d[-1::-3]), 167. + 167j)
+            # sum with first reduction entry != 0
+            d = np.ones((1,), dtype=dt) + 1j
+            d += d
+            assert_almost_equal(d, 2. + 2j)
+
+    def test_sum_initial(self):
+        # Integer, single axis
+        assert_equal(np.sum([3], initial=2), 5)
+
+        # Floating point
+        assert_almost_equal(np.sum([0.2], initial=0.1), 0.3)
+
+        # Multiple non-adjacent axes
+        assert_equal(np.sum(np.ones((2, 3, 5), dtype=np.int64), axis=(0, 2), initial=2),
+                     [12, 12, 12])
+
+    def test_inner1d(self):
+        a = np.arange(6).reshape((2, 3))
+        assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1))
+        a = np.arange(6)
+        assert_array_equal(umt.inner1d(a, a), np.sum(a*a))
+
+    def test_broadcast(self):
+        msg = "broadcast"
+        a = np.arange(4).reshape((2, 1, 2))
+        b = np.arange(4).reshape((1, 2, 2))
+        assert_array_equal(umt.inner1d(a, b), np.sum(a*b, axis=-1), err_msg=msg)
+        msg = "extend & broadcast loop dimensions"
+        b = np.arange(4).reshape((2, 2))
+        assert_array_equal(umt.inner1d(a, b), np.sum(a*b, axis=-1), err_msg=msg)
+        # Broadcast in core dimensions should fail
+        a = np.arange(8).reshape((4, 2))
+        b = np.arange(4).reshape((4, 1))
+        assert_raises(ValueError, umt.inner1d, a, b)
+        # Extend core dimensions should fail
+        a = np.arange(8).reshape((4, 2))
+        b = np.array(7)
+        assert_raises(ValueError, umt.inner1d, a, b)
+        # Broadcast should fail
+        a = np.arange(2).reshape((2, 1, 1))
+        b = np.arange(3).reshape((3, 1, 1))
+        assert_raises(ValueError, umt.inner1d, a, b)
+
+    def test_type_cast(self):
+        msg = "type cast"
+        a = np.arange(6, dtype='short').reshape((2, 3))
+        assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1),
+                           err_msg=msg)
+        msg = "type cast on one argument"
+        a = np.arange(6).reshape((2, 3))
+        b = a + 0.1
+        assert_array_almost_equal(umt.inner1d(a, b), np.sum(a*b, axis=-1),
+                                  err_msg=msg)
+
+    def test_endian(self):
+        msg = "big endian"
+        a = np.arange(6, dtype='>i4').reshape((2, 3))
+        assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1),
+                           err_msg=msg)
+        msg = "little endian"
+        a = np.arange(6, dtype='<i4').reshape((2, 3))
+        assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1),
+                           err_msg=msg)
+
+        # Output should always be native-endian
+        Ba = np.arange(1, dtype='>f8')
+        La = np.arange(1, dtype='<f8')
+        assert_equal((Ba+Ba).dtype, np.dtype('f8'))
+        assert_equal((Ba+La).dtype, np.dtype('f8'))
+        assert_equal((La+Ba).dtype, np.dtype('f8'))
+        assert_equal((La+La).dtype, np.dtype('f8'))
+
+        assert_equal(np.absolute(La).dtype, np.dtype('f8'))
+        assert_equal(np.absolute(Ba).dtype, np.dtype('f8'))
+        assert_equal(np.negative(La).dtype, np.dtype('f8'))
+        assert_equal(np.negative(Ba).dtype, np.dtype('f8'))
+
+    def test_incontiguous_array(self):
+        msg = "incontiguous memory layout of array"
+        x = np.arange(64).reshape((2, 2, 2, 2, 2, 2))
+        a = x[:, 0,:, 0,:, 0]
+        b = x[:, 1,:, 1,:, 1]
+        a[0, 0, 0] = -1
+        msg2 = "make sure it references to the original array"
+        assert_equal(x[0, 0, 0, 0, 0, 0], -1, err_msg=msg2)
+        assert_array_equal(umt.inner1d(a, b), np.sum(a*b, axis=-1), err_msg=msg)
+        x = np.arange(24).reshape(2, 3, 4)
+        a = x.T
+        b = x.T
+        a[0, 0, 0] = -1
+        assert_equal(x[0, 0, 0], -1, err_msg=msg2)
+        assert_array_equal(umt.inner1d(a, b), np.sum(a*b, axis=-1), err_msg=msg)
+
+    def test_output_argument(self):
+        msg = "output argument"
+        a = np.arange(12).reshape((2, 3, 2))
+        b = np.arange(4).reshape((2, 1, 2)) + 1
+        c = np.zeros((2, 3), dtype='int')
+        umt.inner1d(a, b, c)
+        assert_array_equal(c, np.sum(a*b, axis=-1), err_msg=msg)
+        c[:] = -1
+        umt.inner1d(a, b, out=c)
+        assert_array_equal(c, np.sum(a*b, axis=-1), err_msg=msg)
+
+        msg = "output argument with type cast"
+        c = np.zeros((2, 3), dtype='int16')
+        umt.inner1d(a, b, c)
+        assert_array_equal(c, np.sum(a*b, axis=-1), err_msg=msg)
+        c[:] = -1
+        umt.inner1d(a, b, out=c)
+        assert_array_equal(c, np.sum(a*b, axis=-1), err_msg=msg)
+
+        msg = "output argument with incontiguous layout"
+        c = np.zeros((2, 3, 4), dtype='int16')
+        umt.inner1d(a, b, c[..., 0])
+        assert_array_equal(c[..., 0], np.sum(a*b, axis=-1), err_msg=msg)
+        c[:] = -1
+        umt.inner1d(a, b, out=c[..., 0])
+        assert_array_equal(c[..., 0], np.sum(a*b, axis=-1), err_msg=msg)
+
+    def test_axes_argument(self):
+        # inner1d signature: '(i),(i)->()'
+        inner1d = umt.inner1d
+        a = np.arange(27.).reshape((3, 3, 3))
+        b = np.arange(10., 19.).reshape((3, 1, 3))
+        # basic tests on inputs (outputs tested below with matrix_multiply).
+        c = inner1d(a, b)
+        assert_array_equal(c, (a * b).sum(-1))
+        # default
+        c = inner1d(a, b, axes=[(-1,), (-1,), ()])
+        assert_array_equal(c, (a * b).sum(-1))
+        # integers ok for single axis.
+        c = inner1d(a, b, axes=[-1, -1, ()])
+        assert_array_equal(c, (a * b).sum(-1))
+        # mix fine
+        c = inner1d(a, b, axes=[(-1,), -1, ()])
+        assert_array_equal(c, (a * b).sum(-1))
+        # can omit last axis.
+        c = inner1d(a, b, axes=[-1, -1])
+        assert_array_equal(c, (a * b).sum(-1))
+        # can pass in other types of integer (with __index__ protocol)
+        c = inner1d(a, b, axes=[np.int8(-1), np.array(-1, dtype=np.int32)])
+        assert_array_equal(c, (a * b).sum(-1))
+        # swap some axes
+        c = inner1d(a, b, axes=[0, 0])
+        assert_array_equal(c, (a * b).sum(0))
+        c = inner1d(a, b, axes=[0, 2])
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1))
+        # Check errors for improperly constructed axes arguments.
+        # should have list.
+        assert_raises(TypeError, inner1d, a, b, axes=-1)
+        # needs enough elements
+        assert_raises(ValueError, inner1d, a, b, axes=[-1])
+        # should pass in indices.
+        assert_raises(TypeError, inner1d, a, b, axes=[-1.0, -1.0])
+        assert_raises(TypeError, inner1d, a, b, axes=[(-1.0,), -1])
+        assert_raises(TypeError, inner1d, a, b, axes=[None, 1])
+        # cannot pass an index unless there is only one dimension
+        # (output is wrong in this case)
+        assert_raises(TypeError, inner1d, a, b, axes=[-1, -1, -1])
+        # or pass in generally the wrong number of axes
+        assert_raises(ValueError, inner1d, a, b, axes=[-1, -1, (-1,)])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1, (-2, -1), ()])
+        # axes need to have same length.
+        assert_raises(ValueError, inner1d, a, b, axes=[0, 1])
+
+        # matrix_multiply signature: '(m,n),(n,p)->(m,p)'
+        mm = umt.matrix_multiply
+        a = np.arange(12).reshape((2, 3, 2))
+        b = np.arange(8).reshape((2, 2, 2, 1)) + 1
+        # Sanity check.
+        c = mm(a, b)
+        assert_array_equal(c, np.matmul(a, b))
+        # Default axes.
+        c = mm(a, b, axes=[(-2, -1), (-2, -1), (-2, -1)])
+        assert_array_equal(c, np.matmul(a, b))
+        # Default with explicit axes.
+        c = mm(a, b, axes=[(1, 2), (2, 3), (2, 3)])
+        assert_array_equal(c, np.matmul(a, b))
+        # swap some axes.
+        c = mm(a, b, axes=[(0, -1), (1, 2), (-2, -1)])
+        assert_array_equal(c, np.matmul(a.transpose(1, 0, 2),
+                                        b.transpose(0, 3, 1, 2)))
+        # Default with output array.
+        c = np.empty((2, 2, 3, 1))
+        d = mm(a, b, out=c, axes=[(1, 2), (2, 3), (2, 3)])
+        assert_(c is d)
+        assert_array_equal(c, np.matmul(a, b))
+        # Transposed output array
+        c = np.empty((1, 2, 2, 3))
+        d = mm(a, b, out=c, axes=[(-2, -1), (-2, -1), (3, 0)])
+        assert_(c is d)
+        assert_array_equal(c, np.matmul(a, b).transpose(3, 0, 1, 2))
+        # Check errors for improperly constructed axes arguments.
+        # wrong argument
+        assert_raises(TypeError, mm, a, b, axis=1)
+        # axes should be list
+        assert_raises(TypeError, mm, a, b, axes=1)
+        assert_raises(TypeError, mm, a, b, axes=((-2, -1), (-2, -1), (-2, -1)))
+        # list needs to have right length
+        assert_raises(ValueError, mm, a, b, axes=[])
+        assert_raises(ValueError, mm, a, b, axes=[(-2, -1)])
+        # list should contain tuples for multiple axes
+        assert_raises(TypeError, mm, a, b, axes=[-1, -1, -1])
+        assert_raises(TypeError, mm, a, b, axes=[(-2, -1), (-2, -1), -1])
+        assert_raises(TypeError,
+                      mm, a, b, axes=[[-2, -1], [-2, -1], [-2, -1]])
+        assert_raises(TypeError,
+                      mm, a, b, axes=[(-2, -1), (-2, -1), [-2, -1]])
+        assert_raises(TypeError, mm, a, b, axes=[(-2, -1), (-2, -1), None])
+        # tuples should not have duplicated values
+        assert_raises(ValueError, mm, a, b, axes=[(-2, -1), (-2, -1), (-2, -2)])
+        # arrays should have enough axes.
+        z = np.zeros((2, 2))
+        assert_raises(ValueError, mm, z, z[0])
+        assert_raises(ValueError, mm, z, z, out=z[:, 0])
+        assert_raises(ValueError, mm, z[1], z, axes=[0, 1])
+        assert_raises(ValueError, mm, z, z, out=z[0], axes=[0, 1])
+        # Regular ufuncs should not accept axes.
+        assert_raises(TypeError, np.add, 1., 1., axes=[0])
+        # should be able to deal with bad unrelated kwargs.
+        assert_raises(TypeError, mm, z, z, axes=[0, 1], parrot=True)
+
+    def test_axis_argument(self):
+        # inner1d signature: '(i),(i)->()'
+        inner1d = umt.inner1d
+        a = np.arange(27.).reshape((3, 3, 3))
+        b = np.arange(10., 19.).reshape((3, 1, 3))
+        c = inner1d(a, b)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, axis=-1)
+        assert_array_equal(c, (a * b).sum(-1))
+        out = np.zeros_like(c)
+        d = inner1d(a, b, axis=-1, out=out)
+        assert_(d is out)
+        assert_array_equal(d, c)
+        c = inner1d(a, b, axis=0)
+        assert_array_equal(c, (a * b).sum(0))
+        # Sanity checks on innerwt and cumsum.
+        a = np.arange(6).reshape((2, 3))
+        b = np.arange(10, 16).reshape((2, 3))
+        w = np.arange(20, 26).reshape((2, 3))
+        assert_array_equal(umt.innerwt(a, b, w, axis=0),
+                           np.sum(a * b * w, axis=0))
+        assert_array_equal(umt.cumsum(a, axis=0), np.cumsum(a, axis=0))
+        assert_array_equal(umt.cumsum(a, axis=-1), np.cumsum(a, axis=-1))
+        out = np.empty_like(a)
+        b = umt.cumsum(a, out=out, axis=0)
+        assert_(out is b)
+        assert_array_equal(b, np.cumsum(a, axis=0))
+        b = umt.cumsum(a, out=out, axis=1)
+        assert_(out is b)
+        assert_array_equal(b, np.cumsum(a, axis=-1))
+        # Check errors.
+        # Cannot pass in both axis and axes.
+        assert_raises(TypeError, inner1d, a, b, axis=0, axes=[0, 0])
+        # Not an integer.
+        assert_raises(TypeError, inner1d, a, b, axis=[0])
+        # more than 1 core dimensions.
+        mm = umt.matrix_multiply
+        assert_raises(TypeError, mm, a, b, axis=1)
+        # Output wrong size in axis.
+        out = np.empty((1, 2, 3), dtype=a.dtype)
+        assert_raises(ValueError, umt.cumsum, a, out=out, axis=0)
+        # Regular ufuncs should not accept axis.
+        assert_raises(TypeError, np.add, 1., 1., axis=0)
+
+    def test_keepdims_argument(self):
+        # inner1d signature: '(i),(i)->()'
+        inner1d = umt.inner1d
+        a = np.arange(27.).reshape((3, 3, 3))
+        b = np.arange(10., 19.).reshape((3, 1, 3))
+        c = inner1d(a, b)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        out = np.zeros_like(c)
+        d = inner1d(a, b, keepdims=True, out=out)
+        assert_(d is out)
+        assert_array_equal(d, c)
+        # Now combined with axis and axes.
+        c = inner1d(a, b, axis=-1, keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=False))
+        c = inner1d(a, b, axis=-1, keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        c = inner1d(a, b, axis=0, keepdims=False)
+        assert_array_equal(c, (a * b).sum(0, keepdims=False))
+        c = inner1d(a, b, axis=0, keepdims=True)
+        assert_array_equal(c, (a * b).sum(0, keepdims=True))
+        c = inner1d(a, b, axes=[(-1,), (-1,), ()], keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, axes=[(-1,), (-1,), (-1,)], keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        c = inner1d(a, b, axes=[0, 0], keepdims=False)
+        assert_array_equal(c, (a * b).sum(0))
+        c = inner1d(a, b, axes=[0, 0, 0], keepdims=True)
+        assert_array_equal(c, (a * b).sum(0, keepdims=True))
+        c = inner1d(a, b, axes=[0, 2], keepdims=False)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1))
+        c = inner1d(a, b, axes=[0, 2], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1,
+                                                             keepdims=True))
+        c = inner1d(a, b, axes=[0, 2, 2], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1,
+                                                             keepdims=True))
+        c = inner1d(a, b, axes=[0, 2, 0], keepdims=True)
+        assert_array_equal(c, (a * b.transpose(2, 0, 1)).sum(0, keepdims=True))
+        # Hardly useful, but should work.
+        c = inner1d(a, b, axes=[0, 2, 1], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 0, 2) * b.transpose(0, 2, 1))
+                           .sum(1, keepdims=True))
+        # Check with two core dimensions.
+        a = np.eye(3) * np.arange(4.)[:, np.newaxis, np.newaxis]
+        expected = uml.det(a)
+        c = uml.det(a, keepdims=False)
+        assert_array_equal(c, expected)
+        c = uml.det(a, keepdims=True)
+        assert_array_equal(c, expected[:, np.newaxis, np.newaxis])
+        a = np.eye(3) * np.arange(4.)[:, np.newaxis, np.newaxis]
+        expected_s, expected_l = uml.slogdet(a)
+        cs, cl = uml.slogdet(a, keepdims=False)
+        assert_array_equal(cs, expected_s)
+        assert_array_equal(cl, expected_l)
+        cs, cl = uml.slogdet(a, keepdims=True)
+        assert_array_equal(cs, expected_s[:, np.newaxis, np.newaxis])
+        assert_array_equal(cl, expected_l[:, np.newaxis, np.newaxis])
+        # Sanity check on innerwt.
+        a = np.arange(6).reshape((2, 3))
+        b = np.arange(10, 16).reshape((2, 3))
+        w = np.arange(20, 26).reshape((2, 3))
+        assert_array_equal(umt.innerwt(a, b, w, keepdims=True),
+                           np.sum(a * b * w, axis=-1, keepdims=True))
+        assert_array_equal(umt.innerwt(a, b, w, axis=0, keepdims=True),
+                           np.sum(a * b * w, axis=0, keepdims=True))
+        # Check errors.
+        # Not a boolean
+        assert_raises(TypeError, inner1d, a, b, keepdims='true')
+        # More than 1 core dimension, and core output dimensions.
+        mm = umt.matrix_multiply
+        assert_raises(TypeError, mm, a, b, keepdims=True)
+        assert_raises(TypeError, mm, a, b, keepdims=False)
+        # Regular ufuncs should not accept keepdims.
+        assert_raises(TypeError, np.add, 1., 1., keepdims=False)
+
+    def test_innerwt(self):
+        a = np.arange(6).reshape((2, 3))
+        b = np.arange(10, 16).reshape((2, 3))
+        w = np.arange(20, 26).reshape((2, 3))
+        assert_array_equal(umt.innerwt(a, b, w), np.sum(a*b*w, axis=-1))
+        a = np.arange(100, 124).reshape((2, 3, 4))
+        b = np.arange(200, 224).reshape((2, 3, 4))
+        w = np.arange(300, 324).reshape((2, 3, 4))
+        assert_array_equal(umt.innerwt(a, b, w), np.sum(a*b*w, axis=-1))
+
+    def test_innerwt_empty(self):
+        """Test generalized ufunc with zero-sized operands"""
+        a = np.array([], dtype='f8')
+        b = np.array([], dtype='f8')
+        w = np.array([], dtype='f8')
+        assert_array_equal(umt.innerwt(a, b, w), np.sum(a*b*w, axis=-1))
+
+    def test_cross1d(self):
+        """Test with fixed-sized signature."""
+        a = np.eye(3)
+        assert_array_equal(umt.cross1d(a, a), np.zeros((3, 3)))
+        out = np.zeros((3, 3))
+        result = umt.cross1d(a[0], a, out)
+        assert_(result is out)
+        assert_array_equal(result, np.vstack((np.zeros(3), a[2], -a[1])))
+        assert_raises(ValueError, umt.cross1d, np.eye(4), np.eye(4))
+        assert_raises(ValueError, umt.cross1d, a, np.arange(4.))
+        assert_raises(ValueError, umt.cross1d, a, np.arange(3.), np.zeros((3, 4)))
+
+    def test_can_ignore_signature(self):
+        # Comparing the effects of ? in signature:
+        # matrix_multiply: (m,n),(n,p)->(m,p)    # all must be there.
+        # matmul:        (m?,n),(n,p?)->(m?,p?)  # allow missing m, p.
+        mat = np.arange(12).reshape((2, 3, 2))
+        single_vec = np.arange(2)
+        col_vec = single_vec[:, np.newaxis]
+        col_vec_array = np.arange(8).reshape((2, 2, 2, 1)) + 1
+        # matrix @ single column vector with proper dimension
+        mm_col_vec = umt.matrix_multiply(mat, col_vec)
+        # matmul does the same thing
+        matmul_col_vec = umt.matmul(mat, col_vec)
+        assert_array_equal(matmul_col_vec, mm_col_vec)
+        # matrix @ vector without dimension making it a column vector.
+        # matrix multiply fails -> missing core dim.
+        assert_raises(ValueError, umt.matrix_multiply, mat, single_vec)
+        # matmul mimicker passes, and returns a vector.
+        matmul_col = umt.matmul(mat, single_vec)
+        assert_array_equal(matmul_col, mm_col_vec.squeeze())
+        # Now with a column array: same as for column vector,
+        # broadcasting sensibly.
+        mm_col_vec = umt.matrix_multiply(mat, col_vec_array)
+        matmul_col_vec = umt.matmul(mat, col_vec_array)
+        assert_array_equal(matmul_col_vec, mm_col_vec)
+        # As above, but for row vector
+        single_vec = np.arange(3)
+        row_vec = single_vec[np.newaxis, :]
+        row_vec_array = np.arange(24).reshape((4, 2, 1, 1, 3)) + 1
+        # row vector @ matrix
+        mm_row_vec = umt.matrix_multiply(row_vec, mat)
+        matmul_row_vec = umt.matmul(row_vec, mat)
+        assert_array_equal(matmul_row_vec, mm_row_vec)
+        # single row vector @ matrix
+        assert_raises(ValueError, umt.matrix_multiply, single_vec, mat)
+        matmul_row = umt.matmul(single_vec, mat)
+        assert_array_equal(matmul_row, mm_row_vec.squeeze())
+        # row vector array @ matrix
+        mm_row_vec = umt.matrix_multiply(row_vec_array, mat)
+        matmul_row_vec = umt.matmul(row_vec_array, mat)
+        assert_array_equal(matmul_row_vec, mm_row_vec)
+        # Now for vector combinations
+        # row vector @ column vector
+        col_vec = row_vec.T
+        col_vec_array = row_vec_array.swapaxes(-2, -1)
+        mm_row_col_vec = umt.matrix_multiply(row_vec, col_vec)
+        matmul_row_col_vec = umt.matmul(row_vec, col_vec)
+        assert_array_equal(matmul_row_col_vec, mm_row_col_vec)
+        # single row vector @ single col vector
+        assert_raises(ValueError, umt.matrix_multiply, single_vec, single_vec)
+        matmul_row_col = umt.matmul(single_vec, single_vec)
+        assert_array_equal(matmul_row_col, mm_row_col_vec.squeeze())
+        # row vector array @ matrix
+        mm_row_col_array = umt.matrix_multiply(row_vec_array, col_vec_array)
+        matmul_row_col_array = umt.matmul(row_vec_array, col_vec_array)
+        assert_array_equal(matmul_row_col_array, mm_row_col_array)
+        # Finally, check that things are *not* squeezed if one gives an
+        # output.
+        out = np.zeros_like(mm_row_col_array)
+        out = umt.matrix_multiply(row_vec_array, col_vec_array, out=out)
+        assert_array_equal(out, mm_row_col_array)
+        out[:] = 0
+        out = umt.matmul(row_vec_array, col_vec_array, out=out)
+        assert_array_equal(out, mm_row_col_array)
+        # And check one cannot put missing dimensions back.
+        out = np.zeros_like(mm_row_col_vec)
+        assert_raises(ValueError, umt.matrix_multiply, single_vec, single_vec,
+                      out)
+        # But fine for matmul, since it is just a broadcast.
+        out = umt.matmul(single_vec, single_vec, out)
+        assert_array_equal(out, mm_row_col_vec.squeeze())
+
+    def test_matrix_multiply(self):
+        self.compare_matrix_multiply_results(np.long)
+        self.compare_matrix_multiply_results(np.double)
+
+    def test_matrix_multiply_umath_empty(self):
+        res = umt.matrix_multiply(np.ones((0, 10)), np.ones((10, 0)))
+        assert_array_equal(res, np.zeros((0, 0)))
+        res = umt.matrix_multiply(np.ones((10, 0)), np.ones((0, 10)))
+        assert_array_equal(res, np.zeros((10, 10)))
+
+    def compare_matrix_multiply_results(self, tp):
+        d1 = np.array(np.random.rand(2, 3, 4), dtype=tp)
+        d2 = np.array(np.random.rand(2, 3, 4), dtype=tp)
+        msg = "matrix multiply on type %s" % d1.dtype.name
+
+        def permute_n(n):
+            if n == 1:
+                return ([0],)
+            ret = ()
+            base = permute_n(n-1)
+            for perm in base:
+                for i in range(n):
+                    new = perm + [n-1]
+                    new[n-1] = new[i]
+                    new[i] = n-1
+                    ret += (new,)
+            return ret
+
+        def slice_n(n):
+            if n == 0:
+                return ((),)
+            ret = ()
+            base = slice_n(n-1)
+            for sl in base:
+                ret += (sl+(slice(None),),)
+                ret += (sl+(slice(0, 1),),)
+            return ret
+
+        def broadcastable(s1, s2):
+            return s1 == s2 or s1 == 1 or s2 == 1
+
+        permute_3 = permute_n(3)
+        slice_3 = slice_n(3) + ((slice(None, None, -1),)*3,)
+
+        ref = True
+        for p1 in permute_3:
+            for p2 in permute_3:
+                for s1 in slice_3:
+                    for s2 in slice_3:
+                        a1 = d1.transpose(p1)[s1]
+                        a2 = d2.transpose(p2)[s2]
+                        ref = ref and a1.base is not None
+                        ref = ref and a2.base is not None
+                        if (a1.shape[-1] == a2.shape[-2] and
+                                broadcastable(a1.shape[0], a2.shape[0])):
+                            assert_array_almost_equal(
+                                umt.matrix_multiply(a1, a2),
+                                np.sum(a2[..., np.newaxis].swapaxes(-3, -1) *
+                                       a1[..., np.newaxis,:], axis=-1),
+                                err_msg=msg + ' %s %s' % (str(a1.shape),
+                                                          str(a2.shape)))
+
+        assert_equal(ref, True, err_msg="reference check")
+
+    def test_euclidean_pdist(self):
+        a = np.arange(12, dtype=float).reshape(4, 3)
+        out = np.empty((a.shape[0] * (a.shape[0] - 1) // 2,), dtype=a.dtype)
+        umt.euclidean_pdist(a, out)
+        b = np.sqrt(np.sum((a[:, None] - a)**2, axis=-1))
+        b = b[~np.tri(a.shape[0], dtype=bool)]
+        assert_almost_equal(out, b)
+        # An output array is required to determine p with signature (n,d)->(p)
+        assert_raises(ValueError, umt.euclidean_pdist, a)
+
+    def test_cumsum(self):
+        a = np.arange(10)
+        result = umt.cumsum(a)
+        assert_array_equal(result, a.cumsum())
+
+    def test_object_logical(self):
+        a = np.array([3, None, True, False, "test", ""], dtype=object)
+        assert_equal(np.logical_or(a, None),
+                        np.array([x or None for x in a], dtype=object))
+        assert_equal(np.logical_or(a, True),
+                        np.array([x or True for x in a], dtype=object))
+        assert_equal(np.logical_or(a, 12),
+                        np.array([x or 12 for x in a], dtype=object))
+        assert_equal(np.logical_or(a, "blah"),
+                        np.array([x or "blah" for x in a], dtype=object))
+
+        assert_equal(np.logical_and(a, None),
+                        np.array([x and None for x in a], dtype=object))
+        assert_equal(np.logical_and(a, True),
+                        np.array([x and True for x in a], dtype=object))
+        assert_equal(np.logical_and(a, 12),
+                        np.array([x and 12 for x in a], dtype=object))
+        assert_equal(np.logical_and(a, "blah"),
+                        np.array([x and "blah" for x in a], dtype=object))
+
+        assert_equal(np.logical_not(a),
+                        np.array([not x for x in a], dtype=object))
+
+        assert_equal(np.logical_or.reduce(a), 3)
+        assert_equal(np.logical_and.reduce(a), None)
+
+    def test_object_comparison(self):
+        class HasComparisons(object):
+            def __eq__(self, other):
+                return '=='
+
+        arr0d = np.array(HasComparisons())
+        assert_equal(arr0d == arr0d, True)
+        assert_equal(np.equal(arr0d, arr0d), True)  # normal behavior is a cast
+        assert_equal(np.equal(arr0d, arr0d, dtype=object), '==')
+
+        arr1d = np.array([HasComparisons()])
+        assert_equal(arr1d == arr1d, np.array([True]))
+        assert_equal(np.equal(arr1d, arr1d), np.array([True]))  # normal behavior is a cast
+        assert_equal(np.equal(arr1d, arr1d, dtype=object), np.array(['==']))
+
+    def test_object_array_reduction(self):
+        # Reductions on object arrays
+        a = np.array(['a', 'b', 'c'], dtype=object)
+        assert_equal(np.sum(a), 'abc')
+        assert_equal(np.max(a), 'c')
+        assert_equal(np.min(a), 'a')
+        a = np.array([True, False, True], dtype=object)
+        assert_equal(np.sum(a), 2)
+        assert_equal(np.prod(a), 0)
+        assert_equal(np.any(a), True)
+        assert_equal(np.all(a), False)
+        assert_equal(np.max(a), True)
+        assert_equal(np.min(a), False)
+        assert_equal(np.array([[1]], dtype=object).sum(), 1)
+        assert_equal(np.array([[[1, 2]]], dtype=object).sum((0, 1)), [1, 2])
+        assert_equal(np.array([1], dtype=object).sum(initial=1), 2)
+
+    def test_object_array_accumulate_inplace(self):
+        # Checks that in-place accumulates work, see also gh-7402
+        arr = np.ones(4, dtype=object)
+        arr[:] = [[1] for i in range(4)]
+        # Twice reproduced also for tuples:
+        np.add.accumulate(arr, out=arr)
+        np.add.accumulate(arr, out=arr)
+        assert_array_equal(arr, np.array([[1]*i for i in [1, 3, 6, 10]]))
+
+        # And the same if the axis argument is used
+        arr = np.ones((2, 4), dtype=object)
+        arr[0, :] = [[2] for i in range(4)]
+        np.add.accumulate(arr, out=arr, axis=-1)
+        np.add.accumulate(arr, out=arr, axis=-1)
+        assert_array_equal(arr[0, :], np.array([[2]*i for i in [1, 3, 6, 10]]))
+
+    def test_object_array_reduceat_inplace(self):
+        # Checks that in-place reduceats work, see also gh-7465
+        arr = np.empty(4, dtype=object)
+        arr[:] = [[1] for i in range(4)]
+        out = np.empty(4, dtype=object)
+        out[:] = [[1] for i in range(4)]
+        np.add.reduceat(arr, np.arange(4), out=arr)
+        np.add.reduceat(arr, np.arange(4), out=arr)
+        assert_array_equal(arr, out)
+
+        # And the same if the axis argument is used
+        arr = np.ones((2, 4), dtype=object)
+        arr[0, :] = [[2] for i in range(4)]
+        out = np.ones((2, 4), dtype=object)
+        out[0, :] = [[2] for i in range(4)]
+        np.add.reduceat(arr, np.arange(4), out=arr, axis=-1)
+        np.add.reduceat(arr, np.arange(4), out=arr, axis=-1)
+        assert_array_equal(arr, out)
+
+    def test_zerosize_reduction(self):
+        # Test with default dtype and object dtype
+        for a in [[], np.array([], dtype=object)]:
+            assert_equal(np.sum(a), 0)
+            assert_equal(np.prod(a), 1)
+            assert_equal(np.any(a), False)
+            assert_equal(np.all(a), True)
+            assert_raises(ValueError, np.max, a)
+            assert_raises(ValueError, np.min, a)
+
+    def test_axis_out_of_bounds(self):
+        a = np.array([False, False])
+        assert_raises(np.AxisError, a.all, axis=1)
+        a = np.array([False, False])
+        assert_raises(np.AxisError, a.all, axis=-2)
+
+        a = np.array([False, False])
+        assert_raises(np.AxisError, a.any, axis=1)
+        a = np.array([False, False])
+        assert_raises(np.AxisError, a.any, axis=-2)
+
+    def test_scalar_reduction(self):
+        # The functions 'sum', 'prod', etc allow specifying axis=0
+        # even for scalars
+        assert_equal(np.sum(3, axis=0), 3)
+        assert_equal(np.prod(3.5, axis=0), 3.5)
+        assert_equal(np.any(True, axis=0), True)
+        assert_equal(np.all(False, axis=0), False)
+        assert_equal(np.max(3, axis=0), 3)
+        assert_equal(np.min(2.5, axis=0), 2.5)
+
+        # Check scalar behaviour for ufuncs without an identity
+        assert_equal(np.power.reduce(3), 3)
+
+        # Make sure that scalars are coming out from this operation
+        assert_(type(np.prod(np.float32(2.5), axis=0)) is np.float32)
+        assert_(type(np.sum(np.float32(2.5), axis=0)) is np.float32)
+        assert_(type(np.max(np.float32(2.5), axis=0)) is np.float32)
+        assert_(type(np.min(np.float32(2.5), axis=0)) is np.float32)
+
+        # check if scalars/0-d arrays get cast
+        assert_(type(np.any(0, axis=0)) is np.bool_)
+
+        # assert that 0-d arrays get wrapped
+        class MyArray(np.ndarray):
+            pass
+        a = np.array(1).view(MyArray)
+        assert_(type(np.any(a)) is MyArray)
+
+    def test_casting_out_param(self):
+        # Test that it's possible to do casts on output
+        a = np.ones((200, 100), np.int64)
+        b = np.ones((200, 100), np.int64)
+        c = np.ones((200, 100), np.float64)
+        np.add(a, b, out=c)
+        assert_equal(c, 2)
+
+        a = np.zeros(65536)
+        b = np.zeros(65536, dtype=np.float32)
+        np.subtract(a, 0, out=b)
+        assert_equal(b, 0)
+
+    def test_where_param(self):
+        # Test that the where= ufunc parameter works with regular arrays
+        a = np.arange(7)
+        b = np.ones(7)
+        c = np.zeros(7)
+        np.add(a, b, out=c, where=(a % 2 == 1))
+        assert_equal(c, [0, 2, 0, 4, 0, 6, 0])
+
+        a = np.arange(4).reshape(2, 2) + 2
+        np.power(a, [2, 3], out=a, where=[[0, 1], [1, 0]])
+        assert_equal(a, [[2, 27], [16, 5]])
+        # Broadcasting the where= parameter
+        np.subtract(a, 2, out=a, where=[True, False])
+        assert_equal(a, [[0, 27], [14, 5]])
+
+    def test_where_param_buffer_output(self):
+        # This test is temporarily skipped because it requires
+        # adding masking features to the nditer to work properly
+
+        # With casting on output
+        a = np.ones(10, np.int64)
+        b = np.ones(10, np.int64)
+        c = 1.5 * np.ones(10, np.float64)
+        np.add(a, b, out=c, where=[1, 0, 0, 1, 0, 0, 1, 1, 1, 0])
+        assert_equal(c, [2, 1.5, 1.5, 2, 1.5, 1.5, 2, 2, 2, 1.5])
+
+    def test_where_param_alloc(self):
+        # With casting and allocated output
+        a = np.array([1], dtype=np.int64)
+        m = np.array([True], dtype=bool)
+        assert_equal(np.sqrt(a, where=m), [1])
+
+        # No casting and allocated output
+        a = np.array([1], dtype=np.float64)
+        m = np.array([True], dtype=bool)
+        assert_equal(np.sqrt(a, where=m), [1])
+
+    def check_identityless_reduction(self, a):
+        # np.minimum.reduce is an identityless reduction
+
+        # Verify that it sees the zero at various positions
+        a[...] = 1
+        a[1, 0, 0] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0, 1)), [0, 1, 1, 1])
+        assert_equal(np.minimum.reduce(a, axis=(0, 2)), [0, 1, 1])
+        assert_equal(np.minimum.reduce(a, axis=(1, 2)), [1, 0])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[1, 1, 1, 1], [0, 1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[1, 1, 1], [0, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+        a[...] = 1
+        a[0, 1, 0] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0, 1)), [0, 1, 1, 1])
+        assert_equal(np.minimum.reduce(a, axis=(0, 2)), [1, 0, 1])
+        assert_equal(np.minimum.reduce(a, axis=(1, 2)), [0, 1])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[1, 1, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[0, 1, 1, 1], [1, 1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[1, 0, 1], [1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+        a[...] = 1
+        a[0, 0, 1] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0, 1)), [1, 0, 1, 1])
+        assert_equal(np.minimum.reduce(a, axis=(0, 2)), [0, 1, 1])
+        assert_equal(np.minimum.reduce(a, axis=(1, 2)), [0, 1])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[1, 0, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[1, 0, 1, 1], [1, 1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[0, 1, 1], [1, 1, 1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+    def test_identityless_reduction_corder(self):
+        a = np.empty((2, 3, 4), order='C')
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_forder(self):
+        a = np.empty((2, 3, 4), order='F')
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_otherorder(self):
+        a = np.empty((2, 4, 3), order='C').swapaxes(1, 2)
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_noncontig(self):
+        a = np.empty((3, 5, 4), order='C').swapaxes(1, 2)
+        a = a[1:, 1:, 1:]
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_noncontig_unaligned(self):
+        a = np.empty((3*4*5*8 + 1,), dtype='i1')
+        a = a[1:].view(dtype='f8')
+        a.shape = (3, 4, 5)
+        a = a[1:, 1:, 1:]
+        self.check_identityless_reduction(a)
+
+    def test_initial_reduction(self):
+        # np.minimum.reduce is an identityless reduction
+
+        # For cases like np.maximum(np.abs(...), initial=0)
+        # More generally, a supremum over non-negative numbers.
+        assert_equal(np.maximum.reduce([], initial=0), 0)
+
+        # For cases like reduction of an empty array over the reals.
+        assert_equal(np.minimum.reduce([], initial=np.inf), np.inf)
+        assert_equal(np.maximum.reduce([], initial=-np.inf), -np.inf)
+
+        # Random tests
+        assert_equal(np.minimum.reduce([5], initial=4), 4)
+        assert_equal(np.maximum.reduce([4], initial=5), 5)
+        assert_equal(np.maximum.reduce([5], initial=4), 5)
+        assert_equal(np.minimum.reduce([4], initial=5), 4)
+
+        # Check initial=None raises ValueError for both types of ufunc reductions
+        assert_raises(ValueError, np.minimum.reduce, [], initial=None)
+        assert_raises(ValueError, np.add.reduce, [], initial=None)
+
+        # Check that np._NoValue gives default behavior.
+        assert_equal(np.add.reduce([], initial=np._NoValue), 0)
+
+        # Check that initial kwarg behaves as intended for dtype=object
+        a = np.array([10], dtype=object)
+        res = np.add.reduce(a, initial=5)
+        assert_equal(res, 15)
+
+    def test_identityless_reduction_nonreorderable(self):
+        a = np.array([[8.0, 2.0, 2.0], [1.0, 0.5, 0.25]])
+
+        res = np.divide.reduce(a, axis=0)
+        assert_equal(res, [8.0, 4.0, 8.0])
+
+        res = np.divide.reduce(a, axis=1)
+        assert_equal(res, [2.0, 8.0])
+
+        res = np.divide.reduce(a, axis=())
+        assert_equal(res, a)
+
+        assert_raises(ValueError, np.divide.reduce, a, axis=(0, 1))
+
+    def test_reduce_zero_axis(self):
+        # If we have a n x m array and do a reduction with axis=1, then we are
+        # doing n reductions, and each reduction takes an m-element array. For
+        # a reduction operation without an identity, then:
+        #   n > 0, m > 0: fine
+        #   n = 0, m > 0: fine, doing 0 reductions of m-element arrays
+        #   n > 0, m = 0: can't reduce a 0-element array, ValueError
+        #   n = 0, m = 0: can't reduce a 0-element array, ValueError (for
+        #     consistency with the above case)
+        # This test doesn't actually look at return values, it just checks to
+        # make sure that error we get an error in exactly those cases where we
+        # expect one, and assumes the calculations themselves are done
+        # correctly.
+
+        def ok(f, *args, **kwargs):
+            f(*args, **kwargs)
+
+        def err(f, *args, **kwargs):
+            assert_raises(ValueError, f, *args, **kwargs)
+
+        def t(expect, func, n, m):
+            expect(func, np.zeros((n, m)), axis=1)
+            expect(func, np.zeros((m, n)), axis=0)
+            expect(func, np.zeros((n // 2, n // 2, m)), axis=2)
+            expect(func, np.zeros((n // 2, m, n // 2)), axis=1)
+            expect(func, np.zeros((n, m // 2, m // 2)), axis=(1, 2))
+            expect(func, np.zeros((m // 2, n, m // 2)), axis=(0, 2))
+            expect(func, np.zeros((m // 3, m // 3, m // 3,
+                                  n // 2, n // 2)),
+                                 axis=(0, 1, 2))
+            # Check what happens if the inner (resp. outer) dimensions are a
+            # mix of zero and non-zero:
+            expect(func, np.zeros((10, m, n)), axis=(0, 1))
+            expect(func, np.zeros((10, n, m)), axis=(0, 2))
+            expect(func, np.zeros((m, 10, n)), axis=0)
+            expect(func, np.zeros((10, m, n)), axis=1)
+            expect(func, np.zeros((10, n, m)), axis=2)
+
+        # np.maximum is just an arbitrary ufunc with no reduction identity
+        assert_equal(np.maximum.identity, None)
+        t(ok, np.maximum.reduce, 30, 30)
+        t(ok, np.maximum.reduce, 0, 30)
+        t(err, np.maximum.reduce, 30, 0)
+        t(err, np.maximum.reduce, 0, 0)
+        err(np.maximum.reduce, [])
+        np.maximum.reduce(np.zeros((0, 0)), axis=())
+
+        # all of the combinations are fine for a reduction that has an
+        # identity
+        t(ok, np.add.reduce, 30, 30)
+        t(ok, np.add.reduce, 0, 30)
+        t(ok, np.add.reduce, 30, 0)
+        t(ok, np.add.reduce, 0, 0)
+        np.add.reduce([])
+        np.add.reduce(np.zeros((0, 0)), axis=())
+
+        # OTOH, accumulate always makes sense for any combination of n and m,
+        # because it maps an m-element array to an m-element array. These
+        # tests are simpler because accumulate doesn't accept multiple axes.
+        for uf in (np.maximum, np.add):
+            uf.accumulate(np.zeros((30, 0)), axis=0)
+            uf.accumulate(np.zeros((0, 30)), axis=0)
+            uf.accumulate(np.zeros((30, 30)), axis=0)
+            uf.accumulate(np.zeros((0, 0)), axis=0)
+
+    def test_safe_casting(self):
+        # In old versions of numpy, in-place operations used the 'unsafe'
+        # casting rules. In versions >= 1.10, 'same_kind' is the
+        # default and an exception is raised instead of a warning.
+        # when 'same_kind' is not satisfied.
+        a = np.array([1, 2, 3], dtype=int)
+        # Non-in-place addition is fine
+        assert_array_equal(assert_no_warnings(np.add, a, 1.1),
+                           [2.1, 3.1, 4.1])
+        assert_raises(TypeError, np.add, a, 1.1, out=a)
+
+        def add_inplace(a, b):
+            a += b
+
+        assert_raises(TypeError, add_inplace, a, 1.1)
+        # Make sure that explicitly overriding the exception is allowed:
+        assert_no_warnings(np.add, a, 1.1, out=a, casting="unsafe")
+        assert_array_equal(a, [2, 3, 4])
+
+    def test_ufunc_custom_out(self):
+        # Test ufunc with built in input types and custom output type
+
+        a = np.array([0, 1, 2], dtype='i8')
+        b = np.array([0, 1, 2], dtype='i8')
+        c = np.empty(3, dtype=_rational_tests.rational)
+
+        # Output must be specified so numpy knows what
+        # ufunc signature to look for
+        result = _rational_tests.test_add(a, b, c)
+        target = np.array([0, 2, 4], dtype=_rational_tests.rational)
+        assert_equal(result, target)
+
+        # no output type should raise TypeError
+        with assert_raises(TypeError):
+            _rational_tests.test_add(a, b)
+
+    def test_operand_flags(self):
+        a = np.arange(16, dtype='l').reshape(4, 4)
+        b = np.arange(9, dtype='l').reshape(3, 3)
+        opflag_tests.inplace_add(a[:-1, :-1], b)
+        assert_equal(a, np.array([[0, 2, 4, 3], [7, 9, 11, 7],
+            [14, 16, 18, 11], [12, 13, 14, 15]], dtype='l'))
+
+        a = np.array(0)
+        opflag_tests.inplace_add(a, 3)
+        assert_equal(a, 3)
+        opflag_tests.inplace_add(a, [3, 4])
+        assert_equal(a, 10)
+
+    def test_struct_ufunc(self):
+        import numpy.core._struct_ufunc_tests as struct_ufunc
+
+        a = np.array([(1, 2, 3)], dtype='u8,u8,u8')
+        b = np.array([(1, 2, 3)], dtype='u8,u8,u8')
+
+        result = struct_ufunc.add_triplet(a, b)
+        assert_equal(result, np.array([(2, 4, 6)], dtype='u8,u8,u8'))
+
+    def test_custom_ufunc(self):
+        a = np.array(
+            [_rational_tests.rational(1, 2),
+             _rational_tests.rational(1, 3),
+             _rational_tests.rational(1, 4)],
+            dtype=_rational_tests.rational)
+        b = np.array(
+            [_rational_tests.rational(1, 2),
+             _rational_tests.rational(1, 3),
+             _rational_tests.rational(1, 4)],
+            dtype=_rational_tests.rational)
+
+        result = _rational_tests.test_add_rationals(a, b)
+        expected = np.array(
+            [_rational_tests.rational(1),
+             _rational_tests.rational(2, 3),
+             _rational_tests.rational(1, 2)],
+            dtype=_rational_tests.rational)
+        assert_equal(result, expected)
+
+    def test_custom_ufunc_forced_sig(self):
+        # gh-9351 - looking for a non-first userloop would previously hang
+        with assert_raises(TypeError):
+            np.multiply(_rational_tests.rational(1), 1,
+                        signature=(_rational_tests.rational, int, None))
+
+    def test_custom_array_like(self):
+
+        class MyThing(object):
+            __array_priority__ = 1000
+
+            rmul_count = 0
+            getitem_count = 0
+
+            def __init__(self, shape):
+                self.shape = shape
+
+            def __len__(self):
+                return self.shape[0]
+
+            def __getitem__(self, i):
+                MyThing.getitem_count += 1
+                if not isinstance(i, tuple):
+                    i = (i,)
+                if len(i) > self.ndim:
+                    raise IndexError("boo")
+
+                return MyThing(self.shape[len(i):])
+
+            def __rmul__(self, other):
+                MyThing.rmul_count += 1
+                return self
+
+        np.float64(5)*MyThing((3, 3))
+        assert_(MyThing.rmul_count == 1, MyThing.rmul_count)
+        assert_(MyThing.getitem_count <= 2, MyThing.getitem_count)
+
+    def test_inplace_fancy_indexing(self):
+
+        a = np.arange(10)
+        np.add.at(a, [2, 5, 2], 1)
+        assert_equal(a, [0, 1, 4, 3, 4, 6, 6, 7, 8, 9])
+
+        a = np.arange(10)
+        b = np.array([100, 100, 100])
+        np.add.at(a, [2, 5, 2], b)
+        assert_equal(a, [0, 1, 202, 3, 4, 105, 6, 7, 8, 9])
+
+        a = np.arange(9).reshape(3, 3)
+        b = np.array([[100, 100, 100], [200, 200, 200], [300, 300, 300]])
+        np.add.at(a, (slice(None), [1, 2, 1]), b)
+        assert_equal(a, [[0, 201, 102], [3, 404, 205], [6, 607, 308]])
+
+        a = np.arange(27).reshape(3, 3, 3)
+        b = np.array([100, 200, 300])
+        np.add.at(a, (slice(None), slice(None), [1, 2, 1]), b)
+        assert_equal(a,
+            [[[0, 401, 202],
+              [3, 404, 205],
+              [6, 407, 208]],
+
+             [[9, 410, 211],
+              [12, 413, 214],
+              [15, 416, 217]],
+
+             [[18, 419, 220],
+              [21, 422, 223],
+              [24, 425, 226]]])
+
+        a = np.arange(9).reshape(3, 3)
+        b = np.array([[100, 100, 100], [200, 200, 200], [300, 300, 300]])
+        np.add.at(a, ([1, 2, 1], slice(None)), b)
+        assert_equal(a, [[0, 1, 2], [403, 404, 405], [206, 207, 208]])
+
+        a = np.arange(27).reshape(3, 3, 3)
+        b = np.array([100, 200, 300])
+        np.add.at(a, (slice(None), [1, 2, 1], slice(None)), b)
+        assert_equal(a,
+            [[[0,  1,  2],
+              [203, 404, 605],
+              [106, 207, 308]],
+
+             [[9,  10, 11],
+              [212, 413, 614],
+              [115, 216, 317]],
+
+             [[18, 19, 20],
+              [221, 422, 623],
+              [124, 225, 326]]])
+
+        a = np.arange(9).reshape(3, 3)
+        b = np.array([100, 200, 300])
+        np.add.at(a, (0, [1, 2, 1]), b)
+        assert_equal(a, [[0, 401, 202], [3, 4, 5], [6, 7, 8]])
+
+        a = np.arange(27).reshape(3, 3, 3)
+        b = np.array([100, 200, 300])
+        np.add.at(a, ([1, 2, 1], 0, slice(None)), b)
+        assert_equal(a,
+            [[[0,  1,  2],
+              [3,  4,  5],
+              [6,  7,  8]],
+
+             [[209, 410, 611],
+              [12,  13, 14],
+              [15,  16, 17]],
+
+             [[118, 219, 320],
+              [21,  22, 23],
+              [24,  25, 26]]])
+
+        a = np.arange(27).reshape(3, 3, 3)
+        b = np.array([100, 200, 300])
+        np.add.at(a, (slice(None), slice(None), slice(None)), b)
+        assert_equal(a,
+            [[[100, 201, 302],
+              [103, 204, 305],
+              [106, 207, 308]],
+
+             [[109, 210, 311],
+              [112, 213, 314],
+              [115, 216, 317]],
+
+             [[118, 219, 320],
+              [121, 222, 323],
+              [124, 225, 326]]])
+
+        a = np.arange(10)
+        np.negative.at(a, [2, 5, 2])
+        assert_equal(a, [0, 1, 2, 3, 4, -5, 6, 7, 8, 9])
+
+        # Test 0-dim array
+        a = np.array(0)
+        np.add.at(a, (), 1)
+        assert_equal(a, 1)
+
+        assert_raises(IndexError, np.add.at, a, 0, 1)
+        assert_raises(IndexError, np.add.at, a, [], 1)
+
+        # Test mixed dtypes
+        a = np.arange(10)
+        np.power.at(a, [1, 2, 3, 2], 3.5)
+        assert_equal(a, np.array([0, 1, 4414, 46, 4, 5, 6, 7, 8, 9]))
+
+        # Test boolean indexing and boolean ufuncs
+        a = np.arange(10)
+        index = a % 2 == 0
+        np.equal.at(a, index, [0, 2, 4, 6, 8])
+        assert_equal(a, [1, 1, 1, 3, 1, 5, 1, 7, 1, 9])
+
+        # Test unary operator
+        a = np.arange(10, dtype='u4')
+        np.invert.at(a, [2, 5, 2])
+        assert_equal(a, [0, 1, 2, 3, 4, 5 ^ 0xffffffff, 6, 7, 8, 9])
+
+        # Test empty subspace
+        orig = np.arange(4)
+        a = orig[:, None][:, 0:0]
+        np.add.at(a, [0, 1], 3)
+        assert_array_equal(orig, np.arange(4))
+
+        # Test with swapped byte order
+        index = np.array([1, 2, 1], np.dtype('i').newbyteorder())
+        values = np.array([1, 2, 3, 4], np.dtype('f').newbyteorder())
+        np.add.at(values, index, 3)
+        assert_array_equal(values, [1, 8, 6, 4])
+
+        # Test exception thrown
+        values = np.array(['a', 1], dtype=object)
+        assert_raises(TypeError, np.add.at, values, [0, 1], 1)
+        assert_array_equal(values, np.array(['a', 1], dtype=object))
+
+        # Test multiple output ufuncs raise error, gh-5665
+        assert_raises(ValueError, np.modf.at, np.arange(10), [1])
+
+    def test_reduce_arguments(self):
+        f = np.add.reduce
+        d = np.ones((5,2), dtype=int)
+        o = np.ones((2,), dtype=d.dtype)
+        r = o * 5
+        assert_equal(f(d), r)
+        # a, axis=0, dtype=None, out=None, keepdims=False
+        assert_equal(f(d, axis=0), r)
+        assert_equal(f(d, 0), r)
+        assert_equal(f(d, 0, dtype=None), r)
+        assert_equal(f(d, 0, dtype='i'), r)
+        assert_equal(f(d, 0, 'i'), r)
+        assert_equal(f(d, 0, None), r)
+        assert_equal(f(d, 0, None, out=None), r)
+        assert_equal(f(d, 0, None, out=o), r)
+        assert_equal(f(d, 0, None, o), r)
+        assert_equal(f(d, 0, None, None), r)
+        assert_equal(f(d, 0, None, None, keepdims=False), r)
+        assert_equal(f(d, 0, None, None, True), r.reshape((1,) + r.shape))
+        assert_equal(f(d, 0, None, None, False, 0), r)
+        assert_equal(f(d, 0, None, None, False, initial=0), r)
+        # multiple keywords
+        assert_equal(f(d, axis=0, dtype=None, out=None, keepdims=False), r)
+        assert_equal(f(d, 0, dtype=None, out=None, keepdims=False), r)
+        assert_equal(f(d, 0, None, out=None, keepdims=False), r)
+        assert_equal(f(d, 0, None, out=None, keepdims=False, initial=0), r)
+
+        # too little
+        assert_raises(TypeError, f)
+        # too much
+        assert_raises(TypeError, f, d, 0, None, None, False, 0, 1)
+        # invalid axis
+        assert_raises(TypeError, f, d, "invalid")
+        assert_raises(TypeError, f, d, axis="invalid")
+        assert_raises(TypeError, f, d, axis="invalid", dtype=None,
+                      keepdims=True)
+        # invalid dtype
+        assert_raises(TypeError, f, d, 0, "invalid")
+        assert_raises(TypeError, f, d, dtype="invalid")
+        assert_raises(TypeError, f, d, dtype="invalid", out=None)
+        # invalid out
+        assert_raises(TypeError, f, d, 0, None, "invalid")
+        assert_raises(TypeError, f, d, out="invalid")
+        assert_raises(TypeError, f, d, out="invalid", dtype=None)
+        # keepdims boolean, no invalid value
+        # assert_raises(TypeError, f, d, 0, None, None, "invalid")
+        # assert_raises(TypeError, f, d, keepdims="invalid", axis=0, dtype=None)
+        # invalid mix
+        assert_raises(TypeError, f, d, 0, keepdims="invalid", dtype="invalid",
+                     out=None)
+
+        # invalid keyord
+        assert_raises(TypeError, f, d, axis=0, dtype=None, invalid=0)
+        assert_raises(TypeError, f, d, invalid=0)
+        assert_raises(TypeError, f, d, 0, keepdims=True, invalid="invalid",
+                      out=None)
+        assert_raises(TypeError, f, d, axis=0, dtype=None, keepdims=True,
+                      out=None, invalid=0)
+        assert_raises(TypeError, f, d, axis=0, dtype=None,
+                      out=None, invalid=0)
+
+    def test_structured_equal(self):
+        # https://github.com/numpy/numpy/issues/4855
+
+        class MyA(np.ndarray):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return getattr(ufunc, method)(*(input.view(np.ndarray)
+                                              for input in inputs), **kwargs)
+        a = np.arange(12.).reshape(4,3)
+        ra = a.view(dtype=('f8,f8,f8')).squeeze()
+        mra = ra.view(MyA)
+
+        target = np.array([ True, False, False, False], dtype=bool)
+        assert_equal(np.all(target == (mra == ra[0])), True)
+
+    def test_scalar_equal(self):
+        # Scalar comparisons should always work, without deprecation warnings.
+        # even when the ufunc fails.
+        a = np.array(0.)
+        b = np.array('a')
+        assert_(a != b)
+        assert_(b != a)
+        assert_(not (a == b))
+        assert_(not (b == a))
+
+    def test_NotImplemented_not_returned(self):
+        # See gh-5964 and gh-2091. Some of these functions are not operator
+        # related and were fixed for other reasons in the past.
+        binary_funcs = [
+            np.power, np.add, np.subtract, np.multiply, np.divide,
+            np.true_divide, np.floor_divide, np.bitwise_and, np.bitwise_or,
+            np.bitwise_xor, np.left_shift, np.right_shift, np.fmax,
+            np.fmin, np.fmod, np.hypot, np.logaddexp, np.logaddexp2,
+            np.logical_and, np.logical_or, np.logical_xor, np.maximum,
+            np.minimum, np.mod,
+            np.greater, np.greater_equal, np.less, np.less_equal,
+            np.equal, np.not_equal]
+
+        a = np.array('1')
+        b = 1
+        c = np.array([1., 2.])
+        for f in binary_funcs:
+            assert_raises(TypeError, f, a, b)
+            assert_raises(TypeError, f, c, a)
+
+    def test_reduce_noncontig_output(self):
+        # Check that reduction deals with non-contiguous output arrays
+        # appropriately.
+        #
+        # gh-8036
+
+        x = np.arange(7*13*8, dtype=np.int16).reshape(7, 13, 8)
+        x = x[4:6,1:11:6,1:5].transpose(1, 2, 0)
+        y_base = np.arange(4*4, dtype=np.int16).reshape(4, 4)
+        y = y_base[::2,:]
+
+        y_base_copy = y_base.copy()
+
+        r0 = np.add.reduce(x, out=y.copy(), axis=2)
+        r1 = np.add.reduce(x, out=y, axis=2)
+
+        # The results should match, and y_base shouldn't get clobbered
+        assert_equal(r0, r1)
+        assert_equal(y_base[1,:], y_base_copy[1,:])
+        assert_equal(y_base[3,:], y_base_copy[3,:])
+
+    def test_no_doc_string(self):
+        # gh-9337
+        assert_('\n' not in umt.inner1d_no_doc.__doc__)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_umath.py b/contrib/python/numpy/py2/numpy/core/tests/test_umath.py
new file mode 100644
index 00000000000..eb6a67fa3f6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_umath.py
@@ -0,0 +1,2920 @@
+from __future__ import division, absolute_import, print_function
+
+import platform
+import warnings
+import fnmatch
+import itertools
+import pytest
+
+import numpy.core.umath as ncu
+from numpy.core import _umath_tests as ncu_tests
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_raises_regex,
+    assert_array_equal, assert_almost_equal, assert_array_almost_equal,
+    assert_allclose, assert_no_warnings, suppress_warnings,
+    _gen_alignment_data
+    )
+
+
+def on_powerpc():
+    """ True if we are running on a Power PC platform."""
+    return platform.processor() == 'powerpc' or \
+           platform.machine().startswith('ppc')
+
+
+class _FilterInvalids(object):
+    def setup(self):
+        self.olderr = np.seterr(invalid='ignore')
+
+    def teardown(self):
+        np.seterr(**self.olderr)
+
+
+class TestConstants(object):
+    def test_pi(self):
+        assert_allclose(ncu.pi, 3.141592653589793, 1e-15)
+
+    def test_e(self):
+        assert_allclose(ncu.e, 2.718281828459045, 1e-15)
+
+    def test_euler_gamma(self):
+        assert_allclose(ncu.euler_gamma, 0.5772156649015329, 1e-15)
+
+
+class TestOut(object):
+    def test_out_subok(self):
+        for subok in (True, False):
+            a = np.array(0.5)
+            o = np.empty(())
+
+            r = np.add(a, 2, o, subok=subok)
+            assert_(r is o)
+            r = np.add(a, 2, out=o, subok=subok)
+            assert_(r is o)
+            r = np.add(a, 2, out=(o,), subok=subok)
+            assert_(r is o)
+
+            d = np.array(5.7)
+            o1 = np.empty(())
+            o2 = np.empty((), dtype=np.int32)
+
+            r1, r2 = np.frexp(d, o1, None, subok=subok)
+            assert_(r1 is o1)
+            r1, r2 = np.frexp(d, None, o2, subok=subok)
+            assert_(r2 is o2)
+            r1, r2 = np.frexp(d, o1, o2, subok=subok)
+            assert_(r1 is o1)
+            assert_(r2 is o2)
+
+            r1, r2 = np.frexp(d, out=(o1, None), subok=subok)
+            assert_(r1 is o1)
+            r1, r2 = np.frexp(d, out=(None, o2), subok=subok)
+            assert_(r2 is o2)
+            r1, r2 = np.frexp(d, out=(o1, o2), subok=subok)
+            assert_(r1 is o1)
+            assert_(r2 is o2)
+
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings('always', '', DeprecationWarning)
+                r1, r2 = np.frexp(d, out=o1, subok=subok)
+                assert_(r1 is o1)
+                assert_(w[0].category is DeprecationWarning)
+
+            assert_raises(ValueError, np.add, a, 2, o, o, subok=subok)
+            assert_raises(ValueError, np.add, a, 2, o, out=o, subok=subok)
+            assert_raises(ValueError, np.add, a, 2, None, out=o, subok=subok)
+            assert_raises(ValueError, np.add, a, 2, out=(o, o), subok=subok)
+            assert_raises(ValueError, np.add, a, 2, out=(), subok=subok)
+            assert_raises(TypeError, np.add, a, 2, [], subok=subok)
+            assert_raises(TypeError, np.add, a, 2, out=[], subok=subok)
+            assert_raises(TypeError, np.add, a, 2, out=([],), subok=subok)
+            o.flags.writeable = False
+            assert_raises(ValueError, np.add, a, 2, o, subok=subok)
+            assert_raises(ValueError, np.add, a, 2, out=o, subok=subok)
+            assert_raises(ValueError, np.add, a, 2, out=(o,), subok=subok)
+
+    def test_out_wrap_subok(self):
+        class ArrayWrap(np.ndarray):
+            __array_priority__ = 10
+
+            def __new__(cls, arr):
+                return np.asarray(arr).view(cls).copy()
+
+            def __array_wrap__(self, arr, context):
+                return arr.view(type(self))
+
+        for subok in (True, False):
+            a = ArrayWrap([0.5])
+
+            r = np.add(a, 2, subok=subok)
+            if subok:
+                assert_(isinstance(r, ArrayWrap))
+            else:
+                assert_(type(r) == np.ndarray)
+
+            r = np.add(a, 2, None, subok=subok)
+            if subok:
+                assert_(isinstance(r, ArrayWrap))
+            else:
+                assert_(type(r) == np.ndarray)
+
+            r = np.add(a, 2, out=None, subok=subok)
+            if subok:
+                assert_(isinstance(r, ArrayWrap))
+            else:
+                assert_(type(r) == np.ndarray)
+
+            r = np.add(a, 2, out=(None,), subok=subok)
+            if subok:
+                assert_(isinstance(r, ArrayWrap))
+            else:
+                assert_(type(r) == np.ndarray)
+
+            d = ArrayWrap([5.7])
+            o1 = np.empty((1,))
+            o2 = np.empty((1,), dtype=np.int32)
+
+            r1, r2 = np.frexp(d, o1, subok=subok)
+            if subok:
+                assert_(isinstance(r2, ArrayWrap))
+            else:
+                assert_(type(r2) == np.ndarray)
+
+            r1, r2 = np.frexp(d, o1, None, subok=subok)
+            if subok:
+                assert_(isinstance(r2, ArrayWrap))
+            else:
+                assert_(type(r2) == np.ndarray)
+
+            r1, r2 = np.frexp(d, None, o2, subok=subok)
+            if subok:
+                assert_(isinstance(r1, ArrayWrap))
+            else:
+                assert_(type(r1) == np.ndarray)
+
+            r1, r2 = np.frexp(d, out=(o1, None), subok=subok)
+            if subok:
+                assert_(isinstance(r2, ArrayWrap))
+            else:
+                assert_(type(r2) == np.ndarray)
+
+            r1, r2 = np.frexp(d, out=(None, o2), subok=subok)
+            if subok:
+                assert_(isinstance(r1, ArrayWrap))
+            else:
+                assert_(type(r1) == np.ndarray)
+
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings('always', '', DeprecationWarning)
+                r1, r2 = np.frexp(d, out=o1, subok=subok)
+                if subok:
+                    assert_(isinstance(r2, ArrayWrap))
+                else:
+                    assert_(type(r2) == np.ndarray)
+                assert_(w[0].category is DeprecationWarning)
+
+
+class TestComparisons(object):
+    def test_ignore_object_identity_in_equal(self):
+        # Check error raised when comparing identical objects whose comparison
+        # is not a simple boolean, e.g., arrays that are compared elementwise.
+        a = np.array([np.array([1, 2, 3]), None], dtype=object)
+        assert_raises(ValueError, np.equal, a, a)
+
+        # Check error raised when comparing identical non-comparable objects.
+        class FunkyType(object):
+            def __eq__(self, other):
+                raise TypeError("I won't compare")
+
+        a = np.array([FunkyType()])
+        assert_raises(TypeError, np.equal, a, a)
+
+        # Check identity doesn't override comparison mismatch.
+        a = np.array([np.nan], dtype=object)
+        assert_equal(np.equal(a, a), [False])
+
+    def test_ignore_object_identity_in_not_equal(self):
+        # Check error raised when comparing identical objects whose comparison
+        # is not a simple boolean, e.g., arrays that are compared elementwise.
+        a = np.array([np.array([1, 2, 3]), None], dtype=object)
+        assert_raises(ValueError, np.not_equal, a, a)
+
+        # Check error raised when comparing identical non-comparable objects.
+        class FunkyType(object):
+            def __ne__(self, other):
+                raise TypeError("I won't compare")
+
+        a = np.array([FunkyType()])
+        assert_raises(TypeError, np.not_equal, a, a)
+
+        # Check identity doesn't override comparison mismatch.
+        a = np.array([np.nan], dtype=object)
+        assert_equal(np.not_equal(a, a), [True])
+
+
+class TestAdd(object):
+    def test_reduce_alignment(self):
+        # gh-9876
+        # make sure arrays with weird strides work with the optimizations in
+        # pairwise_sum_@TYPE@. On x86, the 'b' field will count as aligned at a
+        # 4 byte offset, even though its itemsize is 8.
+        a = np.zeros(2, dtype=[('a', np.int32), ('b', np.float64)])
+        a['a'] = -1
+        assert_equal(a['b'].sum(), 0)
+
+
+class TestDivision(object):
+    def test_division_int(self):
+        # int division should follow Python
+        x = np.array([5, 10, 90, 100, -5, -10, -90, -100, -120])
+        if 5 / 10 == 0.5:
+            assert_equal(x / 100, [0.05, 0.1, 0.9, 1,
+                                   -0.05, -0.1, -0.9, -1, -1.2])
+        else:
+            assert_equal(x / 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
+        assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
+        assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
+
+    def test_division_complex(self):
+        # check that implementation is correct
+        msg = "Complex division implementation check"
+        x = np.array([1. + 1.*1j, 1. + .5*1j, 1. + 2.*1j], dtype=np.complex128)
+        assert_almost_equal(x**2/x, x, err_msg=msg)
+        # check overflow, underflow
+        msg = "Complex division overflow/underflow check"
+        x = np.array([1.e+110, 1.e-110], dtype=np.complex128)
+        y = x**2/x
+        assert_almost_equal(y/x, [1, 1], err_msg=msg)
+
+    def test_zero_division_complex(self):
+        with np.errstate(invalid="ignore", divide="ignore"):
+            x = np.array([0.0], dtype=np.complex128)
+            y = 1.0/x
+            assert_(np.isinf(y)[0])
+            y = complex(np.inf, np.nan)/x
+            assert_(np.isinf(y)[0])
+            y = complex(np.nan, np.inf)/x
+            assert_(np.isinf(y)[0])
+            y = complex(np.inf, np.inf)/x
+            assert_(np.isinf(y)[0])
+            y = 0.0/x
+            assert_(np.isnan(y)[0])
+
+    def test_floor_division_complex(self):
+        # check that implementation is correct
+        msg = "Complex floor division implementation check"
+        x = np.array([.9 + 1j, -.1 + 1j, .9 + .5*1j, .9 + 2.*1j], dtype=np.complex128)
+        y = np.array([0., -1., 0., 0.], dtype=np.complex128)
+        assert_equal(np.floor_divide(x**2, x), y, err_msg=msg)
+        # check overflow, underflow
+        msg = "Complex floor division overflow/underflow check"
+        x = np.array([1.e+110, 1.e-110], dtype=np.complex128)
+        y = np.floor_divide(x**2, x)
+        assert_equal(y, [1.e+110, 0], err_msg=msg)
+
+    def test_floor_division_signed_zero(self):
+        # Check that the sign bit is correctly set when dividing positive and
+        # negative zero by one.
+        x = np.zeros(10)
+        assert_equal(np.signbit(x//1), 0)
+        assert_equal(np.signbit((-x)//1), 1)
+
+def floor_divide_and_remainder(x, y):
+    return (np.floor_divide(x, y), np.remainder(x, y))
+
+
+def _signs(dt):
+    if dt in np.typecodes['UnsignedInteger']:
+        return (+1,)
+    else:
+        return (+1, -1)
+
+
+class TestRemainder(object):
+
+    def test_remainder_basic(self):
+        dt = np.typecodes['AllInteger'] + np.typecodes['Float']
+        for op in [floor_divide_and_remainder, np.divmod]:
+            for dt1, dt2 in itertools.product(dt, dt):
+                for sg1, sg2 in itertools.product(_signs(dt1), _signs(dt2)):
+                    fmt = 'op: %s, dt1: %s, dt2: %s, sg1: %s, sg2: %s'
+                    msg = fmt % (op.__name__, dt1, dt2, sg1, sg2)
+                    a = np.array(sg1*71, dtype=dt1)
+                    b = np.array(sg2*19, dtype=dt2)
+                    div, rem = op(a, b)
+                    assert_equal(div*b + rem, a, err_msg=msg)
+                    if sg2 == -1:
+                        assert_(b < rem <= 0, msg)
+                    else:
+                        assert_(b > rem >= 0, msg)
+
+    def test_float_remainder_exact(self):
+        # test that float results are exact for small integers. This also
+        # holds for the same integers scaled by powers of two.
+        nlst = list(range(-127, 0))
+        plst = list(range(1, 128))
+        dividend = nlst + [0] + plst
+        divisor = nlst + plst
+        arg = list(itertools.product(dividend, divisor))
+        tgt = list(divmod(*t) for t in arg)
+
+        a, b = np.array(arg, dtype=int).T
+        # convert exact integer results from Python to float so that
+        # signed zero can be used, it is checked.
+        tgtdiv, tgtrem = np.array(tgt, dtype=float).T
+        tgtdiv = np.where((tgtdiv == 0.0) & ((b < 0) ^ (a < 0)), -0.0, tgtdiv)
+        tgtrem = np.where((tgtrem == 0.0) & (b < 0), -0.0, tgtrem)
+
+        for op in [floor_divide_and_remainder, np.divmod]:
+            for dt in np.typecodes['Float']:
+                msg = 'op: %s, dtype: %s' % (op.__name__, dt)
+                fa = a.astype(dt)
+                fb = b.astype(dt)
+                div, rem = op(fa, fb)
+                assert_equal(div, tgtdiv, err_msg=msg)
+                assert_equal(rem, tgtrem, err_msg=msg)
+
+    def test_float_remainder_roundoff(self):
+        # gh-6127
+        dt = np.typecodes['Float']
+        for op in [floor_divide_and_remainder, np.divmod]:
+            for dt1, dt2 in itertools.product(dt, dt):
+                for sg1, sg2 in itertools.product((+1, -1), (+1, -1)):
+                    fmt = 'op: %s, dt1: %s, dt2: %s, sg1: %s, sg2: %s'
+                    msg = fmt % (op.__name__, dt1, dt2, sg1, sg2)
+                    a = np.array(sg1*78*6e-8, dtype=dt1)
+                    b = np.array(sg2*6e-8, dtype=dt2)
+                    div, rem = op(a, b)
+                    # Equal assertion should hold when fmod is used
+                    assert_equal(div*b + rem, a, err_msg=msg)
+                    if sg2 == -1:
+                        assert_(b < rem <= 0, msg)
+                    else:
+                        assert_(b > rem >= 0, msg)
+
+    def test_float_remainder_corner_cases(self):
+        # Check remainder magnitude.
+        for dt in np.typecodes['Float']:
+            b = np.array(1.0, dtype=dt)
+            a = np.nextafter(np.array(0.0, dtype=dt), -b)
+            rem = np.remainder(a, b)
+            assert_(rem <= b, 'dt: %s' % dt)
+            rem = np.remainder(-a, -b)
+            assert_(rem >= -b, 'dt: %s' % dt)
+
+        # Check nans, inf
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning, "invalid value encountered in remainder")
+            for dt in np.typecodes['Float']:
+                fone = np.array(1.0, dtype=dt)
+                fzer = np.array(0.0, dtype=dt)
+                finf = np.array(np.inf, dtype=dt)
+                fnan = np.array(np.nan, dtype=dt)
+                rem = np.remainder(fone, fzer)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                # MSVC 2008 returns NaN here, so disable the check.
+                #rem = np.remainder(fone, finf)
+                #assert_(rem == fone, 'dt: %s, rem: %s' % (dt, rem))
+                rem = np.remainder(fone, fnan)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                rem = np.remainder(finf, fone)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+
+
+class TestCbrt(object):
+    def test_cbrt_scalar(self):
+        assert_almost_equal((np.cbrt(np.float32(-2.5)**3)), -2.5)
+
+    def test_cbrt(self):
+        x = np.array([1., 2., -3., np.inf, -np.inf])
+        assert_almost_equal(np.cbrt(x**3), x)
+
+        assert_(np.isnan(np.cbrt(np.nan)))
+        assert_equal(np.cbrt(np.inf), np.inf)
+        assert_equal(np.cbrt(-np.inf), -np.inf)
+
+
+class TestPower(object):
+    def test_power_float(self):
+        x = np.array([1., 2., 3.])
+        assert_equal(x**0, [1., 1., 1.])
+        assert_equal(x**1, x)
+        assert_equal(x**2, [1., 4., 9.])
+        y = x.copy()
+        y **= 2
+        assert_equal(y, [1., 4., 9.])
+        assert_almost_equal(x**(-1), [1., 0.5, 1./3])
+        assert_almost_equal(x**(0.5), [1., ncu.sqrt(2), ncu.sqrt(3)])
+
+        for out, inp, msg in _gen_alignment_data(dtype=np.float32,
+                                                 type='unary',
+                                                 max_size=11):
+            exp = [ncu.sqrt(i) for i in inp]
+            assert_almost_equal(inp**(0.5), exp, err_msg=msg)
+            np.sqrt(inp, out=out)
+            assert_equal(out, exp, err_msg=msg)
+
+        for out, inp, msg in _gen_alignment_data(dtype=np.float64,
+                                                 type='unary',
+                                                 max_size=7):
+            exp = [ncu.sqrt(i) for i in inp]
+            assert_almost_equal(inp**(0.5), exp, err_msg=msg)
+            np.sqrt(inp, out=out)
+            assert_equal(out, exp, err_msg=msg)
+
+    def test_power_complex(self):
+        x = np.array([1+2j, 2+3j, 3+4j])
+        assert_equal(x**0, [1., 1., 1.])
+        assert_equal(x**1, x)
+        assert_almost_equal(x**2, [-3+4j, -5+12j, -7+24j])
+        assert_almost_equal(x**3, [(1+2j)**3, (2+3j)**3, (3+4j)**3])
+        assert_almost_equal(x**4, [(1+2j)**4, (2+3j)**4, (3+4j)**4])
+        assert_almost_equal(x**(-1), [1/(1+2j), 1/(2+3j), 1/(3+4j)])
+        assert_almost_equal(x**(-2), [1/(1+2j)**2, 1/(2+3j)**2, 1/(3+4j)**2])
+        assert_almost_equal(x**(-3), [(-11+2j)/125, (-46-9j)/2197,
+                                      (-117-44j)/15625])
+        assert_almost_equal(x**(0.5), [ncu.sqrt(1+2j), ncu.sqrt(2+3j),
+                                       ncu.sqrt(3+4j)])
+        norm = 1./((x**14)[0])
+        assert_almost_equal(x**14 * norm,
+                [i * norm for i in [-76443+16124j, 23161315+58317492j,
+                                    5583548873 + 2465133864j]])
+
+        # Ticket #836
+        def assert_complex_equal(x, y):
+            assert_array_equal(x.real, y.real)
+            assert_array_equal(x.imag, y.imag)
+
+        for z in [complex(0, np.inf), complex(1, np.inf)]:
+            z = np.array([z], dtype=np.complex_)
+            with np.errstate(invalid="ignore"):
+                assert_complex_equal(z**1, z)
+                assert_complex_equal(z**2, z*z)
+                assert_complex_equal(z**3, z*z*z)
+
+    def test_power_zero(self):
+        # ticket #1271
+        zero = np.array([0j])
+        one = np.array([1+0j])
+        cnan = np.array([complex(np.nan, np.nan)])
+        # FIXME cinf not tested.
+        #cinf = np.array([complex(np.inf, 0)])
+
+        def assert_complex_equal(x, y):
+            x, y = np.asarray(x), np.asarray(y)
+            assert_array_equal(x.real, y.real)
+            assert_array_equal(x.imag, y.imag)
+
+        # positive powers
+        for p in [0.33, 0.5, 1, 1.5, 2, 3, 4, 5, 6.6]:
+            assert_complex_equal(np.power(zero, p), zero)
+
+        # zero power
+        assert_complex_equal(np.power(zero, 0), one)
+        with np.errstate(invalid="ignore"):
+            assert_complex_equal(np.power(zero, 0+1j), cnan)
+
+            # negative power
+            for p in [0.33, 0.5, 1, 1.5, 2, 3, 4, 5, 6.6]:
+                assert_complex_equal(np.power(zero, -p), cnan)
+            assert_complex_equal(np.power(zero, -1+0.2j), cnan)
+
+    def test_fast_power(self):
+        x = np.array([1, 2, 3], np.int16)
+        res = x**2.0
+        assert_((x**2.00001).dtype is res.dtype)
+        assert_array_equal(res, [1, 4, 9])
+        # check the inplace operation on the casted copy doesn't mess with x
+        assert_(not np.may_share_memory(res, x))
+        assert_array_equal(x, [1, 2, 3])
+
+        # Check that the fast path ignores 1-element not 0-d arrays
+        res = x ** np.array([[[2]]])
+        assert_equal(res.shape, (1, 1, 3))
+
+    def test_integer_power(self):
+        a = np.array([15, 15], 'i8')
+        b = np.power(a, a)
+        assert_equal(b, [437893890380859375, 437893890380859375])
+
+    def test_integer_power_with_integer_zero_exponent(self):
+        dtypes = np.typecodes['Integer']
+        for dt in dtypes:
+            arr = np.arange(-10, 10, dtype=dt)
+            assert_equal(np.power(arr, 0), np.ones_like(arr))
+
+        dtypes = np.typecodes['UnsignedInteger']
+        for dt in dtypes:
+            arr = np.arange(10, dtype=dt)
+            assert_equal(np.power(arr, 0), np.ones_like(arr))
+
+    def test_integer_power_of_1(self):
+        dtypes = np.typecodes['AllInteger']
+        for dt in dtypes:
+            arr = np.arange(10, dtype=dt)
+            assert_equal(np.power(1, arr), np.ones_like(arr))
+
+    def test_integer_power_of_zero(self):
+        dtypes = np.typecodes['AllInteger']
+        for dt in dtypes:
+            arr = np.arange(1, 10, dtype=dt)
+            assert_equal(np.power(0, arr), np.zeros_like(arr))
+
+    def test_integer_to_negative_power(self):
+        dtypes = np.typecodes['Integer']
+        for dt in dtypes:
+            a = np.array([0, 1, 2, 3], dtype=dt)
+            b = np.array([0, 1, 2, -3], dtype=dt)
+            one = np.array(1, dtype=dt)
+            minusone = np.array(-1, dtype=dt)
+            assert_raises(ValueError, np.power, a, b)
+            assert_raises(ValueError, np.power, a, minusone)
+            assert_raises(ValueError, np.power, one, b)
+            assert_raises(ValueError, np.power, one, minusone)
+
+
+class TestFloat_power(object):
+    def test_type_conversion(self):
+        arg_type = '?bhilBHILefdgFDG'
+        res_type = 'ddddddddddddgDDG'
+        for dtin, dtout in zip(arg_type, res_type):
+            msg = "dtin: %s, dtout: %s" % (dtin, dtout)
+            arg = np.ones(1, dtype=dtin)
+            res = np.float_power(arg, arg)
+            assert_(res.dtype.name == np.dtype(dtout).name, msg)
+
+
+class TestLog2(object):
+    def test_log2_values(self):
+        x = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+        y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for dt in ['f', 'd', 'g']:
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)
+            assert_almost_equal(np.log2(xf), yf)
+
+    def test_log2_ints(self):
+        # a good log2 implementation should provide this,
+        # might fail on OS with bad libm
+        for i in range(1, 65):
+            v = np.log2(2.**i)
+            assert_equal(v, float(i), err_msg='at exponent %d' % i)
+
+    def test_log2_special(self):
+        assert_equal(np.log2(1.), 0.)
+        assert_equal(np.log2(np.inf), np.inf)
+        assert_(np.isnan(np.log2(np.nan)))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_(np.isnan(np.log2(-1.)))
+            assert_(np.isnan(np.log2(-np.inf)))
+            assert_equal(np.log2(0.), -np.inf)
+            assert_(w[0].category is RuntimeWarning)
+            assert_(w[1].category is RuntimeWarning)
+            assert_(w[2].category is RuntimeWarning)
+
+
+class TestExp2(object):
+    def test_exp2_values(self):
+        x = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+        y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for dt in ['f', 'd', 'g']:
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)
+            assert_almost_equal(np.exp2(yf), xf)
+
+
+class TestLogAddExp2(_FilterInvalids):
+    # Need test for intermediate precisions
+    def test_logaddexp2_values(self):
+        x = [1, 2, 3, 4, 5]
+        y = [5, 4, 3, 2, 1]
+        z = [6, 6, 6, 6, 6]
+        for dt, dec_ in zip(['f', 'd', 'g'], [6, 15, 15]):
+            xf = np.log2(np.array(x, dtype=dt))
+            yf = np.log2(np.array(y, dtype=dt))
+            zf = np.log2(np.array(z, dtype=dt))
+            assert_almost_equal(np.logaddexp2(xf, yf), zf, decimal=dec_)
+
+    def test_logaddexp2_range(self):
+        x = [1000000, -1000000, 1000200, -1000200]
+        y = [1000200, -1000200, 1000000, -1000000]
+        z = [1000200, -1000000, 1000200, -1000000]
+        for dt in ['f', 'd', 'g']:
+            logxf = np.array(x, dtype=dt)
+            logyf = np.array(y, dtype=dt)
+            logzf = np.array(z, dtype=dt)
+            assert_almost_equal(np.logaddexp2(logxf, logyf), logzf)
+
+    def test_inf(self):
+        inf = np.inf
+        x = [inf, -inf,  inf, -inf, inf, 1,  -inf,  1]
+        y = [inf,  inf, -inf, -inf, 1,   inf, 1,   -inf]
+        z = [inf,  inf,  inf, -inf, inf, inf, 1,    1]
+        with np.errstate(invalid='raise'):
+            for dt in ['f', 'd', 'g']:
+                logxf = np.array(x, dtype=dt)
+                logyf = np.array(y, dtype=dt)
+                logzf = np.array(z, dtype=dt)
+                assert_equal(np.logaddexp2(logxf, logyf), logzf)
+
+    def test_nan(self):
+        assert_(np.isnan(np.logaddexp2(np.nan, np.inf)))
+        assert_(np.isnan(np.logaddexp2(np.inf, np.nan)))
+        assert_(np.isnan(np.logaddexp2(np.nan, 0)))
+        assert_(np.isnan(np.logaddexp2(0, np.nan)))
+        assert_(np.isnan(np.logaddexp2(np.nan, np.nan)))
+
+
+class TestLog(object):
+    def test_log_values(self):
+        x = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+        y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for dt in ['f', 'd', 'g']:
+            log2_ = 0.69314718055994530943
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)*log2_
+            assert_almost_equal(np.log(xf), yf)
+
+
+class TestExp(object):
+    def test_exp_values(self):
+        x = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+        y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for dt in ['f', 'd', 'g']:
+            log2_ = 0.69314718055994530943
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)*log2_
+            assert_almost_equal(np.exp(yf), xf)
+
+
+class TestLogAddExp(_FilterInvalids):
+    def test_logaddexp_values(self):
+        x = [1, 2, 3, 4, 5]
+        y = [5, 4, 3, 2, 1]
+        z = [6, 6, 6, 6, 6]
+        for dt, dec_ in zip(['f', 'd', 'g'], [6, 15, 15]):
+            xf = np.log(np.array(x, dtype=dt))
+            yf = np.log(np.array(y, dtype=dt))
+            zf = np.log(np.array(z, dtype=dt))
+            assert_almost_equal(np.logaddexp(xf, yf), zf, decimal=dec_)
+
+    def test_logaddexp_range(self):
+        x = [1000000, -1000000, 1000200, -1000200]
+        y = [1000200, -1000200, 1000000, -1000000]
+        z = [1000200, -1000000, 1000200, -1000000]
+        for dt in ['f', 'd', 'g']:
+            logxf = np.array(x, dtype=dt)
+            logyf = np.array(y, dtype=dt)
+            logzf = np.array(z, dtype=dt)
+            assert_almost_equal(np.logaddexp(logxf, logyf), logzf)
+
+    def test_inf(self):
+        inf = np.inf
+        x = [inf, -inf,  inf, -inf, inf, 1,  -inf,  1]
+        y = [inf,  inf, -inf, -inf, 1,   inf, 1,   -inf]
+        z = [inf,  inf,  inf, -inf, inf, inf, 1,    1]
+        with np.errstate(invalid='raise'):
+            for dt in ['f', 'd', 'g']:
+                logxf = np.array(x, dtype=dt)
+                logyf = np.array(y, dtype=dt)
+                logzf = np.array(z, dtype=dt)
+                assert_equal(np.logaddexp(logxf, logyf), logzf)
+
+    def test_nan(self):
+        assert_(np.isnan(np.logaddexp(np.nan, np.inf)))
+        assert_(np.isnan(np.logaddexp(np.inf, np.nan)))
+        assert_(np.isnan(np.logaddexp(np.nan, 0)))
+        assert_(np.isnan(np.logaddexp(0, np.nan)))
+        assert_(np.isnan(np.logaddexp(np.nan, np.nan)))
+
+    def test_reduce(self):
+        assert_equal(np.logaddexp.identity, -np.inf)
+        assert_equal(np.logaddexp.reduce([]), -np.inf)
+
+
+class TestLog1p(object):
+    def test_log1p(self):
+        assert_almost_equal(ncu.log1p(0.2), ncu.log(1.2))
+        assert_almost_equal(ncu.log1p(1e-6), ncu.log(1+1e-6))
+
+    def test_special(self):
+        with np.errstate(invalid="ignore", divide="ignore"):
+            assert_equal(ncu.log1p(np.nan), np.nan)
+            assert_equal(ncu.log1p(np.inf), np.inf)
+            assert_equal(ncu.log1p(-1.), -np.inf)
+            assert_equal(ncu.log1p(-2.), np.nan)
+            assert_equal(ncu.log1p(-np.inf), np.nan)
+
+
+class TestExpm1(object):
+    def test_expm1(self):
+        assert_almost_equal(ncu.expm1(0.2), ncu.exp(0.2)-1)
+        assert_almost_equal(ncu.expm1(1e-6), ncu.exp(1e-6)-1)
+
+    def test_special(self):
+        assert_equal(ncu.expm1(np.inf), np.inf)
+        assert_equal(ncu.expm1(0.), 0.)
+        assert_equal(ncu.expm1(-0.), -0.)
+        assert_equal(ncu.expm1(np.inf), np.inf)
+        assert_equal(ncu.expm1(-np.inf), -1.)
+
+
+class TestHypot(object):
+    def test_simple(self):
+        assert_almost_equal(ncu.hypot(1, 1), ncu.sqrt(2))
+        assert_almost_equal(ncu.hypot(0, 0), 0)
+
+    def test_reduce(self):
+        assert_almost_equal(ncu.hypot.reduce([3.0, 4.0]), 5.0)
+        assert_almost_equal(ncu.hypot.reduce([3.0, 4.0, 0]), 5.0)
+        assert_almost_equal(ncu.hypot.reduce([9.0, 12.0, 20.0]), 25.0)
+        assert_equal(ncu.hypot.reduce([]), 0.0)
+
+
+def assert_hypot_isnan(x, y):
+    with np.errstate(invalid='ignore'):
+        assert_(np.isnan(ncu.hypot(x, y)),
+                "hypot(%s, %s) is %s, not nan" % (x, y, ncu.hypot(x, y)))
+
+
+def assert_hypot_isinf(x, y):
+    with np.errstate(invalid='ignore'):
+        assert_(np.isinf(ncu.hypot(x, y)),
+                "hypot(%s, %s) is %s, not inf" % (x, y, ncu.hypot(x, y)))
+
+
+class TestHypotSpecialValues(object):
+    def test_nan_outputs(self):
+        assert_hypot_isnan(np.nan, np.nan)
+        assert_hypot_isnan(np.nan, 1)
+
+    def test_nan_outputs2(self):
+        assert_hypot_isinf(np.nan, np.inf)
+        assert_hypot_isinf(np.inf, np.nan)
+        assert_hypot_isinf(np.inf, 0)
+        assert_hypot_isinf(0, np.inf)
+        assert_hypot_isinf(np.inf, np.inf)
+        assert_hypot_isinf(np.inf, 23.0)
+
+    def test_no_fpe(self):
+        assert_no_warnings(ncu.hypot, np.inf, 0)
+
+
+def assert_arctan2_isnan(x, y):
+    assert_(np.isnan(ncu.arctan2(x, y)), "arctan(%s, %s) is %s, not nan" % (x, y, ncu.arctan2(x, y)))
+
+
+def assert_arctan2_ispinf(x, y):
+    assert_((np.isinf(ncu.arctan2(x, y)) and ncu.arctan2(x, y) > 0), "arctan(%s, %s) is %s, not +inf" % (x, y, ncu.arctan2(x, y)))
+
+
+def assert_arctan2_isninf(x, y):
+    assert_((np.isinf(ncu.arctan2(x, y)) and ncu.arctan2(x, y) < 0), "arctan(%s, %s) is %s, not -inf" % (x, y, ncu.arctan2(x, y)))
+
+
+def assert_arctan2_ispzero(x, y):
+    assert_((ncu.arctan2(x, y) == 0 and not np.signbit(ncu.arctan2(x, y))), "arctan(%s, %s) is %s, not +0" % (x, y, ncu.arctan2(x, y)))
+
+
+def assert_arctan2_isnzero(x, y):
+    assert_((ncu.arctan2(x, y) == 0 and np.signbit(ncu.arctan2(x, y))), "arctan(%s, %s) is %s, not -0" % (x, y, ncu.arctan2(x, y)))
+
+
+class TestArctan2SpecialValues(object):
+    def test_one_one(self):
+        # atan2(1, 1) returns pi/4.
+        assert_almost_equal(ncu.arctan2(1, 1), 0.25 * np.pi)
+        assert_almost_equal(ncu.arctan2(-1, 1), -0.25 * np.pi)
+        assert_almost_equal(ncu.arctan2(1, -1), 0.75 * np.pi)
+
+    def test_zero_nzero(self):
+        # atan2(+-0, -0) returns +-pi.
+        assert_almost_equal(ncu.arctan2(np.PZERO, np.NZERO), np.pi)
+        assert_almost_equal(ncu.arctan2(np.NZERO, np.NZERO), -np.pi)
+
+    def test_zero_pzero(self):
+        # atan2(+-0, +0) returns +-0.
+        assert_arctan2_ispzero(np.PZERO, np.PZERO)
+        assert_arctan2_isnzero(np.NZERO, np.PZERO)
+
+    def test_zero_negative(self):
+        # atan2(+-0, x) returns +-pi for x < 0.
+        assert_almost_equal(ncu.arctan2(np.PZERO, -1), np.pi)
+        assert_almost_equal(ncu.arctan2(np.NZERO, -1), -np.pi)
+
+    def test_zero_positive(self):
+        # atan2(+-0, x) returns +-0 for x > 0.
+        assert_arctan2_ispzero(np.PZERO, 1)
+        assert_arctan2_isnzero(np.NZERO, 1)
+
+    def test_positive_zero(self):
+        # atan2(y, +-0) returns +pi/2 for y > 0.
+        assert_almost_equal(ncu.arctan2(1, np.PZERO), 0.5 * np.pi)
+        assert_almost_equal(ncu.arctan2(1, np.NZERO), 0.5 * np.pi)
+
+    def test_negative_zero(self):
+        # atan2(y, +-0) returns -pi/2 for y < 0.
+        assert_almost_equal(ncu.arctan2(-1, np.PZERO), -0.5 * np.pi)
+        assert_almost_equal(ncu.arctan2(-1, np.NZERO), -0.5 * np.pi)
+
+    def test_any_ninf(self):
+        # atan2(+-y, -infinity) returns +-pi for finite y > 0.
+        assert_almost_equal(ncu.arctan2(1, np.NINF),  np.pi)
+        assert_almost_equal(ncu.arctan2(-1, np.NINF), -np.pi)
+
+    def test_any_pinf(self):
+        # atan2(+-y, +infinity) returns +-0 for finite y > 0.
+        assert_arctan2_ispzero(1, np.inf)
+        assert_arctan2_isnzero(-1, np.inf)
+
+    def test_inf_any(self):
+        # atan2(+-infinity, x) returns +-pi/2 for finite x.
+        assert_almost_equal(ncu.arctan2( np.inf, 1),  0.5 * np.pi)
+        assert_almost_equal(ncu.arctan2(-np.inf, 1), -0.5 * np.pi)
+
+    def test_inf_ninf(self):
+        # atan2(+-infinity, -infinity) returns +-3*pi/4.
+        assert_almost_equal(ncu.arctan2( np.inf, -np.inf),  0.75 * np.pi)
+        assert_almost_equal(ncu.arctan2(-np.inf, -np.inf), -0.75 * np.pi)
+
+    def test_inf_pinf(self):
+        # atan2(+-infinity, +infinity) returns +-pi/4.
+        assert_almost_equal(ncu.arctan2( np.inf, np.inf),  0.25 * np.pi)
+        assert_almost_equal(ncu.arctan2(-np.inf, np.inf), -0.25 * np.pi)
+
+    def test_nan_any(self):
+        # atan2(nan, x) returns nan for any x, including inf
+        assert_arctan2_isnan(np.nan, np.inf)
+        assert_arctan2_isnan(np.inf, np.nan)
+        assert_arctan2_isnan(np.nan, np.nan)
+
+
+class TestLdexp(object):
+    def _check_ldexp(self, tp):
+        assert_almost_equal(ncu.ldexp(np.array(2., np.float32),
+                                      np.array(3, tp)), 16.)
+        assert_almost_equal(ncu.ldexp(np.array(2., np.float64),
+                                      np.array(3, tp)), 16.)
+        assert_almost_equal(ncu.ldexp(np.array(2., np.longdouble),
+                                      np.array(3, tp)), 16.)
+
+    def test_ldexp(self):
+        # The default Python int type should work
+        assert_almost_equal(ncu.ldexp(2., 3),  16.)
+        # The following int types should all be accepted
+        self._check_ldexp(np.int8)
+        self._check_ldexp(np.int16)
+        self._check_ldexp(np.int32)
+        self._check_ldexp('i')
+        self._check_ldexp('l')
+
+    def test_ldexp_overflow(self):
+        # silence warning emitted on overflow
+        with np.errstate(over="ignore"):
+            imax = np.iinfo(np.dtype('l')).max
+            imin = np.iinfo(np.dtype('l')).min
+            assert_equal(ncu.ldexp(2., imax), np.inf)
+            assert_equal(ncu.ldexp(2., imin), 0)
+
+
+class TestMaximum(_FilterInvalids):
+    def test_reduce(self):
+        dflt = np.typecodes['AllFloat']
+        dint = np.typecodes['AllInteger']
+        seq1 = np.arange(11)
+        seq2 = seq1[::-1]
+        func = np.maximum.reduce
+        for dt in dint:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 10)
+            assert_equal(func(tmp2), 10)
+        for dt in dflt:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 10)
+            assert_equal(func(tmp2), 10)
+            tmp1[::2] = np.nan
+            tmp2[::2] = np.nan
+            assert_equal(func(tmp1), np.nan)
+            assert_equal(func(tmp2), np.nan)
+
+    def test_reduce_complex(self):
+        assert_equal(np.maximum.reduce([1, 2j]), 1)
+        assert_equal(np.maximum.reduce([1+3j, 2j]), 1+3j)
+
+    def test_float_nans(self):
+        nan = np.nan
+        arg1 = np.array([0,   nan, nan])
+        arg2 = np.array([nan, 0,   nan])
+        out = np.array([nan, nan, nan])
+        assert_equal(np.maximum(arg1, arg2), out)
+
+    def test_object_nans(self):
+        # Multiple checks to give this a chance to
+        # fail if cmp is used instead of rich compare.
+        # Failure cannot be guaranteed.
+        for i in range(1):
+            x = np.array(float('nan'), object)
+            y = 1.0
+            z = np.array(float('nan'), object)
+            assert_(np.maximum(x, y) == 1.0)
+            assert_(np.maximum(z, y) == 1.0)
+
+    def test_complex_nans(self):
+        nan = np.nan
+        for cnan in [complex(nan, 0), complex(0, nan), complex(nan, nan)]:
+            arg1 = np.array([0, cnan, cnan], dtype=complex)
+            arg2 = np.array([cnan, 0, cnan], dtype=complex)
+            out = np.array([nan, nan, nan], dtype=complex)
+            assert_equal(np.maximum(arg1, arg2), out)
+
+    def test_object_array(self):
+        arg1 = np.arange(5, dtype=object)
+        arg2 = arg1 + 1
+        assert_equal(np.maximum(arg1, arg2), arg2)
+
+
+class TestMinimum(_FilterInvalids):
+    def test_reduce(self):
+        dflt = np.typecodes['AllFloat']
+        dint = np.typecodes['AllInteger']
+        seq1 = np.arange(11)
+        seq2 = seq1[::-1]
+        func = np.minimum.reduce
+        for dt in dint:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 0)
+            assert_equal(func(tmp2), 0)
+        for dt in dflt:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 0)
+            assert_equal(func(tmp2), 0)
+            tmp1[::2] = np.nan
+            tmp2[::2] = np.nan
+            assert_equal(func(tmp1), np.nan)
+            assert_equal(func(tmp2), np.nan)
+
+    def test_reduce_complex(self):
+        assert_equal(np.minimum.reduce([1, 2j]), 2j)
+        assert_equal(np.minimum.reduce([1+3j, 2j]), 2j)
+
+    def test_float_nans(self):
+        nan = np.nan
+        arg1 = np.array([0,   nan, nan])
+        arg2 = np.array([nan, 0,   nan])
+        out = np.array([nan, nan, nan])
+        assert_equal(np.minimum(arg1, arg2), out)
+
+    def test_object_nans(self):
+        # Multiple checks to give this a chance to
+        # fail if cmp is used instead of rich compare.
+        # Failure cannot be guaranteed.
+        for i in range(1):
+            x = np.array(float('nan'), object)
+            y = 1.0
+            z = np.array(float('nan'), object)
+            assert_(np.minimum(x, y) == 1.0)
+            assert_(np.minimum(z, y) == 1.0)
+
+    def test_complex_nans(self):
+        nan = np.nan
+        for cnan in [complex(nan, 0), complex(0, nan), complex(nan, nan)]:
+            arg1 = np.array([0, cnan, cnan], dtype=complex)
+            arg2 = np.array([cnan, 0, cnan], dtype=complex)
+            out = np.array([nan, nan, nan], dtype=complex)
+            assert_equal(np.minimum(arg1, arg2), out)
+
+    def test_object_array(self):
+        arg1 = np.arange(5, dtype=object)
+        arg2 = arg1 + 1
+        assert_equal(np.minimum(arg1, arg2), arg1)
+
+
+class TestFmax(_FilterInvalids):
+    def test_reduce(self):
+        dflt = np.typecodes['AllFloat']
+        dint = np.typecodes['AllInteger']
+        seq1 = np.arange(11)
+        seq2 = seq1[::-1]
+        func = np.fmax.reduce
+        for dt in dint:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 10)
+            assert_equal(func(tmp2), 10)
+        for dt in dflt:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 10)
+            assert_equal(func(tmp2), 10)
+            tmp1[::2] = np.nan
+            tmp2[::2] = np.nan
+            assert_equal(func(tmp1), 9)
+            assert_equal(func(tmp2), 9)
+
+    def test_reduce_complex(self):
+        assert_equal(np.fmax.reduce([1, 2j]), 1)
+        assert_equal(np.fmax.reduce([1+3j, 2j]), 1+3j)
+
+    def test_float_nans(self):
+        nan = np.nan
+        arg1 = np.array([0,   nan, nan])
+        arg2 = np.array([nan, 0,   nan])
+        out = np.array([0,   0,   nan])
+        assert_equal(np.fmax(arg1, arg2), out)
+
+    def test_complex_nans(self):
+        nan = np.nan
+        for cnan in [complex(nan, 0), complex(0, nan), complex(nan, nan)]:
+            arg1 = np.array([0, cnan, cnan], dtype=complex)
+            arg2 = np.array([cnan, 0, cnan], dtype=complex)
+            out = np.array([0,    0, nan], dtype=complex)
+            assert_equal(np.fmax(arg1, arg2), out)
+
+
+class TestFmin(_FilterInvalids):
+    def test_reduce(self):
+        dflt = np.typecodes['AllFloat']
+        dint = np.typecodes['AllInteger']
+        seq1 = np.arange(11)
+        seq2 = seq1[::-1]
+        func = np.fmin.reduce
+        for dt in dint:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 0)
+            assert_equal(func(tmp2), 0)
+        for dt in dflt:
+            tmp1 = seq1.astype(dt)
+            tmp2 = seq2.astype(dt)
+            assert_equal(func(tmp1), 0)
+            assert_equal(func(tmp2), 0)
+            tmp1[::2] = np.nan
+            tmp2[::2] = np.nan
+            assert_equal(func(tmp1), 1)
+            assert_equal(func(tmp2), 1)
+
+    def test_reduce_complex(self):
+        assert_equal(np.fmin.reduce([1, 2j]), 2j)
+        assert_equal(np.fmin.reduce([1+3j, 2j]), 2j)
+
+    def test_float_nans(self):
+        nan = np.nan
+        arg1 = np.array([0,   nan, nan])
+        arg2 = np.array([nan, 0,   nan])
+        out = np.array([0,   0,   nan])
+        assert_equal(np.fmin(arg1, arg2), out)
+
+    def test_complex_nans(self):
+        nan = np.nan
+        for cnan in [complex(nan, 0), complex(0, nan), complex(nan, nan)]:
+            arg1 = np.array([0, cnan, cnan], dtype=complex)
+            arg2 = np.array([cnan, 0, cnan], dtype=complex)
+            out = np.array([0,    0, nan], dtype=complex)
+            assert_equal(np.fmin(arg1, arg2), out)
+
+
+class TestBool(object):
+    def test_exceptions(self):
+        a = np.ones(1, dtype=np.bool_)
+        assert_raises(TypeError, np.negative, a)
+        assert_raises(TypeError, np.positive, a)
+        assert_raises(TypeError, np.subtract, a, a)
+
+    def test_truth_table_logical(self):
+        # 2, 3 and 4 serves as true values
+        input1 = [0, 0, 3, 2]
+        input2 = [0, 4, 0, 2]
+
+        typecodes = (np.typecodes['AllFloat']
+                     + np.typecodes['AllInteger']
+                     + '?')     # boolean
+        for dtype in map(np.dtype, typecodes):
+            arg1 = np.asarray(input1, dtype=dtype)
+            arg2 = np.asarray(input2, dtype=dtype)
+
+            # OR
+            out = [False, True, True, True]
+            for func in (np.logical_or, np.maximum):
+                assert_equal(func(arg1, arg2).astype(bool), out)
+            # AND
+            out = [False, False, False, True]
+            for func in (np.logical_and, np.minimum):
+                assert_equal(func(arg1, arg2).astype(bool), out)
+            # XOR
+            out = [False, True, True, False]
+            for func in (np.logical_xor, np.not_equal):
+                assert_equal(func(arg1, arg2).astype(bool), out)
+
+    def test_truth_table_bitwise(self):
+        arg1 = [False, False, True, True]
+        arg2 = [False, True, False, True]
+
+        out = [False, True, True, True]
+        assert_equal(np.bitwise_or(arg1, arg2), out)
+
+        out = [False, False, False, True]
+        assert_equal(np.bitwise_and(arg1, arg2), out)
+
+        out = [False, True, True, False]
+        assert_equal(np.bitwise_xor(arg1, arg2), out)
+
+    def test_reduce(self):
+        none = np.array([0, 0, 0, 0], bool)
+        some = np.array([1, 0, 1, 1], bool)
+        every = np.array([1, 1, 1, 1], bool)
+        empty = np.array([], bool)
+
+        arrs = [none, some, every, empty]
+
+        for arr in arrs:
+            assert_equal(np.logical_and.reduce(arr), all(arr))
+
+        for arr in arrs:
+            assert_equal(np.logical_or.reduce(arr), any(arr))
+
+        for arr in arrs:
+            assert_equal(np.logical_xor.reduce(arr), arr.sum() % 2 == 1)
+
+
+class TestBitwiseUFuncs(object):
+
+    bitwise_types = [np.dtype(c) for c in '?' + 'bBhHiIlLqQ' + 'O']
+
+    def test_values(self):
+        for dt in self.bitwise_types:
+            zeros = np.array([0], dtype=dt)
+            ones = np.array([-1], dtype=dt)
+            msg = "dt = '%s'" % dt.char
+
+            assert_equal(np.bitwise_not(zeros), ones, err_msg=msg)
+            assert_equal(np.bitwise_not(ones), zeros, err_msg=msg)
+
+            assert_equal(np.bitwise_or(zeros, zeros), zeros, err_msg=msg)
+            assert_equal(np.bitwise_or(zeros, ones), ones, err_msg=msg)
+            assert_equal(np.bitwise_or(ones, zeros), ones, err_msg=msg)
+            assert_equal(np.bitwise_or(ones, ones), ones, err_msg=msg)
+
+            assert_equal(np.bitwise_xor(zeros, zeros), zeros, err_msg=msg)
+            assert_equal(np.bitwise_xor(zeros, ones), ones, err_msg=msg)
+            assert_equal(np.bitwise_xor(ones, zeros), ones, err_msg=msg)
+            assert_equal(np.bitwise_xor(ones, ones), zeros, err_msg=msg)
+
+            assert_equal(np.bitwise_and(zeros, zeros), zeros, err_msg=msg)
+            assert_equal(np.bitwise_and(zeros, ones), zeros, err_msg=msg)
+            assert_equal(np.bitwise_and(ones, zeros), zeros, err_msg=msg)
+            assert_equal(np.bitwise_and(ones, ones), ones, err_msg=msg)
+
+    def test_types(self):
+        for dt in self.bitwise_types:
+            zeros = np.array([0], dtype=dt)
+            ones = np.array([-1], dtype=dt)
+            msg = "dt = '%s'" % dt.char
+
+            assert_(np.bitwise_not(zeros).dtype == dt, msg)
+            assert_(np.bitwise_or(zeros, zeros).dtype == dt, msg)
+            assert_(np.bitwise_xor(zeros, zeros).dtype == dt, msg)
+            assert_(np.bitwise_and(zeros, zeros).dtype == dt, msg)
+
+    def test_identity(self):
+        assert_(np.bitwise_or.identity == 0, 'bitwise_or')
+        assert_(np.bitwise_xor.identity == 0, 'bitwise_xor')
+        assert_(np.bitwise_and.identity == -1, 'bitwise_and')
+
+    def test_reduction(self):
+        binary_funcs = (np.bitwise_or, np.bitwise_xor, np.bitwise_and)
+
+        for dt in self.bitwise_types:
+            zeros = np.array([0], dtype=dt)
+            ones = np.array([-1], dtype=dt)
+            for f in binary_funcs:
+                msg = "dt: '%s', f: '%s'" % (dt, f)
+                assert_equal(f.reduce(zeros), zeros, err_msg=msg)
+                assert_equal(f.reduce(ones), ones, err_msg=msg)
+
+        # Test empty reduction, no object dtype
+        for dt in self.bitwise_types[:-1]:
+            # No object array types
+            empty = np.array([], dtype=dt)
+            for f in binary_funcs:
+                msg = "dt: '%s', f: '%s'" % (dt, f)
+                tgt = np.array(f.identity, dtype=dt)
+                res = f.reduce(empty)
+                assert_equal(res, tgt, err_msg=msg)
+                assert_(res.dtype == tgt.dtype, msg)
+
+        # Empty object arrays use the identity.  Note that the types may
+        # differ, the actual type used is determined by the assign_identity
+        # function and is not the same as the type returned by the identity
+        # method.
+        for f in binary_funcs:
+            msg = "dt: '%s'" % (f,)
+            empty = np.array([], dtype=object)
+            tgt = f.identity
+            res = f.reduce(empty)
+            assert_equal(res, tgt, err_msg=msg)
+
+        # Non-empty object arrays do not use the identity
+        for f in binary_funcs:
+            msg = "dt: '%s'" % (f,)
+            btype = np.array([True], dtype=object)
+            assert_(type(f.reduce(btype)) is bool, msg)
+
+
+class TestInt(object):
+    def test_logical_not(self):
+        x = np.ones(10, dtype=np.int16)
+        o = np.ones(10 * 2, dtype=bool)
+        tgt = o.copy()
+        tgt[::2] = False
+        os = o[::2]
+        assert_array_equal(np.logical_not(x, out=os), False)
+        assert_array_equal(o, tgt)
+
+
+class TestFloatingPoint(object):
+    def test_floating_point(self):
+        assert_equal(ncu.FLOATING_POINT_SUPPORT, 1)
+
+
+class TestDegrees(object):
+    def test_degrees(self):
+        assert_almost_equal(ncu.degrees(np.pi), 180.0)
+        assert_almost_equal(ncu.degrees(-0.5*np.pi), -90.0)
+
+
+class TestRadians(object):
+    def test_radians(self):
+        assert_almost_equal(ncu.radians(180.0), np.pi)
+        assert_almost_equal(ncu.radians(-90.0), -0.5*np.pi)
+
+
+class TestHeavside(object):
+    def test_heaviside(self):
+        x = np.array([[-30.0, -0.1, 0.0, 0.2], [7.5, np.nan, np.inf, -np.inf]])
+        expectedhalf = np.array([[0.0, 0.0, 0.5, 1.0], [1.0, np.nan, 1.0, 0.0]])
+        expected1 = expectedhalf.copy()
+        expected1[0, 2] = 1
+
+        h = ncu.heaviside(x, 0.5)
+        assert_equal(h, expectedhalf)
+
+        h = ncu.heaviside(x, 1.0)
+        assert_equal(h, expected1)
+
+        x = x.astype(np.float32)
+
+        h = ncu.heaviside(x, np.float32(0.5))
+        assert_equal(h, expectedhalf.astype(np.float32))
+
+        h = ncu.heaviside(x, np.float32(1.0))
+        assert_equal(h, expected1.astype(np.float32))
+
+
+class TestSign(object):
+    def test_sign(self):
+        a = np.array([np.inf, -np.inf, np.nan, 0.0, 3.0, -3.0])
+        out = np.zeros(a.shape)
+        tgt = np.array([1., -1., np.nan, 0.0, 1.0, -1.0])
+
+        with np.errstate(invalid='ignore'):
+            res = ncu.sign(a)
+            assert_equal(res, tgt)
+            res = ncu.sign(a, out)
+            assert_equal(res, tgt)
+            assert_equal(out, tgt)
+
+    def test_sign_dtype_object(self):
+        # In reference to github issue #6229
+
+        foo = np.array([-.1, 0, .1])
+        a = np.sign(foo.astype(object))
+        b = np.sign(foo)
+
+        assert_array_equal(a, b)
+
+    def test_sign_dtype_nan_object(self):
+        # In reference to github issue #6229
+        def test_nan():
+            foo = np.array([np.nan])
+            # FIXME: a not used
+            a = np.sign(foo.astype(object))
+
+        assert_raises(TypeError, test_nan)
+
+class TestMinMax(object):
+    def test_minmax_blocked(self):
+        # simd tests on max/min, test all alignments, slow but important
+        # for 2 * vz + 2 * (vs - 1) + 1 (unrolled once)
+        for dt, sz in [(np.float32, 15), (np.float64, 7)]:
+            for out, inp, msg in _gen_alignment_data(dtype=dt, type='unary',
+                                                     max_size=sz):
+                for i in range(inp.size):
+                    inp[:] = np.arange(inp.size, dtype=dt)
+                    inp[i] = np.nan
+                    emsg = lambda: '%r\n%s' % (inp, msg)
+                    with suppress_warnings() as sup:
+                        sup.filter(RuntimeWarning,
+                                   "invalid value encountered in reduce")
+                        assert_(np.isnan(inp.max()), msg=emsg)
+                        assert_(np.isnan(inp.min()), msg=emsg)
+
+                    inp[i] = 1e10
+                    assert_equal(inp.max(), 1e10, err_msg=msg)
+                    inp[i] = -1e10
+                    assert_equal(inp.min(), -1e10, err_msg=msg)
+
+    def test_lower_align(self):
+        # check data that is not aligned to element size
+        # i.e doubles are aligned to 4 bytes on i386
+        d = np.zeros(23 * 8, dtype=np.int8)[4:-4].view(np.float64)
+        assert_equal(d.max(), d[0])
+        assert_equal(d.min(), d[0])
+
+    def test_reduce_reorder(self):
+        # gh 10370, 11029 Some compilers reorder the call to npy_getfloatstatus
+        # and put it before the call to an intrisic function that causes
+        # invalid status to be set. Also make sure warnings are not emitted
+        for n in (2, 4, 8, 16, 32):
+            for dt in (np.float32, np.float16, np.complex64):
+                for r in np.diagflat(np.array([np.nan] * n, dtype=dt)):
+                    assert_equal(np.min(r), np.nan)
+
+    def test_minimize_no_warns(self):
+        a = np.minimum(np.nan, 1)
+        assert_equal(a, np.nan)
+
+
+class TestAbsoluteNegative(object):
+    def test_abs_neg_blocked(self):
+        # simd tests on abs, test all alignments for vz + 2 * (vs - 1) + 1
+        for dt, sz in [(np.float32, 11), (np.float64, 5)]:
+            for out, inp, msg in _gen_alignment_data(dtype=dt, type='unary',
+                                                     max_size=sz):
+                tgt = [ncu.absolute(i) for i in inp]
+                np.absolute(inp, out=out)
+                assert_equal(out, tgt, err_msg=msg)
+                assert_((out >= 0).all())
+
+                tgt = [-1*(i) for i in inp]
+                np.negative(inp, out=out)
+                assert_equal(out, tgt, err_msg=msg)
+
+                for v in [np.nan, -np.inf, np.inf]:
+                    for i in range(inp.size):
+                        d = np.arange(inp.size, dtype=dt)
+                        inp[:] = -d
+                        inp[i] = v
+                        d[i] = -v if v == -np.inf else v
+                        assert_array_equal(np.abs(inp), d, err_msg=msg)
+                        np.abs(inp, out=out)
+                        assert_array_equal(out, d, err_msg=msg)
+
+                        assert_array_equal(-inp, -1*inp, err_msg=msg)
+                        d = -1 * inp
+                        np.negative(inp, out=out)
+                        assert_array_equal(out, d, err_msg=msg)
+
+    def test_lower_align(self):
+        # check data that is not aligned to element size
+        # i.e doubles are aligned to 4 bytes on i386
+        d = np.zeros(23 * 8, dtype=np.int8)[4:-4].view(np.float64)
+        assert_equal(np.abs(d), d)
+        assert_equal(np.negative(d), -d)
+        np.negative(d, out=d)
+        np.negative(np.ones_like(d), out=d)
+        np.abs(d, out=d)
+        np.abs(np.ones_like(d), out=d)
+
+
+class TestPositive(object):
+    def test_valid(self):
+        valid_dtypes = [int, float, complex, object]
+        for dtype in valid_dtypes:
+            x = np.arange(5, dtype=dtype)
+            result = np.positive(x)
+            assert_equal(x, result, err_msg=str(dtype))
+
+    def test_invalid(self):
+        with assert_raises(TypeError):
+            np.positive(True)
+        with assert_raises(TypeError):
+            np.positive(np.datetime64('2000-01-01'))
+        with assert_raises(TypeError):
+            np.positive(np.array(['foo'], dtype=str))
+        with assert_raises(TypeError):
+            np.positive(np.array(['bar'], dtype=object))
+
+
+class TestSpecialMethods(object):
+    def test_wrap(self):
+
+        class with_wrap(object):
+            def __array__(self):
+                return np.zeros(1)
+
+            def __array_wrap__(self, arr, context):
+                r = with_wrap()
+                r.arr = arr
+                r.context = context
+                return r
+
+        a = with_wrap()
+        x = ncu.minimum(a, a)
+        assert_equal(x.arr, np.zeros(1))
+        func, args, i = x.context
+        assert_(func is ncu.minimum)
+        assert_equal(len(args), 2)
+        assert_equal(args[0], a)
+        assert_equal(args[1], a)
+        assert_equal(i, 0)
+
+    def test_wrap_and_prepare_out(self):
+        # Calling convention for out should not affect how special methods are
+        # called
+
+        class StoreArrayPrepareWrap(np.ndarray):
+            _wrap_args = None
+            _prepare_args = None
+            def __new__(cls):
+                return np.empty(()).view(cls)
+            def __array_wrap__(self, obj, context):
+                self._wrap_args = context[1]
+                return obj
+            def __array_prepare__(self, obj, context):
+                self._prepare_args = context[1]
+                return obj
+            @property
+            def args(self):
+                # We need to ensure these are fetched at the same time, before
+                # any other ufuncs are calld by the assertions
+                return (self._prepare_args, self._wrap_args)
+            def __repr__(self):
+                return "a"  # for short test output
+
+        def do_test(f_call, f_expected):
+            a = StoreArrayPrepareWrap()
+            f_call(a)
+            p, w = a.args
+            expected = f_expected(a)
+            try:
+                assert_equal(p, expected)
+                assert_equal(w, expected)
+            except AssertionError as e:
+                # assert_equal produces truly useless error messages
+                raise AssertionError("\n".join([
+                    "Bad arguments passed in ufunc call",
+                    " expected:              {}".format(expected),
+                    " __array_prepare__ got: {}".format(p),
+                    " __array_wrap__ got:    {}".format(w)
+                ]))
+
+        # method not on the out argument
+        do_test(lambda a: np.add(a, 0),              lambda a: (a, 0))
+        do_test(lambda a: np.add(a, 0, None),        lambda a: (a, 0))
+        do_test(lambda a: np.add(a, 0, out=None),    lambda a: (a, 0))
+        do_test(lambda a: np.add(a, 0, out=(None,)), lambda a: (a, 0))
+
+        # method on the out argument
+        do_test(lambda a: np.add(0, 0, a),           lambda a: (0, 0, a))
+        do_test(lambda a: np.add(0, 0, out=a),       lambda a: (0, 0, a))
+        do_test(lambda a: np.add(0, 0, out=(a,)),    lambda a: (0, 0, a))
+
+    def test_wrap_with_iterable(self):
+        # test fix for bug #1026:
+
+        class with_wrap(np.ndarray):
+            __array_priority__ = 10
+
+            def __new__(cls):
+                return np.asarray(1).view(cls).copy()
+
+            def __array_wrap__(self, arr, context):
+                return arr.view(type(self))
+
+        a = with_wrap()
+        x = ncu.multiply(a, (1, 2, 3))
+        assert_(isinstance(x, with_wrap))
+        assert_array_equal(x, np.array((1, 2, 3)))
+
+    def test_priority_with_scalar(self):
+        # test fix for bug #826:
+
+        class A(np.ndarray):
+            __array_priority__ = 10
+
+            def __new__(cls):
+                return np.asarray(1.0, 'float64').view(cls).copy()
+
+        a = A()
+        x = np.float64(1)*a
+        assert_(isinstance(x, A))
+        assert_array_equal(x, np.array(1))
+
+    def test_old_wrap(self):
+
+        class with_wrap(object):
+            def __array__(self):
+                return np.zeros(1)
+
+            def __array_wrap__(self, arr):
+                r = with_wrap()
+                r.arr = arr
+                return r
+
+        a = with_wrap()
+        x = ncu.minimum(a, a)
+        assert_equal(x.arr, np.zeros(1))
+
+    def test_priority(self):
+
+        class A(object):
+            def __array__(self):
+                return np.zeros(1)
+
+            def __array_wrap__(self, arr, context):
+                r = type(self)()
+                r.arr = arr
+                r.context = context
+                return r
+
+        class B(A):
+            __array_priority__ = 20.
+
+        class C(A):
+            __array_priority__ = 40.
+
+        x = np.zeros(1)
+        a = A()
+        b = B()
+        c = C()
+        f = ncu.minimum
+        assert_(type(f(x, x)) is np.ndarray)
+        assert_(type(f(x, a)) is A)
+        assert_(type(f(x, b)) is B)
+        assert_(type(f(x, c)) is C)
+        assert_(type(f(a, x)) is A)
+        assert_(type(f(b, x)) is B)
+        assert_(type(f(c, x)) is C)
+
+        assert_(type(f(a, a)) is A)
+        assert_(type(f(a, b)) is B)
+        assert_(type(f(b, a)) is B)
+        assert_(type(f(b, b)) is B)
+        assert_(type(f(b, c)) is C)
+        assert_(type(f(c, b)) is C)
+        assert_(type(f(c, c)) is C)
+
+        assert_(type(ncu.exp(a) is A))
+        assert_(type(ncu.exp(b) is B))
+        assert_(type(ncu.exp(c) is C))
+
+    def test_failing_wrap(self):
+
+        class A(object):
+            def __array__(self):
+                return np.zeros(2)
+
+            def __array_wrap__(self, arr, context):
+                raise RuntimeError
+
+        a = A()
+        assert_raises(RuntimeError, ncu.maximum, a, a)
+        assert_raises(RuntimeError, ncu.maximum.reduce, a)
+
+    def test_failing_out_wrap(self):
+
+        singleton = np.array([1.0])
+
+        class Ok(np.ndarray):
+            def __array_wrap__(self, obj):
+                return singleton
+
+        class Bad(np.ndarray):
+            def __array_wrap__(self, obj):
+                raise RuntimeError
+
+        ok = np.empty(1).view(Ok)
+        bad = np.empty(1).view(Bad)
+
+        # double-free (segfault) of "ok" if "bad" raises an exception
+        for i in range(10):
+            assert_raises(RuntimeError, ncu.frexp, 1, ok, bad)
+
+    def test_none_wrap(self):
+        # Tests that issue #8507 is resolved. Previously, this would segfault
+
+        class A(object):
+            def __array__(self):
+                return np.zeros(1)
+
+            def __array_wrap__(self, arr, context=None):
+                return None
+
+        a = A()
+        assert_equal(ncu.maximum(a, a), None)
+
+    def test_default_prepare(self):
+
+        class with_wrap(object):
+            __array_priority__ = 10
+
+            def __array__(self):
+                return np.zeros(1)
+
+            def __array_wrap__(self, arr, context):
+                return arr
+
+        a = with_wrap()
+        x = ncu.minimum(a, a)
+        assert_equal(x, np.zeros(1))
+        assert_equal(type(x), np.ndarray)
+
+    def test_prepare(self):
+
+        class with_prepare(np.ndarray):
+            __array_priority__ = 10
+
+            def __array_prepare__(self, arr, context):
+                # make sure we can return a new
+                return np.array(arr).view(type=with_prepare)
+
+        a = np.array(1).view(type=with_prepare)
+        x = np.add(a, a)
+        assert_equal(x, np.array(2))
+        assert_equal(type(x), with_prepare)
+
+    def test_prepare_out(self):
+
+        class with_prepare(np.ndarray):
+            __array_priority__ = 10
+
+            def __array_prepare__(self, arr, context):
+                return np.array(arr).view(type=with_prepare)
+
+        a = np.array([1]).view(type=with_prepare)
+        x = np.add(a, a, a)
+        # Returned array is new, because of the strange
+        # __array_prepare__ above
+        assert_(not np.shares_memory(x, a))
+        assert_equal(x, np.array([2]))
+        assert_equal(type(x), with_prepare)
+
+    def test_failing_prepare(self):
+
+        class A(object):
+            def __array__(self):
+                return np.zeros(1)
+
+            def __array_prepare__(self, arr, context=None):
+                raise RuntimeError
+
+        a = A()
+        assert_raises(RuntimeError, ncu.maximum, a, a)
+
+    def test_array_with_context(self):
+
+        class A(object):
+            def __array__(self, dtype=None, context=None):
+                func, args, i = context
+                self.func = func
+                self.args = args
+                self.i = i
+                return np.zeros(1)
+
+        class B(object):
+            def __array__(self, dtype=None):
+                return np.zeros(1, dtype)
+
+        class C(object):
+            def __array__(self):
+                return np.zeros(1)
+
+        a = A()
+        ncu.maximum(np.zeros(1), a)
+        assert_(a.func is ncu.maximum)
+        assert_equal(a.args[0], 0)
+        assert_(a.args[1] is a)
+        assert_(a.i == 1)
+        assert_equal(ncu.maximum(a, B()), 0)
+        assert_equal(ncu.maximum(a, C()), 0)
+
+    def test_ufunc_override(self):
+        # check override works even with instance with high priority.
+        class A(object):
+            def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                return self, func, method, inputs, kwargs
+
+        class MyNDArray(np.ndarray):
+            __array_priority__ = 100
+
+        a = A()
+        b = np.array([1]).view(MyNDArray)
+        res0 = np.multiply(a, b)
+        res1 = np.multiply(b, b, out=a)
+
+        # self
+        assert_equal(res0[0], a)
+        assert_equal(res1[0], a)
+        assert_equal(res0[1], np.multiply)
+        assert_equal(res1[1], np.multiply)
+        assert_equal(res0[2], '__call__')
+        assert_equal(res1[2], '__call__')
+        assert_equal(res0[3], (a, b))
+        assert_equal(res1[3], (b, b))
+        assert_equal(res0[4], {})
+        assert_equal(res1[4], {'out': (a,)})
+
+    def test_ufunc_override_mro(self):
+
+        # Some multi arg functions for testing.
+        def tres_mul(a, b, c):
+            return a * b * c
+
+        def quatro_mul(a, b, c, d):
+            return a * b * c * d
+
+        # Make these into ufuncs.
+        three_mul_ufunc = np.frompyfunc(tres_mul, 3, 1)
+        four_mul_ufunc = np.frompyfunc(quatro_mul, 4, 1)
+
+        class A(object):
+            def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                return "A"
+
+        class ASub(A):
+            def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                return "ASub"
+
+        class B(object):
+            def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                return "B"
+
+        class C(object):
+            def __init__(self):
+                self.count = 0
+
+            def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                self.count += 1
+                return NotImplemented
+
+        class CSub(C):
+            def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                self.count += 1
+                return NotImplemented
+
+        a = A()
+        a_sub = ASub()
+        b = B()
+        c = C()
+
+        # Standard
+        res = np.multiply(a, a_sub)
+        assert_equal(res, "ASub")
+        res = np.multiply(a_sub, b)
+        assert_equal(res, "ASub")
+
+        # With 1 NotImplemented
+        res = np.multiply(c, a)
+        assert_equal(res, "A")
+        assert_equal(c.count, 1)
+        # Check our counter works, so we can trust tests below.
+        res = np.multiply(c, a)
+        assert_equal(c.count, 2)
+
+        # Both NotImplemented.
+        c = C()
+        c_sub = CSub()
+        assert_raises(TypeError, np.multiply, c, c_sub)
+        assert_equal(c.count, 1)
+        assert_equal(c_sub.count, 1)
+        c.count = c_sub.count = 0
+        assert_raises(TypeError, np.multiply, c_sub, c)
+        assert_equal(c.count, 1)
+        assert_equal(c_sub.count, 1)
+        c.count = 0
+        assert_raises(TypeError, np.multiply, c, c)
+        assert_equal(c.count, 1)
+        c.count = 0
+        assert_raises(TypeError, np.multiply, 2, c)
+        assert_equal(c.count, 1)
+
+        # Ternary testing.
+        assert_equal(three_mul_ufunc(a, 1, 2), "A")
+        assert_equal(three_mul_ufunc(1, a, 2), "A")
+        assert_equal(three_mul_ufunc(1, 2, a), "A")
+
+        assert_equal(three_mul_ufunc(a, a, 6), "A")
+        assert_equal(three_mul_ufunc(a, 2, a), "A")
+        assert_equal(three_mul_ufunc(a, 2, b), "A")
+        assert_equal(three_mul_ufunc(a, 2, a_sub), "ASub")
+        assert_equal(three_mul_ufunc(a, a_sub, 3), "ASub")
+        c.count = 0
+        assert_equal(three_mul_ufunc(c, a_sub, 3), "ASub")
+        assert_equal(c.count, 1)
+        c.count = 0
+        assert_equal(three_mul_ufunc(1, a_sub, c), "ASub")
+        assert_equal(c.count, 0)
+
+        c.count = 0
+        assert_equal(three_mul_ufunc(a, b, c), "A")
+        assert_equal(c.count, 0)
+        c_sub.count = 0
+        assert_equal(three_mul_ufunc(a, b, c_sub), "A")
+        assert_equal(c_sub.count, 0)
+        assert_equal(three_mul_ufunc(1, 2, b), "B")
+
+        assert_raises(TypeError, three_mul_ufunc, 1, 2, c)
+        assert_raises(TypeError, three_mul_ufunc, c_sub, 2, c)
+        assert_raises(TypeError, three_mul_ufunc, c_sub, 2, 3)
+
+        # Quaternary testing.
+        assert_equal(four_mul_ufunc(a, 1, 2, 3), "A")
+        assert_equal(four_mul_ufunc(1, a, 2, 3), "A")
+        assert_equal(four_mul_ufunc(1, 1, a, 3), "A")
+        assert_equal(four_mul_ufunc(1, 1, 2, a), "A")
+
+        assert_equal(four_mul_ufunc(a, b, 2, 3), "A")
+        assert_equal(four_mul_ufunc(1, a, 2, b), "A")
+        assert_equal(four_mul_ufunc(b, 1, a, 3), "B")
+        assert_equal(four_mul_ufunc(a_sub, 1, 2, a), "ASub")
+        assert_equal(four_mul_ufunc(a, 1, 2, a_sub), "ASub")
+
+        c = C()
+        c_sub = CSub()
+        assert_raises(TypeError, four_mul_ufunc, 1, 2, 3, c)
+        assert_equal(c.count, 1)
+        c.count = 0
+        assert_raises(TypeError, four_mul_ufunc, 1, 2, c_sub, c)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 1)
+        c2 = C()
+        c.count = c_sub.count = 0
+        assert_raises(TypeError, four_mul_ufunc, 1, c, c_sub, c2)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 1)
+        assert_equal(c2.count, 0)
+        c.count = c2.count = c_sub.count = 0
+        assert_raises(TypeError, four_mul_ufunc, c2, c, c_sub, c)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 0)
+        assert_equal(c2.count, 1)
+
+    def test_ufunc_override_methods(self):
+
+        class A(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return self, ufunc, method, inputs, kwargs
+
+        # __call__
+        a = A()
+        res = np.multiply.__call__(1, a, foo='bar', answer=42)
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], '__call__')
+        assert_equal(res[3], (1, a))
+        assert_equal(res[4], {'foo': 'bar', 'answer': 42})
+
+        # __call__, wrong args
+        assert_raises(TypeError, np.multiply, a)
+        assert_raises(TypeError, np.multiply, a, a, a, a)
+        assert_raises(TypeError, np.multiply, a, a, sig='a', signature='a')
+        assert_raises(TypeError, ncu_tests.inner1d, a, a, axis=0, axes=[0, 0])
+
+        # reduce, positional args
+        res = np.multiply.reduce(a, 'axis0', 'dtype0', 'out0', 'keep0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'reduce')
+        assert_equal(res[3], (a,))
+        assert_equal(res[4], {'dtype':'dtype0',
+                              'out': ('out0',),
+                              'keepdims': 'keep0',
+                              'axis': 'axis0'})
+
+        # reduce, kwargs
+        res = np.multiply.reduce(a, axis='axis0', dtype='dtype0', out='out0',
+                                 keepdims='keep0', initial='init0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'reduce')
+        assert_equal(res[3], (a,))
+        assert_equal(res[4], {'dtype':'dtype0',
+                              'out': ('out0',),
+                              'keepdims': 'keep0',
+                              'axis': 'axis0',
+                              'initial': 'init0'})
+
+        # reduce, output equal to None removed, but not other explicit ones,
+        # even if they are at their default value.
+        res = np.multiply.reduce(a, 0, None, None, False)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False})
+        res = np.multiply.reduce(a, out=None, axis=0, keepdims=True)
+        assert_equal(res[4], {'axis': 0, 'keepdims': True})
+        res = np.multiply.reduce(a, None, out=(None,), dtype=None)
+        assert_equal(res[4], {'axis': None, 'dtype': None})
+        res = np.multiply.reduce(a, 0, None, None, False, 2)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': 2})
+        # np._NoValue ignored for initial.
+        res = np.multiply.reduce(a, 0, None, None, False, np._NoValue)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False})
+        # None kept for initial.
+        res = np.multiply.reduce(a, 0, None, None, False, None)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': None})
+
+        # reduce, wrong args
+        assert_raises(ValueError, np.multiply.reduce, a, out=())
+        assert_raises(ValueError, np.multiply.reduce, a, out=('out0', 'out1'))
+        assert_raises(TypeError, np.multiply.reduce, a, 'axis0', axis='axis0')
+
+        # accumulate, pos args
+        res = np.multiply.accumulate(a, 'axis0', 'dtype0', 'out0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'accumulate')
+        assert_equal(res[3], (a,))
+        assert_equal(res[4], {'dtype':'dtype0',
+                              'out': ('out0',),
+                              'axis': 'axis0'})
+
+        # accumulate, kwargs
+        res = np.multiply.accumulate(a, axis='axis0', dtype='dtype0',
+                                     out='out0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'accumulate')
+        assert_equal(res[3], (a,))
+        assert_equal(res[4], {'dtype':'dtype0',
+                              'out': ('out0',),
+                              'axis': 'axis0'})
+
+        # accumulate, output equal to None removed.
+        res = np.multiply.accumulate(a, 0, None, None)
+        assert_equal(res[4], {'axis': 0, 'dtype': None})
+        res = np.multiply.accumulate(a, out=None, axis=0, dtype='dtype1')
+        assert_equal(res[4], {'axis': 0, 'dtype': 'dtype1'})
+        res = np.multiply.accumulate(a, None, out=(None,), dtype=None)
+        assert_equal(res[4], {'axis': None, 'dtype': None})
+
+        # accumulate, wrong args
+        assert_raises(ValueError, np.multiply.accumulate, a, out=())
+        assert_raises(ValueError, np.multiply.accumulate, a,
+                      out=('out0', 'out1'))
+        assert_raises(TypeError, np.multiply.accumulate, a,
+                      'axis0', axis='axis0')
+
+        # reduceat, pos args
+        res = np.multiply.reduceat(a, [4, 2], 'axis0', 'dtype0', 'out0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'reduceat')
+        assert_equal(res[3], (a, [4, 2]))
+        assert_equal(res[4], {'dtype':'dtype0',
+                              'out': ('out0',),
+                              'axis': 'axis0'})
+
+        # reduceat, kwargs
+        res = np.multiply.reduceat(a, [4, 2], axis='axis0', dtype='dtype0',
+                                   out='out0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'reduceat')
+        assert_equal(res[3], (a, [4, 2]))
+        assert_equal(res[4], {'dtype':'dtype0',
+                              'out': ('out0',),
+                              'axis': 'axis0'})
+
+        # reduceat, output equal to None removed.
+        res = np.multiply.reduceat(a, [4, 2], 0, None, None)
+        assert_equal(res[4], {'axis': 0, 'dtype': None})
+        res = np.multiply.reduceat(a, [4, 2], axis=None, out=None, dtype='dt')
+        assert_equal(res[4], {'axis': None, 'dtype': 'dt'})
+        res = np.multiply.reduceat(a, [4, 2], None, None, out=(None,))
+        assert_equal(res[4], {'axis': None, 'dtype': None})
+
+        # reduceat, wrong args
+        assert_raises(ValueError, np.multiply.reduce, a, [4, 2], out=())
+        assert_raises(ValueError, np.multiply.reduce, a, [4, 2],
+                      out=('out0', 'out1'))
+        assert_raises(TypeError, np.multiply.reduce, a, [4, 2],
+                      'axis0', axis='axis0')
+
+        # outer
+        res = np.multiply.outer(a, 42)
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'outer')
+        assert_equal(res[3], (a, 42))
+        assert_equal(res[4], {})
+
+        # outer, wrong args
+        assert_raises(TypeError, np.multiply.outer, a)
+        assert_raises(TypeError, np.multiply.outer, a, a, a, a)
+        assert_raises(TypeError, np.multiply.outer, a, a, sig='a', signature='a')
+
+        # at
+        res = np.multiply.at(a, [4, 2], 'b0')
+        assert_equal(res[0], a)
+        assert_equal(res[1], np.multiply)
+        assert_equal(res[2], 'at')
+        assert_equal(res[3], (a, [4, 2], 'b0'))
+
+        # at, wrong args
+        assert_raises(TypeError, np.multiply.at, a)
+        assert_raises(TypeError, np.multiply.at, a, a, a, a)
+
+    def test_ufunc_override_out(self):
+
+        class A(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return kwargs
+
+        class B(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return kwargs
+
+        a = A()
+        b = B()
+        res0 = np.multiply(a, b, 'out_arg')
+        res1 = np.multiply(a, b, out='out_arg')
+        res2 = np.multiply(2, b, 'out_arg')
+        res3 = np.multiply(3, b, out='out_arg')
+        res4 = np.multiply(a, 4, 'out_arg')
+        res5 = np.multiply(a, 5, out='out_arg')
+
+        assert_equal(res0['out'][0], 'out_arg')
+        assert_equal(res1['out'][0], 'out_arg')
+        assert_equal(res2['out'][0], 'out_arg')
+        assert_equal(res3['out'][0], 'out_arg')
+        assert_equal(res4['out'][0], 'out_arg')
+        assert_equal(res5['out'][0], 'out_arg')
+
+        # ufuncs with multiple output modf and frexp.
+        res6 = np.modf(a, 'out0', 'out1')
+        res7 = np.frexp(a, 'out0', 'out1')
+        assert_equal(res6['out'][0], 'out0')
+        assert_equal(res6['out'][1], 'out1')
+        assert_equal(res7['out'][0], 'out0')
+        assert_equal(res7['out'][1], 'out1')
+
+        # While we're at it, check that default output is never passed on.
+        assert_(np.sin(a, None) == {})
+        assert_(np.sin(a, out=None) == {})
+        assert_(np.sin(a, out=(None,)) == {})
+        assert_(np.modf(a, None) == {})
+        assert_(np.modf(a, None, None) == {})
+        assert_(np.modf(a, out=(None, None)) == {})
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', DeprecationWarning)
+            assert_(np.modf(a, out=None) == {})
+            assert_(w[0].category is DeprecationWarning)
+
+        # don't give positional and output argument, or too many arguments.
+        # wrong number of arguments in the tuple is an error too.
+        assert_raises(TypeError, np.multiply, a, b, 'one', out='two')
+        assert_raises(TypeError, np.multiply, a, b, 'one', 'two')
+        assert_raises(ValueError, np.multiply, a, b, out=('one', 'two'))
+        assert_raises(ValueError, np.multiply, a, out=())
+        assert_raises(TypeError, np.modf, a, 'one', out=('two', 'three'))
+        assert_raises(TypeError, np.modf, a, 'one', 'two', 'three')
+        assert_raises(ValueError, np.modf, a, out=('one', 'two', 'three'))
+        assert_raises(ValueError, np.modf, a, out=('one',))
+
+    def test_ufunc_override_exception(self):
+
+        class A(object):
+            def __array_ufunc__(self, *a, **kwargs):
+                raise ValueError("oops")
+
+        a = A()
+        assert_raises(ValueError, np.negative, 1, out=a)
+        assert_raises(ValueError, np.negative, a)
+        assert_raises(ValueError, np.divide, 1., a)
+
+    def test_ufunc_override_not_implemented(self):
+
+        class A(object):
+            def __array_ufunc__(self, *args, **kwargs):
+                return NotImplemented
+
+        msg = ("operand type(s) all returned NotImplemented from "
+               "__array_ufunc__(<ufunc 'negative'>, '__call__', <*>): 'A'")
+        with assert_raises_regex(TypeError, fnmatch.translate(msg)):
+            np.negative(A())
+
+        msg = ("operand type(s) all returned NotImplemented from "
+               "__array_ufunc__(<ufunc 'add'>, '__call__', <*>, <object *>, "
+               "out=(1,)): 'A', 'object', 'int'")
+        with assert_raises_regex(TypeError, fnmatch.translate(msg)):
+            np.add(A(), object(), out=1)
+
+    def test_ufunc_override_disabled(self):
+
+        class OptOut(object):
+            __array_ufunc__ = None
+
+        opt_out = OptOut()
+
+        # ufuncs always raise
+        msg = "operand 'OptOut' does not support ufuncs"
+        with assert_raises_regex(TypeError, msg):
+            np.add(opt_out, 1)
+        with assert_raises_regex(TypeError, msg):
+            np.add(1, opt_out)
+        with assert_raises_regex(TypeError, msg):
+            np.negative(opt_out)
+
+        # opt-outs still hold even when other arguments have pathological
+        # __array_ufunc__ implementations
+
+        class GreedyArray(object):
+            def __array_ufunc__(self, *args, **kwargs):
+                return self
+
+        greedy = GreedyArray()
+        assert_(np.negative(greedy) is greedy)
+        with assert_raises_regex(TypeError, msg):
+            np.add(greedy, opt_out)
+        with assert_raises_regex(TypeError, msg):
+            np.add(greedy, 1, out=opt_out)
+
+    def test_gufunc_override(self):
+        # gufunc are just ufunc instances, but follow a different path,
+        # so check __array_ufunc__ overrides them properly.
+        class A(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return self, ufunc, method, inputs, kwargs
+
+        inner1d = ncu_tests.inner1d
+        a = A()
+        res = inner1d(a, a)
+        assert_equal(res[0], a)
+        assert_equal(res[1], inner1d)
+        assert_equal(res[2], '__call__')
+        assert_equal(res[3], (a, a))
+        assert_equal(res[4], {})
+
+        res = inner1d(1, 1, out=a)
+        assert_equal(res[0], a)
+        assert_equal(res[1], inner1d)
+        assert_equal(res[2], '__call__')
+        assert_equal(res[3], (1, 1))
+        assert_equal(res[4], {'out': (a,)})
+
+        # wrong number of arguments in the tuple is an error too.
+        assert_raises(TypeError, inner1d, a, out='two')
+        assert_raises(TypeError, inner1d, a, a, 'one', out='two')
+        assert_raises(TypeError, inner1d, a, a, 'one', 'two')
+        assert_raises(ValueError, inner1d, a, a, out=('one', 'two'))
+        assert_raises(ValueError, inner1d, a, a, out=())
+
+    def test_ufunc_override_with_super(self):
+        # NOTE: this class is given as an example in doc/subclassing.py;
+        # if you make any changes here, do update it there too.
+        class A(np.ndarray):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                args = []
+                in_no = []
+                for i, input_ in enumerate(inputs):
+                    if isinstance(input_, A):
+                        in_no.append(i)
+                        args.append(input_.view(np.ndarray))
+                    else:
+                        args.append(input_)
+
+                outputs = kwargs.pop('out', None)
+                out_no = []
+                if outputs:
+                    out_args = []
+                    for j, output in enumerate(outputs):
+                        if isinstance(output, A):
+                            out_no.append(j)
+                            out_args.append(output.view(np.ndarray))
+                        else:
+                            out_args.append(output)
+                    kwargs['out'] = tuple(out_args)
+                else:
+                    outputs = (None,) * ufunc.nout
+
+                info = {}
+                if in_no:
+                    info['inputs'] = in_no
+                if out_no:
+                    info['outputs'] = out_no
+
+                results = super(A, self).__array_ufunc__(ufunc, method,
+                                                         *args, **kwargs)
+                if results is NotImplemented:
+                    return NotImplemented
+
+                if method == 'at':
+                    if isinstance(inputs[0], A):
+                        inputs[0].info = info
+                    return
+
+                if ufunc.nout == 1:
+                    results = (results,)
+
+                results = tuple((np.asarray(result).view(A)
+                                 if output is None else output)
+                                for result, output in zip(results, outputs))
+                if results and isinstance(results[0], A):
+                    results[0].info = info
+
+                return results[0] if len(results) == 1 else results
+
+        class B(object):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                if any(isinstance(input_, A) for input_ in inputs):
+                    return "A!"
+                else:
+                    return NotImplemented
+
+        d = np.arange(5.)
+        # 1 input, 1 output
+        a = np.arange(5.).view(A)
+        b = np.sin(a)
+        check = np.sin(d)
+        assert_(np.all(check == b))
+        assert_equal(b.info, {'inputs': [0]})
+        b = np.sin(d, out=(a,))
+        assert_(np.all(check == b))
+        assert_equal(b.info, {'outputs': [0]})
+        assert_(b is a)
+        a = np.arange(5.).view(A)
+        b = np.sin(a, out=a)
+        assert_(np.all(check == b))
+        assert_equal(b.info, {'inputs': [0], 'outputs': [0]})
+
+        # 1 input, 2 outputs
+        a = np.arange(5.).view(A)
+        b1, b2 = np.modf(a)
+        assert_equal(b1.info, {'inputs': [0]})
+        b1, b2 = np.modf(d, out=(None, a))
+        assert_(b2 is a)
+        assert_equal(b1.info, {'outputs': [1]})
+        a = np.arange(5.).view(A)
+        b = np.arange(5.).view(A)
+        c1, c2 = np.modf(a, out=(a, b))
+        assert_(c1 is a)
+        assert_(c2 is b)
+        assert_equal(c1.info, {'inputs': [0], 'outputs': [0, 1]})
+
+        # 2 input, 1 output
+        a = np.arange(5.).view(A)
+        b = np.arange(5.).view(A)
+        c = np.add(a, b, out=a)
+        assert_(c is a)
+        assert_equal(c.info, {'inputs': [0, 1], 'outputs': [0]})
+        # some tests with a non-ndarray subclass
+        a = np.arange(5.)
+        b = B()
+        assert_(a.__array_ufunc__(np.add, '__call__', a, b) is NotImplemented)
+        assert_(b.__array_ufunc__(np.add, '__call__', a, b) is NotImplemented)
+        assert_raises(TypeError, np.add, a, b)
+        a = a.view(A)
+        assert_(a.__array_ufunc__(np.add, '__call__', a, b) is NotImplemented)
+        assert_(b.__array_ufunc__(np.add, '__call__', a, b) == "A!")
+        assert_(np.add(a, b) == "A!")
+        # regression check for gh-9102 -- tests ufunc.reduce implicitly.
+        d = np.array([[1, 2, 3], [1, 2, 3]])
+        a = d.view(A)
+        c = a.any()
+        check = d.any()
+        assert_equal(c, check)
+        assert_(c.info, {'inputs': [0]})
+        c = a.max()
+        check = d.max()
+        assert_equal(c, check)
+        assert_(c.info, {'inputs': [0]})
+        b = np.array(0).view(A)
+        c = a.max(out=b)
+        assert_equal(c, check)
+        assert_(c is b)
+        assert_(c.info, {'inputs': [0], 'outputs': [0]})
+        check = a.max(axis=0)
+        b = np.zeros_like(check).view(A)
+        c = a.max(axis=0, out=b)
+        assert_equal(c, check)
+        assert_(c is b)
+        assert_(c.info, {'inputs': [0], 'outputs': [0]})
+        # simple explicit tests of reduce, accumulate, reduceat
+        check = np.add.reduce(d, axis=1)
+        c = np.add.reduce(a, axis=1)
+        assert_equal(c, check)
+        assert_(c.info, {'inputs': [0]})
+        b = np.zeros_like(c)
+        c = np.add.reduce(a, 1, None, b)
+        assert_equal(c, check)
+        assert_(c is b)
+        assert_(c.info, {'inputs': [0], 'outputs': [0]})
+        check = np.add.accumulate(d, axis=0)
+        c = np.add.accumulate(a, axis=0)
+        assert_equal(c, check)
+        assert_(c.info, {'inputs': [0]})
+        b = np.zeros_like(c)
+        c = np.add.accumulate(a, 0, None, b)
+        assert_equal(c, check)
+        assert_(c is b)
+        assert_(c.info, {'inputs': [0], 'outputs': [0]})
+        indices = [0, 2, 1]
+        check = np.add.reduceat(d, indices, axis=1)
+        c = np.add.reduceat(a, indices, axis=1)
+        assert_equal(c, check)
+        assert_(c.info, {'inputs': [0]})
+        b = np.zeros_like(c)
+        c = np.add.reduceat(a, indices, 1, None, b)
+        assert_equal(c, check)
+        assert_(c is b)
+        assert_(c.info, {'inputs': [0], 'outputs': [0]})
+        # and a few tests for at
+        d = np.array([[1, 2, 3], [1, 2, 3]])
+        check = d.copy()
+        a = d.copy().view(A)
+        np.add.at(check, ([0, 1], [0, 2]), 1.)
+        np.add.at(a, ([0, 1], [0, 2]), 1.)
+        assert_equal(a, check)
+        assert_(a.info, {'inputs': [0]})
+        b = np.array(1.).view(A)
+        a = d.copy().view(A)
+        np.add.at(a, ([0, 1], [0, 2]), b)
+        assert_equal(a, check)
+        assert_(a.info, {'inputs': [0, 2]})
+
+
+class TestChoose(object):
+    def test_mixed(self):
+        c = np.array([True, True])
+        a = np.array([True, True])
+        assert_equal(np.choose(c, (a, 1)), np.array([1, 1]))
+
+
+class TestRationalFunctions(object):
+    def test_lcm(self):
+        self._test_lcm_inner(np.int16)
+        self._test_lcm_inner(np.uint16)
+
+    def test_lcm_object(self):
+        self._test_lcm_inner(np.object_)
+
+    def test_gcd(self):
+        self._test_gcd_inner(np.int16)
+        self._test_lcm_inner(np.uint16)
+
+    def test_gcd_object(self):
+        self._test_gcd_inner(np.object_)
+
+    def _test_lcm_inner(self, dtype):
+        # basic use
+        a = np.array([12, 120], dtype=dtype)
+        b = np.array([20, 200], dtype=dtype)
+        assert_equal(np.lcm(a, b), [60, 600])
+
+        if not issubclass(dtype, np.unsignedinteger):
+            # negatives are ignored
+            a = np.array([12, -12,  12, -12], dtype=dtype)
+            b = np.array([20,  20, -20, -20], dtype=dtype)
+            assert_equal(np.lcm(a, b), [60]*4)
+
+        # reduce
+        a = np.array([3, 12, 20], dtype=dtype)
+        assert_equal(np.lcm.reduce([3, 12, 20]), 60)
+
+        # broadcasting, and a test including 0
+        a = np.arange(6).astype(dtype)
+        b = 20
+        assert_equal(np.lcm(a, b), [0, 20, 20, 60, 20, 20])
+
+    def _test_gcd_inner(self, dtype):
+        # basic use
+        a = np.array([12, 120], dtype=dtype)
+        b = np.array([20, 200], dtype=dtype)
+        assert_equal(np.gcd(a, b), [4, 40])
+
+        if not issubclass(dtype, np.unsignedinteger):
+            # negatives are ignored
+            a = np.array([12, -12,  12, -12], dtype=dtype)
+            b = np.array([20,  20, -20, -20], dtype=dtype)
+            assert_equal(np.gcd(a, b), [4]*4)
+
+        # reduce
+        a = np.array([15, 25, 35], dtype=dtype)
+        assert_equal(np.gcd.reduce(a), 5)
+
+        # broadcasting, and a test including 0
+        a = np.arange(6).astype(dtype)
+        b = 20
+        assert_equal(np.gcd(a, b), [20,  1,  2,  1,  4,  5])
+
+    def test_lcm_overflow(self):
+        # verify that we don't overflow when a*b does overflow
+        big = np.int32(np.iinfo(np.int32).max // 11)
+        a = 2*big
+        b = 5*big
+        assert_equal(np.lcm(a, b), 10*big)
+
+    def test_gcd_overflow(self):
+        for dtype in (np.int32, np.int64):
+            # verify that we don't overflow when taking abs(x)
+            # not relevant for lcm, where the result is unrepresentable anyway
+            a = dtype(np.iinfo(dtype).min)  # negative power of two
+            q = -(a // 4)
+            assert_equal(np.gcd(a,  q*3), q)
+            assert_equal(np.gcd(a, -q*3), q)
+
+    def test_decimal(self):
+        from decimal import Decimal
+        a = np.array([1,  1, -1, -1]) * Decimal('0.20')
+        b = np.array([1, -1,  1, -1]) * Decimal('0.12')
+
+        assert_equal(np.gcd(a, b), 4*[Decimal('0.04')])
+        assert_equal(np.lcm(a, b), 4*[Decimal('0.60')])
+
+    def test_float(self):
+        # not well-defined on float due to rounding errors
+        assert_raises(TypeError, np.gcd, 0.3, 0.4)
+        assert_raises(TypeError, np.lcm, 0.3, 0.4)
+
+    def test_builtin_long(self):
+        # sanity check that array coercion is alright for builtin longs
+        assert_equal(np.array(2**200).item(), 2**200)
+
+        # expressed as prime factors
+        a = np.array(2**100 * 3**5)
+        b = np.array([2**100 * 5**7, 2**50 * 3**10])
+        assert_equal(np.gcd(a, b), [2**100,               2**50 * 3**5])
+        assert_equal(np.lcm(a, b), [2**100 * 3**5 * 5**7, 2**100 * 3**10])
+
+        assert_equal(np.gcd(2**100, 3**100), 1)
+
+
+class TestComplexFunctions(object):
+    funcs = [np.arcsin,  np.arccos,  np.arctan, np.arcsinh, np.arccosh,
+             np.arctanh, np.sin,     np.cos,    np.tan,     np.exp,
+             np.exp2,    np.log,     np.sqrt,   np.log10,   np.log2,
+             np.log1p]
+
+    def test_it(self):
+        for f in self.funcs:
+            if f is np.arccosh:
+                x = 1.5
+            else:
+                x = .5
+            fr = f(x)
+            fz = f(complex(x))
+            assert_almost_equal(fz.real, fr, err_msg='real part %s' % f)
+            assert_almost_equal(fz.imag, 0., err_msg='imag part %s' % f)
+
+    def test_precisions_consistent(self):
+        z = 1 + 1j
+        for f in self.funcs:
+            fcf = f(np.csingle(z))
+            fcd = f(np.cdouble(z))
+            fcl = f(np.clongdouble(z))
+            assert_almost_equal(fcf, fcd, decimal=6, err_msg='fch-fcd %s' % f)
+            assert_almost_equal(fcl, fcd, decimal=15, err_msg='fch-fcl %s' % f)
+
+    def test_branch_cuts(self):
+        # check branch cuts and continuity on them
+        _check_branch_cut(np.log,   -0.5, 1j, 1, -1, True)
+        _check_branch_cut(np.log2,  -0.5, 1j, 1, -1, True)
+        _check_branch_cut(np.log10, -0.5, 1j, 1, -1, True)
+        _check_branch_cut(np.log1p, -1.5, 1j, 1, -1, True)
+        _check_branch_cut(np.sqrt,  -0.5, 1j, 1, -1, True)
+
+        _check_branch_cut(np.arcsin, [ -2, 2],   [1j, 1j], 1, -1, True)
+        _check_branch_cut(np.arccos, [ -2, 2],   [1j, 1j], 1, -1, True)
+        _check_branch_cut(np.arctan, [0-2j, 2j],  [1,  1], -1, 1, True)
+
+        _check_branch_cut(np.arcsinh, [0-2j,  2j], [1,   1], -1, 1, True)
+        _check_branch_cut(np.arccosh, [ -1, 0.5], [1j,  1j], 1, -1, True)
+        _check_branch_cut(np.arctanh, [ -2,   2], [1j, 1j], 1, -1, True)
+
+        # check against bogus branch cuts: assert continuity between quadrants
+        _check_branch_cut(np.arcsin, [0-2j, 2j], [ 1,  1], 1, 1)
+        _check_branch_cut(np.arccos, [0-2j, 2j], [ 1,  1], 1, 1)
+        _check_branch_cut(np.arctan, [ -2,  2], [1j, 1j], 1, 1)
+
+        _check_branch_cut(np.arcsinh, [ -2,  2, 0], [1j, 1j, 1], 1, 1)
+        _check_branch_cut(np.arccosh, [0-2j, 2j, 2], [1,  1,  1j], 1, 1)
+        _check_branch_cut(np.arctanh, [0-2j, 2j, 0], [1,  1,  1j], 1, 1)
+
+    def test_branch_cuts_complex64(self):
+        # check branch cuts and continuity on them
+        _check_branch_cut(np.log,   -0.5, 1j, 1, -1, True, np.complex64)
+        _check_branch_cut(np.log2,  -0.5, 1j, 1, -1, True, np.complex64)
+        _check_branch_cut(np.log10, -0.5, 1j, 1, -1, True, np.complex64)
+        _check_branch_cut(np.log1p, -1.5, 1j, 1, -1, True, np.complex64)
+        _check_branch_cut(np.sqrt,  -0.5, 1j, 1, -1, True, np.complex64)
+
+        _check_branch_cut(np.arcsin, [ -2, 2],   [1j, 1j], 1, -1, True, np.complex64)
+        _check_branch_cut(np.arccos, [ -2, 2],   [1j, 1j], 1, -1, True, np.complex64)
+        _check_branch_cut(np.arctan, [0-2j, 2j],  [1,  1], -1, 1, True, np.complex64)
+
+        _check_branch_cut(np.arcsinh, [0-2j,  2j], [1,   1], -1, 1, True, np.complex64)
+        _check_branch_cut(np.arccosh, [ -1, 0.5], [1j,  1j], 1, -1, True, np.complex64)
+        _check_branch_cut(np.arctanh, [ -2,   2], [1j, 1j], 1, -1, True, np.complex64)
+
+        # check against bogus branch cuts: assert continuity between quadrants
+        _check_branch_cut(np.arcsin, [0-2j, 2j], [ 1,  1], 1, 1, False, np.complex64)
+        _check_branch_cut(np.arccos, [0-2j, 2j], [ 1,  1], 1, 1, False, np.complex64)
+        _check_branch_cut(np.arctan, [ -2,  2], [1j, 1j], 1, 1, False, np.complex64)
+
+        _check_branch_cut(np.arcsinh, [ -2,  2, 0], [1j, 1j, 1], 1, 1, False, np.complex64)
+        _check_branch_cut(np.arccosh, [0-2j, 2j, 2], [1,  1,  1j], 1, 1, False, np.complex64)
+        _check_branch_cut(np.arctanh, [0-2j, 2j, 0], [1,  1,  1j], 1, 1, False, np.complex64)
+
+    def test_against_cmath(self):
+        import cmath
+
+        points = [-1-1j, -1+1j, +1-1j, +1+1j]
+        name_map = {'arcsin': 'asin', 'arccos': 'acos', 'arctan': 'atan',
+                    'arcsinh': 'asinh', 'arccosh': 'acosh', 'arctanh': 'atanh'}
+        atol = 4*np.finfo(complex).eps
+        for func in self.funcs:
+            fname = func.__name__.split('.')[-1]
+            cname = name_map.get(fname, fname)
+            try:
+                cfunc = getattr(cmath, cname)
+            except AttributeError:
+                continue
+            for p in points:
+                a = complex(func(np.complex_(p)))
+                b = cfunc(p)
+                assert_(abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b))
+
+    @pytest.mark.parametrize('dtype', [np.complex64, np.complex_, np.longcomplex])
+    def test_loss_of_precision(self, dtype):
+        """Check loss of precision in complex arc* functions"""
+
+        # Check against known-good functions
+
+        info = np.finfo(dtype)
+        real_dtype = dtype(0.).real.dtype
+        eps = info.eps
+
+        def check(x, rtol):
+            x = x.astype(real_dtype)
+
+            z = x.astype(dtype)
+            d = np.absolute(np.arcsinh(x)/np.arcsinh(z).real - 1)
+            assert_(np.all(d < rtol), (np.argmax(d), x[np.argmax(d)], d.max(),
+                                      'arcsinh'))
+
+            z = (1j*x).astype(dtype)
+            d = np.absolute(np.arcsinh(x)/np.arcsin(z).imag - 1)
+            assert_(np.all(d < rtol), (np.argmax(d), x[np.argmax(d)], d.max(),
+                                      'arcsin'))
+
+            z = x.astype(dtype)
+            d = np.absolute(np.arctanh(x)/np.arctanh(z).real - 1)
+            assert_(np.all(d < rtol), (np.argmax(d), x[np.argmax(d)], d.max(),
+                                      'arctanh'))
+
+            z = (1j*x).astype(dtype)
+            d = np.absolute(np.arctanh(x)/np.arctan(z).imag - 1)
+            assert_(np.all(d < rtol), (np.argmax(d), x[np.argmax(d)], d.max(),
+                                      'arctan'))
+
+        # The switchover was chosen as 1e-3; hence there can be up to
+        # ~eps/1e-3 of relative cancellation error before it
+
+        x_series = np.logspace(-20, -3.001, 200)
+        x_basic = np.logspace(-2.999, 0, 10, endpoint=False)
+
+        if dtype is np.longcomplex:
+            # It's not guaranteed that the system-provided arc functions
+            # are accurate down to a few epsilons. (Eg. on Linux 64-bit)
+            # So, give more leeway for long complex tests here:
+            # Can use 2.1 for > Ubuntu LTS Trusty (2014), glibc = 2.19.
+            check(x_series, 50.0*eps)
+        else:
+            check(x_series, 2.1*eps)
+        check(x_basic, 2.0*eps/1e-3)
+
+        # Check a few points
+
+        z = np.array([1e-5*(1+1j)], dtype=dtype)
+        p = 9.999999999333333333e-6 + 1.000000000066666666e-5j
+        d = np.absolute(1-np.arctanh(z)/p)
+        assert_(np.all(d < 1e-15))
+
+        p = 1.0000000000333333333e-5 + 9.999999999666666667e-6j
+        d = np.absolute(1-np.arcsinh(z)/p)
+        assert_(np.all(d < 1e-15))
+
+        p = 9.999999999333333333e-6j + 1.000000000066666666e-5
+        d = np.absolute(1-np.arctan(z)/p)
+        assert_(np.all(d < 1e-15))
+
+        p = 1.0000000000333333333e-5j + 9.999999999666666667e-6
+        d = np.absolute(1-np.arcsin(z)/p)
+        assert_(np.all(d < 1e-15))
+
+        # Check continuity across switchover points
+
+        def check(func, z0, d=1):
+            z0 = np.asarray(z0, dtype=dtype)
+            zp = z0 + abs(z0) * d * eps * 2
+            zm = z0 - abs(z0) * d * eps * 2
+            assert_(np.all(zp != zm), (zp, zm))
+
+            # NB: the cancellation error at the switchover is at least eps
+            good = (abs(func(zp) - func(zm)) < 2*eps)
+            assert_(np.all(good), (func, z0[~good]))
+
+        for func in (np.arcsinh, np.arcsinh, np.arcsin, np.arctanh, np.arctan):
+            pts = [rp+1j*ip for rp in (-1e-3, 0, 1e-3) for ip in(-1e-3, 0, 1e-3)
+                   if rp != 0 or ip != 0]
+            check(func, pts, 1)
+            check(func, pts, 1j)
+            check(func, pts, 1+1j)
+
+
+class TestAttributes(object):
+    def test_attributes(self):
+        add = ncu.add
+        assert_equal(add.__name__, 'add')
+        assert_(add.ntypes >= 18)  # don't fail if types added
+        assert_('ii->i' in add.types)
+        assert_equal(add.nin, 2)
+        assert_equal(add.nout, 1)
+        assert_equal(add.identity, 0)
+
+    def test_doc(self):
+        # don't bother checking the long list of kwargs, which are likely to
+        # change
+        assert_(ncu.add.__doc__.startswith(
+            "add(x1, x2, /, out=None, *, where=True"))
+        assert_(ncu.frexp.__doc__.startswith(
+            "frexp(x[, out1, out2], / [, out=(None, None)], *, where=True"))
+
+
+class TestSubclass(object):
+
+    def test_subclass_op(self):
+
+        class simple(np.ndarray):
+            def __new__(subtype, shape):
+                self = np.ndarray.__new__(subtype, shape, dtype=object)
+                self.fill(0)
+                return self
+
+        a = simple((3, 4))
+        assert_equal(a+a, a)
+
+def _check_branch_cut(f, x0, dx, re_sign=1, im_sign=-1, sig_zero_ok=False,
+                      dtype=complex):
+    """
+    Check for a branch cut in a function.
+
+    Assert that `x0` lies on a branch cut of function `f` and `f` is
+    continuous from the direction `dx`.
+
+    Parameters
+    ----------
+    f : func
+        Function to check
+    x0 : array-like
+        Point on branch cut
+    dx : array-like
+        Direction to check continuity in
+    re_sign, im_sign : {1, -1}
+        Change of sign of the real or imaginary part expected
+    sig_zero_ok : bool
+        Whether to check if the branch cut respects signed zero (if applicable)
+    dtype : dtype
+        Dtype to check (should be complex)
+
+    """
+    x0 = np.atleast_1d(x0).astype(dtype)
+    dx = np.atleast_1d(dx).astype(dtype)
+
+    if np.dtype(dtype).char == 'F':
+        scale = np.finfo(dtype).eps * 1e2
+        atol = np.float32(1e-2)
+    else:
+        scale = np.finfo(dtype).eps * 1e3
+        atol = 1e-4
+
+    y0 = f(x0)
+    yp = f(x0 + dx*scale*np.absolute(x0)/np.absolute(dx))
+    ym = f(x0 - dx*scale*np.absolute(x0)/np.absolute(dx))
+
+    assert_(np.all(np.absolute(y0.real - yp.real) < atol), (y0, yp))
+    assert_(np.all(np.absolute(y0.imag - yp.imag) < atol), (y0, yp))
+    assert_(np.all(np.absolute(y0.real - ym.real*re_sign) < atol), (y0, ym))
+    assert_(np.all(np.absolute(y0.imag - ym.imag*im_sign) < atol), (y0, ym))
+
+    if sig_zero_ok:
+        # check that signed zeros also work as a displacement
+        jr = (x0.real == 0) & (dx.real != 0)
+        ji = (x0.imag == 0) & (dx.imag != 0)
+        if np.any(jr):
+            x = x0[jr]
+            x.real = np.NZERO
+            ym = f(x)
+            assert_(np.all(np.absolute(y0[jr].real - ym.real*re_sign) < atol), (y0[jr], ym))
+            assert_(np.all(np.absolute(y0[jr].imag - ym.imag*im_sign) < atol), (y0[jr], ym))
+
+        if np.any(ji):
+            x = x0[ji]
+            x.imag = np.NZERO
+            ym = f(x)
+            assert_(np.all(np.absolute(y0[ji].real - ym.real*re_sign) < atol), (y0[ji], ym))
+            assert_(np.all(np.absolute(y0[ji].imag - ym.imag*im_sign) < atol), (y0[ji], ym))
+
+def test_copysign():
+    assert_(np.copysign(1, -1) == -1)
+    with np.errstate(divide="ignore"):
+        assert_(1 / np.copysign(0, -1) < 0)
+        assert_(1 / np.copysign(0, 1) > 0)
+    assert_(np.signbit(np.copysign(np.nan, -1)))
+    assert_(not np.signbit(np.copysign(np.nan, 1)))
+
+def _test_nextafter(t):
+    one = t(1)
+    two = t(2)
+    zero = t(0)
+    eps = np.finfo(t).eps
+    assert_(np.nextafter(one, two) - one == eps)
+    assert_(np.nextafter(one, zero) - one < 0)
+    assert_(np.isnan(np.nextafter(np.nan, one)))
+    assert_(np.isnan(np.nextafter(one, np.nan)))
+    assert_(np.nextafter(one, one) == one)
+
+def test_nextafter():
+    return _test_nextafter(np.float64)
+
+
+def test_nextafterf():
+    return _test_nextafter(np.float32)
+
+
+[email protected](np.finfo(np.double) == np.finfo(np.longdouble),
+                    reason="long double is same as double")
+[email protected](condition=platform.machine().startswith("ppc64"),
+                    reason="IBM double double")
+def test_nextafterl():
+    return _test_nextafter(np.longdouble)
+
+
+def test_nextafter_0():
+    for t, direction in itertools.product(np.sctypes['float'], (1, -1)):
+        tiny = np.finfo(t).tiny
+        assert_(0. < direction * np.nextafter(t(0), t(direction)) < tiny)
+        assert_equal(np.nextafter(t(0), t(direction)) / t(2.1), direction * 0.0)
+
+def _test_spacing(t):
+    one = t(1)
+    eps = np.finfo(t).eps
+    nan = t(np.nan)
+    inf = t(np.inf)
+    with np.errstate(invalid='ignore'):
+        assert_(np.spacing(one) == eps)
+        assert_(np.isnan(np.spacing(nan)))
+        assert_(np.isnan(np.spacing(inf)))
+        assert_(np.isnan(np.spacing(-inf)))
+        assert_(np.spacing(t(1e30)) != 0)
+
+def test_spacing():
+    return _test_spacing(np.float64)
+
+def test_spacingf():
+    return _test_spacing(np.float32)
+
+
+[email protected](np.finfo(np.double) == np.finfo(np.longdouble),
+                    reason="long double is same as double")
+[email protected](condition=platform.machine().startswith("ppc64"),
+                    reason="IBM double double")
+def test_spacingl():
+    return _test_spacing(np.longdouble)
+
+def test_spacing_gfortran():
+    # Reference from this fortran file, built with gfortran 4.3.3 on linux
+    # 32bits:
+    #       PROGRAM test_spacing
+    #        INTEGER, PARAMETER :: SGL = SELECTED_REAL_KIND(p=6, r=37)
+    #        INTEGER, PARAMETER :: DBL = SELECTED_REAL_KIND(p=13, r=200)
+    #
+    #        WRITE(*,*) spacing(0.00001_DBL)
+    #        WRITE(*,*) spacing(1.0_DBL)
+    #        WRITE(*,*) spacing(1000._DBL)
+    #        WRITE(*,*) spacing(10500._DBL)
+    #
+    #        WRITE(*,*) spacing(0.00001_SGL)
+    #        WRITE(*,*) spacing(1.0_SGL)
+    #        WRITE(*,*) spacing(1000._SGL)
+    #        WRITE(*,*) spacing(10500._SGL)
+    #       END PROGRAM
+    ref = {np.float64: [1.69406589450860068E-021,
+                        2.22044604925031308E-016,
+                        1.13686837721616030E-013,
+                        1.81898940354585648E-012],
+           np.float32: [9.09494702E-13,
+                        1.19209290E-07,
+                        6.10351563E-05,
+                        9.76562500E-04]}
+
+    for dt, dec_ in zip([np.float32, np.float64], (10, 20)):
+        x = np.array([1e-5, 1, 1000, 10500], dtype=dt)
+        assert_array_almost_equal(np.spacing(x), ref[dt], decimal=dec_)
+
+def test_nextafter_vs_spacing():
+    # XXX: spacing does not handle long double yet
+    for t in [np.float32, np.float64]:
+        for _f in [1, 1e-5, 1000]:
+            f = t(_f)
+            f1 = t(_f + 1)
+            assert_(np.nextafter(f, f1) - f == np.spacing(f))
+
+def test_pos_nan():
+    """Check np.nan is a positive nan."""
+    assert_(np.signbit(np.nan) == 0)
+
+def test_reduceat():
+    """Test bug in reduceat when structured arrays are not copied."""
+    db = np.dtype([('name', 'S11'), ('time', np.int64), ('value', np.float32)])
+    a = np.empty([100], dtype=db)
+    a['name'] = 'Simple'
+    a['time'] = 10
+    a['value'] = 100
+    indx = [0, 7, 15, 25]
+
+    h2 = []
+    val1 = indx[0]
+    for val2 in indx[1:]:
+        h2.append(np.add.reduce(a['value'][val1:val2]))
+        val1 = val2
+    h2.append(np.add.reduce(a['value'][val1:]))
+    h2 = np.array(h2)
+
+    # test buffered -- this should work
+    h1 = np.add.reduceat(a['value'], indx)
+    assert_array_almost_equal(h1, h2)
+
+    # This is when the error occurs.
+    # test no buffer
+    np.setbufsize(32)
+    h1 = np.add.reduceat(a['value'], indx)
+    np.setbufsize(np.UFUNC_BUFSIZE_DEFAULT)
+    assert_array_almost_equal(h1, h2)
+
+def test_reduceat_empty():
+    """Reduceat should work with empty arrays"""
+    indices = np.array([], 'i4')
+    x = np.array([], 'f8')
+    result = np.add.reduceat(x, indices)
+    assert_equal(result.dtype, x.dtype)
+    assert_equal(result.shape, (0,))
+    # Another case with a slightly different zero-sized shape
+    x = np.ones((5, 2))
+    result = np.add.reduceat(x, [], axis=0)
+    assert_equal(result.dtype, x.dtype)
+    assert_equal(result.shape, (0, 2))
+    result = np.add.reduceat(x, [], axis=1)
+    assert_equal(result.dtype, x.dtype)
+    assert_equal(result.shape, (5, 0))
+
+def test_complex_nan_comparisons():
+    nans = [complex(np.nan, 0), complex(0, np.nan), complex(np.nan, np.nan)]
+    fins = [complex(1, 0), complex(-1, 0), complex(0, 1), complex(0, -1),
+            complex(1, 1), complex(-1, -1), complex(0, 0)]
+
+    with np.errstate(invalid='ignore'):
+        for x in nans + fins:
+            x = np.array([x])
+            for y in nans + fins:
+                y = np.array([y])
+
+                if np.isfinite(x) and np.isfinite(y):
+                    continue
+
+                assert_equal(x < y, False, err_msg="%r < %r" % (x, y))
+                assert_equal(x > y, False, err_msg="%r > %r" % (x, y))
+                assert_equal(x <= y, False, err_msg="%r <= %r" % (x, y))
+                assert_equal(x >= y, False, err_msg="%r >= %r" % (x, y))
+                assert_equal(x == y, False, err_msg="%r == %r" % (x, y))
+
+
+def test_rint_big_int():
+    # np.rint bug for large integer values on Windows 32-bit and MKL
+    # https://github.com/numpy/numpy/issues/6685
+    val = 4607998452777363968
+    # This is exactly representable in floating point
+    assert_equal(val, int(float(val)))
+    # Rint should not change the value
+    assert_equal(val, np.rint(val))
+
+
+def test_signaling_nan_exceptions():
+    with assert_no_warnings():
+        a = np.ndarray(shape=(), dtype='float32', buffer=b'\x00\xe0\xbf\xff')
+        np.isnan(a)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_umath_complex.py b/contrib/python/numpy/py2/numpy/core/tests/test_umath_complex.py
new file mode 100644
index 00000000000..785ae8c57d0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_umath_complex.py
@@ -0,0 +1,543 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import platform
+import pytest
+
+import numpy as np
+import numpy.core.umath as ncu
+from numpy.testing import (
+    assert_raises, assert_equal, assert_array_equal, assert_almost_equal
+    )
+
+# TODO: branch cuts (use Pauli code)
+# TODO: conj 'symmetry'
+# TODO: FPU exceptions
+
+# At least on Windows the results of many complex functions are not conforming
+# to the C99 standard. See ticket 1574.
+# Ditto for Solaris (ticket 1642) and OS X on PowerPC.
+#FIXME: this will probably change when we require full C99 campatibility
+with np.errstate(all='ignore'):
+    functions_seem_flaky = ((np.exp(complex(np.inf, 0)).imag != 0)
+                            or (np.log(complex(np.NZERO, 0)).imag != np.pi))
+# TODO: replace with a check on whether platform-provided C99 funcs are used
+xfail_complex_tests = (not sys.platform.startswith('linux') or functions_seem_flaky)
+
+# TODO This can be xfail when the generator functions are got rid of.
+platform_skip = pytest.mark.skipif(xfail_complex_tests,
+                                   reason="Inadequate C99 complex support")
+
+
+
+class TestCexp(object):
+    def test_simple(self):
+        check = check_complex_value
+        f = np.exp
+
+        check(f, 1, 0, np.exp(1), 0, False)
+        check(f, 0, 1, np.cos(1), np.sin(1), False)
+
+        ref = np.exp(1) * complex(np.cos(1), np.sin(1))
+        check(f, 1, 1, ref.real, ref.imag, False)
+
+    @platform_skip
+    def test_special_values(self):
+        # C99: Section G 6.3.1
+
+        check = check_complex_value
+        f = np.exp
+
+        # cexp(+-0 + 0i) is 1 + 0i
+        check(f, np.PZERO, 0, 1, 0, False)
+        check(f, np.NZERO, 0, 1, 0, False)
+
+        # cexp(x + infi) is nan + nani for finite x and raises 'invalid' FPU
+        # exception
+        check(f,  1, np.inf, np.nan, np.nan)
+        check(f, -1, np.inf, np.nan, np.nan)
+        check(f,  0, np.inf, np.nan, np.nan)
+
+        # cexp(inf + 0i) is inf + 0i
+        check(f,  np.inf, 0, np.inf, 0)
+
+        # cexp(-inf + yi) is +0 * (cos(y) + i sin(y)) for finite y
+        check(f,  -np.inf, 1, np.PZERO, np.PZERO)
+        check(f,  -np.inf, 0.75 * np.pi, np.NZERO, np.PZERO)
+
+        # cexp(inf + yi) is +inf * (cos(y) + i sin(y)) for finite y
+        check(f,  np.inf, 1, np.inf, np.inf)
+        check(f,  np.inf, 0.75 * np.pi, -np.inf, np.inf)
+
+        # cexp(-inf + inf i) is +-0 +- 0i (signs unspecified)
+        def _check_ninf_inf(dummy):
+            msgform = "cexp(-inf, inf) is (%f, %f), expected (+-0, +-0)"
+            with np.errstate(invalid='ignore'):
+                z = f(np.array(complex(-np.inf, np.inf)))
+                if z.real != 0 or z.imag != 0:
+                    raise AssertionError(msgform % (z.real, z.imag))
+
+        _check_ninf_inf(None)
+
+        # cexp(inf + inf i) is +-inf + NaNi and raised invalid FPU ex.
+        def _check_inf_inf(dummy):
+            msgform = "cexp(inf, inf) is (%f, %f), expected (+-inf, nan)"
+            with np.errstate(invalid='ignore'):
+                z = f(np.array(complex(np.inf, np.inf)))
+                if not np.isinf(z.real) or not np.isnan(z.imag):
+                    raise AssertionError(msgform % (z.real, z.imag))
+
+        _check_inf_inf(None)
+
+        # cexp(-inf + nan i) is +-0 +- 0i
+        def _check_ninf_nan(dummy):
+            msgform = "cexp(-inf, nan) is (%f, %f), expected (+-0, +-0)"
+            with np.errstate(invalid='ignore'):
+                z = f(np.array(complex(-np.inf, np.nan)))
+                if z.real != 0 or z.imag != 0:
+                    raise AssertionError(msgform % (z.real, z.imag))
+
+        _check_ninf_nan(None)
+
+        # cexp(inf + nan i) is +-inf + nan
+        def _check_inf_nan(dummy):
+            msgform = "cexp(-inf, nan) is (%f, %f), expected (+-inf, nan)"
+            with np.errstate(invalid='ignore'):
+                z = f(np.array(complex(np.inf, np.nan)))
+                if not np.isinf(z.real) or not np.isnan(z.imag):
+                    raise AssertionError(msgform % (z.real, z.imag))
+
+        _check_inf_nan(None)
+
+        # cexp(nan + yi) is nan + nani for y != 0 (optional: raises invalid FPU
+        # ex)
+        check(f, np.nan, 1, np.nan, np.nan)
+        check(f, np.nan, -1, np.nan, np.nan)
+
+        check(f, np.nan,  np.inf, np.nan, np.nan)
+        check(f, np.nan, -np.inf, np.nan, np.nan)
+
+        # cexp(nan + nani) is nan + nani
+        check(f, np.nan, np.nan, np.nan, np.nan)
+
+    # TODO This can be xfail when the generator functions are got rid of.
+    @pytest.mark.skip(reason="cexp(nan + 0I) is wrong on most platforms")
+    def test_special_values2(self):
+        # XXX: most implementations get it wrong here (including glibc <= 2.10)
+        # cexp(nan + 0i) is nan + 0i
+        check = check_complex_value
+        f = np.exp
+
+        check(f, np.nan, 0, np.nan, 0)
+
+class TestClog(object):
+    def test_simple(self):
+        x = np.array([1+0j, 1+2j])
+        y_r = np.log(np.abs(x)) + 1j * np.angle(x)
+        y = np.log(x)
+        for i in range(len(x)):
+            assert_almost_equal(y[i], y_r[i])
+
+    @platform_skip
+    @pytest.mark.skipif(platform.machine() == "armv5tel", reason="See gh-413.")
+    def test_special_values(self):
+        xl = []
+        yl = []
+
+        # From C99 std (Sec 6.3.2)
+        # XXX: check exceptions raised
+        # --- raise for invalid fails.
+
+        # clog(-0 + i0) returns -inf + i pi and raises the 'divide-by-zero'
+        # floating-point exception.
+        with np.errstate(divide='raise'):
+            x = np.array([np.NZERO], dtype=complex)
+            y = complex(-np.inf, np.pi)
+            assert_raises(FloatingPointError, np.log, x)
+        with np.errstate(divide='ignore'):
+            assert_almost_equal(np.log(x), y)
+
+        xl.append(x)
+        yl.append(y)
+
+        # clog(+0 + i0) returns -inf + i0 and raises the 'divide-by-zero'
+        # floating-point exception.
+        with np.errstate(divide='raise'):
+            x = np.array([0], dtype=complex)
+            y = complex(-np.inf, 0)
+            assert_raises(FloatingPointError, np.log, x)
+        with np.errstate(divide='ignore'):
+            assert_almost_equal(np.log(x), y)
+
+        xl.append(x)
+        yl.append(y)
+
+        # clog(x + i inf returns +inf + i pi /2, for finite x.
+        x = np.array([complex(1, np.inf)], dtype=complex)
+        y = complex(np.inf, 0.5 * np.pi)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        x = np.array([complex(-1, np.inf)], dtype=complex)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(x + iNaN) returns NaN + iNaN and optionally raises the
+        # 'invalid' floating- point exception, for finite x.
+        with np.errstate(invalid='raise'):
+            x = np.array([complex(1., np.nan)], dtype=complex)
+            y = complex(np.nan, np.nan)
+            #assert_raises(FloatingPointError, np.log, x)
+        with np.errstate(invalid='ignore'):
+            assert_almost_equal(np.log(x), y)
+
+        xl.append(x)
+        yl.append(y)
+
+        with np.errstate(invalid='raise'):
+            x = np.array([np.inf + 1j * np.nan], dtype=complex)
+            #assert_raises(FloatingPointError, np.log, x)
+        with np.errstate(invalid='ignore'):
+            assert_almost_equal(np.log(x), y)
+
+        xl.append(x)
+        yl.append(y)
+
+        # clog(- inf + iy) returns +inf + ipi , for finite positive-signed y.
+        x = np.array([-np.inf + 1j], dtype=complex)
+        y = complex(np.inf, np.pi)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(+ inf + iy) returns +inf + i0, for finite positive-signed y.
+        x = np.array([np.inf + 1j], dtype=complex)
+        y = complex(np.inf, 0)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(- inf + i inf) returns +inf + i3pi /4.
+        x = np.array([complex(-np.inf, np.inf)], dtype=complex)
+        y = complex(np.inf, 0.75 * np.pi)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(+ inf + i inf) returns +inf + ipi /4.
+        x = np.array([complex(np.inf, np.inf)], dtype=complex)
+        y = complex(np.inf, 0.25 * np.pi)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(+/- inf + iNaN) returns +inf + iNaN.
+        x = np.array([complex(np.inf, np.nan)], dtype=complex)
+        y = complex(np.inf, np.nan)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        x = np.array([complex(-np.inf, np.nan)], dtype=complex)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(NaN + iy) returns NaN + iNaN and optionally raises the
+        # 'invalid' floating-point exception, for finite y.
+        x = np.array([complex(np.nan, 1)], dtype=complex)
+        y = complex(np.nan, np.nan)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(NaN + i inf) returns +inf + iNaN.
+        x = np.array([complex(np.nan, np.inf)], dtype=complex)
+        y = complex(np.inf, np.nan)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(NaN + iNaN) returns NaN + iNaN.
+        x = np.array([complex(np.nan, np.nan)], dtype=complex)
+        y = complex(np.nan, np.nan)
+        assert_almost_equal(np.log(x), y)
+        xl.append(x)
+        yl.append(y)
+
+        # clog(conj(z)) = conj(clog(z)).
+        xa = np.array(xl, dtype=complex)
+        ya = np.array(yl, dtype=complex)
+        with np.errstate(divide='ignore'):
+            for i in range(len(xa)):
+                assert_almost_equal(np.log(xa[i].conj()), ya[i].conj())
+
+
+class TestCsqrt(object):
+
+    def test_simple(self):
+        # sqrt(1)
+        check_complex_value(np.sqrt, 1, 0, 1, 0)
+
+        # sqrt(1i)
+        rres = 0.5*np.sqrt(2)
+        ires = rres
+        check_complex_value(np.sqrt, 0, 1, rres, ires, False)
+
+        # sqrt(-1)
+        check_complex_value(np.sqrt, -1, 0, 0, 1)
+
+    def test_simple_conjugate(self):
+        ref = np.conj(np.sqrt(complex(1, 1)))
+
+        def f(z):
+            return np.sqrt(np.conj(z))
+
+        check_complex_value(f, 1, 1, ref.real, ref.imag, False)
+
+    #def test_branch_cut(self):
+    #    _check_branch_cut(f, -1, 0, 1, -1)
+
+    @platform_skip
+    def test_special_values(self):
+        # C99: Sec G 6.4.2
+
+        check = check_complex_value
+        f = np.sqrt
+
+        # csqrt(+-0 + 0i) is 0 + 0i
+        check(f, np.PZERO, 0, 0, 0)
+        check(f, np.NZERO, 0, 0, 0)
+
+        # csqrt(x + infi) is inf + infi for any x (including NaN)
+        check(f,  1, np.inf, np.inf, np.inf)
+        check(f, -1, np.inf, np.inf, np.inf)
+
+        check(f, np.PZERO, np.inf, np.inf, np.inf)
+        check(f, np.NZERO, np.inf, np.inf, np.inf)
+        check(f,   np.inf, np.inf, np.inf, np.inf)
+        check(f,  -np.inf, np.inf, np.inf, np.inf)
+        check(f,  -np.nan, np.inf, np.inf, np.inf)
+
+        # csqrt(x + nani) is nan + nani for any finite x
+        check(f,  1, np.nan, np.nan, np.nan)
+        check(f, -1, np.nan, np.nan, np.nan)
+        check(f,  0, np.nan, np.nan, np.nan)
+
+        # csqrt(-inf + yi) is +0 + infi for any finite y > 0
+        check(f, -np.inf, 1, np.PZERO, np.inf)
+
+        # csqrt(inf + yi) is +inf + 0i for any finite y > 0
+        check(f, np.inf, 1, np.inf, np.PZERO)
+
+        # csqrt(-inf + nani) is nan +- infi (both +i infi are valid)
+        def _check_ninf_nan(dummy):
+            msgform = "csqrt(-inf, nan) is (%f, %f), expected (nan, +-inf)"
+            z = np.sqrt(np.array(complex(-np.inf, np.nan)))
+            #Fixme: ugly workaround for isinf bug.
+            with np.errstate(invalid='ignore'):
+                if not (np.isnan(z.real) and np.isinf(z.imag)):
+                    raise AssertionError(msgform % (z.real, z.imag))
+
+        _check_ninf_nan(None)
+
+        # csqrt(+inf + nani) is inf + nani
+        check(f, np.inf, np.nan, np.inf, np.nan)
+
+        # csqrt(nan + yi) is nan + nani for any finite y (infinite handled in x
+        # + nani)
+        check(f, np.nan,       0, np.nan, np.nan)
+        check(f, np.nan,       1, np.nan, np.nan)
+        check(f, np.nan,  np.nan, np.nan, np.nan)
+
+        # XXX: check for conj(csqrt(z)) == csqrt(conj(z)) (need to fix branch
+        # cuts first)
+
+class TestCpow(object):
+    def setup(self):
+        self.olderr = np.seterr(invalid='ignore')
+
+    def teardown(self):
+        np.seterr(**self.olderr)
+
+    def test_simple(self):
+        x = np.array([1+1j, 0+2j, 1+2j, np.inf, np.nan])
+        y_r = x ** 2
+        y = np.power(x, 2)
+        for i in range(len(x)):
+            assert_almost_equal(y[i], y_r[i])
+
+    def test_scalar(self):
+        x = np.array([1, 1j,         2,  2.5+.37j, np.inf, np.nan])
+        y = np.array([1, 1j, -0.5+1.5j, -0.5+1.5j,      2,      3])
+        lx = list(range(len(x)))
+        # Compute the values for complex type in python
+        p_r = [complex(x[i]) ** complex(y[i]) for i in lx]
+        # Substitute a result allowed by C99 standard
+        p_r[4] = complex(np.inf, np.nan)
+        # Do the same with numpy complex scalars
+        n_r = [x[i] ** y[i] for i in lx]
+        for i in lx:
+            assert_almost_equal(n_r[i], p_r[i], err_msg='Loop %d\n' % i)
+
+    def test_array(self):
+        x = np.array([1, 1j,         2,  2.5+.37j, np.inf, np.nan])
+        y = np.array([1, 1j, -0.5+1.5j, -0.5+1.5j,      2,      3])
+        lx = list(range(len(x)))
+        # Compute the values for complex type in python
+        p_r = [complex(x[i]) ** complex(y[i]) for i in lx]
+        # Substitute a result allowed by C99 standard
+        p_r[4] = complex(np.inf, np.nan)
+        # Do the same with numpy arrays
+        n_r = x ** y
+        for i in lx:
+            assert_almost_equal(n_r[i], p_r[i], err_msg='Loop %d\n' % i)
+
+class TestCabs(object):
+    def setup(self):
+        self.olderr = np.seterr(invalid='ignore')
+
+    def teardown(self):
+        np.seterr(**self.olderr)
+
+    def test_simple(self):
+        x = np.array([1+1j, 0+2j, 1+2j, np.inf, np.nan])
+        y_r = np.array([np.sqrt(2.), 2, np.sqrt(5), np.inf, np.nan])
+        y = np.abs(x)
+        for i in range(len(x)):
+            assert_almost_equal(y[i], y_r[i])
+
+    def test_fabs(self):
+        # Test that np.abs(x +- 0j) == np.abs(x) (as mandated by C99 for cabs)
+        x = np.array([1+0j], dtype=complex)
+        assert_array_equal(np.abs(x), np.real(x))
+
+        x = np.array([complex(1, np.NZERO)], dtype=complex)
+        assert_array_equal(np.abs(x), np.real(x))
+
+        x = np.array([complex(np.inf, np.NZERO)], dtype=complex)
+        assert_array_equal(np.abs(x), np.real(x))
+
+        x = np.array([complex(np.nan, np.NZERO)], dtype=complex)
+        assert_array_equal(np.abs(x), np.real(x))
+
+    def test_cabs_inf_nan(self):
+        x, y = [], []
+
+        # cabs(+-nan + nani) returns nan
+        x.append(np.nan)
+        y.append(np.nan)
+        check_real_value(np.abs,  np.nan, np.nan, np.nan)
+
+        x.append(np.nan)
+        y.append(-np.nan)
+        check_real_value(np.abs, -np.nan, np.nan, np.nan)
+
+        # According to C99 standard, if exactly one of the real/part is inf and
+        # the other nan, then cabs should return inf
+        x.append(np.inf)
+        y.append(np.nan)
+        check_real_value(np.abs,  np.inf, np.nan, np.inf)
+
+        x.append(-np.inf)
+        y.append(np.nan)
+        check_real_value(np.abs, -np.inf, np.nan, np.inf)
+
+        # cabs(conj(z)) == conj(cabs(z)) (= cabs(z))
+        def f(a):
+            return np.abs(np.conj(a))
+
+        def g(a, b):
+            return np.abs(complex(a, b))
+
+        xa = np.array(x, dtype=complex)
+        for i in range(len(xa)):
+            ref = g(x[i], y[i])
+            check_real_value(f, x[i], y[i], ref)
+
+class TestCarg(object):
+    def test_simple(self):
+        check_real_value(ncu._arg, 1, 0, 0, False)
+        check_real_value(ncu._arg, 0, 1, 0.5*np.pi, False)
+
+        check_real_value(ncu._arg, 1, 1, 0.25*np.pi, False)
+        check_real_value(ncu._arg, np.PZERO, np.PZERO, np.PZERO)
+
+    # TODO This can be xfail when the generator functions are got rid of.
+    @pytest.mark.skip(
+        reason="Complex arithmetic with signed zero fails on most platforms")
+    def test_zero(self):
+        # carg(-0 +- 0i) returns +- pi
+        check_real_value(ncu._arg, np.NZERO, np.PZERO,  np.pi, False)
+        check_real_value(ncu._arg, np.NZERO, np.NZERO, -np.pi, False)
+
+        # carg(+0 +- 0i) returns +- 0
+        check_real_value(ncu._arg, np.PZERO, np.PZERO, np.PZERO)
+        check_real_value(ncu._arg, np.PZERO, np.NZERO, np.NZERO)
+
+        # carg(x +- 0i) returns +- 0 for x > 0
+        check_real_value(ncu._arg, 1, np.PZERO, np.PZERO, False)
+        check_real_value(ncu._arg, 1, np.NZERO, np.NZERO, False)
+
+        # carg(x +- 0i) returns +- pi for x < 0
+        check_real_value(ncu._arg, -1, np.PZERO,  np.pi, False)
+        check_real_value(ncu._arg, -1, np.NZERO, -np.pi, False)
+
+        # carg(+- 0 + yi) returns pi/2 for y > 0
+        check_real_value(ncu._arg, np.PZERO, 1, 0.5 * np.pi, False)
+        check_real_value(ncu._arg, np.NZERO, 1, 0.5 * np.pi, False)
+
+        # carg(+- 0 + yi) returns -pi/2 for y < 0
+        check_real_value(ncu._arg, np.PZERO, -1, 0.5 * np.pi, False)
+        check_real_value(ncu._arg, np.NZERO, -1, -0.5 * np.pi, False)
+
+    #def test_branch_cuts(self):
+    #    _check_branch_cut(ncu._arg, -1, 1j, -1, 1)
+
+    def test_special_values(self):
+        # carg(-np.inf +- yi) returns +-pi for finite y > 0
+        check_real_value(ncu._arg, -np.inf,  1,  np.pi, False)
+        check_real_value(ncu._arg, -np.inf, -1, -np.pi, False)
+
+        # carg(np.inf +- yi) returns +-0 for finite y > 0
+        check_real_value(ncu._arg, np.inf,  1, np.PZERO, False)
+        check_real_value(ncu._arg, np.inf, -1, np.NZERO, False)
+
+        # carg(x +- np.infi) returns +-pi/2 for finite x
+        check_real_value(ncu._arg, 1,  np.inf,  0.5 * np.pi, False)
+        check_real_value(ncu._arg, 1, -np.inf, -0.5 * np.pi, False)
+
+        # carg(-np.inf +- np.infi) returns +-3pi/4
+        check_real_value(ncu._arg, -np.inf,  np.inf,  0.75 * np.pi, False)
+        check_real_value(ncu._arg, -np.inf, -np.inf, -0.75 * np.pi, False)
+
+        # carg(np.inf +- np.infi) returns +-pi/4
+        check_real_value(ncu._arg, np.inf,  np.inf,  0.25 * np.pi, False)
+        check_real_value(ncu._arg, np.inf, -np.inf, -0.25 * np.pi, False)
+
+        # carg(x + yi) returns np.nan if x or y is nan
+        check_real_value(ncu._arg, np.nan,      0, np.nan, False)
+        check_real_value(ncu._arg,      0, np.nan, np.nan, False)
+
+        check_real_value(ncu._arg, np.nan, np.inf, np.nan, False)
+        check_real_value(ncu._arg, np.inf, np.nan, np.nan, False)
+
+
+def check_real_value(f, x1, y1, x, exact=True):
+    z1 = np.array([complex(x1, y1)])
+    if exact:
+        assert_equal(f(z1), x)
+    else:
+        assert_almost_equal(f(z1), x)
+
+
+def check_complex_value(f, x1, y1, x2, y2, exact=True):
+    z1 = np.array([complex(x1, y1)])
+    z2 = complex(x2, y2)
+    with np.errstate(invalid='ignore'):
+        if exact:
+            assert_equal(f(z1), z2)
+        else:
+            assert_almost_equal(f(z1), z2)
diff --git a/contrib/python/numpy/py2/numpy/core/tests/test_unicode.py b/contrib/python/numpy/py2/numpy/core/tests/test_unicode.py
new file mode 100644
index 00000000000..2ffd8801b7e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/core/tests/test_unicode.py
@@ -0,0 +1,396 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+import numpy as np
+from numpy.compat import unicode
+from numpy.testing import assert_, assert_equal, assert_array_equal
+
+# Guess the UCS length for this python interpreter
+if sys.version_info[:2] >= (3, 3):
+    # Python 3.3 uses a flexible string representation
+    ucs4 = False
+
+    def buffer_length(arr):
+        if isinstance(arr, unicode):
+            arr = str(arr)
+            if not arr:
+                charmax = 0
+            else:
+                charmax = max([ord(c) for c in arr])
+            if charmax < 256:
+                size = 1
+            elif charmax < 65536:
+                size = 2
+            else:
+                size = 4
+            return size * len(arr)
+        v = memoryview(arr)
+        if v.shape is None:
+            return len(v) * v.itemsize
+        else:
+            return np.prod(v.shape) * v.itemsize
+else:
+    if len(buffer(u'u')) == 4:
+        ucs4 = True
+    else:
+        ucs4 = False
+
+    def buffer_length(arr):
+        if isinstance(arr, np.ndarray):
+            return len(arr.data)
+        return len(buffer(arr))
+
+# In both cases below we need to make sure that the byte swapped value (as
+# UCS4) is still a valid unicode:
+# Value that can be represented in UCS2 interpreters
+ucs2_value = u'\u0900'
+# Value that cannot be represented in UCS2 interpreters (but can in UCS4)
+ucs4_value = u'\U00100900'
+
+
+def test_string_cast():
+    str_arr = np.array(["1234", "1234\0\0"], dtype='S')
+    uni_arr1 = str_arr.astype('>U')
+    uni_arr2 = str_arr.astype('<U')
+
+    if sys.version_info[0] < 3:
+        assert_array_equal(str_arr, uni_arr1)
+        assert_array_equal(str_arr, uni_arr2)
+    else:
+        assert_(str_arr != uni_arr1)
+        assert_(str_arr != uni_arr2)
+    assert_array_equal(uni_arr1, uni_arr2)
+
+
+############################################################
+#    Creation tests
+############################################################
+
+class CreateZeros(object):
+    """Check the creation of zero-valued arrays"""
+
+    def content_check(self, ua, ua_scalar, nbytes):
+
+        # Check the length of the unicode base type
+        assert_(int(ua.dtype.str[2:]) == self.ulen)
+        # Check the length of the data buffer
+        assert_(buffer_length(ua) == nbytes)
+        # Small check that data in array element is ok
+        assert_(ua_scalar == u'')
+        # Encode to ascii and double check
+        assert_(ua_scalar.encode('ascii') == b'')
+        # Check buffer lengths for scalars
+        if ucs4:
+            assert_(buffer_length(ua_scalar) == 0)
+        else:
+            assert_(buffer_length(ua_scalar) == 0)
+
+    def test_zeros0D(self):
+        # Check creation of 0-dimensional objects
+        ua = np.zeros((), dtype='U%s' % self.ulen)
+        self.content_check(ua, ua[()], 4*self.ulen)
+
+    def test_zerosSD(self):
+        # Check creation of single-dimensional objects
+        ua = np.zeros((2,), dtype='U%s' % self.ulen)
+        self.content_check(ua, ua[0], 4*self.ulen*2)
+        self.content_check(ua, ua[1], 4*self.ulen*2)
+
+    def test_zerosMD(self):
+        # Check creation of multi-dimensional objects
+        ua = np.zeros((2, 3, 4), dtype='U%s' % self.ulen)
+        self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
+        self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
+
+
+class TestCreateZeros_1(CreateZeros):
+    """Check the creation of zero-valued arrays (size 1)"""
+    ulen = 1
+
+
+class TestCreateZeros_2(CreateZeros):
+    """Check the creation of zero-valued arrays (size 2)"""
+    ulen = 2
+
+
+class TestCreateZeros_1009(CreateZeros):
+    """Check the creation of zero-valued arrays (size 1009)"""
+    ulen = 1009
+
+
+class CreateValues(object):
+    """Check the creation of unicode arrays with values"""
+
+    def content_check(self, ua, ua_scalar, nbytes):
+
+        # Check the length of the unicode base type
+        assert_(int(ua.dtype.str[2:]) == self.ulen)
+        # Check the length of the data buffer
+        assert_(buffer_length(ua) == nbytes)
+        # Small check that data in array element is ok
+        assert_(ua_scalar == self.ucs_value*self.ulen)
+        # Encode to UTF-8 and double check
+        assert_(ua_scalar.encode('utf-8') ==
+                        (self.ucs_value*self.ulen).encode('utf-8'))
+        # Check buffer lengths for scalars
+        if ucs4:
+            assert_(buffer_length(ua_scalar) == 4*self.ulen)
+        else:
+            if self.ucs_value == ucs4_value:
+                # In UCS2, the \U0010FFFF will be represented using a
+                # surrogate *pair*
+                assert_(buffer_length(ua_scalar) == 2*2*self.ulen)
+            else:
+                # In UCS2, the \uFFFF will be represented using a
+                # regular 2-byte word
+                assert_(buffer_length(ua_scalar) == 2*self.ulen)
+
+    def test_values0D(self):
+        # Check creation of 0-dimensional objects with values
+        ua = np.array(self.ucs_value*self.ulen, dtype='U%s' % self.ulen)
+        self.content_check(ua, ua[()], 4*self.ulen)
+
+    def test_valuesSD(self):
+        # Check creation of single-dimensional objects with values
+        ua = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
+        self.content_check(ua, ua[0], 4*self.ulen*2)
+        self.content_check(ua, ua[1], 4*self.ulen*2)
+
+    def test_valuesMD(self):
+        # Check creation of multi-dimensional objects with values
+        ua = np.array([[[self.ucs_value*self.ulen]*2]*3]*4, dtype='U%s' % self.ulen)
+        self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
+        self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
+
+
+class TestCreateValues_1_UCS2(CreateValues):
+    """Check the creation of valued arrays (size 1, UCS2 values)"""
+    ulen = 1
+    ucs_value = ucs2_value
+
+
+class TestCreateValues_1_UCS4(CreateValues):
+    """Check the creation of valued arrays (size 1, UCS4 values)"""
+    ulen = 1
+    ucs_value = ucs4_value
+
+
+class TestCreateValues_2_UCS2(CreateValues):
+    """Check the creation of valued arrays (size 2, UCS2 values)"""
+    ulen = 2
+    ucs_value = ucs2_value
+
+
+class TestCreateValues_2_UCS4(CreateValues):
+    """Check the creation of valued arrays (size 2, UCS4 values)"""
+    ulen = 2
+    ucs_value = ucs4_value
+
+
+class TestCreateValues_1009_UCS2(CreateValues):
+    """Check the creation of valued arrays (size 1009, UCS2 values)"""
+    ulen = 1009
+    ucs_value = ucs2_value
+
+
+class TestCreateValues_1009_UCS4(CreateValues):
+    """Check the creation of valued arrays (size 1009, UCS4 values)"""
+    ulen = 1009
+    ucs_value = ucs4_value
+
+
+############################################################
+#    Assignment tests
+############################################################
+
+class AssignValues(object):
+    """Check the assignment of unicode arrays with values"""
+
+    def content_check(self, ua, ua_scalar, nbytes):
+
+        # Check the length of the unicode base type
+        assert_(int(ua.dtype.str[2:]) == self.ulen)
+        # Check the length of the data buffer
+        assert_(buffer_length(ua) == nbytes)
+        # Small check that data in array element is ok
+        assert_(ua_scalar == self.ucs_value*self.ulen)
+        # Encode to UTF-8 and double check
+        assert_(ua_scalar.encode('utf-8') ==
+                        (self.ucs_value*self.ulen).encode('utf-8'))
+        # Check buffer lengths for scalars
+        if ucs4:
+            assert_(buffer_length(ua_scalar) == 4*self.ulen)
+        else:
+            if self.ucs_value == ucs4_value:
+                # In UCS2, the \U0010FFFF will be represented using a
+                # surrogate *pair*
+                assert_(buffer_length(ua_scalar) == 2*2*self.ulen)
+            else:
+                # In UCS2, the \uFFFF will be represented using a
+                # regular 2-byte word
+                assert_(buffer_length(ua_scalar) == 2*self.ulen)
+
+    def test_values0D(self):
+        # Check assignment of 0-dimensional objects with values
+        ua = np.zeros((), dtype='U%s' % self.ulen)
+        ua[()] = self.ucs_value*self.ulen
+        self.content_check(ua, ua[()], 4*self.ulen)
+
+    def test_valuesSD(self):
+        # Check assignment of single-dimensional objects with values
+        ua = np.zeros((2,), dtype='U%s' % self.ulen)
+        ua[0] = self.ucs_value*self.ulen
+        self.content_check(ua, ua[0], 4*self.ulen*2)
+        ua[1] = self.ucs_value*self.ulen
+        self.content_check(ua, ua[1], 4*self.ulen*2)
+
+    def test_valuesMD(self):
+        # Check assignment of multi-dimensional objects with values
+        ua = np.zeros((2, 3, 4), dtype='U%s' % self.ulen)
+        ua[0, 0, 0] = self.ucs_value*self.ulen
+        self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
+        ua[-1, -1, -1] = self.ucs_value*self.ulen
+        self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
+
+
+class TestAssignValues_1_UCS2(AssignValues):
+    """Check the assignment of valued arrays (size 1, UCS2 values)"""
+    ulen = 1
+    ucs_value = ucs2_value
+
+
+class TestAssignValues_1_UCS4(AssignValues):
+    """Check the assignment of valued arrays (size 1, UCS4 values)"""
+    ulen = 1
+    ucs_value = ucs4_value
+
+
+class TestAssignValues_2_UCS2(AssignValues):
+    """Check the assignment of valued arrays (size 2, UCS2 values)"""
+    ulen = 2
+    ucs_value = ucs2_value
+
+
+class TestAssignValues_2_UCS4(AssignValues):
+    """Check the assignment of valued arrays (size 2, UCS4 values)"""
+    ulen = 2
+    ucs_value = ucs4_value
+
+
+class TestAssignValues_1009_UCS2(AssignValues):
+    """Check the assignment of valued arrays (size 1009, UCS2 values)"""
+    ulen = 1009
+    ucs_value = ucs2_value
+
+
+class TestAssignValues_1009_UCS4(AssignValues):
+    """Check the assignment of valued arrays (size 1009, UCS4 values)"""
+    ulen = 1009
+    ucs_value = ucs4_value
+
+
+############################################################
+#    Byteorder tests
+############################################################
+
+class ByteorderValues(object):
+    """Check the byteorder of unicode arrays in round-trip conversions"""
+
+    def test_values0D(self):
+        # Check byteorder of 0-dimensional objects
+        ua = np.array(self.ucs_value*self.ulen, dtype='U%s' % self.ulen)
+        ua2 = ua.newbyteorder()
+        # This changes the interpretation of the data region (but not the
+        #  actual data), therefore the returned scalars are not
+        #  the same (they are byte-swapped versions of each other).
+        assert_(ua[()] != ua2[()])
+        ua3 = ua2.newbyteorder()
+        # Arrays must be equal after the round-trip
+        assert_equal(ua, ua3)
+
+    def test_valuesSD(self):
+        # Check byteorder of single-dimensional objects
+        ua = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
+        ua2 = ua.newbyteorder()
+        assert_((ua != ua2).all())
+        assert_(ua[-1] != ua2[-1])
+        ua3 = ua2.newbyteorder()
+        # Arrays must be equal after the round-trip
+        assert_equal(ua, ua3)
+
+    def test_valuesMD(self):
+        # Check byteorder of multi-dimensional objects
+        ua = np.array([[[self.ucs_value*self.ulen]*2]*3]*4,
+                      dtype='U%s' % self.ulen)
+        ua2 = ua.newbyteorder()
+        assert_((ua != ua2).all())
+        assert_(ua[-1, -1, -1] != ua2[-1, -1, -1])
+        ua3 = ua2.newbyteorder()
+        # Arrays must be equal after the round-trip
+        assert_equal(ua, ua3)
+
+    def test_values_cast(self):
+        # Check byteorder of when casting the array for a strided and
+        # contiguous array:
+        test1 = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
+        test2 = np.repeat(test1, 2)[::2]
+        for ua in (test1, test2):
+            ua2 = ua.astype(dtype=ua.dtype.newbyteorder())
+            assert_((ua == ua2).all())
+            assert_(ua[-1] == ua2[-1])
+            ua3 = ua2.astype(dtype=ua.dtype)
+            # Arrays must be equal after the round-trip
+            assert_equal(ua, ua3)
+
+    def test_values_updowncast(self):
+        # Check byteorder of when casting the array to a longer and shorter
+        # string length for strided and contiguous arrays
+        test1 = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
+        test2 = np.repeat(test1, 2)[::2]
+        for ua in (test1, test2):
+            # Cast to a longer type with zero padding
+            longer_type = np.dtype('U%s' % (self.ulen+1)).newbyteorder()
+            ua2 = ua.astype(dtype=longer_type)
+            assert_((ua == ua2).all())
+            assert_(ua[-1] == ua2[-1])
+            # Cast back again with truncating:
+            ua3 = ua2.astype(dtype=ua.dtype)
+            # Arrays must be equal after the round-trip
+            assert_equal(ua, ua3)
+
+
+class TestByteorder_1_UCS2(ByteorderValues):
+    """Check the byteorder in unicode (size 1, UCS2 values)"""
+    ulen = 1
+    ucs_value = ucs2_value
+
+
+class TestByteorder_1_UCS4(ByteorderValues):
+    """Check the byteorder in unicode (size 1, UCS4 values)"""
+    ulen = 1
+    ucs_value = ucs4_value
+
+
+class TestByteorder_2_UCS2(ByteorderValues):
+    """Check the byteorder in unicode (size 2, UCS2 values)"""
+    ulen = 2
+    ucs_value = ucs2_value
+
+
+class TestByteorder_2_UCS4(ByteorderValues):
+    """Check the byteorder in unicode (size 2, UCS4 values)"""
+    ulen = 2
+    ucs_value = ucs4_value
+
+
+class TestByteorder_1009_UCS2(ByteorderValues):
+    """Check the byteorder in unicode (size 1009, UCS2 values)"""
+    ulen = 1009
+    ucs_value = ucs2_value
+
+
+class TestByteorder_1009_UCS4(ByteorderValues):
+    """Check the byteorder in unicode (size 1009, UCS4 values)"""
+    ulen = 1009
+    ucs_value = ucs4_value
diff --git a/contrib/python/numpy/py2/numpy/distutils/mingw/gfortran_vs2003_hack.c b/contrib/python/numpy/py2/numpy/distutils/mingw/gfortran_vs2003_hack.c
new file mode 100644
index 00000000000..485a675d8a1
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/mingw/gfortran_vs2003_hack.c
@@ -0,0 +1,6 @@
+int _get_output_format(void)
+{
+    return 0;
+}
+
+int _imp____lc_codepage = 0;
diff --git a/contrib/python/numpy/py2/numpy/distutils/setup.py b/contrib/python/numpy/py2/numpy/distutils/setup.py
new file mode 100644
index 00000000000..82a53bd08db
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/setup.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+from __future__ import division, print_function
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('distutils', parent_package, top_path)
+    config.add_subpackage('command')
+    config.add_subpackage('fcompiler')
+    config.add_data_dir('tests')
+    config.add_data_files('site.cfg')
+    config.add_data_files('mingw/gfortran_vs2003_hack.c')
+    config.make_config_py()
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core      import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/__init__.py b/contrib/python/numpy/py2/numpy/distutils/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_exec_command.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_exec_command.py
new file mode 100644
index 00000000000..8bd26500748
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_exec_command.py
@@ -0,0 +1,215 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+from tempfile import TemporaryFile
+
+from numpy.distutils import exec_command
+from numpy.distutils.exec_command import get_pythonexe
+from numpy.testing import tempdir, assert_
+
+# In python 3 stdout, stderr are text (unicode compliant) devices, so to
+# emulate them import StringIO from the io module.
+if sys.version_info[0] >= 3:
+    from io import StringIO
+else:
+    from StringIO import StringIO
+
+class redirect_stdout(object):
+    """Context manager to redirect stdout for exec_command test."""
+    def __init__(self, stdout=None):
+        self._stdout = stdout or sys.stdout
+
+    def __enter__(self):
+        self.old_stdout = sys.stdout
+        sys.stdout = self._stdout
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._stdout.flush()
+        sys.stdout = self.old_stdout
+        # note: closing sys.stdout won't close it.
+        self._stdout.close()
+
+class redirect_stderr(object):
+    """Context manager to redirect stderr for exec_command test."""
+    def __init__(self, stderr=None):
+        self._stderr = stderr or sys.stderr
+
+    def __enter__(self):
+        self.old_stderr = sys.stderr
+        sys.stderr = self._stderr
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._stderr.flush()
+        sys.stderr = self.old_stderr
+        # note: closing sys.stderr won't close it.
+        self._stderr.close()
+
+class emulate_nonposix(object):
+    """Context manager to emulate os.name != 'posix' """
+    def __init__(self, osname='non-posix'):
+        self._new_name = osname
+
+    def __enter__(self):
+        self._old_name = os.name
+        os.name = self._new_name
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        os.name = self._old_name
+
+
+def test_exec_command_stdout():
+    # Regression test for gh-2999 and gh-2915.
+    # There are several packages (nose, scipy.weave.inline, Sage inline
+    # Fortran) that replace stdout, in which case it doesn't have a fileno
+    # method.  This is tested here, with a do-nothing command that fails if the
+    # presence of fileno() is assumed in exec_command.
+
+    # The code has a special case for posix systems, so if we are on posix test
+    # both that the special case works and that the generic code works.
+
+    # Test posix version:
+    with redirect_stdout(StringIO()):
+        with redirect_stderr(TemporaryFile()):
+            exec_command.exec_command("cd '.'")
+
+    if os.name == 'posix':
+        # Test general (non-posix) version:
+        with emulate_nonposix():
+            with redirect_stdout(StringIO()):
+                with redirect_stderr(TemporaryFile()):
+                    exec_command.exec_command("cd '.'")
+
+def test_exec_command_stderr():
+    # Test posix version:
+    with redirect_stdout(TemporaryFile(mode='w+')):
+        with redirect_stderr(StringIO()):
+            exec_command.exec_command("cd '.'")
+
+    if os.name == 'posix':
+        # Test general (non-posix) version:
+        with emulate_nonposix():
+            with redirect_stdout(TemporaryFile()):
+                with redirect_stderr(StringIO()):
+                    exec_command.exec_command("cd '.'")
+
+
+class TestExecCommand(object):
+    def setup(self):
+        self.pyexe = get_pythonexe()
+
+    def check_nt(self, **kws):
+        s, o = exec_command.exec_command('cmd /C echo path=%path%')
+        assert_(s == 0)
+        assert_(o != '')
+
+        s, o = exec_command.exec_command(
+         '"%s" -c "import sys;sys.stderr.write(sys.platform)"' % self.pyexe)
+        assert_(s == 0)
+        assert_(o == 'win32')
+
+    def check_posix(self, **kws):
+        s, o = exec_command.exec_command("echo Hello", **kws)
+        assert_(s == 0)
+        assert_(o == 'Hello')
+
+        s, o = exec_command.exec_command('echo $AAA', **kws)
+        assert_(s == 0)
+        assert_(o == '')
+
+        s, o = exec_command.exec_command('echo "$AAA"', AAA='Tere', **kws)
+        assert_(s == 0)
+        assert_(o == 'Tere')
+
+        s, o = exec_command.exec_command('echo "$AAA"', **kws)
+        assert_(s == 0)
+        assert_(o == '')
+
+        if 'BBB' not in os.environ:
+            os.environ['BBB'] = 'Hi'
+            s, o = exec_command.exec_command('echo "$BBB"', **kws)
+            assert_(s == 0)
+            assert_(o == 'Hi')
+
+            s, o = exec_command.exec_command('echo "$BBB"', BBB='Hey', **kws)
+            assert_(s == 0)
+            assert_(o == 'Hey')
+
+            s, o = exec_command.exec_command('echo "$BBB"', **kws)
+            assert_(s == 0)
+            assert_(o == 'Hi')
+
+            del os.environ['BBB']
+
+            s, o = exec_command.exec_command('echo "$BBB"', **kws)
+            assert_(s == 0)
+            assert_(o == '')
+
+
+        s, o = exec_command.exec_command('this_is_not_a_command', **kws)
+        assert_(s != 0)
+        assert_(o != '')
+
+        s, o = exec_command.exec_command('echo path=$PATH', **kws)
+        assert_(s == 0)
+        assert_(o != '')
+
+        s, o = exec_command.exec_command(
+             '"%s" -c "import sys,os;sys.stderr.write(os.name)"' %
+             self.pyexe, **kws)
+        assert_(s == 0)
+        assert_(o == 'posix')
+
+    def check_basic(self, *kws):
+        s, o = exec_command.exec_command(
+                     '"%s" -c "raise \'Ignore me.\'"' % self.pyexe, **kws)
+        assert_(s != 0)
+        assert_(o != '')
+
+        s, o = exec_command.exec_command(
+             '"%s" -c "import sys;sys.stderr.write(\'0\');'
+             'sys.stderr.write(\'1\');sys.stderr.write(\'2\')"' %
+             self.pyexe, **kws)
+        assert_(s == 0)
+        assert_(o == '012')
+
+        s, o = exec_command.exec_command(
+                 '"%s" -c "import sys;sys.exit(15)"' % self.pyexe, **kws)
+        assert_(s == 15)
+        assert_(o == '')
+
+        s, o = exec_command.exec_command(
+                     '"%s" -c "print(\'Heipa\'")' % self.pyexe, **kws)
+        assert_(s == 0)
+        assert_(o == 'Heipa')
+
+    def check_execute_in(self, **kws):
+        with tempdir() as tmpdir:
+            fn = "file"
+            tmpfile = os.path.join(tmpdir, fn)
+            f = open(tmpfile, 'w')
+            f.write('Hello')
+            f.close()
+
+            s, o = exec_command.exec_command(
+                 '"%s" -c "f = open(\'%s\', \'r\'); f.close()"' %
+                 (self.pyexe, fn), **kws)
+            assert_(s != 0)
+            assert_(o != '')
+            s, o = exec_command.exec_command(
+                     '"%s" -c "f = open(\'%s\', \'r\'); print(f.read()); '
+                     'f.close()"' % (self.pyexe, fn), execute_in=tmpdir, **kws)
+            assert_(s == 0)
+            assert_(o == 'Hello')
+
+    def test_basic(self):
+        with redirect_stdout(StringIO()):
+            with redirect_stderr(StringIO()):
+                if os.name == "posix":
+                    self.check_posix(use_tee=0)
+                    self.check_posix(use_tee=1)
+                elif os.name == "nt":
+                    self.check_nt(use_tee=0)
+                    self.check_nt(use_tee=1)
+                self.check_execute_in(use_tee=0)
+                self.check_execute_in(use_tee=1)
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler.py
new file mode 100644
index 00000000000..ba19a97ea69
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler.py
@@ -0,0 +1,81 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+from numpy.testing import assert_, suppress_warnings
+import numpy.distutils.fcompiler
+
+customizable_flags = [
+    ('f77', 'F77FLAGS'),
+    ('f90', 'F90FLAGS'),
+    ('free', 'FREEFLAGS'),
+    ('arch', 'FARCH'),
+    ('debug', 'FDEBUG'),
+    ('flags', 'FFLAGS'),
+    ('linker_so', 'LDFLAGS'),
+]
+
+
+def test_fcompiler_flags(monkeypatch):
+    monkeypatch.setenv('NPY_DISTUTILS_APPEND_FLAGS', '0')
+    fc = numpy.distutils.fcompiler.new_fcompiler(compiler='none')
+    flag_vars = fc.flag_vars.clone(lambda *args, **kwargs: None)
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        prev_flags = getattr(flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        new_flags = getattr(flag_vars, opt)
+
+        monkeypatch.delenv(envvar)
+        assert_(new_flags == [new_flag])
+
+    monkeypatch.setenv('NPY_DISTUTILS_APPEND_FLAGS', '1')
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        prev_flags = getattr(flag_vars, opt)
+        monkeypatch.setenv(envvar, new_flag)
+        new_flags = getattr(flag_vars, opt)
+
+        monkeypatch.delenv(envvar)
+        if prev_flags is None:
+            assert_(new_flags == [new_flag])
+        else:
+            assert_(new_flags == prev_flags + [new_flag])
+
+
+def test_fcompiler_flags_append_warning(monkeypatch):
+    # Test to check that the warning for append behavior changing in future
+    # is triggered.  Need to use a real compiler instance so that we have
+    # non-empty flags to start with (otherwise the "if var and append" check
+    # will always be false).
+    try:
+        with suppress_warnings() as sup:
+            sup.record()
+            fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu95')
+            fc.customize()
+    except numpy.distutils.fcompiler.CompilerNotFound:
+        pytest.skip("gfortran not found, so can't execute this test")
+
+    # Ensure NPY_DISTUTILS_APPEND_FLAGS not defined
+    monkeypatch.delenv('NPY_DISTUTILS_APPEND_FLAGS', raising=False)
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        with suppress_warnings() as sup:
+            sup.record()
+            prev_flags = getattr(fc.flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        with suppress_warnings() as sup:
+            sup.record()
+            new_flags = getattr(fc.flag_vars, opt)
+            if prev_flags:
+                # Check that warning was issued
+                assert len(sup.log) == 1
+
+        monkeypatch.delenv(envvar)
+        assert_(new_flags == [new_flag])
+
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_gnu.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_gnu.py
new file mode 100644
index 00000000000..49208aaced3
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_gnu.py
@@ -0,0 +1,57 @@
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import assert_
+
+import numpy.distutils.fcompiler
+
+g77_version_strings = [
+    ('GNU Fortran 0.5.25 20010319 (prerelease)', '0.5.25'),
+    ('GNU Fortran (GCC 3.2) 3.2 20020814 (release)', '3.2'),
+    ('GNU Fortran (GCC) 3.3.3 20040110 (prerelease) (Debian)', '3.3.3'),
+    ('GNU Fortran (GCC) 3.3.3 (Debian 20040401)', '3.3.3'),
+    ('GNU Fortran (GCC 3.2.2 20030222 (Red Hat Linux 3.2.2-5)) 3.2.2'
+       ' 20030222 (Red Hat Linux 3.2.2-5)', '3.2.2'),
+]
+
+gfortran_version_strings = [
+    ('GNU Fortran 95 (GCC 4.0.3 20051023 (prerelease) (Debian 4.0.2-3))',
+     '4.0.3'),
+    ('GNU Fortran 95 (GCC) 4.1.0', '4.1.0'),
+    ('GNU Fortran 95 (GCC) 4.2.0 20060218 (experimental)', '4.2.0'),
+    ('GNU Fortran (GCC) 4.3.0 20070316 (experimental)', '4.3.0'),
+    ('GNU Fortran (rubenvb-4.8.0) 4.8.0', '4.8.0'),
+    ('4.8.0', '4.8.0'),
+    ('4.0.3-7', '4.0.3'),
+    ("gfortran: warning: couldn't understand kern.osversion '14.1.0\n4.9.1",
+     '4.9.1'),
+    ("gfortran: warning: couldn't understand kern.osversion '14.1.0\n"
+     "gfortran: warning: yet another warning\n4.9.1",
+     '4.9.1'),
+    ('GNU Fortran (crosstool-NG 8a21ab48) 7.2.0', '7.2.0')
+]
+
+class TestG77Versions(object):
+    def test_g77_version(self):
+        fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu')
+        for vs, version in g77_version_strings:
+            v = fc.version_match(vs)
+            assert_(v == version, (vs, v))
+
+    def test_not_g77(self):
+        fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu')
+        for vs, _ in gfortran_version_strings:
+            v = fc.version_match(vs)
+            assert_(v is None, (vs, v))
+
+class TestGFortranVersions(object):
+    def test_gfortran_version(self):
+        fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu95')
+        for vs, version in gfortran_version_strings:
+            v = fc.version_match(vs)
+            assert_(v == version, (vs, v))
+
+    def test_not_gfortran(self):
+        fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu95')
+        for vs, _ in g77_version_strings:
+            v = fc.version_match(vs)
+            assert_(v is None, (vs, v))
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_intel.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_intel.py
new file mode 100644
index 00000000000..5e014bada34
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_intel.py
@@ -0,0 +1,32 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy.distutils.fcompiler
+from numpy.testing import assert_
+
+
+intel_32bit_version_strings = [
+    ("Intel(R) Fortran Intel(R) 32-bit Compiler Professional for applications"
+     "running on Intel(R) 32, Version 11.1", '11.1'),
+]
+
+intel_64bit_version_strings = [
+    ("Intel(R) Fortran IA-64 Compiler Professional for applications"
+     "running on IA-64, Version 11.0", '11.0'),
+    ("Intel(R) Fortran Intel(R) 64 Compiler Professional for applications"
+     "running on Intel(R) 64, Version 11.1", '11.1')
+]
+
+class TestIntelFCompilerVersions(object):
+    def test_32bit_version(self):
+        fc = numpy.distutils.fcompiler.new_fcompiler(compiler='intel')
+        for vs, version in intel_32bit_version_strings:
+            v = fc.version_match(vs)
+            assert_(v == version)
+
+
+class TestIntelEM64TFCompilerVersions(object):
+    def test_64bit_version(self):
+        fc = numpy.distutils.fcompiler.new_fcompiler(compiler='intelem')
+        for vs, version in intel_64bit_version_strings:
+            v = fc.version_match(vs)
+            assert_(v == version)
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_nagfor.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_nagfor.py
new file mode 100644
index 00000000000..1c936056a89
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_fcompiler_nagfor.py
@@ -0,0 +1,24 @@
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import assert_
+import numpy.distutils.fcompiler
+
+nag_version_strings = [('nagfor', 'NAG Fortran Compiler Release '
+                        '6.2(Chiyoda) Build 6200', '6.2'),
+                       ('nagfor', 'NAG Fortran Compiler Release '
+                        '6.1(Tozai) Build 6136', '6.1'),
+                       ('nagfor', 'NAG Fortran Compiler Release '
+                        '6.0(Hibiya) Build 1021', '6.0'),
+                       ('nagfor', 'NAG Fortran Compiler Release '
+                        '5.3.2(971)', '5.3.2'),
+                       ('nag', 'NAGWare Fortran 95 compiler Release 5.1'
+                        '(347,355-367,375,380-383,389,394,399,401-402,407,'
+                        '431,435,437,446,459-460,463,472,494,496,503,508,'
+                        '511,517,529,555,557,565)', '5.1')]
+
+class TestNagFCompilerVersions(object):
+    def test_version_match(self):
+        for comp, vs, version in nag_version_strings:
+            fc = numpy.distutils.fcompiler.new_fcompiler(compiler=comp)
+            v = fc.version_match(vs)
+            assert_(v == version)
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_from_template.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_from_template.py
new file mode 100644
index 00000000000..58817549629
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_from_template.py
@@ -0,0 +1,44 @@
+
+from numpy.distutils.from_template import process_str
+from numpy.testing import assert_equal
+
+
+pyf_src = """
+python module foo
+    <_rd=real,double precision>
+    interface
+        subroutine <s,d>foosub(tol)
+            <_rd>, intent(in,out) :: tol
+        end subroutine <s,d>foosub
+    end interface
+end python module foo
+"""
+
+expected_pyf = """
+python module foo
+    interface
+        subroutine sfoosub(tol)
+            real, intent(in,out) :: tol
+        end subroutine sfoosub
+        subroutine dfoosub(tol)
+            double precision, intent(in,out) :: tol
+        end subroutine dfoosub
+    end interface
+end python module foo
+"""
+
+
+def normalize_whitespace(s):
+    """
+    Remove leading and trailing whitespace, and convert internal
+    stretches of whitespace to a single space.
+    """
+    return ' '.join(s.split())
+
+
+def test_from_template():
+    """Regression test for gh-10712."""
+    pyf = process_str(pyf_src)
+    normalized_pyf = normalize_whitespace(pyf)
+    normalized_expected_pyf = normalize_whitespace(expected_pyf)
+    assert_equal(normalized_pyf, normalized_expected_pyf)
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_misc_util.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_misc_util.py
new file mode 100644
index 00000000000..3e239cf48cb
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_misc_util.py
@@ -0,0 +1,84 @@
+from __future__ import division, absolute_import, print_function
+
+from os.path import join, sep, dirname
+
+from numpy.distutils.misc_util import (
+    appendpath, minrelpath, gpaths, get_shared_lib_extension, get_info
+    )
+from numpy.testing import (
+    assert_, assert_equal
+    )
+
+ajoin = lambda *paths: join(*((sep,)+paths))
+
+class TestAppendpath(object):
+
+    def test_1(self):
+        assert_equal(appendpath('prefix', 'name'), join('prefix', 'name'))
+        assert_equal(appendpath('/prefix', 'name'), ajoin('prefix', 'name'))
+        assert_equal(appendpath('/prefix', '/name'), ajoin('prefix', 'name'))
+        assert_equal(appendpath('prefix', '/name'), join('prefix', 'name'))
+
+    def test_2(self):
+        assert_equal(appendpath('prefix/sub', 'name'),
+                     join('prefix', 'sub', 'name'))
+        assert_equal(appendpath('prefix/sub', 'sup/name'),
+                     join('prefix', 'sub', 'sup', 'name'))
+        assert_equal(appendpath('/prefix/sub', '/prefix/name'),
+                     ajoin('prefix', 'sub', 'name'))
+
+    def test_3(self):
+        assert_equal(appendpath('/prefix/sub', '/prefix/sup/name'),
+                     ajoin('prefix', 'sub', 'sup', 'name'))
+        assert_equal(appendpath('/prefix/sub/sub2', '/prefix/sup/sup2/name'),
+                     ajoin('prefix', 'sub', 'sub2', 'sup', 'sup2', 'name'))
+        assert_equal(appendpath('/prefix/sub/sub2', '/prefix/sub/sup/name'),
+                     ajoin('prefix', 'sub', 'sub2', 'sup', 'name'))
+
+class TestMinrelpath(object):
+
+    def test_1(self):
+        n = lambda path: path.replace('/', sep)
+        assert_equal(minrelpath(n('aa/bb')), n('aa/bb'))
+        assert_equal(minrelpath('..'), '..')
+        assert_equal(minrelpath(n('aa/..')), '')
+        assert_equal(minrelpath(n('aa/../bb')), 'bb')
+        assert_equal(minrelpath(n('aa/bb/..')), 'aa')
+        assert_equal(minrelpath(n('aa/bb/../..')), '')
+        assert_equal(minrelpath(n('aa/bb/../cc/../dd')), n('aa/dd'))
+        assert_equal(minrelpath(n('.././..')), n('../..'))
+        assert_equal(minrelpath(n('aa/bb/.././../dd')), n('dd'))
+
+class TestGpaths(object):
+
+    def test_gpaths(self):
+        local_path = minrelpath(join(dirname(__file__), '..'))
+        ls = gpaths('command/*.py', local_path)
+        assert_(join(local_path, 'command', 'build_src.py') in ls, repr(ls))
+        f = gpaths('system_info.py', local_path)
+        assert_(join(local_path, 'system_info.py') == f[0], repr(f))
+
+class TestSharedExtension(object):
+
+    def test_get_shared_lib_extension(self):
+        import sys
+        ext = get_shared_lib_extension(is_python_ext=False)
+        if sys.platform.startswith('linux'):
+            assert_equal(ext, '.so')
+        elif sys.platform.startswith('gnukfreebsd'):
+            assert_equal(ext, '.so')
+        elif sys.platform.startswith('darwin'):
+            assert_equal(ext, '.dylib')
+        elif sys.platform.startswith('win'):
+            assert_equal(ext, '.dll')
+        # just check for no crash
+        assert_(get_shared_lib_extension(is_python_ext=True))
+
+
+def test_installed_npymath_ini():
+    # Regression test for gh-7707.  If npymath.ini wasn't installed, then this
+    # will give an error.
+    info = get_info('npymath')
+
+    assert isinstance(info, dict)
+    assert "define_macros" in info
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_npy_pkg_config.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_npy_pkg_config.py
new file mode 100644
index 00000000000..537e16e90d2
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_npy_pkg_config.py
@@ -0,0 +1,86 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+
+from numpy.distutils.npy_pkg_config import read_config, parse_flags
+from numpy.testing import temppath, assert_
+
+simple = """\
+[meta]
+Name = foo
+Description = foo lib
+Version = 0.1
+
+[default]
+cflags = -I/usr/include
+libs = -L/usr/lib
+"""
+simple_d = {'cflags': '-I/usr/include', 'libflags': '-L/usr/lib',
+        'version': '0.1', 'name': 'foo'}
+
+simple_variable = """\
+[meta]
+Name = foo
+Description = foo lib
+Version = 0.1
+
+[variables]
+prefix = /foo/bar
+libdir = ${prefix}/lib
+includedir = ${prefix}/include
+
+[default]
+cflags = -I${includedir}
+libs = -L${libdir}
+"""
+simple_variable_d = {'cflags': '-I/foo/bar/include', 'libflags': '-L/foo/bar/lib',
+        'version': '0.1', 'name': 'foo'}
+
+class TestLibraryInfo(object):
+    def test_simple(self):
+        with temppath('foo.ini') as path:
+            with open(path,  'w') as f:
+                f.write(simple)
+            pkg = os.path.splitext(path)[0]
+            out = read_config(pkg)
+
+        assert_(out.cflags() == simple_d['cflags'])
+        assert_(out.libs() == simple_d['libflags'])
+        assert_(out.name == simple_d['name'])
+        assert_(out.version == simple_d['version'])
+
+    def test_simple_variable(self):
+        with temppath('foo.ini') as path:
+            with open(path,  'w') as f:
+                f.write(simple_variable)
+            pkg = os.path.splitext(path)[0]
+            out = read_config(pkg)
+
+        assert_(out.cflags() == simple_variable_d['cflags'])
+        assert_(out.libs() == simple_variable_d['libflags'])
+        assert_(out.name == simple_variable_d['name'])
+        assert_(out.version == simple_variable_d['version'])
+        out.vars['prefix'] = '/Users/david'
+        assert_(out.cflags() == '-I/Users/david/include')
+
+class TestParseFlags(object):
+    def test_simple_cflags(self):
+        d = parse_flags("-I/usr/include")
+        assert_(d['include_dirs'] == ['/usr/include'])
+
+        d = parse_flags("-I/usr/include -DFOO")
+        assert_(d['include_dirs'] == ['/usr/include'])
+        assert_(d['macros'] == ['FOO'])
+
+        d = parse_flags("-I /usr/include -DFOO")
+        assert_(d['include_dirs'] == ['/usr/include'])
+        assert_(d['macros'] == ['FOO'])
+
+    def test_simple_lflags(self):
+        d = parse_flags("-L/usr/lib -lfoo -L/usr/lib -lbar")
+        assert_(d['library_dirs'] == ['/usr/lib', '/usr/lib'])
+        assert_(d['libraries'] == ['foo', 'bar'])
+
+        d = parse_flags("-L /usr/lib -lfoo -L/usr/lib -lbar")
+        assert_(d['library_dirs'] == ['/usr/lib', '/usr/lib'])
+        assert_(d['libraries'] == ['foo', 'bar'])
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_shell_utils.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_shell_utils.py
new file mode 100644
index 00000000000..a0344244fe9
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_shell_utils.py
@@ -0,0 +1,79 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+import subprocess
+import os
+import json
+import sys
+
+from numpy.distutils import _shell_utils
+
+argv_cases = [
+    [r'exe'],
+    [r'path/exe'],
+    [r'path\exe'],
+    [r'\\server\path\exe'],
+    [r'path to/exe'],
+    [r'path to\exe'],
+
+    [r'exe', '--flag'],
+    [r'path/exe', '--flag'],
+    [r'path\exe', '--flag'],
+    [r'path to/exe', '--flag'],
+    [r'path to\exe', '--flag'],
+
+    # flags containing literal quotes in their name
+    [r'path to/exe', '--flag-"quoted"'],
+    [r'path to\exe', '--flag-"quoted"'],
+    [r'path to/exe', '"--flag-quoted"'],
+    [r'path to\exe', '"--flag-quoted"'],
+]
+
+
+[email protected](params=[
+    _shell_utils.WindowsParser,
+    _shell_utils.PosixParser
+])
+def Parser(request):
+    return request.param
+
+
+[email protected]
+def runner(Parser):
+    if Parser != _shell_utils.NativeParser:
+        pytest.skip('Unable to run with non-native parser')
+
+    if Parser == _shell_utils.WindowsParser:
+        return lambda cmd: subprocess.check_output(cmd)
+    elif Parser == _shell_utils.PosixParser:
+        # posix has no non-shell string parsing
+        return lambda cmd: subprocess.check_output(cmd, shell=True)
+    else:
+        raise NotImplementedError
+
+
+[email protected]('argv', argv_cases)
+def test_join_matches_subprocess(Parser, runner, argv):
+    """
+    Test that join produces strings understood by subprocess
+    """
+    # invoke python to return its arguments as json
+    cmd = [
+        sys.executable, '-c',
+        'import json, sys; print(json.dumps(sys.argv[1:]))'
+    ]
+    joined = Parser.join(cmd + argv)
+    json_out = runner(joined).decode()
+    assert json.loads(json_out) == argv
+
+
+[email protected]('argv', argv_cases)
+def test_roundtrip(Parser, argv):
+    """
+    Test that split is the inverse operation of join
+    """
+    try:
+        joined = Parser.join(argv)
+        assert argv == Parser.split(joined)
+    except NotImplementedError:
+        pytest.skip("Not implemented")
diff --git a/contrib/python/numpy/py2/numpy/distutils/tests/test_system_info.py b/contrib/python/numpy/py2/numpy/distutils/tests/test_system_info.py
new file mode 100644
index 00000000000..f7e275a2e7a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/distutils/tests/test_system_info.py
@@ -0,0 +1,237 @@
+from __future__ import division, print_function
+
+import os
+import shutil
+import pytest
+from tempfile import mkstemp, mkdtemp
+from subprocess import Popen, PIPE
+from distutils.errors import DistutilsError
+
+from numpy.distutils import ccompiler, customized_ccompiler
+from numpy.testing import assert_, assert_equal
+from numpy.distutils.system_info import system_info, ConfigParser
+from numpy.distutils.system_info import default_lib_dirs, default_include_dirs
+from numpy.distutils import _shell_utils
+
+
+def get_class(name, notfound_action=1):
+    """
+    notfound_action:
+      0 - do nothing
+      1 - display warning message
+      2 - raise error
+    """
+    cl = {'temp1': Temp1Info,
+          'temp2': Temp2Info
+          }.get(name.lower(), _system_info)
+    return cl()
+
+simple_site = """
+[ALL]
+library_dirs = {dir1:s}{pathsep:s}{dir2:s}
+libraries = {lib1:s},{lib2:s}
+extra_compile_args = -I/fake/directory -I"/path with/spaces" -Os
+runtime_library_dirs = {dir1:s}
+
+[temp1]
+library_dirs = {dir1:s}
+libraries = {lib1:s}
+runtime_library_dirs = {dir1:s}
+
+[temp2]
+library_dirs = {dir2:s}
+libraries = {lib2:s}
+extra_link_args = -Wl,-rpath={lib2_escaped:s}
+rpath = {dir2:s}
+"""
+site_cfg = simple_site
+
+fakelib_c_text = """
+/* This file is generated from numpy/distutils/testing/test_system_info.py */
+#include<stdio.h>
+void foo(void) {
+   printf("Hello foo");
+}
+void bar(void) {
+   printf("Hello bar");
+}
+"""
+
+def have_compiler():
+    """ Return True if there appears to be an executable compiler
+    """
+    compiler = customized_ccompiler()
+    try:
+        cmd = compiler.compiler  # Unix compilers
+    except AttributeError:
+        try:
+            if not compiler.initialized:
+                compiler.initialize()  # MSVC is different
+        except (DistutilsError, ValueError):
+            return False
+        cmd = [compiler.cc]
+    try:
+        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
+        p.stdout.close()
+        p.stderr.close()
+        p.wait()
+    except OSError:
+        return False
+    return True
+
+
+HAVE_COMPILER = have_compiler()
+
+
+class _system_info(system_info):
+
+    def __init__(self,
+                 default_lib_dirs=default_lib_dirs,
+                 default_include_dirs=default_include_dirs,
+                 verbosity=1,
+                 ):
+        self.__class__.info = {}
+        self.local_prefixes = []
+        defaults = {'library_dirs': '',
+                    'include_dirs': '',
+                    'runtime_library_dirs': '',
+                    'rpath': '',
+                    'src_dirs': '',
+                    'search_static_first': "0",
+                    'extra_compile_args': '',
+                    'extra_link_args': ''}
+        self.cp = ConfigParser(defaults)
+        # We have to parse the config files afterwards
+        # to have a consistent temporary filepath
+
+    def _check_libs(self, lib_dirs, libs, opt_libs, exts):
+        """Override _check_libs to return with all dirs """
+        info = {'libraries': libs, 'library_dirs': lib_dirs}
+        return info
+
+
+class Temp1Info(_system_info):
+    """For testing purposes"""
+    section = 'temp1'
+
+
+class Temp2Info(_system_info):
+    """For testing purposes"""
+    section = 'temp2'
+
+
+class TestSystemInfoReading(object):
+
+    def setup(self):
+        """ Create the libraries """
+        # Create 2 sources and 2 libraries
+        self._dir1 = mkdtemp()
+        self._src1 = os.path.join(self._dir1, 'foo.c')
+        self._lib1 = os.path.join(self._dir1, 'libfoo.so')
+        self._dir2 = mkdtemp()
+        self._src2 = os.path.join(self._dir2, 'bar.c')
+        self._lib2 = os.path.join(self._dir2, 'libbar.so')
+        # Update local site.cfg
+        global simple_site, site_cfg
+        site_cfg = simple_site.format(**{
+            'dir1': self._dir1,
+            'lib1': self._lib1,
+            'dir2': self._dir2,
+            'lib2': self._lib2,
+            'pathsep': os.pathsep,
+            'lib2_escaped': _shell_utils.NativeParser.join([self._lib2])
+        })
+        # Write site.cfg
+        fd, self._sitecfg = mkstemp()
+        os.close(fd)
+        with open(self._sitecfg, 'w') as fd:
+            fd.write(site_cfg)
+        # Write the sources
+        with open(self._src1, 'w') as fd:
+            fd.write(fakelib_c_text)
+        with open(self._src2, 'w') as fd:
+            fd.write(fakelib_c_text)
+        # We create all class-instances
+
+        def site_and_parse(c, site_cfg):
+            c.files = [site_cfg]
+            c.parse_config_files()
+            return c
+        self.c_default = site_and_parse(get_class('default'), self._sitecfg)
+        self.c_temp1 = site_and_parse(get_class('temp1'), self._sitecfg)
+        self.c_temp2 = site_and_parse(get_class('temp2'), self._sitecfg)
+
+    def teardown(self):
+        # Do each removal separately
+        try:
+            shutil.rmtree(self._dir1)
+        except Exception:
+            pass
+        try:
+            shutil.rmtree(self._dir2)
+        except Exception:
+            pass
+        try:
+            os.remove(self._sitecfg)
+        except Exception:
+            pass
+
+    def test_all(self):
+        # Read in all information in the ALL block
+        tsi = self.c_default
+        assert_equal(tsi.get_lib_dirs(), [self._dir1, self._dir2])
+        assert_equal(tsi.get_libraries(), [self._lib1, self._lib2])
+        assert_equal(tsi.get_runtime_lib_dirs(), [self._dir1])
+        extra = tsi.calc_extra_info()
+        assert_equal(extra['extra_compile_args'], ['-I/fake/directory', '-I/path with/spaces', '-Os'])
+
+    def test_temp1(self):
+        # Read in all information in the temp1 block
+        tsi = self.c_temp1
+        assert_equal(tsi.get_lib_dirs(), [self._dir1])
+        assert_equal(tsi.get_libraries(), [self._lib1])
+        assert_equal(tsi.get_runtime_lib_dirs(), [self._dir1])
+
+    def test_temp2(self):
+        # Read in all information in the temp2 block
+        tsi = self.c_temp2
+        assert_equal(tsi.get_lib_dirs(), [self._dir2])
+        assert_equal(tsi.get_libraries(), [self._lib2])
+        # Now from rpath and not runtime_library_dirs
+        assert_equal(tsi.get_runtime_lib_dirs(key='rpath'), [self._dir2])
+        extra = tsi.calc_extra_info()
+        assert_equal(extra['extra_link_args'], ['-Wl,-rpath=' + self._lib2])
+
+    @pytest.mark.skipif(not HAVE_COMPILER, reason="Missing compiler")
+    def test_compile1(self):
+        # Compile source and link the first source
+        c = customized_ccompiler()
+        previousDir = os.getcwd()
+        try:
+            # Change directory to not screw up directories
+            os.chdir(self._dir1)
+            c.compile([os.path.basename(self._src1)], output_dir=self._dir1)
+            # Ensure that the object exists
+            assert_(os.path.isfile(self._src1.replace('.c', '.o')) or
+                    os.path.isfile(self._src1.replace('.c', '.obj')))
+        finally:
+            os.chdir(previousDir)
+
+    @pytest.mark.skipif(not HAVE_COMPILER, reason="Missing compiler")
+    @pytest.mark.skipif('msvc' in repr(ccompiler.new_compiler()),
+                         reason="Fails with MSVC compiler ")
+    def test_compile2(self):
+        # Compile source and link the second source
+        tsi = self.c_temp2
+        c = customized_ccompiler()
+        extra_link_args = tsi.calc_extra_info()['extra_link_args']
+        previousDir = os.getcwd()
+        try:
+            # Change directory to not screw up directories
+            os.chdir(self._dir2)
+            c.compile([os.path.basename(self._src2)], output_dir=self._dir2,
+                      extra_postargs=extra_link_args)
+            # Ensure that the object exists
+            assert_(os.path.isfile(self._src2.replace('.c', '.o')))
+        finally:
+            os.chdir(previousDir)
diff --git a/contrib/python/numpy/py2/numpy/f2py/setup.cfg b/contrib/python/numpy/py2/numpy/f2py/setup.cfg
new file mode 100644
index 00000000000..14669544cc9
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/setup.cfg
@@ -0,0 +1,3 @@
+[bdist_rpm]
+doc_files = docs/
+            tests/
+\ No newline at end of file
diff --git a/contrib/python/numpy/py2/numpy/f2py/setup.py b/contrib/python/numpy/py2/numpy/f2py/setup.py
new file mode 100644
index 00000000000..c0c50ce5474
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/setup.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+"""
+setup.py for installing F2PY
+
+Usage:
+   python setup.py install
+
+Copyright 2001-2005 Pearu Peterson all rights reserved,
+Pearu Peterson <[email protected]>
+Permission to use, modify, and distribute this software is given under the
+terms of the NumPy License.
+
+NO WARRANTY IS EXPRESSED OR IMPLIED.  USE AT YOUR OWN RISK.
+$Revision: 1.32 $
+$Date: 2005/01/30 17:22:14 $
+Pearu Peterson
+
+"""
+from __future__ import division, print_function
+
+from numpy.distutils.core import setup
+from numpy.distutils.misc_util import Configuration
+
+
+from __version__ import version
+
+
+def configuration(parent_package='', top_path=None):
+    config = Configuration('f2py', parent_package, top_path)
+    config.add_data_dir('tests')
+    config.add_data_files(
+        'src/fortranobject.c',
+        'src/fortranobject.h')
+    return config
+
+
+if __name__ == "__main__":
+
+    config = configuration(top_path='')
+    config = config.todict()
+
+    config['download_url'] = "http://cens.ioc.ee/projects/f2py2e/2.x"\
+                             "/F2PY-2-latest.tar.gz"
+    config['classifiers'] = [
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: NumPy License',
+        'Natural Language :: English',
+        'Operating System :: OS Independent',
+        'Programming Language :: C',
+        'Programming Language :: Fortran',
+        'Programming Language :: Python',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Software Development :: Code Generators',
+    ]
+    setup(version=version,
+          description="F2PY - Fortran to Python Interface Generator",
+          author="Pearu Peterson",
+          author_email="[email protected]",
+          maintainer="Pearu Peterson",
+          maintainer_email="[email protected]",
+          license="BSD",
+          platforms="Unix, Windows (mingw|cygwin), Mac OSX",
+          long_description="""\
+The Fortran to Python Interface Generator, or F2PY for short, is a
+command line tool (f2py) for generating Python C/API modules for
+wrapping Fortran 77/90/95 subroutines, accessing common blocks from
+Python, and calling Python functions from Fortran (call-backs).
+Interfacing subroutines/data from Fortran 90/95 modules is supported.""",
+          url="http://cens.ioc.ee/projects/f2py2e/",
+          keywords=['Fortran', 'f2py'],
+          **config)
diff --git a/contrib/python/numpy/py2/numpy/f2py/src/test/Makefile b/contrib/python/numpy/py2/numpy/f2py/src/test/Makefile
new file mode 100644
index 00000000000..0f8869f726f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/src/test/Makefile
@@ -0,0 +1,96 @@
+# -*- makefile -*-
+# File: Makefile-foo
+# Usage:
+#     make -f Makefile-foo [MODE=opt|debug]
+# Notes:
+# 1) You must use GNU make; try `gmake ..' if `make' fails.
+# 2) This file is auto-generated with f2py (version 2.264).
+#    f2py is a Fortran to Python Interface Generator (FPIG), Second Edition,
+#    written by Pearu Peterson <[email protected]>.
+#    See http://cens.ioc.ee/projects/f2py2e/
+#    Generation date: Wed Sep 13 16:22:55 2000
+#    $Revision: 1.2 $
+#    $Date: 2000/09/17 16:10:27 $
+
+# Recommendation notes produced by f2py2e/buildmakefile.py:
+# *** 
+
+PYINC =  -I/numeric/include/python1.5/Numeric -I/numeric/include/python1.5
+INCLUDES = -I..
+LIBS =  -L$(shell gcc -v 2>&1 | grep specs | sed -e 's/Reading specs from //g' | sed -e 's/\/specs//g') -lg2c 
+LIBS=-L$$ABSOFT/lib -lfio -lf77math -lf90math
+LIBS=-L/numeric/bin -lvast90 -L/usr/lib/gcc-lib/i586-mandrake-linux/2.95.2 -lg2c
+
+# Wrapper generator:
+F2PY = /home/pearu/bin/f2py-cvs
+
+# Fortran compiler: Absoft f95 
+FC = f95
+FC = f90
+FOPT = 
+FDEBUG = 
+FFLAGS = -B108 -YCFRL=1 -YCOM_NAMES=LCS -YCOM_PFX -YCOM_SFX=_ -YEXT_PFX -YEXT_NAMES=LCS
+FFLAGS = 
+# C compiler: cc ('gcc 2.x.x' 2.95.2)
+CC = cc
+COPT = 
+CDEBUG = 
+CFLAGS =  -fpic 
+
+# Linker: ld ('GNU ld' 2.9.5)
+LD = ld
+LDFLAGS = -shared -s 
+SO = .so
+
+ifeq '$(MODE)' 'debug'
+FFLAGS += $(FDEBUG)
+CFLAGS += $(CDEBUG)
+endif
+ifeq '$(MODE)' 'opt'
+FFLAGS += $(FOPT)
+CFLAGS += $(COPT)
+endif
+FFLAGS += $(INCLUDES)
+CFLAGS += $(PYINC) $(INCLUDES)
+
+SRCC = ../fortranobject.c
+SRCF = mod.f90 bar.f foo90.f90 wrap.f
+SRCS = $(SRCC) $(SRCF)
+OBJC = $(filter %.o,$(SRCC:.c=.o) $(SRCC:.cc=.o) $(SRCC:.C=.o))
+OBJF = $(filter %.o,$(SRCF:.f90=.o) $(SRCF:.f=.o) $(SRCF:.F=.o) $(SRCF:.for=.o))
+OBJS = $(OBJC) $(OBJF)
+
+INSTALLNAME = f2py2e-apps
+INSTALLDIRECTORY = /numeric/lib/python1.5/site-packages/$(INSTALLNAME)
+INSTALLDIR = install -d -c
+INSTALLEXEC = install -m 755 -c
+
+all: foo
+
+foo: foomodule$(SO)
+foomodule$(SO) : foomodule.o $(OBJS)
+	$(LD) $(LDFLAGS) -o $@ $< $(OBJS) $(LIBS)
+
+foomodule.o: foomodule.c
+
+
+$(OBJS) : $(SRCS)
+%.o : %.f ; $(FC) -c $(FFLAGS) $< 
+%.o : %.f90 ; $(FC) -c $(FFLAGS) $< 
+
+test: foomodule$(SO)
+	python -c 'import foo;print foo.__doc__'
+install: foomodule$(SO)
+	$(INSTALLDIR) $(INSTALLDIRECTORY)
+	$(INSTALLEXEC) foomodule$(SO) $(INSTALLDIRECTORY)
+	cd $(INSTALLDIRECTORY) && echo "$(INSTALLNAME)" > ../$(INSTALLNAME).pth
+
+.PHONY: clean distclean debug test install foo
+debug:
+	echo "OBJS=$(OBJS)"
+	echo "SRCS=$(SRCS)"
+clean:
+	$(RM) *.o *.mod core foomodule.{dvi,log} $(OBJS)
+distclean: clean
+	$(RM) *.so *.sl foomodule.{tex,so}
+	$(RM) .f2py_get_compiler_*
diff --git a/contrib/python/numpy/py2/numpy/f2py/src/test/bar.f b/contrib/python/numpy/py2/numpy/f2py/src/test/bar.f
new file mode 100644
index 00000000000..5354ceaf986
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/src/test/bar.f
@@ -0,0 +1,11 @@
+      subroutine bar()
+      integer a
+      real*8 b,c(3)
+      common /foodata/ a,b,c
+      a = 4
+      b = 6.7
+      c(2) = 3.0
+      write(*,*) "bar:a=",a
+      write(*,*) "bar:b=",b
+      write(*,*) "bar:c=",c
+      end
diff --git a/contrib/python/numpy/py2/numpy/f2py/src/test/foo.f b/contrib/python/numpy/py2/numpy/f2py/src/test/foo.f
new file mode 100644
index 00000000000..5354ceaf986
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/src/test/foo.f
@@ -0,0 +1,11 @@
+      subroutine bar()
+      integer a
+      real*8 b,c(3)
+      common /foodata/ a,b,c
+      a = 4
+      b = 6.7
+      c(2) = 3.0
+      write(*,*) "bar:a=",a
+      write(*,*) "bar:b=",b
+      write(*,*) "bar:c=",c
+      end
diff --git a/contrib/python/numpy/py2/numpy/f2py/src/test/foo90.f90 b/contrib/python/numpy/py2/numpy/f2py/src/test/foo90.f90
new file mode 100644
index 00000000000..dbca7e95ba8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/src/test/foo90.f90
@@ -0,0 +1,13 @@
+subroutine foo()
+  integer a
+  real*8 b,c(3)
+  common /foodata/ a,b,c
+  print*, "   F: in foo"
+  a = 5
+  b = 6.3
+  c(2) = 9.1
+end subroutine foo
+
+
+
+
diff --git a/contrib/python/numpy/py2/numpy/f2py/src/test/foomodule.c b/contrib/python/numpy/py2/numpy/f2py/src/test/foomodule.c
new file mode 100644
index 00000000000..733fab0bed2
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/src/test/foomodule.c
@@ -0,0 +1,142 @@
+/* File: foomodule.c
+ * Example of FortranObject usage. See also wrap.f foo.f foo90.f90.
+ * Author: Pearu Peterson <[email protected]>.
+ * http://cens.ioc.ee/projects/f2py2e/
+ * $Revision: 1.2 $
+ * $Date: 2000/09/17 16:10:27 $
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+#include "fortranobject.h"
+
+static PyObject *foo_error;
+
+#if defined(NO_APPEND_FORTRAN)
+#if defined(UPPERCASE_FORTRAN)
+#define F_FUNC(f,F) F
+#else
+#define F_FUNC(f,F) f
+#endif
+#else
+#if defined(UPPERCASE_FORTRAN)
+#define F_FUNC(f,F) F##_
+#else
+#define F_FUNC(f,F) f##_
+#endif
+#endif
+
+/************* foo_bar *************/
+static char doc_foo_bar[] = "\
+Function signature:\n\
+  bar()\n\
+";
+static PyObject *foo_bar(PyObject *capi_self, PyObject *capi_args,
+                         PyObject *capi_keywds, void (*f2py_func)()) {
+    PyObject *capi_buildvalue = NULL;
+    static char *capi_kwlist[] = {NULL};
+    if (!PyArg_ParseTupleAndKeywords(capi_args,capi_keywds,\
+                                     "|:foo.bar",\
+                                     capi_kwlist))
+        goto capi_fail;
+    (*f2py_func)();
+    capi_buildvalue = Py_BuildValue("");
+ capi_fail:
+    return capi_buildvalue;
+}
+/************ mod_init **************/
+static PyObject *mod_init(PyObject *capi_self, PyObject *capi_args,
+                          PyObject *capi_keywds, void (*f2py_func)()) {
+    PyObject *capi_buildvalue = NULL;
+    static char *capi_kwlist[] = {NULL};
+    if (!PyArg_ParseTupleAndKeywords(capi_args,capi_keywds,\
+                                     "|:mod.init",\
+                                     capi_kwlist))
+        goto capi_fail;
+    (*f2py_func)();
+    capi_buildvalue = Py_BuildValue("");
+ capi_fail:
+    return capi_buildvalue;
+}
+
+/* F90 module */
+static FortranDataDef f2py_mod_def[] = {
+    {"a",0, {}, NPY_INT},
+    {"b",0, {}, NPY_DOUBLE},
+    {"c",1, {3}, NPY_DOUBLE},
+    {"d",1, {-1}, NPY_DOUBLE},
+    {"init",-1,{},0,NULL,(void *)mod_init},
+    {NULL}
+};
+static void f2py_setup_mod(char *a,char *b,char *c,void (*d)(),char *init) {
+    f2py_mod_def[0].data = a;
+    f2py_mod_def[1].data = b;
+    f2py_mod_def[2].data = c;
+    f2py_mod_def[3].func = d;
+    f2py_mod_def[4].data = init;
+}
+extern void F_FUNC(f2pyinitmod,F2PYINITMOD)();
+                                           static void f2py_init_mod() {
+                                               F_FUNC(f2pyinitmod,F2PYINITMOD)(f2py_setup_mod);
+                                           }
+
+/* COMMON block */
+static FortranDataDef f2py_foodata_def[] = {
+    {"a",0, {}, NPY_INT},
+    {"b",0, {}, NPY_DOUBLE},
+    {"c",1, {3}, NPY_DOUBLE},
+    {NULL}
+};
+static void f2py_setup_foodata(char *a,char *b,char *c) {
+    f2py_foodata_def[0].data = a;
+    f2py_foodata_def[1].data = b;
+    f2py_foodata_def[2].data = c;
+}
+extern void F_FUNC(f2pyinitfoodata,F2PYINITFOODATA)();
+                                                   static void f2py_init_foodata() {
+                                                       F_FUNC(f2pyinitfoodata,F2PYINITFOODATA)(f2py_setup_foodata);
+                                                   }
+
+/* Fortran routines (needs no initialization/setup function) */
+extern void F_FUNC(bar,BAR)();
+                           extern void F_FUNC(foo,FOO)();
+                                                      static FortranDataDef f2py_routines_def[] = {
+                                                          {"bar",-1, {}, 0, (char *)F_FUNC(bar,BAR),(void *)foo_bar,doc_foo_bar},
+                                                          {"foo",-1, {}, 0, (char *)F_FUNC(foo,FOO),(void *)foo_bar,doc_foo_bar},
+                                                          {NULL}
+                                                      };
+
+static PyMethodDef foo_module_methods[] = {
+    /*eof method*/
+    {NULL,NULL}
+};
+
+void initfoo() {
+    int i;
+    PyObject *m, *d, *s;
+    import_array();
+
+    m = Py_InitModule("foo", foo_module_methods);
+
+    d = PyModule_GetDict(m);
+    s = PyString_FromString("This module 'foo' demonstrates the usage of fortranobject.");
+    PyDict_SetItemString(d, "__doc__", s);
+
+    /* Fortran objects: */
+    PyDict_SetItemString(d, "mod", PyFortranObject_New(f2py_mod_def,f2py_init_mod));
+    PyDict_SetItemString(d, "foodata", PyFortranObject_New(f2py_foodata_def,f2py_init_foodata));
+    for(i=0;f2py_routines_def[i].name!=NULL;i++)
+        PyDict_SetItemString(d, f2py_routines_def[i].name,
+                             PyFortranObject_NewAsAttr(&f2py_routines_def[i]));
+
+    Py_DECREF(s);
+
+    if (PyErr_Occurred())
+        Py_FatalError("can't initialize module foo");
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/python/numpy/py2/numpy/f2py/src/test/wrap.f b/contrib/python/numpy/py2/numpy/f2py/src/test/wrap.f
new file mode 100644
index 00000000000..9414eb9f6f8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/src/test/wrap.f
@@ -0,0 +1,70 @@
+      subroutine f2py_mod_get_dims(f2py_r,f2py_s,f2py_set,f2py_n)
+      use mod
+      external f2py_set
+      logical f2py_ns
+      integer f2py_s(*),f2py_r,f2py_i,f2py_j
+      character*(*) f2py_n
+      if ("d".eq.f2py_n) then
+         f2py_ns = .FALSE.
+         if (allocated(d)) then
+            do f2py_i=1,f2py_r
+               if ((size(d,f2py_r-f2py_i+1).ne.f2py_s(f2py_i)).and.
+     c          (f2py_s(f2py_i).ge.0)) then
+                  f2py_ns = .TRUE.
+               end if
+            end do
+            if (f2py_ns) then
+               deallocate(d)
+            end if
+         end if
+         if (.not.allocated(d)) then
+            allocate(d(f2py_s(1)))
+         end if
+         if (allocated(d)) then
+            do f2py_i=1,f2py_r
+               f2py_s(f2py_i) = size(d,f2py_r-f2py_i+1)
+            end do
+            call f2py_set(d)
+         end if
+      end if
+      end subroutine f2py_mod_get_dims
+      subroutine f2py_mod_get_dims_d(r,s,set_data)
+      use mod, only: d => d
+      external set_data
+      logical ns
+      integer s(*),r,i,j
+      ns = .FALSE.
+      if (allocated(d)) then
+         do i=1,r
+            if ((size(d,r-i+1).ne.s(i)).and.(s(i).ge.0)) then
+               ns = .TRUE.
+            end if
+         end do
+         if (ns) then 
+            deallocate(d) 
+         end if
+      end if
+      if (.not.allocated(d).and.(s(1).ge.1)) then
+         allocate(d(s(1)))
+      end if
+      if (allocated(d)) then
+         do i=1,r
+            s(i) = size(d,r-i+1)
+         end do
+      end if
+      call set_data(d,allocated(d))
+      end subroutine f2py_mod_get_dims_d
+
+      subroutine f2pyinitmod(setupfunc)
+      use mod
+      external setupfunc,f2py_mod_get_dims_d,init
+      call setupfunc(a,b,c,f2py_mod_get_dims_d,init)
+      end subroutine f2pyinitmod
+
+      subroutine f2pyinitfoodata(setupfunc)
+      external setupfunc
+      integer a
+      real*8 b,c(3)
+      common /foodata/ a,b,c
+      call setupfunc(a,b,c)
+      end subroutine f2pyinitfoodata
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/__init__.py b/contrib/python/numpy/py2/numpy/f2py/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c b/contrib/python/numpy/py2/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
new file mode 100644
index 00000000000..7f46303b014
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
@@ -0,0 +1,224 @@
+/* File: wrapmodule.c
+ * This file is auto-generated with f2py (version:2_1330).
+ * Hand edited by Pearu.
+ * f2py is a Fortran to Python Interface Generator (FPIG), Second Edition,
+ * written by Pearu Peterson <[email protected]>.
+ * See http://cens.ioc.ee/projects/f2py2e/
+ * Generation date: Fri Oct 21 22:41:12 2005
+ * $Revision:$
+ * $Date:$
+ * Do not edit this file directly unless you know what you are doing!!!
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************** See f2py2e/cfuncs.py: includes ***********************/
+#include "Python.h"
+#include "fortranobject.h"
+#include <math.h>
+
+static PyObject *wrap_error;
+static PyObject *wrap_module;
+
+/************************************ call ************************************/
+static char doc_f2py_rout_wrap_call[] = "\
+Function signature:\n\
+  arr = call(type_num,dims,intent,obj)\n\
+Required arguments:\n"
+"  type_num : input int\n"
+"  dims : input int-sequence\n"
+"  intent : input int\n"
+"  obj : input python object\n"
+"Return objects:\n"
+"  arr : array";
+static PyObject *f2py_rout_wrap_call(PyObject *capi_self,
+                                     PyObject *capi_args) {
+  PyObject * volatile capi_buildvalue = NULL;
+  int type_num = 0;
+  npy_intp *dims = NULL;
+  PyObject *dims_capi = Py_None;
+  int rank = 0;
+  int intent = 0;
+  PyArrayObject *capi_arr_tmp = NULL;
+  PyObject *arr_capi = Py_None;
+  int i;
+
+  if (!PyArg_ParseTuple(capi_args,"iOiO|:wrap.call",\
+                        &type_num,&dims_capi,&intent,&arr_capi))
+    return NULL;
+  rank = PySequence_Length(dims_capi);
+  dims = malloc(rank*sizeof(npy_intp));
+  for (i=0;i<rank;++i)
+    dims[i] = (npy_intp)PyInt_AsLong(PySequence_GetItem(dims_capi,i));
+
+  capi_arr_tmp = array_from_pyobj(type_num,dims,rank,intent|F2PY_INTENT_OUT,arr_capi);
+  if (capi_arr_tmp == NULL) {
+    free(dims);
+    return NULL;
+  }
+  capi_buildvalue = Py_BuildValue("N",capi_arr_tmp);
+  free(dims);
+  return capi_buildvalue;
+}
+
+static char doc_f2py_rout_wrap_attrs[] = "\
+Function signature:\n\
+  arr = array_attrs(arr)\n\
+Required arguments:\n"
+"  arr : input array object\n"
+"Return objects:\n"
+"  data : data address in hex\n"
+"  nd : int\n"
+"  dimensions : tuple\n"
+"  strides : tuple\n"
+"  base : python object\n"
+"  (kind,type,type_num,elsize,alignment) : 4-tuple\n"
+"  flags : int\n"
+"  itemsize : int\n"
+;
+static PyObject *f2py_rout_wrap_attrs(PyObject *capi_self,
+                                      PyObject *capi_args) {
+  PyObject *arr_capi = Py_None;
+  PyArrayObject *arr = NULL;
+  PyObject *dimensions = NULL;
+  PyObject *strides = NULL;
+  char s[100];
+  int i;
+  memset(s,0,100*sizeof(char));
+  if (!PyArg_ParseTuple(capi_args,"O!|:wrap.attrs",
+                        &PyArray_Type,&arr_capi))
+    return NULL;
+  arr = (PyArrayObject *)arr_capi;
+  sprintf(s,"%p",PyArray_DATA(arr));
+  dimensions = PyTuple_New(PyArray_NDIM(arr));
+  strides = PyTuple_New(PyArray_NDIM(arr));
+  for (i=0;i<PyArray_NDIM(arr);++i) {
+    PyTuple_SetItem(dimensions,i,PyInt_FromLong(PyArray_DIM(arr,i)));
+    PyTuple_SetItem(strides,i,PyInt_FromLong(PyArray_STRIDE(arr,i)));
+  }
+  return Py_BuildValue("siOOO(cciii)ii",s,PyArray_NDIM(arr),
+                       dimensions,strides,
+                       (PyArray_BASE(arr)==NULL?Py_None:PyArray_BASE(arr)),
+                       PyArray_DESCR(arr)->kind,
+                       PyArray_DESCR(arr)->type,
+                       PyArray_TYPE(arr),
+                       PyArray_ITEMSIZE(arr),
+                       PyArray_DESCR(arr)->alignment,
+                       PyArray_FLAGS(arr),
+                       PyArray_ITEMSIZE(arr));
+}
+
+static PyMethodDef f2py_module_methods[] = {
+
+  {"call",f2py_rout_wrap_call,METH_VARARGS,doc_f2py_rout_wrap_call},
+  {"array_attrs",f2py_rout_wrap_attrs,METH_VARARGS,doc_f2py_rout_wrap_attrs},
+  {NULL,NULL}
+};
+
+#if PY_VERSION_HEX >= 0x03000000
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "test_array_from_pyobj_ext",
+    NULL,
+    -1,
+    f2py_module_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#endif
+
+#if PY_VERSION_HEX >= 0x03000000
+#define RETVAL m
+PyMODINIT_FUNC PyInit_test_array_from_pyobj_ext(void) {
+#else
+#define RETVAL
+PyMODINIT_FUNC inittest_array_from_pyobj_ext(void) {
+#endif
+  PyObject *m,*d, *s;
+#if PY_VERSION_HEX >= 0x03000000
+  m = wrap_module = PyModule_Create(&moduledef);
+#else
+  m = wrap_module = Py_InitModule("test_array_from_pyobj_ext", f2py_module_methods);
+#endif
+  Py_TYPE(&PyFortran_Type) = &PyType_Type;
+  import_array();
+  if (PyErr_Occurred())
+    Py_FatalError("can't initialize module wrap (failed to import numpy)");
+  d = PyModule_GetDict(m);
+  s = PyString_FromString("This module 'wrap' is auto-generated with f2py (version:2_1330).\nFunctions:\n"
+"  arr = call(type_num,dims,intent,obj)\n"
+".");
+  PyDict_SetItemString(d, "__doc__", s);
+  wrap_error = PyErr_NewException ("wrap.error", NULL, NULL);
+  Py_DECREF(s);
+  PyDict_SetItemString(d, "F2PY_INTENT_IN", PyInt_FromLong(F2PY_INTENT_IN));
+  PyDict_SetItemString(d, "F2PY_INTENT_INOUT", PyInt_FromLong(F2PY_INTENT_INOUT));
+  PyDict_SetItemString(d, "F2PY_INTENT_OUT", PyInt_FromLong(F2PY_INTENT_OUT));
+  PyDict_SetItemString(d, "F2PY_INTENT_HIDE", PyInt_FromLong(F2PY_INTENT_HIDE));
+  PyDict_SetItemString(d, "F2PY_INTENT_CACHE", PyInt_FromLong(F2PY_INTENT_CACHE));
+  PyDict_SetItemString(d, "F2PY_INTENT_COPY", PyInt_FromLong(F2PY_INTENT_COPY));
+  PyDict_SetItemString(d, "F2PY_INTENT_C", PyInt_FromLong(F2PY_INTENT_C));
+  PyDict_SetItemString(d, "F2PY_OPTIONAL", PyInt_FromLong(F2PY_OPTIONAL));
+  PyDict_SetItemString(d, "F2PY_INTENT_INPLACE", PyInt_FromLong(F2PY_INTENT_INPLACE));
+  PyDict_SetItemString(d, "NPY_BOOL", PyInt_FromLong(NPY_BOOL));
+  PyDict_SetItemString(d, "NPY_BYTE", PyInt_FromLong(NPY_BYTE));
+  PyDict_SetItemString(d, "NPY_UBYTE", PyInt_FromLong(NPY_UBYTE));
+  PyDict_SetItemString(d, "NPY_SHORT", PyInt_FromLong(NPY_SHORT));
+  PyDict_SetItemString(d, "NPY_USHORT", PyInt_FromLong(NPY_USHORT));
+  PyDict_SetItemString(d, "NPY_INT", PyInt_FromLong(NPY_INT));
+  PyDict_SetItemString(d, "NPY_UINT", PyInt_FromLong(NPY_UINT));
+  PyDict_SetItemString(d, "NPY_INTP", PyInt_FromLong(NPY_INTP));
+  PyDict_SetItemString(d, "NPY_UINTP", PyInt_FromLong(NPY_UINTP));
+  PyDict_SetItemString(d, "NPY_LONG", PyInt_FromLong(NPY_LONG));
+  PyDict_SetItemString(d, "NPY_ULONG", PyInt_FromLong(NPY_ULONG));
+  PyDict_SetItemString(d, "NPY_LONGLONG", PyInt_FromLong(NPY_LONGLONG));
+  PyDict_SetItemString(d, "NPY_ULONGLONG", PyInt_FromLong(NPY_ULONGLONG));
+  PyDict_SetItemString(d, "NPY_FLOAT", PyInt_FromLong(NPY_FLOAT));
+  PyDict_SetItemString(d, "NPY_DOUBLE", PyInt_FromLong(NPY_DOUBLE));
+  PyDict_SetItemString(d, "NPY_LONGDOUBLE", PyInt_FromLong(NPY_LONGDOUBLE));
+  PyDict_SetItemString(d, "NPY_CFLOAT", PyInt_FromLong(NPY_CFLOAT));
+  PyDict_SetItemString(d, "NPY_CDOUBLE", PyInt_FromLong(NPY_CDOUBLE));
+  PyDict_SetItemString(d, "NPY_CLONGDOUBLE", PyInt_FromLong(NPY_CLONGDOUBLE));
+  PyDict_SetItemString(d, "NPY_OBJECT", PyInt_FromLong(NPY_OBJECT));
+  PyDict_SetItemString(d, "NPY_STRING", PyInt_FromLong(NPY_STRING));
+  PyDict_SetItemString(d, "NPY_UNICODE", PyInt_FromLong(NPY_UNICODE));
+  PyDict_SetItemString(d, "NPY_VOID", PyInt_FromLong(NPY_VOID));
+  PyDict_SetItemString(d, "NPY_NTYPES", PyInt_FromLong(NPY_NTYPES));
+  PyDict_SetItemString(d, "NPY_NOTYPE", PyInt_FromLong(NPY_NOTYPE));
+  PyDict_SetItemString(d, "NPY_USERDEF", PyInt_FromLong(NPY_USERDEF));
+
+  PyDict_SetItemString(d, "CONTIGUOUS", PyInt_FromLong(NPY_ARRAY_C_CONTIGUOUS));
+  PyDict_SetItemString(d, "FORTRAN", PyInt_FromLong(NPY_ARRAY_F_CONTIGUOUS));
+  PyDict_SetItemString(d, "OWNDATA", PyInt_FromLong(NPY_ARRAY_OWNDATA));
+  PyDict_SetItemString(d, "FORCECAST", PyInt_FromLong(NPY_ARRAY_FORCECAST));
+  PyDict_SetItemString(d, "ENSURECOPY", PyInt_FromLong(NPY_ARRAY_ENSURECOPY));
+  PyDict_SetItemString(d, "ENSUREARRAY", PyInt_FromLong(NPY_ARRAY_ENSUREARRAY));
+  PyDict_SetItemString(d, "ALIGNED", PyInt_FromLong(NPY_ARRAY_ALIGNED));
+  PyDict_SetItemString(d, "WRITEABLE", PyInt_FromLong(NPY_ARRAY_WRITEABLE));
+  PyDict_SetItemString(d, "UPDATEIFCOPY", PyInt_FromLong(NPY_ARRAY_UPDATEIFCOPY));
+  PyDict_SetItemString(d, "WRITEBACKIFCOPY", PyInt_FromLong(NPY_ARRAY_WRITEBACKIFCOPY));
+
+  PyDict_SetItemString(d, "BEHAVED", PyInt_FromLong(NPY_ARRAY_BEHAVED));
+  PyDict_SetItemString(d, "BEHAVED_NS", PyInt_FromLong(NPY_ARRAY_BEHAVED_NS));
+  PyDict_SetItemString(d, "CARRAY", PyInt_FromLong(NPY_ARRAY_CARRAY));
+  PyDict_SetItemString(d, "FARRAY", PyInt_FromLong(NPY_ARRAY_FARRAY));
+  PyDict_SetItemString(d, "CARRAY_RO", PyInt_FromLong(NPY_ARRAY_CARRAY_RO));
+  PyDict_SetItemString(d, "FARRAY_RO", PyInt_FromLong(NPY_ARRAY_FARRAY_RO));
+  PyDict_SetItemString(d, "DEFAULT", PyInt_FromLong(NPY_ARRAY_DEFAULT));
+  PyDict_SetItemString(d, "UPDATE_ALL", PyInt_FromLong(NPY_ARRAY_UPDATE_ALL));
+
+  if (PyErr_Occurred())
+    Py_FatalError("can't initialize module wrap");
+
+#ifdef F2PY_REPORT_ATEXIT
+  on_exit(f2py_report_on_exit,(void*)"array_from_pyobj.wrap.call");
+#endif
+
+  return RETVAL;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/.f2py_f2cmap b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/.f2py_f2cmap
new file mode 100644
index 00000000000..2665f89b52d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/.f2py_f2cmap
@@ -0,0 +1 @@
+dict(real=dict(rk="double"))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_free.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_free.f90
new file mode 100644
index 00000000000..b301710f5dd
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_free.f90
@@ -0,0 +1,34 @@
+
+subroutine sum(x, res)
+  implicit none
+  real, intent(in) :: x(:)
+  real, intent(out) :: res
+
+  integer :: i
+
+  !print *, "sum: size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+end subroutine sum
+
+function fsum(x) result (res)
+  implicit none
+  real, intent(in) :: x(:)
+  real :: res
+
+  integer :: i
+
+  !print *, "fsum: size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+end function fsum
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_mod.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_mod.f90
new file mode 100644
index 00000000000..cbe6317ed8f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_mod.f90
@@ -0,0 +1,41 @@
+
+module mod
+
+contains
+
+subroutine sum(x, res)
+  implicit none
+  real, intent(in) :: x(:)
+  real, intent(out) :: res
+
+  integer :: i
+
+  !print *, "sum: size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+end subroutine sum
+
+function fsum(x) result (res)
+  implicit none
+  real, intent(in) :: x(:)
+  real :: res
+
+  integer :: i
+
+  !print *, "fsum: size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+end function fsum
+
+
+end module mod
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_use.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_use.f90
new file mode 100644
index 00000000000..337465ac540
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/foo_use.f90
@@ -0,0 +1,19 @@
+subroutine sum_with_use(x, res)
+  use precision
+
+  implicit none
+
+  real(kind=rk), intent(in) :: x(:)
+  real(kind=rk), intent(out) :: res
+
+  integer :: i
+
+  !print *, "size(x) = ", size(x)
+
+  res = 0.0
+
+  do i = 1, size(x)
+    res = res + x(i)
+  enddo
+
+ end subroutine
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/precision.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/precision.f90
new file mode 100644
index 00000000000..ed6c70cbbe7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/assumed_shape/precision.f90
@@ -0,0 +1,4 @@
+module precision
+  integer, parameter :: rk = selected_real_kind(8)
+  integer, parameter :: ik = selected_real_kind(4)
+end module
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/common/block.f b/contrib/python/numpy/py2/numpy/f2py/tests/src/common/block.f
new file mode 100644
index 00000000000..7ea7968fe93
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/common/block.f
@@ -0,0 +1,11 @@
+      SUBROUTINE INITCB
+      DOUBLE PRECISION LONG
+      CHARACTER        STRING
+      INTEGER          OK
+    
+      COMMON  /BLOCK/ LONG, STRING, OK
+      LONG = 1.0
+      STRING = '2'
+      OK = 3
+      RETURN
+      END
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/kind/foo.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/kind/foo.f90
new file mode 100644
index 00000000000..d3d15cfb20a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/kind/foo.f90
@@ -0,0 +1,20 @@
+
+
+subroutine selectedrealkind(p, r, res)
+  implicit none
+  
+  integer, intent(in) :: p, r
+  !f2py integer :: r=0
+  integer, intent(out) :: res
+  res = selected_real_kind(p, r)
+
+end subroutine
+
+subroutine selectedintkind(p, res)
+  implicit none
+
+  integer, intent(in) :: p
+  integer, intent(out) :: res
+  res = selected_int_kind(p)
+
+end subroutine
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo.f b/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo.f
new file mode 100644
index 00000000000..c34742578f8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo.f
@@ -0,0 +1,5 @@
+      subroutine bar11(a)
+cf2py intent(out) a
+      integer a
+      a = 11
+      end
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo_fixed.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo_fixed.f90
new file mode 100644
index 00000000000..7543a6acb73
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo_fixed.f90
@@ -0,0 +1,8 @@
+      module foo_fixed
+      contains
+        subroutine bar12(a)
+!f2py intent(out) a
+          integer a
+          a = 12
+        end subroutine bar12
+      end module foo_fixed
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo_free.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo_free.f90
new file mode 100644
index 00000000000..c1b641f13ec
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/mixed/foo_free.f90
@@ -0,0 +1,8 @@
+module foo_free
+contains
+  subroutine bar13(a)
+    !f2py intent(out) a
+    integer a
+    a = 13
+  end subroutine bar13
+end module foo_free
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_both.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_both.f90
new file mode 100644
index 00000000000..ac90cedc525
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_both.f90
@@ -0,0 +1,57 @@
+! Check that parameters are correct intercepted.
+! Constants with comma separations are commonly
+! used, for instance Pi = 3._dp
+subroutine foo(x)
+  implicit none
+  integer, parameter :: sp = selected_real_kind(6)
+  integer, parameter :: dp = selected_real_kind(15)
+  integer, parameter :: ii = selected_int_kind(9)
+  integer, parameter :: il = selected_int_kind(18)
+  real(dp), intent(inout) :: x
+  dimension x(3)
+  real(sp), parameter :: three_s = 3._sp
+  real(dp), parameter :: three_d = 3._dp
+  integer(ii), parameter :: three_i = 3_ii
+  integer(il), parameter :: three_l = 3_il
+  x(1) = x(1) + x(2) * three_s * three_i + x(3) * three_d * three_l
+  x(2) = x(2) * three_s
+  x(3) = x(3) * three_l
+  return
+end subroutine
+
+
+subroutine foo_no(x)
+  implicit none
+  integer, parameter :: sp = selected_real_kind(6)
+  integer, parameter :: dp = selected_real_kind(15)
+  integer, parameter :: ii = selected_int_kind(9)
+  integer, parameter :: il = selected_int_kind(18)
+  real(dp), intent(inout) :: x
+  dimension x(3)
+  real(sp), parameter :: three_s = 3.
+  real(dp), parameter :: three_d = 3.
+  integer(ii), parameter :: three_i = 3
+  integer(il), parameter :: three_l = 3
+  x(1) = x(1) + x(2) * three_s * three_i + x(3) * three_d * three_l
+  x(2) = x(2) * three_s
+  x(3) = x(3) * three_l
+  return
+end subroutine
+
+subroutine foo_sum(x)
+  implicit none
+  integer, parameter :: sp = selected_real_kind(6)
+  integer, parameter :: dp = selected_real_kind(15)
+  integer, parameter :: ii = selected_int_kind(9)
+  integer, parameter :: il = selected_int_kind(18)
+  real(dp), intent(inout) :: x
+  dimension x(3)
+  real(sp), parameter :: three_s = 2._sp + 1._sp
+  real(dp), parameter :: three_d = 1._dp + 2._dp
+  integer(ii), parameter :: three_i = 2_ii + 1_ii
+  integer(il), parameter :: three_l = 1_il + 2_il
+  x(1) = x(1) + x(2) * three_s * three_i + x(3) * three_d * three_l
+  x(2) = x(2) * three_s
+  x(3) = x(3) * three_l
+  return
+end subroutine
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_compound.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_compound.f90
new file mode 100644
index 00000000000..e51f5e9b2fb
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_compound.f90
@@ -0,0 +1,15 @@
+! Check that parameters are correct intercepted.
+! Constants with comma separations are commonly
+! used, for instance Pi = 3._dp
+subroutine foo_compound_int(x)
+  implicit none
+  integer, parameter :: ii = selected_int_kind(9)
+  integer(ii), intent(inout) :: x
+  dimension x(3)
+  integer(ii), parameter :: three = 3_ii
+  integer(ii), parameter :: two = 2_ii
+  integer(ii), parameter :: six = three * 1_ii * two
+
+  x(1) = x(1) + x(2) + x(3) * six
+  return
+end subroutine
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_integer.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_integer.f90
new file mode 100644
index 00000000000..aaa83d2eb24
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_integer.f90
@@ -0,0 +1,22 @@
+! Check that parameters are correct intercepted.
+! Constants with comma separations are commonly
+! used, for instance Pi = 3._dp
+subroutine foo_int(x)
+  implicit none
+  integer, parameter :: ii = selected_int_kind(9)
+  integer(ii), intent(inout) :: x
+  dimension x(3)
+  integer(ii), parameter :: three = 3_ii
+  x(1) = x(1) + x(2) + x(3) * three
+  return
+end subroutine
+
+subroutine foo_long(x)
+  implicit none
+  integer, parameter :: ii = selected_int_kind(18)
+  integer(ii), intent(inout) :: x
+  dimension x(3)
+  integer(ii), parameter :: three = 3_ii
+  x(1) = x(1) + x(2) + x(3) * three
+  return
+end subroutine
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_non_compound.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_non_compound.f90
new file mode 100644
index 00000000000..62c9a5b943c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_non_compound.f90
@@ -0,0 +1,23 @@
+! Check that parameters are correct intercepted.
+! Specifically that types of constants without 
+! compound kind specs are correctly inferred
+! adapted Gibbs iteration code from pymc 
+! for this test case 
+subroutine foo_non_compound_int(x)
+  implicit none
+  integer, parameter :: ii = selected_int_kind(9)
+
+  integer(ii)   maxiterates
+  parameter (maxiterates=2)
+
+  integer(ii)   maxseries
+  parameter (maxseries=2)
+
+  integer(ii)   wasize
+  parameter (wasize=maxiterates*maxseries)
+  integer(ii), intent(inout) :: x
+  dimension x(wasize)
+
+  x(1) = x(1) + x(2) + x(3) + x(4) * wasize
+  return
+end subroutine
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_real.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_real.f90
new file mode 100644
index 00000000000..02ac9dd993b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/parameter/constant_real.f90
@@ -0,0 +1,23 @@
+! Check that parameters are correct intercepted.
+! Constants with comma separations are commonly
+! used, for instance Pi = 3._dp
+subroutine foo_single(x)
+  implicit none
+  integer, parameter :: rp = selected_real_kind(6)
+  real(rp), intent(inout) :: x
+  dimension x(3)
+  real(rp), parameter :: three = 3._rp
+  x(1) = x(1) + x(2) + x(3) * three
+  return
+end subroutine
+
+subroutine foo_double(x)
+  implicit none
+  integer, parameter :: rp = selected_real_kind(15)
+  real(rp), intent(inout) :: x
+  dimension x(3)
+  real(rp), parameter :: three = 3._rp
+  x(1) = x(1) + x(2) + x(3) * three
+  return
+end subroutine
+
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/regression/inout.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/regression/inout.f90
new file mode 100644
index 00000000000..80cdad90cec
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/regression/inout.f90
@@ -0,0 +1,9 @@
+! Check that intent(in out) translates as intent(inout).
+! The separation seems to be a common usage.
+      subroutine foo(x)
+          implicit none
+          real(4), intent(in out) :: x
+          dimension x(3)
+          x(1) = x(1) + x(2) + x(3)
+          return
+      end
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/size/foo.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/size/foo.f90
new file mode 100644
index 00000000000..5b66f8c430d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/size/foo.f90
@@ -0,0 +1,44 @@
+
+subroutine foo(a, n, m, b)
+  implicit none
+
+  real, intent(in) :: a(n, m)
+  integer, intent(in) :: n, m
+  real, intent(out) :: b(size(a, 1))
+
+  integer :: i
+
+  do i = 1, size(b)
+    b(i) = sum(a(i,:))
+  enddo
+end subroutine
+
+subroutine trans(x,y)
+  implicit none
+  real, intent(in), dimension(:,:) :: x
+  real, intent(out), dimension( size(x,2), size(x,1) ) :: y
+  integer :: N, M, i, j
+  N = size(x,1)
+  M = size(x,2)
+  DO i=1,N
+     do j=1,M
+        y(j,i) = x(i,j)
+     END DO
+  END DO
+end subroutine trans
+
+subroutine flatten(x,y)
+  implicit none
+  real, intent(in), dimension(:,:) :: x
+  real, intent(out), dimension( size(x) ) :: y
+  integer :: N, M, i, j, k
+  N = size(x,1)
+  M = size(x,2)
+  k = 1
+  DO i=1,N
+     do j=1,M
+        y(k) = x(i,j)
+        k = k + 1
+     END DO
+  END DO
+end subroutine flatten
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/src/string/char.f90 b/contrib/python/numpy/py2/numpy/f2py/tests/src/string/char.f90
new file mode 100644
index 00000000000..bb7985ce50f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/src/string/char.f90
@@ -0,0 +1,29 @@
+MODULE char_test
+
+CONTAINS
+
+SUBROUTINE change_strings(strings, n_strs, out_strings)
+    IMPLICIT NONE
+
+    ! Inputs
+    INTEGER, INTENT(IN) :: n_strs
+    CHARACTER, INTENT(IN), DIMENSION(2,n_strs) :: strings
+    CHARACTER, INTENT(OUT), DIMENSION(2,n_strs) :: out_strings
+
+!f2py INTEGER, INTENT(IN) :: n_strs
+!f2py CHARACTER, INTENT(IN), DIMENSION(2,n_strs) :: strings
+!f2py CHARACTER, INTENT(OUT), DIMENSION(2,n_strs) :: strings
+
+    ! Misc.
+    INTEGER*4 :: j
+
+
+    DO j=1, n_strs
+        out_strings(1,j) = strings(1,j)
+        out_strings(2,j) = 'A'
+    END DO
+
+END SUBROUTINE change_strings
+
+END MODULE char_test
+
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_array_from_pyobj.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_array_from_pyobj.py
new file mode 100644
index 00000000000..a80090185d0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_array_from_pyobj.py
@@ -0,0 +1,581 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import copy
+import pytest
+
+from numpy import (
+    array, alltrue, ndarray, zeros, dtype, intp, clongdouble
+    )
+from numpy.testing import assert_, assert_equal
+from numpy.core.multiarray import typeinfo
+from . import util
+
+wrap = None
+
+
+def setup_module():
+    """
+    Build the required testing extension module
+
+    """
+    global wrap
+
+    # Check compiler availability first
+    if not util.has_c_compiler():
+        pytest.skip("No C compiler available")
+
+    if wrap is None:
+        config_code = """
+        config.add_extension('test_array_from_pyobj_ext',
+                             sources=['wrapmodule.c', 'fortranobject.c'],
+                             define_macros=[])
+        """
+        d = os.path.dirname(__file__)
+        src = [os.path.join(d, 'src', 'array_from_pyobj', 'wrapmodule.c'),
+               os.path.join(d, '..', 'src', 'fortranobject.c'),
+               os.path.join(d, '..', 'src', 'fortranobject.h')]
+        wrap = util.build_module_distutils(src, config_code,
+                                           'test_array_from_pyobj_ext')
+
+
+def flags_info(arr):
+    flags = wrap.array_attrs(arr)[6]
+    return flags2names(flags)
+
+
+def flags2names(flags):
+    info = []
+    for flagname in ['CONTIGUOUS', 'FORTRAN', 'OWNDATA', 'ENSURECOPY',
+                     'ENSUREARRAY', 'ALIGNED', 'NOTSWAPPED', 'WRITEABLE',
+                     'WRITEBACKIFCOPY', 'UPDATEIFCOPY', 'BEHAVED', 'BEHAVED_RO',
+                     'CARRAY', 'FARRAY'
+                     ]:
+        if abs(flags) & getattr(wrap, flagname, 0):
+            info.append(flagname)
+    return info
+
+
+class Intent(object):
+
+    def __init__(self, intent_list=[]):
+        self.intent_list = intent_list[:]
+        flags = 0
+        for i in intent_list:
+            if i == 'optional':
+                flags |= wrap.F2PY_OPTIONAL
+            else:
+                flags |= getattr(wrap, 'F2PY_INTENT_' + i.upper())
+        self.flags = flags
+
+    def __getattr__(self, name):
+        name = name.lower()
+        if name == 'in_':
+            name = 'in'
+        return self.__class__(self.intent_list + [name])
+
+    def __str__(self):
+        return 'intent(%s)' % (','.join(self.intent_list))
+
+    def __repr__(self):
+        return 'Intent(%r)' % (self.intent_list)
+
+    def is_intent(self, *names):
+        for name in names:
+            if name not in self.intent_list:
+                return False
+        return True
+
+    def is_intent_exact(self, *names):
+        return len(self.intent_list) == len(names) and self.is_intent(*names)
+
+intent = Intent()
+
+_type_names = ['BOOL', 'BYTE', 'UBYTE', 'SHORT', 'USHORT', 'INT', 'UINT',
+               'LONG', 'ULONG', 'LONGLONG', 'ULONGLONG',
+               'FLOAT', 'DOUBLE', 'CFLOAT']
+
+_cast_dict = {'BOOL': ['BOOL']}
+_cast_dict['BYTE'] = _cast_dict['BOOL'] + ['BYTE']
+_cast_dict['UBYTE'] = _cast_dict['BOOL'] + ['UBYTE']
+_cast_dict['BYTE'] = ['BYTE']
+_cast_dict['UBYTE'] = ['UBYTE']
+_cast_dict['SHORT'] = _cast_dict['BYTE'] + ['UBYTE', 'SHORT']
+_cast_dict['USHORT'] = _cast_dict['UBYTE'] + ['BYTE', 'USHORT']
+_cast_dict['INT'] = _cast_dict['SHORT'] + ['USHORT', 'INT']
+_cast_dict['UINT'] = _cast_dict['USHORT'] + ['SHORT', 'UINT']
+
+_cast_dict['LONG'] = _cast_dict['INT'] + ['LONG']
+_cast_dict['ULONG'] = _cast_dict['UINT'] + ['ULONG']
+
+_cast_dict['LONGLONG'] = _cast_dict['LONG'] + ['LONGLONG']
+_cast_dict['ULONGLONG'] = _cast_dict['ULONG'] + ['ULONGLONG']
+
+_cast_dict['FLOAT'] = _cast_dict['SHORT'] + ['USHORT', 'FLOAT']
+_cast_dict['DOUBLE'] = _cast_dict['INT'] + ['UINT', 'FLOAT', 'DOUBLE']
+
+_cast_dict['CFLOAT'] = _cast_dict['FLOAT'] + ['CFLOAT']
+
+# 32 bit system malloc typically does not provide the alignment required by
+# 16 byte long double types this means the inout intent cannot be satisfied
+# and several tests fail as the alignment flag can be randomly true or fals
+# when numpy gains an aligned allocator the tests could be enabled again
+if ((intp().dtype.itemsize != 4 or clongdouble().dtype.alignment <= 8) and
+        sys.platform != 'win32'):
+    _type_names.extend(['LONGDOUBLE', 'CDOUBLE', 'CLONGDOUBLE'])
+    _cast_dict['LONGDOUBLE'] = _cast_dict['LONG'] + \
+        ['ULONG', 'FLOAT', 'DOUBLE', 'LONGDOUBLE']
+    _cast_dict['CLONGDOUBLE'] = _cast_dict['LONGDOUBLE'] + \
+        ['CFLOAT', 'CDOUBLE', 'CLONGDOUBLE']
+    _cast_dict['CDOUBLE'] = _cast_dict['DOUBLE'] + ['CFLOAT', 'CDOUBLE']
+
+
+class Type(object):
+    _type_cache = {}
+
+    def __new__(cls, name):
+        if isinstance(name, dtype):
+            dtype0 = name
+            name = None
+            for n, i in typeinfo.items():
+                if not isinstance(i, type) and dtype0.type is i.type:
+                    name = n
+                    break
+        obj = cls._type_cache.get(name.upper(), None)
+        if obj is not None:
+            return obj
+        obj = object.__new__(cls)
+        obj._init(name)
+        cls._type_cache[name.upper()] = obj
+        return obj
+
+    def _init(self, name):
+        self.NAME = name.upper()
+        info = typeinfo[self.NAME]
+        self.type_num = getattr(wrap, 'NPY_' + self.NAME)
+        assert_equal(self.type_num, info.num)
+        self.dtype = info.type
+        self.elsize = info.bits / 8
+        self.dtypechar = info.char
+
+    def cast_types(self):
+        return [self.__class__(_m) for _m in _cast_dict[self.NAME]]
+
+    def all_types(self):
+        return [self.__class__(_m) for _m in _type_names]
+
+    def smaller_types(self):
+        bits = typeinfo[self.NAME].alignment
+        types = []
+        for name in _type_names:
+            if typeinfo[name].alignment < bits:
+                types.append(Type(name))
+        return types
+
+    def equal_types(self):
+        bits = typeinfo[self.NAME].alignment
+        types = []
+        for name in _type_names:
+            if name == self.NAME:
+                continue
+            if typeinfo[name].alignment == bits:
+                types.append(Type(name))
+        return types
+
+    def larger_types(self):
+        bits = typeinfo[self.NAME].alignment
+        types = []
+        for name in _type_names:
+            if typeinfo[name].alignment > bits:
+                types.append(Type(name))
+        return types
+
+
+class Array(object):
+
+    def __init__(self, typ, dims, intent, obj):
+        self.type = typ
+        self.dims = dims
+        self.intent = intent
+        self.obj_copy = copy.deepcopy(obj)
+        self.obj = obj
+
+        # arr.dtypechar may be different from typ.dtypechar
+        self.arr = wrap.call(typ.type_num, dims, intent.flags, obj)
+
+        assert_(isinstance(self.arr, ndarray), repr(type(self.arr)))
+
+        self.arr_attr = wrap.array_attrs(self.arr)
+
+        if len(dims) > 1:
+            if self.intent.is_intent('c'):
+                assert_(intent.flags & wrap.F2PY_INTENT_C)
+                assert_(not self.arr.flags['FORTRAN'],
+                        repr((self.arr.flags, getattr(obj, 'flags', None))))
+                assert_(self.arr.flags['CONTIGUOUS'])
+                assert_(not self.arr_attr[6] & wrap.FORTRAN)
+            else:
+                assert_(not intent.flags & wrap.F2PY_INTENT_C)
+                assert_(self.arr.flags['FORTRAN'])
+                assert_(not self.arr.flags['CONTIGUOUS'])
+                assert_(self.arr_attr[6] & wrap.FORTRAN)
+
+        if obj is None:
+            self.pyarr = None
+            self.pyarr_attr = None
+            return
+
+        if intent.is_intent('cache'):
+            assert_(isinstance(obj, ndarray), repr(type(obj)))
+            self.pyarr = array(obj).reshape(*dims).copy()
+        else:
+            self.pyarr = array(array(obj, dtype=typ.dtypechar).reshape(*dims),
+                               order=self.intent.is_intent('c') and 'C' or 'F')
+            assert_(self.pyarr.dtype == typ,
+                    repr((self.pyarr.dtype, typ)))
+        assert_(self.pyarr.flags['OWNDATA'], (obj, intent))
+        self.pyarr_attr = wrap.array_attrs(self.pyarr)
+
+        if len(dims) > 1:
+            if self.intent.is_intent('c'):
+                assert_(not self.pyarr.flags['FORTRAN'])
+                assert_(self.pyarr.flags['CONTIGUOUS'])
+                assert_(not self.pyarr_attr[6] & wrap.FORTRAN)
+            else:
+                assert_(self.pyarr.flags['FORTRAN'])
+                assert_(not self.pyarr.flags['CONTIGUOUS'])
+                assert_(self.pyarr_attr[6] & wrap.FORTRAN)
+
+        assert_(self.arr_attr[1] == self.pyarr_attr[1])  # nd
+        assert_(self.arr_attr[2] == self.pyarr_attr[2])  # dimensions
+        if self.arr_attr[1] <= 1:
+            assert_(self.arr_attr[3] == self.pyarr_attr[3],
+                    repr((self.arr_attr[3], self.pyarr_attr[3],
+                          self.arr.tobytes(), self.pyarr.tobytes())))  # strides
+        assert_(self.arr_attr[5][-2:] == self.pyarr_attr[5][-2:],
+                repr((self.arr_attr[5], self.pyarr_attr[5])))  # descr
+        assert_(self.arr_attr[6] == self.pyarr_attr[6],
+                repr((self.arr_attr[6], self.pyarr_attr[6],
+                      flags2names(0 * self.arr_attr[6] - self.pyarr_attr[6]),
+                      flags2names(self.arr_attr[6]), intent)))  # flags
+
+        if intent.is_intent('cache'):
+            assert_(self.arr_attr[5][3] >= self.type.elsize,
+                    repr((self.arr_attr[5][3], self.type.elsize)))
+        else:
+            assert_(self.arr_attr[5][3] == self.type.elsize,
+                    repr((self.arr_attr[5][3], self.type.elsize)))
+        assert_(self.arr_equal(self.pyarr, self.arr))
+
+        if isinstance(self.obj, ndarray):
+            if typ.elsize == Type(obj.dtype).elsize:
+                if not intent.is_intent('copy') and self.arr_attr[1] <= 1:
+                    assert_(self.has_shared_memory())
+
+    def arr_equal(self, arr1, arr2):
+        if arr1.shape != arr2.shape:
+            return False
+        s = arr1 == arr2
+        return alltrue(s.flatten())
+
+    def __str__(self):
+        return str(self.arr)
+
+    def has_shared_memory(self):
+        """Check that created array shares data with input array.
+        """
+        if self.obj is self.arr:
+            return True
+        if not isinstance(self.obj, ndarray):
+            return False
+        obj_attr = wrap.array_attrs(self.obj)
+        return obj_attr[0] == self.arr_attr[0]
+
+
+class TestIntent(object):
+
+    def test_in_out(self):
+        assert_equal(str(intent.in_.out), 'intent(in,out)')
+        assert_(intent.in_.c.is_intent('c'))
+        assert_(not intent.in_.c.is_intent_exact('c'))
+        assert_(intent.in_.c.is_intent_exact('c', 'in'))
+        assert_(intent.in_.c.is_intent_exact('in', 'c'))
+        assert_(not intent.in_.is_intent('c'))
+
+
+class TestSharedMemory(object):
+    num2seq = [1, 2]
+    num23seq = [[1, 2, 3], [4, 5, 6]]
+
+    @pytest.fixture(autouse=True, scope='class', params=_type_names)
+    def setup_type(self, request):
+        request.cls.type = Type(request.param)
+        request.cls.array = lambda self, dims, intent, obj: \
+            Array(Type(request.param), dims, intent, obj)
+
+    def test_in_from_2seq(self):
+        a = self.array([2], intent.in_, self.num2seq)
+        assert_(not a.has_shared_memory())
+
+    def test_in_from_2casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num2seq, dtype=t.dtype)
+            a = self.array([len(self.num2seq)], intent.in_, obj)
+            if t.elsize == self.type.elsize:
+                assert_(
+                    a.has_shared_memory(), repr((self.type.dtype, t.dtype)))
+            else:
+                assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_inout_2seq(self):
+        obj = array(self.num2seq, dtype=self.type.dtype)
+        a = self.array([len(self.num2seq)], intent.inout, obj)
+        assert_(a.has_shared_memory())
+
+        try:
+            a = self.array([2], intent.in_.inout, self.num2seq)
+        except TypeError as msg:
+            if not str(msg).startswith('failed to initialize intent'
+                                       '(inout|inplace|cache) array'):
+                raise
+        else:
+            raise SystemError('intent(inout) should have failed on sequence')
+
+    def test_f_inout_23seq(self):
+        obj = array(self.num23seq, dtype=self.type.dtype, order='F')
+        shape = (len(self.num23seq), len(self.num23seq[0]))
+        a = self.array(shape, intent.in_.inout, obj)
+        assert_(a.has_shared_memory())
+
+        obj = array(self.num23seq, dtype=self.type.dtype, order='C')
+        shape = (len(self.num23seq), len(self.num23seq[0]))
+        try:
+            a = self.array(shape, intent.in_.inout, obj)
+        except ValueError as msg:
+            if not str(msg).startswith('failed to initialize intent'
+                                       '(inout) array'):
+                raise
+        else:
+            raise SystemError(
+                'intent(inout) should have failed on improper array')
+
+    def test_c_inout_23seq(self):
+        obj = array(self.num23seq, dtype=self.type.dtype)
+        shape = (len(self.num23seq), len(self.num23seq[0]))
+        a = self.array(shape, intent.in_.c.inout, obj)
+        assert_(a.has_shared_memory())
+
+    def test_in_copy_from_2casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num2seq, dtype=t.dtype)
+            a = self.array([len(self.num2seq)], intent.in_.copy, obj)
+            assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_c_in_from_23seq(self):
+        a = self.array([len(self.num23seq), len(self.num23seq[0])],
+                       intent.in_, self.num23seq)
+        assert_(not a.has_shared_memory())
+
+    def test_in_from_23casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num23seq, dtype=t.dtype)
+            a = self.array([len(self.num23seq), len(self.num23seq[0])],
+                           intent.in_, obj)
+            assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_f_in_from_23casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num23seq, dtype=t.dtype, order='F')
+            a = self.array([len(self.num23seq), len(self.num23seq[0])],
+                           intent.in_, obj)
+            if t.elsize == self.type.elsize:
+                assert_(a.has_shared_memory(), repr(t.dtype))
+            else:
+                assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_c_in_from_23casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num23seq, dtype=t.dtype)
+            a = self.array([len(self.num23seq), len(self.num23seq[0])],
+                           intent.in_.c, obj)
+            if t.elsize == self.type.elsize:
+                assert_(a.has_shared_memory(), repr(t.dtype))
+            else:
+                assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_f_copy_in_from_23casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num23seq, dtype=t.dtype, order='F')
+            a = self.array([len(self.num23seq), len(self.num23seq[0])],
+                           intent.in_.copy, obj)
+            assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_c_copy_in_from_23casttype(self):
+        for t in self.type.cast_types():
+            obj = array(self.num23seq, dtype=t.dtype)
+            a = self.array([len(self.num23seq), len(self.num23seq[0])],
+                           intent.in_.c.copy, obj)
+            assert_(not a.has_shared_memory(), repr(t.dtype))
+
+    def test_in_cache_from_2casttype(self):
+        for t in self.type.all_types():
+            if t.elsize != self.type.elsize:
+                continue
+            obj = array(self.num2seq, dtype=t.dtype)
+            shape = (len(self.num2seq),)
+            a = self.array(shape, intent.in_.c.cache, obj)
+            assert_(a.has_shared_memory(), repr(t.dtype))
+
+            a = self.array(shape, intent.in_.cache, obj)
+            assert_(a.has_shared_memory(), repr(t.dtype))
+
+            obj = array(self.num2seq, dtype=t.dtype, order='F')
+            a = self.array(shape, intent.in_.c.cache, obj)
+            assert_(a.has_shared_memory(), repr(t.dtype))
+
+            a = self.array(shape, intent.in_.cache, obj)
+            assert_(a.has_shared_memory(), repr(t.dtype))
+
+            try:
+                a = self.array(shape, intent.in_.cache, obj[::-1])
+            except ValueError as msg:
+                if not str(msg).startswith('failed to initialize'
+                                           ' intent(cache) array'):
+                    raise
+            else:
+                raise SystemError(
+                    'intent(cache) should have failed on multisegmented array')
+
+    def test_in_cache_from_2casttype_failure(self):
+        for t in self.type.all_types():
+            if t.elsize >= self.type.elsize:
+                continue
+            obj = array(self.num2seq, dtype=t.dtype)
+            shape = (len(self.num2seq),)
+            try:
+                self.array(shape, intent.in_.cache, obj)  # Should succeed
+            except ValueError as msg:
+                if not str(msg).startswith('failed to initialize'
+                                           ' intent(cache) array'):
+                    raise
+            else:
+                raise SystemError(
+                    'intent(cache) should have failed on smaller array')
+
+    def test_cache_hidden(self):
+        shape = (2,)
+        a = self.array(shape, intent.cache.hide, None)
+        assert_(a.arr.shape == shape)
+
+        shape = (2, 3)
+        a = self.array(shape, intent.cache.hide, None)
+        assert_(a.arr.shape == shape)
+
+        shape = (-1, 3)
+        try:
+            a = self.array(shape, intent.cache.hide, None)
+        except ValueError as msg:
+            if not str(msg).startswith('failed to create intent'
+                                       '(cache|hide)|optional array'):
+                raise
+        else:
+            raise SystemError(
+                'intent(cache) should have failed on undefined dimensions')
+
+    def test_hidden(self):
+        shape = (2,)
+        a = self.array(shape, intent.hide, None)
+        assert_(a.arr.shape == shape)
+        assert_(a.arr_equal(a.arr, zeros(shape, dtype=self.type.dtype)))
+
+        shape = (2, 3)
+        a = self.array(shape, intent.hide, None)
+        assert_(a.arr.shape == shape)
+        assert_(a.arr_equal(a.arr, zeros(shape, dtype=self.type.dtype)))
+        assert_(a.arr.flags['FORTRAN'] and not a.arr.flags['CONTIGUOUS'])
+
+        shape = (2, 3)
+        a = self.array(shape, intent.c.hide, None)
+        assert_(a.arr.shape == shape)
+        assert_(a.arr_equal(a.arr, zeros(shape, dtype=self.type.dtype)))
+        assert_(not a.arr.flags['FORTRAN'] and a.arr.flags['CONTIGUOUS'])
+
+        shape = (-1, 3)
+        try:
+            a = self.array(shape, intent.hide, None)
+        except ValueError as msg:
+            if not str(msg).startswith('failed to create intent'
+                                       '(cache|hide)|optional array'):
+                raise
+        else:
+            raise SystemError('intent(hide) should have failed'
+                              ' on undefined dimensions')
+
+    def test_optional_none(self):
+        shape = (2,)
+        a = self.array(shape, intent.optional, None)
+        assert_(a.arr.shape == shape)
+        assert_(a.arr_equal(a.arr, zeros(shape, dtype=self.type.dtype)))
+
+        shape = (2, 3)
+        a = self.array(shape, intent.optional, None)
+        assert_(a.arr.shape == shape)
+        assert_(a.arr_equal(a.arr, zeros(shape, dtype=self.type.dtype)))
+        assert_(a.arr.flags['FORTRAN'] and not a.arr.flags['CONTIGUOUS'])
+
+        shape = (2, 3)
+        a = self.array(shape, intent.c.optional, None)
+        assert_(a.arr.shape == shape)
+        assert_(a.arr_equal(a.arr, zeros(shape, dtype=self.type.dtype)))
+        assert_(not a.arr.flags['FORTRAN'] and a.arr.flags['CONTIGUOUS'])
+
+    def test_optional_from_2seq(self):
+        obj = self.num2seq
+        shape = (len(obj),)
+        a = self.array(shape, intent.optional, obj)
+        assert_(a.arr.shape == shape)
+        assert_(not a.has_shared_memory())
+
+    def test_optional_from_23seq(self):
+        obj = self.num23seq
+        shape = (len(obj), len(obj[0]))
+        a = self.array(shape, intent.optional, obj)
+        assert_(a.arr.shape == shape)
+        assert_(not a.has_shared_memory())
+
+        a = self.array(shape, intent.optional.c, obj)
+        assert_(a.arr.shape == shape)
+        assert_(not a.has_shared_memory())
+
+    def test_inplace(self):
+        obj = array(self.num23seq, dtype=self.type.dtype)
+        assert_(not obj.flags['FORTRAN'] and obj.flags['CONTIGUOUS'])
+        shape = obj.shape
+        a = self.array(shape, intent.inplace, obj)
+        assert_(obj[1][2] == a.arr[1][2], repr((obj, a.arr)))
+        a.arr[1][2] = 54
+        assert_(obj[1][2] == a.arr[1][2] ==
+                array(54, dtype=self.type.dtype), repr((obj, a.arr)))
+        assert_(a.arr is obj)
+        assert_(obj.flags['FORTRAN'])  # obj attributes are changed inplace!
+        assert_(not obj.flags['CONTIGUOUS'])
+
+    def test_inplace_from_casttype(self):
+        for t in self.type.cast_types():
+            if t is self.type:
+                continue
+            obj = array(self.num23seq, dtype=t.dtype)
+            assert_(obj.dtype.type == t.dtype)
+            assert_(obj.dtype.type is not self.type.dtype)
+            assert_(not obj.flags['FORTRAN'] and obj.flags['CONTIGUOUS'])
+            shape = obj.shape
+            a = self.array(shape, intent.inplace, obj)
+            assert_(obj[1][2] == a.arr[1][2], repr((obj, a.arr)))
+            a.arr[1][2] = 54
+            assert_(obj[1][2] == a.arr[1][2] ==
+                    array(54, dtype=self.type.dtype), repr((obj, a.arr)))
+            assert_(a.arr is obj)
+            assert_(obj.flags['FORTRAN'])  # obj attributes changed inplace!
+            assert_(not obj.flags['CONTIGUOUS'])
+            assert_(obj.dtype.type is self.type.dtype)  # obj changed inplace!
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_assumed_shape.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_assumed_shape.py
new file mode 100644
index 00000000000..460afd68db6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_assumed_shape.py
@@ -0,0 +1,33 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import pytest
+
+from numpy.testing import assert_
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestAssumedShapeSumExample(util.F2PyTest):
+    sources = [_path('src', 'assumed_shape', 'foo_free.f90'),
+               _path('src', 'assumed_shape', 'foo_use.f90'),
+               _path('src', 'assumed_shape', 'precision.f90'),
+               _path('src', 'assumed_shape', 'foo_mod.f90'),
+               ]
+
+    @pytest.mark.slow
+    def test_all(self):
+        r = self.module.fsum([1, 2])
+        assert_(r == 3, repr(r))
+        r = self.module.sum([1, 2])
+        assert_(r == 3, repr(r))
+        r = self.module.sum_with_use([1, 2])
+        assert_(r == 3, repr(r))
+
+        r = self.module.mod.sum([1, 2])
+        assert_(r == 3, repr(r))
+        r = self.module.mod.fsum([1, 2])
+        assert_(r == 3, repr(r))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_block_docstring.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_block_docstring.py
new file mode 100644
index 00000000000..4f1678980f8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_block_docstring.py
@@ -0,0 +1,24 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import pytest
+from . import util
+
+from numpy.testing import assert_equal, IS_PYPY
+
+class TestBlockDocString(util.F2PyTest):
+    code = """
+      SUBROUTINE FOO()
+      INTEGER BAR(2, 3)
+
+      COMMON  /BLOCK/ BAR
+      RETURN
+      END
+    """
+
+    @pytest.mark.skipif(sys.platform=='win32',
+                        reason='Fails with MinGW64 Gfortran (Issue #9673)')
+    @pytest.mark.xfail(IS_PYPY, reason="PyPy does not modify tp_doc")
+    def test_block_docstring(self):
+        expected = "'i'-array(2,3)\n"
+        assert_equal(self.module.block.__doc__, expected)
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_callback.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_callback.py
new file mode 100644
index 00000000000..824ef7b0c5d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_callback.py
@@ -0,0 +1,165 @@
+from __future__ import division, absolute_import, print_function
+
+import math
+import textwrap
+import sys
+import pytest
+
+import numpy as np
+from numpy.testing import assert_, assert_equal
+from . import util
+
+
+class TestF77Callback(util.F2PyTest):
+    code = """
+       subroutine t(fun,a)
+       integer a
+cf2py  intent(out) a
+       external fun
+       call fun(a)
+       end
+
+       subroutine func(a)
+cf2py  intent(in,out) a
+       integer a
+       a = a + 11
+       end
+
+       subroutine func0(a)
+cf2py  intent(out) a
+       integer a
+       a = 11
+       end
+
+       subroutine t2(a)
+cf2py  intent(callback) fun
+       integer a
+cf2py  intent(out) a
+       external fun
+       call fun(a)
+       end
+
+       subroutine string_callback(callback, a)
+       external callback
+       double precision callback
+       double precision a
+       character*1 r
+cf2py  intent(out) a
+       r = 'r'
+       a = callback(r)
+       end
+
+       subroutine string_callback_array(callback, cu, lencu, a)
+       external callback
+       integer callback
+       integer lencu
+       character*8 cu(lencu)
+       integer a
+cf2py  intent(out) a
+
+       a = callback(cu, lencu)
+       end
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't,t2'.split(','))
+    def test_all(self, name):
+        self.check_function(name)
+
+    @pytest.mark.slow
+    def test_docstring(self):
+        expected = """
+        a = t(fun,[fun_extra_args])
+
+        Wrapper for ``t``.
+
+        Parameters
+        ----------
+        fun : call-back function
+
+        Other Parameters
+        ----------------
+        fun_extra_args : input tuple, optional
+            Default: ()
+
+        Returns
+        -------
+        a : int
+
+        Notes
+        -----
+        Call-back functions::
+
+          def fun(): return a
+          Return objects:
+            a : int
+        """
+        assert_equal(self.module.t.__doc__, textwrap.dedent(expected).lstrip())
+
+    def check_function(self, name):
+        t = getattr(self.module, name)
+        r = t(lambda: 4)
+        assert_(r == 4, repr(r))
+        r = t(lambda a: 5, fun_extra_args=(6,))
+        assert_(r == 5, repr(r))
+        r = t(lambda a: a, fun_extra_args=(6,))
+        assert_(r == 6, repr(r))
+        r = t(lambda a: 5 + a, fun_extra_args=(7,))
+        assert_(r == 12, repr(r))
+        r = t(lambda a: math.degrees(a), fun_extra_args=(math.pi,))
+        assert_(r == 180, repr(r))
+        r = t(math.degrees, fun_extra_args=(math.pi,))
+        assert_(r == 180, repr(r))
+
+        r = t(self.module.func, fun_extra_args=(6,))
+        assert_(r == 17, repr(r))
+        r = t(self.module.func0)
+        assert_(r == 11, repr(r))
+        r = t(self.module.func0._cpointer)
+        assert_(r == 11, repr(r))
+
+        class A(object):
+
+            def __call__(self):
+                return 7
+
+            def mth(self):
+                return 9
+        a = A()
+        r = t(a)
+        assert_(r == 7, repr(r))
+        r = t(a.mth)
+        assert_(r == 9, repr(r))
+
+    @pytest.mark.skipif(sys.platform=='win32',
+                        reason='Fails with MinGW64 Gfortran (Issue #9673)')
+    def test_string_callback(self):
+
+        def callback(code):
+            if code == 'r':
+                return 0
+            else:
+                return 1
+
+        f = getattr(self.module, 'string_callback')
+        r = f(callback)
+        assert_(r == 0, repr(r))
+
+    @pytest.mark.skipif(sys.platform=='win32',
+                        reason='Fails with MinGW64 Gfortran (Issue #9673)')
+    def test_string_callback_array(self):
+        # See gh-10027
+        cu = np.zeros((1, 8), 'S1')
+
+        def callback(cu, lencu):
+            if cu.shape != (lencu, 8):
+                return 1
+            if cu.dtype != 'S1':
+                return 2
+            if not np.all(cu == b''):
+                return 3
+            return 0
+
+        f = getattr(self.module, 'string_callback_array')
+        res = f(callback, cu, len(cu))
+        assert_(res == 0, repr(res))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_common.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_common.py
new file mode 100644
index 00000000000..dcb01b0ec73
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_common.py
@@ -0,0 +1,27 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import pytest
+
+import numpy as np
+from . import util
+
+from numpy.testing import assert_array_equal
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+class TestCommonBlock(util.F2PyTest):
+    sources = [_path('src', 'common', 'block.f')]
+
+    @pytest.mark.skipif(sys.platform=='win32',
+                        reason='Fails with MinGW64 Gfortran (Issue #9673)')
+    def test_common_block(self):
+        self.module.initcb()
+        assert_array_equal(self.module.block.long_bn,
+                           np.array(1.0, dtype=np.float64))
+        assert_array_equal(self.module.block.string_bn,
+                           np.array('2', dtype='|S1'))
+        assert_array_equal(self.module.block.ok,
+                           np.array(3, dtype=np.int32))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_compile_function.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_compile_function.py
new file mode 100644
index 00000000000..36abf05f9cc
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_compile_function.py
@@ -0,0 +1,125 @@
+"""See https://github.com/numpy/numpy/pull/11937.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys
+import os
+import uuid
+from importlib import import_module
+import pytest
+
+import numpy.f2py
+
+from numpy.testing import assert_equal
+from . import util
+
+
+def setup_module():
+    if sys.platform == 'win32' and sys.version_info[0] < 3:
+        pytest.skip('Fails with MinGW64 Gfortran (Issue #9673)')
+    if not util.has_c_compiler():
+        pytest.skip("Needs C compiler")
+    if not util.has_f77_compiler():
+        pytest.skip('Needs FORTRAN 77 compiler')
+
+
+# extra_args can be a list (since gh-11937) or string.
+# also test absence of extra_args
+[email protected](
+    "extra_args", [['--noopt', '--debug'], '--noopt --debug', '']
+    )
+def test_f2py_init_compile(extra_args):
+    # flush through the f2py __init__ compile() function code path as a
+    # crude test for input handling following migration from
+    # exec_command() to subprocess.check_output() in gh-11937
+
+    # the Fortran 77 syntax requires 6 spaces before any commands, but
+    # more space may be added/
+    fsource =  """
+        integer function foo()
+        foo = 10 + 5
+        return
+        end
+    """
+    # use various helper functions in util.py to enable robust build /
+    # compile and reimport cycle in test suite
+    moddir = util.get_module_dir()
+    modname = util.get_temp_module_name()
+
+    cwd = os.getcwd()
+    target = os.path.join(moddir, str(uuid.uuid4()) + '.f')
+    # try running compile() with and without a source_fn provided so
+    # that the code path where a temporary file for writing Fortran
+    # source is created is also explored
+    for source_fn in [target, None]:
+        # mimic the path changing behavior used by build_module() in
+        # util.py, but don't actually use build_module() because it has
+        # its own invocation of subprocess that circumvents the
+        # f2py.compile code block under test
+        try:
+            os.chdir(moddir)
+            ret_val = numpy.f2py.compile(
+                fsource,
+                modulename=modname,
+                extra_args=extra_args,
+                source_fn=source_fn
+                )
+        finally:
+            os.chdir(cwd)
+
+        # check for compile success return value
+        assert_equal(ret_val, 0)
+
+        # we are not currently able to import the Python-Fortran
+        # interface module on Windows / Appveyor, even though we do get
+        # successful compilation on that platform with Python 3.x
+        if sys.platform != 'win32':
+            # check for sensible result of Fortran function; that means
+            # we can import the module name in Python and retrieve the
+            # result of the sum operation
+            return_check = import_module(modname)
+            calc_result = return_check.foo()
+            assert_equal(calc_result, 15)
+
+
+def test_f2py_init_compile_failure():
+    # verify an appropriate integer status value returned by
+    # f2py.compile() when invalid Fortran is provided
+    ret_val = numpy.f2py.compile(b"invalid")
+    assert_equal(ret_val, 1)
+
+
+def test_f2py_init_compile_bad_cmd():
+    # verify that usage of invalid command in f2py.compile() returns
+    # status value of 127 for historic consistency with exec_command()
+    # error handling
+
+    # patch the sys Python exe path temporarily to induce an OSError
+    # downstream NOTE: how bad of an idea is this patching?
+    try:
+        temp = sys.executable
+        sys.executable = 'does not exist'
+
+        # the OSError should take precedence over invalid Fortran
+        ret_val = numpy.f2py.compile(b"invalid")
+        assert_equal(ret_val, 127)
+    finally:
+        sys.executable = temp
+
+
+[email protected]('fsource',
+        ['program test_f2py\nend program test_f2py',
+         b'program test_f2py\nend program test_f2py',])
+def test_compile_from_strings(tmpdir, fsource):
+    # Make sure we can compile str and bytes gh-12796
+    cwd = os.getcwd()
+    try:
+        os.chdir(str(tmpdir))
+        ret_val = numpy.f2py.compile(
+                fsource,
+                modulename='test_compile_from_strings',
+                extension='.f90')
+        assert_equal(ret_val, 0)
+    finally:
+        os.chdir(cwd)
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_kind.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_kind.py
new file mode 100644
index 00000000000..1f7762a805f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_kind.py
@@ -0,0 +1,34 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import pytest
+
+from numpy.testing import assert_
+from numpy.f2py.crackfortran import (
+    _selected_int_kind_func as selected_int_kind,
+    _selected_real_kind_func as selected_real_kind
+    )
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestKind(util.F2PyTest):
+    sources = [_path('src', 'kind', 'foo.f90')]
+
+    @pytest.mark.slow
+    def test_all(self):
+        selectedrealkind = self.module.selectedrealkind
+        selectedintkind = self.module.selectedintkind
+
+        for i in range(40):
+            assert_(selectedintkind(i) in [selected_int_kind(i), -1],
+                    'selectedintkind(%s): expected %r but got %r' %
+                    (i, selected_int_kind(i), selectedintkind(i)))
+
+        for i in range(20):
+            assert_(selectedrealkind(i) in [selected_real_kind(i), -1],
+                    'selectedrealkind(%s): expected %r but got %r' %
+                    (i, selected_real_kind(i), selectedrealkind(i)))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_mixed.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_mixed.py
new file mode 100644
index 00000000000..28268ecc024
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_mixed.py
@@ -0,0 +1,38 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import textwrap
+import pytest
+
+from numpy.testing import assert_, assert_equal
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestMixed(util.F2PyTest):
+    sources = [_path('src', 'mixed', 'foo.f'),
+               _path('src', 'mixed', 'foo_fixed.f90'),
+               _path('src', 'mixed', 'foo_free.f90')]
+
+    @pytest.mark.slow
+    def test_all(self):
+        assert_(self.module.bar11() == 11)
+        assert_(self.module.foo_fixed.bar12() == 12)
+        assert_(self.module.foo_free.bar13() == 13)
+
+    @pytest.mark.slow
+    def test_docstring(self):
+        expected = """
+        a = bar11()
+
+        Wrapper for ``bar11``.
+
+        Returns
+        -------
+        a : int
+        """
+        assert_equal(self.module.bar11.__doc__,
+                     textwrap.dedent(expected).lstrip())
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_parameter.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_parameter.py
new file mode 100644
index 00000000000..6a378687ad7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_parameter.py
@@ -0,0 +1,118 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import pytest
+
+import numpy as np
+from numpy.testing import assert_raises, assert_equal
+
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestParameters(util.F2PyTest):
+    # Check that intent(in out) translates as intent(inout)
+    sources = [_path('src', 'parameter', 'constant_real.f90'),
+               _path('src', 'parameter', 'constant_integer.f90'),
+               _path('src', 'parameter', 'constant_both.f90'),
+               _path('src', 'parameter', 'constant_compound.f90'),
+               _path('src', 'parameter', 'constant_non_compound.f90'),
+    ]
+
+    @pytest.mark.slow
+    def test_constant_real_single(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.float32)[::2]
+        assert_raises(ValueError, self.module.foo_single, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.float32)
+        self.module.foo_single(x)
+        assert_equal(x, [0 + 1 + 2*3, 1, 2])
+
+    @pytest.mark.slow
+    def test_constant_real_double(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.float64)[::2]
+        assert_raises(ValueError, self.module.foo_double, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.float64)
+        self.module.foo_double(x)
+        assert_equal(x, [0 + 1 + 2*3, 1, 2])
+
+    @pytest.mark.slow
+    def test_constant_compound_int(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.int32)[::2]
+        assert_raises(ValueError, self.module.foo_compound_int, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.int32)
+        self.module.foo_compound_int(x)
+        assert_equal(x, [0 + 1 + 2*6, 1, 2])
+
+    @pytest.mark.slow
+    def test_constant_non_compound_int(self):
+        # check values
+        x = np.arange(4, dtype=np.int32)
+        self.module.foo_non_compound_int(x)
+        assert_equal(x, [0 + 1 + 2 + 3*4, 1, 2, 3])
+
+    @pytest.mark.slow
+    def test_constant_integer_int(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.int32)[::2]
+        assert_raises(ValueError, self.module.foo_int, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.int32)
+        self.module.foo_int(x)
+        assert_equal(x, [0 + 1 + 2*3, 1, 2])
+
+    @pytest.mark.slow
+    def test_constant_integer_long(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.int64)[::2]
+        assert_raises(ValueError, self.module.foo_long, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.int64)
+        self.module.foo_long(x)
+        assert_equal(x, [0 + 1 + 2*3, 1, 2])
+
+    @pytest.mark.slow
+    def test_constant_both(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.float64)[::2]
+        assert_raises(ValueError, self.module.foo, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.float64)
+        self.module.foo(x)
+        assert_equal(x, [0 + 1*3*3 + 2*3*3, 1*3, 2*3])
+
+    @pytest.mark.slow
+    def test_constant_no(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.float64)[::2]
+        assert_raises(ValueError, self.module.foo_no, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.float64)
+        self.module.foo_no(x)
+        assert_equal(x, [0 + 1*3*3 + 2*3*3, 1*3, 2*3])
+
+    @pytest.mark.slow
+    def test_constant_sum(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.float64)[::2]
+        assert_raises(ValueError, self.module.foo_sum, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.float64)
+        self.module.foo_sum(x)
+        assert_equal(x, [0 + 1*3*3 + 2*3*3, 1*3, 2*3])
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_quoted_character.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_quoted_character.py
new file mode 100644
index 00000000000..c9a1c36f50c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_quoted_character.py
@@ -0,0 +1,35 @@
+"""See https://github.com/numpy/numpy/pull/10676.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys
+from importlib import import_module
+import pytest
+
+from numpy.testing import assert_equal
+from . import util
+
+
+class TestQuotedCharacter(util.F2PyTest):
+    code = """
+      SUBROUTINE FOO(OUT1, OUT2, OUT3, OUT4, OUT5, OUT6)
+      CHARACTER SINGLE, DOUBLE, SEMICOL, EXCLA, OPENPAR, CLOSEPAR
+      PARAMETER (SINGLE="'", DOUBLE='"', SEMICOL=';', EXCLA="!",
+     1           OPENPAR="(", CLOSEPAR=")")
+      CHARACTER OUT1, OUT2, OUT3, OUT4, OUT5, OUT6
+Cf2py intent(out) OUT1, OUT2, OUT3, OUT4, OUT5, OUT6
+      OUT1 = SINGLE
+      OUT2 = DOUBLE
+      OUT3 = SEMICOL
+      OUT4 = EXCLA
+      OUT5 = OPENPAR
+      OUT6 = CLOSEPAR
+      RETURN
+      END
+    """
+
+    @pytest.mark.skipif(sys.platform=='win32',
+                        reason='Fails with MinGW64 Gfortran (Issue #9673)')
+    def test_quoted_character(self):
+        assert_equal(self.module.foo(), (b"'", b'"', b';', b'!', b'(', b')'))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_regression.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_regression.py
new file mode 100644
index 00000000000..3adae635d9a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_regression.py
@@ -0,0 +1,29 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import pytest
+
+import numpy as np
+from numpy.testing import assert_raises, assert_equal
+
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestIntentInOut(util.F2PyTest):
+    # Check that intent(in out) translates as intent(inout)
+    sources = [_path('src', 'regression', 'inout.f90')]
+
+    @pytest.mark.slow
+    def test_inout(self):
+        # non-contiguous should raise error
+        x = np.arange(6, dtype=np.float32)[::2]
+        assert_raises(ValueError, self.module.foo, x)
+
+        # check values with contiguous array
+        x = np.arange(3, dtype=np.float32)
+        self.module.foo(x)
+        assert_equal(x, [3, 1, 2])
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_return_character.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_character.py
new file mode 100644
index 00000000000..fc3a58d36b8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_character.py
@@ -0,0 +1,146 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+from numpy import array
+from numpy.testing import assert_
+from . import util
+
+
+class TestReturnCharacter(util.F2PyTest):
+
+    def check_function(self, t):
+        tname = t.__doc__.split()[0]
+        if tname in ['t0', 't1', 's0', 's1']:
+            assert_(t(23) == b'2')
+            r = t('ab')
+            assert_(r == b'a', repr(r))
+            r = t(array('ab'))
+            assert_(r == b'a', repr(r))
+            r = t(array(77, 'u1'))
+            assert_(r == b'M', repr(r))
+            #assert_(_raises(ValueError, t, array([77,87])))
+            #assert_(_raises(ValueError, t, array(77)))
+        elif tname in ['ts', 'ss']:
+            assert_(t(23) == b'23        ', repr(t(23)))
+            assert_(t('123456789abcdef') == b'123456789a')
+        elif tname in ['t5', 's5']:
+            assert_(t(23) == b'23   ', repr(t(23)))
+            assert_(t('ab') == b'ab   ', repr(t('ab')))
+            assert_(t('123456789abcdef') == b'12345')
+        else:
+            raise NotImplementedError
+
+
+class TestF77ReturnCharacter(TestReturnCharacter):
+    code = """
+       function t0(value)
+         character value
+         character t0
+         t0 = value
+       end
+       function t1(value)
+         character*1 value
+         character*1 t1
+         t1 = value
+       end
+       function t5(value)
+         character*5 value
+         character*5 t5
+         t5 = value
+       end
+       function ts(value)
+         character*(*) value
+         character*(*) ts
+         ts = value
+       end
+
+       subroutine s0(t0,value)
+         character value
+         character t0
+cf2py    intent(out) t0
+         t0 = value
+       end
+       subroutine s1(t1,value)
+         character*1 value
+         character*1 t1
+cf2py    intent(out) t1
+         t1 = value
+       end
+       subroutine s5(t5,value)
+         character*5 value
+         character*5 t5
+cf2py    intent(out) t5
+         t5 = value
+       end
+       subroutine ss(ts,value)
+         character*(*) value
+         character*10 ts
+cf2py    intent(out) ts
+         ts = value
+       end
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t1,t5,s0,s1,s5,ss'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module, name))
+
+
+class TestF90ReturnCharacter(TestReturnCharacter):
+    suffix = ".f90"
+    code = """
+module f90_return_char
+  contains
+       function t0(value)
+         character :: value
+         character :: t0
+         t0 = value
+       end function t0
+       function t1(value)
+         character(len=1) :: value
+         character(len=1) :: t1
+         t1 = value
+       end function t1
+       function t5(value)
+         character(len=5) :: value
+         character(len=5) :: t5
+         t5 = value
+       end function t5
+       function ts(value)
+         character(len=*) :: value
+         character(len=10) :: ts
+         ts = value
+       end function ts
+
+       subroutine s0(t0,value)
+         character :: value
+         character :: t0
+!f2py    intent(out) t0
+         t0 = value
+       end subroutine s0
+       subroutine s1(t1,value)
+         character(len=1) :: value
+         character(len=1) :: t1
+!f2py    intent(out) t1
+         t1 = value
+       end subroutine s1
+       subroutine s5(t5,value)
+         character(len=5) :: value
+         character(len=5) :: t5
+!f2py    intent(out) t5
+         t5 = value
+       end subroutine s5
+       subroutine ss(ts,value)
+         character(len=*) :: value
+         character(len=10) :: ts
+!f2py    intent(out) ts
+         ts = value
+       end subroutine ss
+end module f90_return_char
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t1,t5,ts,s0,s1,s5,ss'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module.f90_return_char, name))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_return_complex.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_complex.py
new file mode 100644
index 00000000000..43c884dfb0d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_complex.py
@@ -0,0 +1,169 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+from numpy import array
+from numpy.compat import long
+from numpy.testing import assert_, assert_raises
+from . import util
+
+
+class TestReturnComplex(util.F2PyTest):
+
+    def check_function(self, t):
+        tname = t.__doc__.split()[0]
+        if tname in ['t0', 't8', 's0', 's8']:
+            err = 1e-5
+        else:
+            err = 0.0
+        assert_(abs(t(234j) - 234.0j) <= err)
+        assert_(abs(t(234.6) - 234.6) <= err)
+        assert_(abs(t(long(234)) - 234.0) <= err)
+        assert_(abs(t(234.6 + 3j) - (234.6 + 3j)) <= err)
+        #assert_( abs(t('234')-234.)<=err)
+        #assert_( abs(t('234.6')-234.6)<=err)
+        assert_(abs(t(-234) + 234.) <= err)
+        assert_(abs(t([234]) - 234.) <= err)
+        assert_(abs(t((234,)) - 234.) <= err)
+        assert_(abs(t(array(234)) - 234.) <= err)
+        assert_(abs(t(array(23 + 4j, 'F')) - (23 + 4j)) <= err)
+        assert_(abs(t(array([234])) - 234.) <= err)
+        assert_(abs(t(array([[234]])) - 234.) <= err)
+        assert_(abs(t(array([234], 'b')) + 22.) <= err)
+        assert_(abs(t(array([234], 'h')) - 234.) <= err)
+        assert_(abs(t(array([234], 'i')) - 234.) <= err)
+        assert_(abs(t(array([234], 'l')) - 234.) <= err)
+        assert_(abs(t(array([234], 'q')) - 234.) <= err)
+        assert_(abs(t(array([234], 'f')) - 234.) <= err)
+        assert_(abs(t(array([234], 'd')) - 234.) <= err)
+        assert_(abs(t(array([234 + 3j], 'F')) - (234 + 3j)) <= err)
+        assert_(abs(t(array([234], 'D')) - 234.) <= err)
+
+        #assert_raises(TypeError, t, array([234], 'a1'))
+        assert_raises(TypeError, t, 'abc')
+
+        assert_raises(IndexError, t, [])
+        assert_raises(IndexError, t, ())
+
+        assert_raises(TypeError, t, t)
+        assert_raises(TypeError, t, {})
+
+        try:
+            r = t(10 ** 400)
+            assert_(repr(r) in ['(inf+0j)', '(Infinity+0j)'], repr(r))
+        except OverflowError:
+            pass
+
+
+class TestF77ReturnComplex(TestReturnComplex):
+    code = """
+       function t0(value)
+         complex value
+         complex t0
+         t0 = value
+       end
+       function t8(value)
+         complex*8 value
+         complex*8 t8
+         t8 = value
+       end
+       function t16(value)
+         complex*16 value
+         complex*16 t16
+         t16 = value
+       end
+       function td(value)
+         double complex value
+         double complex td
+         td = value
+       end
+
+       subroutine s0(t0,value)
+         complex value
+         complex t0
+cf2py    intent(out) t0
+         t0 = value
+       end
+       subroutine s8(t8,value)
+         complex*8 value
+         complex*8 t8
+cf2py    intent(out) t8
+         t8 = value
+       end
+       subroutine s16(t16,value)
+         complex*16 value
+         complex*16 t16
+cf2py    intent(out) t16
+         t16 = value
+       end
+       subroutine sd(td,value)
+         double complex value
+         double complex td
+cf2py    intent(out) td
+         td = value
+       end
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t8,t16,td,s0,s8,s16,sd'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module, name))
+
+
+class TestF90ReturnComplex(TestReturnComplex):
+    suffix = ".f90"
+    code = """
+module f90_return_complex
+  contains
+       function t0(value)
+         complex :: value
+         complex :: t0
+         t0 = value
+       end function t0
+       function t8(value)
+         complex(kind=4) :: value
+         complex(kind=4) :: t8
+         t8 = value
+       end function t8
+       function t16(value)
+         complex(kind=8) :: value
+         complex(kind=8) :: t16
+         t16 = value
+       end function t16
+       function td(value)
+         double complex :: value
+         double complex :: td
+         td = value
+       end function td
+
+       subroutine s0(t0,value)
+         complex :: value
+         complex :: t0
+!f2py    intent(out) t0
+         t0 = value
+       end subroutine s0
+       subroutine s8(t8,value)
+         complex(kind=4) :: value
+         complex(kind=4) :: t8
+!f2py    intent(out) t8
+         t8 = value
+       end subroutine s8
+       subroutine s16(t16,value)
+         complex(kind=8) :: value
+         complex(kind=8) :: t16
+!f2py    intent(out) t16
+         t16 = value
+       end subroutine s16
+       subroutine sd(td,value)
+         double complex :: value
+         double complex :: td
+!f2py    intent(out) td
+         td = value
+       end subroutine sd
+end module f90_return_complex
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t8,t16,td,s0,s8,s16,sd'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module.f90_return_complex, name))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_return_integer.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_integer.py
new file mode 100644
index 00000000000..22f4acfdf6d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_integer.py
@@ -0,0 +1,181 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+from numpy import array
+from numpy.compat import long
+from numpy.testing import assert_, assert_raises
+from . import util
+
+
+class TestReturnInteger(util.F2PyTest):
+
+    def check_function(self, t):
+        assert_(t(123) == 123, repr(t(123)))
+        assert_(t(123.6) == 123)
+        assert_(t(long(123)) == 123)
+        assert_(t('123') == 123)
+        assert_(t(-123) == -123)
+        assert_(t([123]) == 123)
+        assert_(t((123,)) == 123)
+        assert_(t(array(123)) == 123)
+        assert_(t(array([123])) == 123)
+        assert_(t(array([[123]])) == 123)
+        assert_(t(array([123], 'b')) == 123)
+        assert_(t(array([123], 'h')) == 123)
+        assert_(t(array([123], 'i')) == 123)
+        assert_(t(array([123], 'l')) == 123)
+        assert_(t(array([123], 'B')) == 123)
+        assert_(t(array([123], 'f')) == 123)
+        assert_(t(array([123], 'd')) == 123)
+
+        #assert_raises(ValueError, t, array([123],'S3'))
+        assert_raises(ValueError, t, 'abc')
+
+        assert_raises(IndexError, t, [])
+        assert_raises(IndexError, t, ())
+
+        assert_raises(Exception, t, t)
+        assert_raises(Exception, t, {})
+
+        if t.__doc__.split()[0] in ['t8', 's8']:
+            assert_raises(OverflowError, t, 100000000000000000000000)
+            assert_raises(OverflowError, t, 10000000011111111111111.23)
+
+
+class TestF77ReturnInteger(TestReturnInteger):
+    code = """
+       function t0(value)
+         integer value
+         integer t0
+         t0 = value
+       end
+       function t1(value)
+         integer*1 value
+         integer*1 t1
+         t1 = value
+       end
+       function t2(value)
+         integer*2 value
+         integer*2 t2
+         t2 = value
+       end
+       function t4(value)
+         integer*4 value
+         integer*4 t4
+         t4 = value
+       end
+       function t8(value)
+         integer*8 value
+         integer*8 t8
+         t8 = value
+       end
+
+       subroutine s0(t0,value)
+         integer value
+         integer t0
+cf2py    intent(out) t0
+         t0 = value
+       end
+       subroutine s1(t1,value)
+         integer*1 value
+         integer*1 t1
+cf2py    intent(out) t1
+         t1 = value
+       end
+       subroutine s2(t2,value)
+         integer*2 value
+         integer*2 t2
+cf2py    intent(out) t2
+         t2 = value
+       end
+       subroutine s4(t4,value)
+         integer*4 value
+         integer*4 t4
+cf2py    intent(out) t4
+         t4 = value
+       end
+       subroutine s8(t8,value)
+         integer*8 value
+         integer*8 t8
+cf2py    intent(out) t8
+         t8 = value
+       end
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name',
+                             't0,t1,t2,t4,t8,s0,s1,s2,s4,s8'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module, name))
+
+
+class TestF90ReturnInteger(TestReturnInteger):
+    suffix = ".f90"
+    code = """
+module f90_return_integer
+  contains
+       function t0(value)
+         integer :: value
+         integer :: t0
+         t0 = value
+       end function t0
+       function t1(value)
+         integer(kind=1) :: value
+         integer(kind=1) :: t1
+         t1 = value
+       end function t1
+       function t2(value)
+         integer(kind=2) :: value
+         integer(kind=2) :: t2
+         t2 = value
+       end function t2
+       function t4(value)
+         integer(kind=4) :: value
+         integer(kind=4) :: t4
+         t4 = value
+       end function t4
+       function t8(value)
+         integer(kind=8) :: value
+         integer(kind=8) :: t8
+         t8 = value
+       end function t8
+
+       subroutine s0(t0,value)
+         integer :: value
+         integer :: t0
+!f2py    intent(out) t0
+         t0 = value
+       end subroutine s0
+       subroutine s1(t1,value)
+         integer(kind=1) :: value
+         integer(kind=1) :: t1
+!f2py    intent(out) t1
+         t1 = value
+       end subroutine s1
+       subroutine s2(t2,value)
+         integer(kind=2) :: value
+         integer(kind=2) :: t2
+!f2py    intent(out) t2
+         t2 = value
+       end subroutine s2
+       subroutine s4(t4,value)
+         integer(kind=4) :: value
+         integer(kind=4) :: t4
+!f2py    intent(out) t4
+         t4 = value
+       end subroutine s4
+       subroutine s8(t8,value)
+         integer(kind=8) :: value
+         integer(kind=8) :: t8
+!f2py    intent(out) t8
+         t8 = value
+       end subroutine s8
+end module f90_return_integer
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name',
+                             't0,t1,t2,t4,t8,s0,s1,s2,s4,s8'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module.f90_return_integer, name))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_return_logical.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_logical.py
new file mode 100644
index 00000000000..96f215a914f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_logical.py
@@ -0,0 +1,189 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+from numpy import array
+from numpy.compat import long
+from numpy.testing import assert_, assert_raises
+from . import util
+
+
+class TestReturnLogical(util.F2PyTest):
+
+    def check_function(self, t):
+        assert_(t(True) == 1, repr(t(True)))
+        assert_(t(False) == 0, repr(t(False)))
+        assert_(t(0) == 0)
+        assert_(t(None) == 0)
+        assert_(t(0.0) == 0)
+        assert_(t(0j) == 0)
+        assert_(t(1j) == 1)
+        assert_(t(234) == 1)
+        assert_(t(234.6) == 1)
+        assert_(t(long(234)) == 1)
+        assert_(t(234.6 + 3j) == 1)
+        assert_(t('234') == 1)
+        assert_(t('aaa') == 1)
+        assert_(t('') == 0)
+        assert_(t([]) == 0)
+        assert_(t(()) == 0)
+        assert_(t({}) == 0)
+        assert_(t(t) == 1)
+        assert_(t(-234) == 1)
+        assert_(t(10 ** 100) == 1)
+        assert_(t([234]) == 1)
+        assert_(t((234,)) == 1)
+        assert_(t(array(234)) == 1)
+        assert_(t(array([234])) == 1)
+        assert_(t(array([[234]])) == 1)
+        assert_(t(array([234], 'b')) == 1)
+        assert_(t(array([234], 'h')) == 1)
+        assert_(t(array([234], 'i')) == 1)
+        assert_(t(array([234], 'l')) == 1)
+        assert_(t(array([234], 'f')) == 1)
+        assert_(t(array([234], 'd')) == 1)
+        assert_(t(array([234 + 3j], 'F')) == 1)
+        assert_(t(array([234], 'D')) == 1)
+        assert_(t(array(0)) == 0)
+        assert_(t(array([0])) == 0)
+        assert_(t(array([[0]])) == 0)
+        assert_(t(array([0j])) == 0)
+        assert_(t(array([1])) == 1)
+        assert_raises(ValueError, t, array([0, 0]))
+
+
+class TestF77ReturnLogical(TestReturnLogical):
+    code = """
+       function t0(value)
+         logical value
+         logical t0
+         t0 = value
+       end
+       function t1(value)
+         logical*1 value
+         logical*1 t1
+         t1 = value
+       end
+       function t2(value)
+         logical*2 value
+         logical*2 t2
+         t2 = value
+       end
+       function t4(value)
+         logical*4 value
+         logical*4 t4
+         t4 = value
+       end
+c       function t8(value)
+c         logical*8 value
+c         logical*8 t8
+c         t8 = value
+c       end
+
+       subroutine s0(t0,value)
+         logical value
+         logical t0
+cf2py    intent(out) t0
+         t0 = value
+       end
+       subroutine s1(t1,value)
+         logical*1 value
+         logical*1 t1
+cf2py    intent(out) t1
+         t1 = value
+       end
+       subroutine s2(t2,value)
+         logical*2 value
+         logical*2 t2
+cf2py    intent(out) t2
+         t2 = value
+       end
+       subroutine s4(t4,value)
+         logical*4 value
+         logical*4 t4
+cf2py    intent(out) t4
+         t4 = value
+       end
+c       subroutine s8(t8,value)
+c         logical*8 value
+c         logical*8 t8
+cf2py    intent(out) t8
+c         t8 = value
+c       end
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t1,t2,t4,s0,s1,s2,s4'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module, name))
+
+
+class TestF90ReturnLogical(TestReturnLogical):
+    suffix = ".f90"
+    code = """
+module f90_return_logical
+  contains
+       function t0(value)
+         logical :: value
+         logical :: t0
+         t0 = value
+       end function t0
+       function t1(value)
+         logical(kind=1) :: value
+         logical(kind=1) :: t1
+         t1 = value
+       end function t1
+       function t2(value)
+         logical(kind=2) :: value
+         logical(kind=2) :: t2
+         t2 = value
+       end function t2
+       function t4(value)
+         logical(kind=4) :: value
+         logical(kind=4) :: t4
+         t4 = value
+       end function t4
+       function t8(value)
+         logical(kind=8) :: value
+         logical(kind=8) :: t8
+         t8 = value
+       end function t8
+
+       subroutine s0(t0,value)
+         logical :: value
+         logical :: t0
+!f2py    intent(out) t0
+         t0 = value
+       end subroutine s0
+       subroutine s1(t1,value)
+         logical(kind=1) :: value
+         logical(kind=1) :: t1
+!f2py    intent(out) t1
+         t1 = value
+       end subroutine s1
+       subroutine s2(t2,value)
+         logical(kind=2) :: value
+         logical(kind=2) :: t2
+!f2py    intent(out) t2
+         t2 = value
+       end subroutine s2
+       subroutine s4(t4,value)
+         logical(kind=4) :: value
+         logical(kind=4) :: t4
+!f2py    intent(out) t4
+         t4 = value
+       end subroutine s4
+       subroutine s8(t8,value)
+         logical(kind=8) :: value
+         logical(kind=8) :: t8
+!f2py    intent(out) t8
+         t8 = value
+       end subroutine s8
+end module f90_return_logical
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name',
+                             't0,t1,t2,t4,t8,s0,s1,s2,s4,s8'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module.f90_return_logical, name))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_return_real.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_real.py
new file mode 100644
index 00000000000..315cfe49b9b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_return_real.py
@@ -0,0 +1,210 @@
+from __future__ import division, absolute_import, print_function
+
+import platform
+import pytest
+
+from numpy import array
+from numpy.compat import long
+from numpy.testing import assert_, assert_raises
+from . import util
+
+
+class TestReturnReal(util.F2PyTest):
+
+    def check_function(self, t):
+        if t.__doc__.split()[0] in ['t0', 't4', 's0', 's4']:
+            err = 1e-5
+        else:
+            err = 0.0
+        assert_(abs(t(234) - 234.0) <= err)
+        assert_(abs(t(234.6) - 234.6) <= err)
+        assert_(abs(t(long(234)) - 234.0) <= err)
+        assert_(abs(t('234') - 234) <= err)
+        assert_(abs(t('234.6') - 234.6) <= err)
+        assert_(abs(t(-234) + 234) <= err)
+        assert_(abs(t([234]) - 234) <= err)
+        assert_(abs(t((234,)) - 234.) <= err)
+        assert_(abs(t(array(234)) - 234.) <= err)
+        assert_(abs(t(array([234])) - 234.) <= err)
+        assert_(abs(t(array([[234]])) - 234.) <= err)
+        assert_(abs(t(array([234], 'b')) + 22) <= err)
+        assert_(abs(t(array([234], 'h')) - 234.) <= err)
+        assert_(abs(t(array([234], 'i')) - 234.) <= err)
+        assert_(abs(t(array([234], 'l')) - 234.) <= err)
+        assert_(abs(t(array([234], 'B')) - 234.) <= err)
+        assert_(abs(t(array([234], 'f')) - 234.) <= err)
+        assert_(abs(t(array([234], 'd')) - 234.) <= err)
+        if t.__doc__.split()[0] in ['t0', 't4', 's0', 's4']:
+            assert_(t(1e200) == t(1e300))  # inf
+
+        #assert_raises(ValueError, t, array([234], 'S1'))
+        assert_raises(ValueError, t, 'abc')
+
+        assert_raises(IndexError, t, [])
+        assert_raises(IndexError, t, ())
+
+        assert_raises(Exception, t, t)
+        assert_raises(Exception, t, {})
+
+        try:
+            r = t(10 ** 400)
+            assert_(repr(r) in ['inf', 'Infinity'], repr(r))
+        except OverflowError:
+            pass
+
+
+
+[email protected](
+    platform.system() == 'Darwin',
+    reason="Prone to error when run with numpy/f2py/tests on mac os, "
+           "but not when run in isolation")
+class TestCReturnReal(TestReturnReal):
+    suffix = ".pyf"
+    module_name = "c_ext_return_real"
+    code = """
+python module c_ext_return_real
+usercode \'\'\'
+float t4(float value) { return value; }
+void s4(float *t4, float value) { *t4 = value; }
+double t8(double value) { return value; }
+void s8(double *t8, double value) { *t8 = value; }
+\'\'\'
+interface
+  function t4(value)
+    real*4 intent(c) :: t4,value
+  end
+  function t8(value)
+    real*8 intent(c) :: t8,value
+  end
+  subroutine s4(t4,value)
+    intent(c) s4
+    real*4 intent(out) :: t4
+    real*4 intent(c) :: value
+  end
+  subroutine s8(t8,value)
+    intent(c) s8
+    real*8 intent(out) :: t8
+    real*8 intent(c) :: value
+  end
+end interface
+end python module c_ext_return_real
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't4,t8,s4,s8'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module, name))
+
+
+class TestF77ReturnReal(TestReturnReal):
+    code = """
+       function t0(value)
+         real value
+         real t0
+         t0 = value
+       end
+       function t4(value)
+         real*4 value
+         real*4 t4
+         t4 = value
+       end
+       function t8(value)
+         real*8 value
+         real*8 t8
+         t8 = value
+       end
+       function td(value)
+         double precision value
+         double precision td
+         td = value
+       end
+
+       subroutine s0(t0,value)
+         real value
+         real t0
+cf2py    intent(out) t0
+         t0 = value
+       end
+       subroutine s4(t4,value)
+         real*4 value
+         real*4 t4
+cf2py    intent(out) t4
+         t4 = value
+       end
+       subroutine s8(t8,value)
+         real*8 value
+         real*8 t8
+cf2py    intent(out) t8
+         t8 = value
+       end
+       subroutine sd(td,value)
+         double precision value
+         double precision td
+cf2py    intent(out) td
+         td = value
+       end
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t4,t8,td,s0,s4,s8,sd'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module, name))
+
+
+class TestF90ReturnReal(TestReturnReal):
+    suffix = ".f90"
+    code = """
+module f90_return_real
+  contains
+       function t0(value)
+         real :: value
+         real :: t0
+         t0 = value
+       end function t0
+       function t4(value)
+         real(kind=4) :: value
+         real(kind=4) :: t4
+         t4 = value
+       end function t4
+       function t8(value)
+         real(kind=8) :: value
+         real(kind=8) :: t8
+         t8 = value
+       end function t8
+       function td(value)
+         double precision :: value
+         double precision :: td
+         td = value
+       end function td
+
+       subroutine s0(t0,value)
+         real :: value
+         real :: t0
+!f2py    intent(out) t0
+         t0 = value
+       end subroutine s0
+       subroutine s4(t4,value)
+         real(kind=4) :: value
+         real(kind=4) :: t4
+!f2py    intent(out) t4
+         t4 = value
+       end subroutine s4
+       subroutine s8(t8,value)
+         real(kind=8) :: value
+         real(kind=8) :: t8
+!f2py    intent(out) t8
+         t8 = value
+       end subroutine s8
+       subroutine sd(td,value)
+         double precision :: value
+         double precision :: td
+!f2py    intent(out) td
+         td = value
+       end subroutine sd
+end module f90_return_real
+    """
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize('name', 't0,t4,t8,td,s0,s4,s8,sd'.split(','))
+    def test_all(self, name):
+        self.check_function(getattr(self.module.f90_return_real, name))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_semicolon_split.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_semicolon_split.py
new file mode 100644
index 00000000000..bcd18c893fc
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_semicolon_split.py
@@ -0,0 +1,65 @@
+from __future__ import division, absolute_import, print_function
+
+import platform
+import pytest
+
+from . import util
+from numpy.testing import assert_equal
+
+[email protected](
+    platform.system() == 'Darwin',
+    reason="Prone to error when run with numpy/f2py/tests on mac os, "
+           "but not when run in isolation")
+class TestMultiline(util.F2PyTest):
+    suffix = ".pyf"
+    module_name = "multiline"
+    code = """
+python module {module}
+    usercode '''
+void foo(int* x) {{
+    char dummy = ';';
+    *x = 42;
+}}
+'''
+    interface
+        subroutine foo(x)
+            intent(c) foo
+            integer intent(out) :: x
+        end subroutine foo
+    end interface
+end python module {module}
+    """.format(module=module_name)
+
+    def test_multiline(self):
+        assert_equal(self.module.foo(), 42)
+
+
+[email protected](
+    platform.system() == 'Darwin',
+    reason="Prone to error when run with numpy/f2py/tests on mac os, "
+           "but not when run in isolation")
+class TestCallstatement(util.F2PyTest):
+    suffix = ".pyf"
+    module_name = "callstatement"
+    code = """
+python module {module}
+    usercode '''
+void foo(int* x) {{
+}}
+'''
+    interface
+        subroutine foo(x)
+            intent(c) foo
+            integer intent(out) :: x
+            callprotoargument int*
+            callstatement {{ &
+                ; &
+                x = 42; &
+            }}
+        end subroutine foo
+    end interface
+end python module {module}
+    """.format(module=module_name)
+
+    def test_callstatement(self):
+        assert_equal(self.module.foo(), 42)
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_size.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_size.py
new file mode 100644
index 00000000000..e2af6180489
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_size.py
@@ -0,0 +1,51 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import pytest
+
+from numpy.testing import assert_equal
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestSizeSumExample(util.F2PyTest):
+    sources = [_path('src', 'size', 'foo.f90')]
+
+    @pytest.mark.slow
+    def test_all(self):
+        r = self.module.foo([[]])
+        assert_equal(r, [0], repr(r))
+
+        r = self.module.foo([[1, 2]])
+        assert_equal(r, [3], repr(r))
+
+        r = self.module.foo([[1, 2], [3, 4]])
+        assert_equal(r, [3, 7], repr(r))
+
+        r = self.module.foo([[1, 2], [3, 4], [5, 6]])
+        assert_equal(r, [3, 7, 11], repr(r))
+
+    @pytest.mark.slow
+    def test_transpose(self):
+        r = self.module.trans([[]])
+        assert_equal(r.T, [[]], repr(r))
+
+        r = self.module.trans([[1, 2]])
+        assert_equal(r, [[1], [2]], repr(r))
+
+        r = self.module.trans([[1, 2, 3], [4, 5, 6]])
+        assert_equal(r, [[1, 4], [2, 5], [3, 6]], repr(r))
+
+    @pytest.mark.slow
+    def test_flatten(self):
+        r = self.module.flatten([[]])
+        assert_equal(r, [], repr(r))
+
+        r = self.module.flatten([[1, 2]])
+        assert_equal(r, [1, 2], repr(r))
+
+        r = self.module.flatten([[1, 2, 3], [4, 5, 6]])
+        assert_equal(r, [1, 2, 3, 4, 5, 6], repr(r))
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/test_string.py b/contrib/python/numpy/py2/numpy/f2py/tests/test_string.py
new file mode 100644
index 00000000000..0493c99cf1d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/test_string.py
@@ -0,0 +1,24 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import pytest
+
+from numpy.testing import assert_array_equal
+import numpy as np
+from . import util
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+class TestString(util.F2PyTest):
+    sources = [_path('src', 'string', 'char.f90')]
+
+    @pytest.mark.slow
+    def test_char(self):
+        strings = np.array(['ab', 'cd', 'ef'], dtype='c').T
+        inp, out = self.module.char_test.change_strings(strings, strings.shape[1])
+        assert_array_equal(inp, strings)
+        expected = strings.copy()
+        expected[1, :] = 'AAA'
+        assert_array_equal(out, expected)
diff --git a/contrib/python/numpy/py2/numpy/f2py/tests/util.py b/contrib/python/numpy/py2/numpy/f2py/tests/util.py
new file mode 100644
index 00000000000..5fa5dadd2ac
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/f2py/tests/util.py
@@ -0,0 +1,360 @@
+"""
+Utility functions for
+
+- building and importing modules on test time, using a temporary location
+- detecting if compilers are present
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import subprocess
+import tempfile
+import shutil
+import atexit
+import textwrap
+import re
+import pytest
+
+from numpy.compat import asbytes, asstr
+from numpy.testing import temppath
+from importlib import import_module
+
+try:
+    from hashlib import md5
+except ImportError:
+    from md5 import new as md5  # noqa: F401
+
+#
+# Maintaining a temporary module directory
+#
+
+_module_dir = None
+
+
+def _cleanup():
+    global _module_dir
+    if _module_dir is not None:
+        try:
+            sys.path.remove(_module_dir)
+        except ValueError:
+            pass
+        try:
+            shutil.rmtree(_module_dir)
+        except (IOError, OSError):
+            pass
+        _module_dir = None
+
+
+def get_module_dir():
+    global _module_dir
+    if _module_dir is None:
+        _module_dir = tempfile.mkdtemp()
+        atexit.register(_cleanup)
+        if _module_dir not in sys.path:
+            sys.path.insert(0, _module_dir)
+    return _module_dir
+
+
+def get_temp_module_name():
+    # Assume single-threaded, and the module dir usable only by this thread
+    d = get_module_dir()
+    for j in range(5403, 9999999):
+        name = "_test_ext_module_%d" % j
+        fn = os.path.join(d, name)
+        if name not in sys.modules and not os.path.isfile(fn + '.py'):
+            return name
+    raise RuntimeError("Failed to create a temporary module name")
+
+
+def _memoize(func):
+    memo = {}
+
+    def wrapper(*a, **kw):
+        key = repr((a, kw))
+        if key not in memo:
+            try:
+                memo[key] = func(*a, **kw)
+            except Exception as e:
+                memo[key] = e
+                raise
+        ret = memo[key]
+        if isinstance(ret, Exception):
+            raise ret
+        return ret
+    wrapper.__name__ = func.__name__
+    return wrapper
+
+#
+# Building modules
+#
+
+
+@_memoize
+def build_module(source_files, options=[], skip=[], only=[], module_name=None):
+    """
+    Compile and import a f2py module, built from the given files.
+
+    """
+
+    code = ("import sys; sys.path = %s; import numpy.f2py as f2py2e; "
+            "f2py2e.main()" % repr(sys.path))
+
+    d = get_module_dir()
+
+    # Copy files
+    dst_sources = []
+    for fn in source_files:
+        if not os.path.isfile(fn):
+            raise RuntimeError("%s is not a file" % fn)
+        dst = os.path.join(d, os.path.basename(fn))
+        shutil.copyfile(fn, dst)
+        dst_sources.append(dst)
+
+        fn = os.path.join(os.path.dirname(fn), '.f2py_f2cmap')
+        if os.path.isfile(fn):
+            dst = os.path.join(d, os.path.basename(fn))
+            if not os.path.isfile(dst):
+                shutil.copyfile(fn, dst)
+
+    # Prepare options
+    if module_name is None:
+        module_name = get_temp_module_name()
+    f2py_opts = ['-c', '-m', module_name] + options + dst_sources
+    if skip:
+        f2py_opts += ['skip:'] + skip
+    if only:
+        f2py_opts += ['only:'] + only
+
+    # Build
+    cwd = os.getcwd()
+    try:
+        os.chdir(d)
+        cmd = [sys.executable, '-c', code] + f2py_opts
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        out, err = p.communicate()
+        if p.returncode != 0:
+            raise RuntimeError("Running f2py failed: %s\n%s"
+                               % (cmd[4:], asstr(out)))
+    finally:
+        os.chdir(cwd)
+
+        # Partial cleanup
+        for fn in dst_sources:
+            os.unlink(fn)
+
+    # Import
+    return import_module(module_name)
+
+
+@_memoize
+def build_code(source_code, options=[], skip=[], only=[], suffix=None,
+               module_name=None):
+    """
+    Compile and import Fortran code using f2py.
+
+    """
+    if suffix is None:
+        suffix = '.f'
+    with temppath(suffix=suffix) as path:
+        with open(path, 'w') as f:
+            f.write(source_code)
+        return build_module([path], options=options, skip=skip, only=only,
+                            module_name=module_name)
+
+#
+# Check if compilers are available at all...
+#
+
+_compiler_status = None
+
+
+def _get_compiler_status():
+    global _compiler_status
+    if _compiler_status is not None:
+        return _compiler_status
+
+    _compiler_status = (False, False, False)
+
+    # XXX: this is really ugly. But I don't know how to invoke Distutils
+    #      in a safer way...
+    code = """
+import os
+import sys
+sys.path = %(syspath)s
+
+def configuration(parent_name='',top_path=None):
+    global config
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('', parent_name, top_path)
+    return config
+
+from numpy.distutils.core import setup
+setup(configuration=configuration)
+
+config_cmd = config.get_config_cmd()
+have_c = config_cmd.try_compile('void foo() {}')
+print('COMPILERS:%%d,%%d,%%d' %% (have_c,
+                                  config.have_f77c(),
+                                  config.have_f90c()))
+sys.exit(99)
+"""
+    code = code % dict(syspath=repr(sys.path))
+
+    with temppath(suffix='.py') as script:
+        with open(script, 'w') as f:
+            f.write(code)
+
+        cmd = [sys.executable, script, 'config']
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        out, err = p.communicate()
+
+    m = re.search(br'COMPILERS:(\d+),(\d+),(\d+)', out)
+    if m:
+        _compiler_status = (bool(int(m.group(1))), bool(int(m.group(2))),
+                            bool(int(m.group(3))))
+    # Finished
+    return _compiler_status
+
+
+def has_c_compiler():
+    return _get_compiler_status()[0]
+
+
+def has_f77_compiler():
+    return _get_compiler_status()[1]
+
+
+def has_f90_compiler():
+    return _get_compiler_status()[2]
+
+#
+# Building with distutils
+#
+
+
+@_memoize
+def build_module_distutils(source_files, config_code, module_name, **kw):
+    """
+    Build a module via distutils and import it.
+
+    """
+    from numpy.distutils.misc_util import Configuration
+    from numpy.distutils.core import setup
+
+    d = get_module_dir()
+
+    # Copy files
+    dst_sources = []
+    for fn in source_files:
+        if not os.path.isfile(fn):
+            raise RuntimeError("%s is not a file" % fn)
+        dst = os.path.join(d, os.path.basename(fn))
+        shutil.copyfile(fn, dst)
+        dst_sources.append(dst)
+
+    # Build script
+    config_code = textwrap.dedent(config_code).replace("\n", "\n    ")
+
+    code = """\
+import os
+import sys
+sys.path = %(syspath)s
+
+def configuration(parent_name='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('', parent_name, top_path)
+    %(config_code)s
+    return config
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
+""" % dict(config_code=config_code, syspath=repr(sys.path))
+
+    script = os.path.join(d, get_temp_module_name() + '.py')
+    dst_sources.append(script)
+    f = open(script, 'wb')
+    f.write(asbytes(code))
+    f.close()
+
+    # Build
+    cwd = os.getcwd()
+    try:
+        os.chdir(d)
+        cmd = [sys.executable, script, 'build_ext', '-i']
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        out, err = p.communicate()
+        if p.returncode != 0:
+            raise RuntimeError("Running distutils build failed: %s\n%s"
+                               % (cmd[4:], asstr(out)))
+    finally:
+        os.chdir(cwd)
+
+        # Partial cleanup
+        for fn in dst_sources:
+            os.unlink(fn)
+
+    # Import
+    __import__(module_name)
+    return sys.modules[module_name]
+
+#
+# Unittest convenience
+#
+
+
+class F2PyTest(object):
+    code = None
+    sources = None
+    options = []
+    skip = []
+    only = []
+    suffix = '.f'
+    module = None
+    module_name = None
+
+    def setup(self):
+        if sys.platform == 'win32':
+            pytest.skip('Fails with MinGW64 Gfortran (Issue #9673)')
+
+        if self.module is not None:
+            return
+
+        # Check compiler availability first
+        if not has_c_compiler():
+            pytest.skip("No C compiler available")
+
+        codes = []
+        if self.sources:
+            codes.extend(self.sources)
+        if self.code is not None:
+            codes.append(self.suffix)
+
+        needs_f77 = False
+        needs_f90 = False
+        for fn in codes:
+            if fn.endswith('.f'):
+                needs_f77 = True
+            elif fn.endswith('.f90'):
+                needs_f90 = True
+        if needs_f77 and not has_f77_compiler():
+            pytest.skip("No Fortran 77 compiler available")
+        if needs_f90 and not has_f90_compiler():
+            pytest.skip("No Fortran 90 compiler available")
+
+        # Build the module
+        if self.code is not None:
+            self.module = build_code(self.code, options=self.options,
+                                     skip=self.skip, only=self.only,
+                                     suffix=self.suffix,
+                                     module_name=self.module_name)
+
+        if self.sources is not None:
+            self.module = build_module(self.sources, options=self.options,
+                                       skip=self.skip, only=self.only,
+                                       module_name=self.module_name)
diff --git a/contrib/python/numpy/py2/numpy/fft/setup.py b/contrib/python/numpy/py2/numpy/fft/setup.py
new file mode 100644
index 00000000000..cd99a82d7b5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/fft/setup.py
@@ -0,0 +1,19 @@
+from __future__ import division, print_function
+
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('fft', parent_package, top_path)
+
+    config.add_data_dir('tests')
+
+    # Configure fftpack_lite
+    config.add_extension('fftpack_lite',
+                         sources=['fftpack_litemodule.c', 'fftpack.c']
+                         )
+
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/fft/tests/__init__.py b/contrib/python/numpy/py2/numpy/fft/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/fft/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/fft/tests/test_fftpack.py b/contrib/python/numpy/py2/numpy/fft/tests/test_fftpack.py
new file mode 100644
index 00000000000..8d6cd84070b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/fft/tests/test_fftpack.py
@@ -0,0 +1,185 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.random import random
+from numpy.testing import (
+        assert_array_almost_equal, assert_array_equal, assert_raises,
+        )
+import threading
+import sys
+if sys.version_info[0] >= 3:
+    import queue
+else:
+    import Queue as queue
+
+
+def fft1(x):
+    L = len(x)
+    phase = -2j*np.pi*(np.arange(L)/float(L))
+    phase = np.arange(L).reshape(-1, 1) * phase
+    return np.sum(x*np.exp(phase), axis=1)
+
+
+class TestFFTShift(object):
+
+    def test_fft_n(self):
+        assert_raises(ValueError, np.fft.fft, [1, 2, 3], 0)
+
+
+class TestFFT1D(object):
+
+    def test_fft(self):
+        x = random(30) + 1j*random(30)
+        assert_array_almost_equal(fft1(x), np.fft.fft(x))
+        assert_array_almost_equal(fft1(x) / np.sqrt(30),
+                                  np.fft.fft(x, norm="ortho"))
+
+    def test_ifft(self):
+        x = random(30) + 1j*random(30)
+        assert_array_almost_equal(x, np.fft.ifft(np.fft.fft(x)))
+        assert_array_almost_equal(
+            x, np.fft.ifft(np.fft.fft(x, norm="ortho"), norm="ortho"))
+
+    def test_fft2(self):
+        x = random((30, 20)) + 1j*random((30, 20))
+        assert_array_almost_equal(np.fft.fft(np.fft.fft(x, axis=1), axis=0),
+                                  np.fft.fft2(x))
+        assert_array_almost_equal(np.fft.fft2(x) / np.sqrt(30 * 20),
+                                  np.fft.fft2(x, norm="ortho"))
+
+    def test_ifft2(self):
+        x = random((30, 20)) + 1j*random((30, 20))
+        assert_array_almost_equal(np.fft.ifft(np.fft.ifft(x, axis=1), axis=0),
+                                  np.fft.ifft2(x))
+        assert_array_almost_equal(np.fft.ifft2(x) * np.sqrt(30 * 20),
+                                  np.fft.ifft2(x, norm="ortho"))
+
+    def test_fftn(self):
+        x = random((30, 20, 10)) + 1j*random((30, 20, 10))
+        assert_array_almost_equal(
+            np.fft.fft(np.fft.fft(np.fft.fft(x, axis=2), axis=1), axis=0),
+            np.fft.fftn(x))
+        assert_array_almost_equal(np.fft.fftn(x) / np.sqrt(30 * 20 * 10),
+                                  np.fft.fftn(x, norm="ortho"))
+
+    def test_ifftn(self):
+        x = random((30, 20, 10)) + 1j*random((30, 20, 10))
+        assert_array_almost_equal(
+            np.fft.ifft(np.fft.ifft(np.fft.ifft(x, axis=2), axis=1), axis=0),
+            np.fft.ifftn(x))
+        assert_array_almost_equal(np.fft.ifftn(x) * np.sqrt(30 * 20 * 10),
+                                  np.fft.ifftn(x, norm="ortho"))
+
+    def test_rfft(self):
+        x = random(30)
+        for n in [x.size, 2*x.size]:
+            for norm in [None, 'ortho']:
+                assert_array_almost_equal(
+                    np.fft.fft(x, n=n, norm=norm)[:(n//2 + 1)],
+                    np.fft.rfft(x, n=n, norm=norm))
+            assert_array_almost_equal(np.fft.rfft(x, n=n) / np.sqrt(n),
+                                      np.fft.rfft(x, n=n, norm="ortho"))
+
+    def test_irfft(self):
+        x = random(30)
+        assert_array_almost_equal(x, np.fft.irfft(np.fft.rfft(x)))
+        assert_array_almost_equal(
+            x, np.fft.irfft(np.fft.rfft(x, norm="ortho"), norm="ortho"))
+
+    def test_rfft2(self):
+        x = random((30, 20))
+        assert_array_almost_equal(np.fft.fft2(x)[:, :11], np.fft.rfft2(x))
+        assert_array_almost_equal(np.fft.rfft2(x) / np.sqrt(30 * 20),
+                                  np.fft.rfft2(x, norm="ortho"))
+
+    def test_irfft2(self):
+        x = random((30, 20))
+        assert_array_almost_equal(x, np.fft.irfft2(np.fft.rfft2(x)))
+        assert_array_almost_equal(
+            x, np.fft.irfft2(np.fft.rfft2(x, norm="ortho"), norm="ortho"))
+
+    def test_rfftn(self):
+        x = random((30, 20, 10))
+        assert_array_almost_equal(np.fft.fftn(x)[:, :, :6], np.fft.rfftn(x))
+        assert_array_almost_equal(np.fft.rfftn(x) / np.sqrt(30 * 20 * 10),
+                                  np.fft.rfftn(x, norm="ortho"))
+
+    def test_irfftn(self):
+        x = random((30, 20, 10))
+        assert_array_almost_equal(x, np.fft.irfftn(np.fft.rfftn(x)))
+        assert_array_almost_equal(
+            x, np.fft.irfftn(np.fft.rfftn(x, norm="ortho"), norm="ortho"))
+
+    def test_hfft(self):
+        x = random(14) + 1j*random(14)
+        x_herm = np.concatenate((random(1), x, random(1)))
+        x = np.concatenate((x_herm, x[::-1].conj()))
+        assert_array_almost_equal(np.fft.fft(x), np.fft.hfft(x_herm))
+        assert_array_almost_equal(np.fft.hfft(x_herm) / np.sqrt(30),
+                                  np.fft.hfft(x_herm, norm="ortho"))
+
+    def test_ihttf(self):
+        x = random(14) + 1j*random(14)
+        x_herm = np.concatenate((random(1), x, random(1)))
+        x = np.concatenate((x_herm, x[::-1].conj()))
+        assert_array_almost_equal(x_herm, np.fft.ihfft(np.fft.hfft(x_herm)))
+        assert_array_almost_equal(
+            x_herm, np.fft.ihfft(np.fft.hfft(x_herm, norm="ortho"),
+                                 norm="ortho"))
+
+    def test_all_1d_norm_preserving(self):
+        # verify that round-trip transforms are norm-preserving
+        x = random(30)
+        x_norm = np.linalg.norm(x)
+        n = x.size * 2
+        func_pairs = [(np.fft.fft, np.fft.ifft),
+                      (np.fft.rfft, np.fft.irfft),
+                      # hfft: order so the first function takes x.size samples
+                      #       (necessary for comparison to x_norm above)
+                      (np.fft.ihfft, np.fft.hfft),
+                      ]
+        for forw, back in func_pairs:
+            for n in [x.size, 2*x.size]:
+                for norm in [None, 'ortho']:
+                    tmp = forw(x, n=n, norm=norm)
+                    tmp = back(tmp, n=n, norm=norm)
+                    assert_array_almost_equal(x_norm,
+                                              np.linalg.norm(tmp))
+
+class TestFFTThreadSafe(object):
+    threads = 16
+    input_shape = (800, 200)
+
+    def _test_mtsame(self, func, *args):
+        def worker(args, q):
+            q.put(func(*args))
+
+        q = queue.Queue()
+        expected = func(*args)
+
+        # Spin off a bunch of threads to call the same function simultaneously
+        t = [threading.Thread(target=worker, args=(args, q))
+             for i in range(self.threads)]
+        [x.start() for x in t]
+
+        [x.join() for x in t]
+        # Make sure all threads returned the correct value
+        for i in range(self.threads):
+            assert_array_equal(q.get(timeout=5), expected,
+                'Function returned wrong value in multithreaded context')
+
+    def test_fft(self):
+        a = np.ones(self.input_shape) * 1+0j
+        self._test_mtsame(np.fft.fft, a)
+
+    def test_ifft(self):
+        a = np.ones(self.input_shape) * 1+0j
+        self._test_mtsame(np.fft.ifft, a)
+
+    def test_rfft(self):
+        a = np.ones(self.input_shape)
+        self._test_mtsame(np.fft.rfft, a)
+
+    def test_irfft(self):
+        a = np.ones(self.input_shape) * 1+0j
+        self._test_mtsame(np.fft.irfft, a)
diff --git a/contrib/python/numpy/py2/numpy/fft/tests/test_helper.py b/contrib/python/numpy/py2/numpy/fft/tests/test_helper.py
new file mode 100644
index 00000000000..8d315fa0202
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/fft/tests/test_helper.py
@@ -0,0 +1,248 @@
+"""Test functions for fftpack.helper module
+
+Copied from fftpack.helper by Pearu Peterson, October 2005
+
+"""
+from __future__ import division, absolute_import, print_function
+import numpy as np
+from numpy.testing import assert_array_almost_equal, assert_equal
+from numpy import fft, pi
+from numpy.fft.helper import _FFTCache
+
+
+class TestFFTShift(object):
+
+    def test_definition(self):
+        x = [0, 1, 2, 3, 4, -4, -3, -2, -1]
+        y = [-4, -3, -2, -1, 0, 1, 2, 3, 4]
+        assert_array_almost_equal(fft.fftshift(x), y)
+        assert_array_almost_equal(fft.ifftshift(y), x)
+        x = [0, 1, 2, 3, 4, -5, -4, -3, -2, -1]
+        y = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
+        assert_array_almost_equal(fft.fftshift(x), y)
+        assert_array_almost_equal(fft.ifftshift(y), x)
+
+    def test_inverse(self):
+        for n in [1, 4, 9, 100, 211]:
+            x = np.random.random((n,))
+            assert_array_almost_equal(fft.ifftshift(fft.fftshift(x)), x)
+
+    def test_axes_keyword(self):
+        freqs = [[0, 1, 2], [3, 4, -4], [-3, -2, -1]]
+        shifted = [[-1, -3, -2], [2, 0, 1], [-4, 3, 4]]
+        assert_array_almost_equal(fft.fftshift(freqs, axes=(0, 1)), shifted)
+        assert_array_almost_equal(fft.fftshift(freqs, axes=0),
+                                  fft.fftshift(freqs, axes=(0,)))
+        assert_array_almost_equal(fft.ifftshift(shifted, axes=(0, 1)), freqs)
+        assert_array_almost_equal(fft.ifftshift(shifted, axes=0),
+                                  fft.ifftshift(shifted, axes=(0,)))
+
+        assert_array_almost_equal(fft.fftshift(freqs), shifted)
+        assert_array_almost_equal(fft.ifftshift(shifted), freqs)
+
+    def test_uneven_dims(self):
+        """ Test 2D input, which has uneven dimension sizes """
+        freqs = [
+            [0, 1],
+            [2, 3],
+            [4, 5]
+        ]
+
+        # shift in dimension 0
+        shift_dim0 = [
+            [4, 5],
+            [0, 1],
+            [2, 3]
+        ]
+        assert_array_almost_equal(fft.fftshift(freqs, axes=0), shift_dim0)
+        assert_array_almost_equal(fft.ifftshift(shift_dim0, axes=0), freqs)
+        assert_array_almost_equal(fft.fftshift(freqs, axes=(0,)), shift_dim0)
+        assert_array_almost_equal(fft.ifftshift(shift_dim0, axes=[0]), freqs)
+
+        # shift in dimension 1
+        shift_dim1 = [
+            [1, 0],
+            [3, 2],
+            [5, 4]
+        ]
+        assert_array_almost_equal(fft.fftshift(freqs, axes=1), shift_dim1)
+        assert_array_almost_equal(fft.ifftshift(shift_dim1, axes=1), freqs)
+
+        # shift in both dimensions
+        shift_dim_both = [
+            [5, 4],
+            [1, 0],
+            [3, 2]
+        ]
+        assert_array_almost_equal(fft.fftshift(freqs, axes=(0, 1)), shift_dim_both)
+        assert_array_almost_equal(fft.ifftshift(shift_dim_both, axes=(0, 1)), freqs)
+        assert_array_almost_equal(fft.fftshift(freqs, axes=[0, 1]), shift_dim_both)
+        assert_array_almost_equal(fft.ifftshift(shift_dim_both, axes=[0, 1]), freqs)
+
+        # axes=None (default) shift in all dimensions
+        assert_array_almost_equal(fft.fftshift(freqs, axes=None), shift_dim_both)
+        assert_array_almost_equal(fft.ifftshift(shift_dim_both, axes=None), freqs)
+        assert_array_almost_equal(fft.fftshift(freqs), shift_dim_both)
+        assert_array_almost_equal(fft.ifftshift(shift_dim_both), freqs)
+
+    def test_equal_to_original(self):
+        """ Test that the new (>=v1.15) implementation (see #10073) is equal to the original (<=v1.14) """
+        from numpy.compat import integer_types
+        from numpy.core import asarray, concatenate, arange, take
+
+        def original_fftshift(x, axes=None):
+            """ How fftshift was implemented in v1.14"""
+            tmp = asarray(x)
+            ndim = tmp.ndim
+            if axes is None:
+                axes = list(range(ndim))
+            elif isinstance(axes, integer_types):
+                axes = (axes,)
+            y = tmp
+            for k in axes:
+                n = tmp.shape[k]
+                p2 = (n + 1) // 2
+                mylist = concatenate((arange(p2, n), arange(p2)))
+                y = take(y, mylist, k)
+            return y
+
+        def original_ifftshift(x, axes=None):
+            """ How ifftshift was implemented in v1.14 """
+            tmp = asarray(x)
+            ndim = tmp.ndim
+            if axes is None:
+                axes = list(range(ndim))
+            elif isinstance(axes, integer_types):
+                axes = (axes,)
+            y = tmp
+            for k in axes:
+                n = tmp.shape[k]
+                p2 = n - (n + 1) // 2
+                mylist = concatenate((arange(p2, n), arange(p2)))
+                y = take(y, mylist, k)
+            return y
+
+        # create possible 2d array combinations and try all possible keywords
+        # compare output to original functions
+        for i in range(16):
+            for j in range(16):
+                for axes_keyword in [0, 1, None, (0,), (0, 1)]:
+                    inp = np.random.rand(i, j)
+
+                    assert_array_almost_equal(fft.fftshift(inp, axes_keyword),
+                                              original_fftshift(inp, axes_keyword))
+
+                    assert_array_almost_equal(fft.ifftshift(inp, axes_keyword),
+                                              original_ifftshift(inp, axes_keyword))
+
+
+class TestFFTFreq(object):
+
+    def test_definition(self):
+        x = [0, 1, 2, 3, 4, -4, -3, -2, -1]
+        assert_array_almost_equal(9*fft.fftfreq(9), x)
+        assert_array_almost_equal(9*pi*fft.fftfreq(9, pi), x)
+        x = [0, 1, 2, 3, 4, -5, -4, -3, -2, -1]
+        assert_array_almost_equal(10*fft.fftfreq(10), x)
+        assert_array_almost_equal(10*pi*fft.fftfreq(10, pi), x)
+
+
+class TestRFFTFreq(object):
+
+    def test_definition(self):
+        x = [0, 1, 2, 3, 4]
+        assert_array_almost_equal(9*fft.rfftfreq(9), x)
+        assert_array_almost_equal(9*pi*fft.rfftfreq(9, pi), x)
+        x = [0, 1, 2, 3, 4, 5]
+        assert_array_almost_equal(10*fft.rfftfreq(10), x)
+        assert_array_almost_equal(10*pi*fft.rfftfreq(10, pi), x)
+
+
+class TestIRFFTN(object):
+
+    def test_not_last_axis_success(self):
+        ar, ai = np.random.random((2, 16, 8, 32))
+        a = ar + 1j*ai
+
+        axes = (-2,)
+
+        # Should not raise error
+        fft.irfftn(a, axes=axes)
+
+
+class TestFFTCache(object):
+
+    def test_basic_behaviour(self):
+        c = _FFTCache(max_size_in_mb=1, max_item_count=4)
+
+        # Put
+        c.put_twiddle_factors(1, np.ones(2, dtype=np.float32))
+        c.put_twiddle_factors(2, np.zeros(2, dtype=np.float32))
+
+        # Get
+        assert_array_almost_equal(c.pop_twiddle_factors(1),
+                                  np.ones(2, dtype=np.float32))
+        assert_array_almost_equal(c.pop_twiddle_factors(2),
+                                  np.zeros(2, dtype=np.float32))
+
+        # Nothing should be left.
+        assert_equal(len(c._dict), 0)
+
+        # Now put everything in twice so it can be retrieved once and each will
+        # still have one item left.
+        for _ in range(2):
+            c.put_twiddle_factors(1, np.ones(2, dtype=np.float32))
+            c.put_twiddle_factors(2, np.zeros(2, dtype=np.float32))
+        assert_array_almost_equal(c.pop_twiddle_factors(1),
+                                  np.ones(2, dtype=np.float32))
+        assert_array_almost_equal(c.pop_twiddle_factors(2),
+                                  np.zeros(2, dtype=np.float32))
+        assert_equal(len(c._dict), 2)
+
+    def test_automatic_pruning(self):
+        # That's around 2600 single precision samples.
+        c = _FFTCache(max_size_in_mb=0.01, max_item_count=4)
+
+        c.put_twiddle_factors(1, np.ones(200, dtype=np.float32))
+        c.put_twiddle_factors(2, np.ones(200, dtype=np.float32))
+        assert_equal(list(c._dict.keys()), [1, 2])
+
+        # This is larger than the limit but should still be kept.
+        c.put_twiddle_factors(3, np.ones(3000, dtype=np.float32))
+        assert_equal(list(c._dict.keys()), [1, 2, 3])
+        # Add one more.
+        c.put_twiddle_factors(4, np.ones(3000, dtype=np.float32))
+        # The other three should no longer exist.
+        assert_equal(list(c._dict.keys()), [4])
+
+        # Now test the max item count pruning.
+        c = _FFTCache(max_size_in_mb=0.01, max_item_count=2)
+        c.put_twiddle_factors(2, np.empty(2))
+        c.put_twiddle_factors(1, np.empty(2))
+        # Can still be accessed.
+        assert_equal(list(c._dict.keys()), [2, 1])
+
+        c.put_twiddle_factors(3, np.empty(2))
+        # 1 and 3 can still be accessed - c[2] has been touched least recently
+        # and is thus evicted.
+        assert_equal(list(c._dict.keys()), [1, 3])
+
+        # One last test. We will add a single large item that is slightly
+        # bigger then the cache size. Some small items can still be added.
+        c = _FFTCache(max_size_in_mb=0.01, max_item_count=5)
+        c.put_twiddle_factors(1, np.ones(3000, dtype=np.float32))
+        c.put_twiddle_factors(2, np.ones(2, dtype=np.float32))
+        c.put_twiddle_factors(3, np.ones(2, dtype=np.float32))
+        c.put_twiddle_factors(4, np.ones(2, dtype=np.float32))
+        assert_equal(list(c._dict.keys()), [1, 2, 3, 4])
+
+        # One more big item. This time it is 6 smaller ones but they are
+        # counted as one big item.
+        for _ in range(6):
+            c.put_twiddle_factors(5, np.ones(500, dtype=np.float32))
+        # '1' no longer in the cache. Rest still in the cache.
+        assert_equal(list(c._dict.keys()), [2, 3, 4, 5])
+
+        # Another big item - should now be the only item in the cache.
+        c.put_twiddle_factors(6, np.ones(4000, dtype=np.float32))
+        assert_equal(list(c._dict.keys()), [6])
diff --git a/contrib/python/numpy/py2/numpy/lib/setup.py b/contrib/python/numpy/py2/numpy/lib/setup.py
new file mode 100644
index 00000000000..d342410b8a8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/setup.py
@@ -0,0 +1,12 @@
+from __future__ import division, print_function
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+
+    config = Configuration('lib', parent_package, top_path)
+    config.add_data_dir('tests')
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/__init__.py b/contrib/python/numpy/py2/numpy/lib/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/data/py2-objarr.npy b/contrib/python/numpy/py2/numpy/lib/tests/data/py2-objarr.npy
new file mode 100644
index 00000000000..12936c92d8f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/data/py2-objarr.npy
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/data/py2-objarr.npz b/contrib/python/numpy/py2/numpy/lib/tests/data/py2-objarr.npz
new file mode 100644
index 00000000000..68a3b53a1df
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/data/py2-objarr.npz
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/data/py3-objarr.npy b/contrib/python/numpy/py2/numpy/lib/tests/data/py3-objarr.npy
new file mode 100644
index 00000000000..6776074b421
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/data/py3-objarr.npy
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/data/py3-objarr.npz b/contrib/python/numpy/py2/numpy/lib/tests/data/py3-objarr.npz
new file mode 100644
index 00000000000..05eac0b76d8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/data/py3-objarr.npz
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/data/python3.npy b/contrib/python/numpy/py2/numpy/lib/tests/data/python3.npy
new file mode 100644
index 00000000000..7c6997dd69e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/data/python3.npy
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/data/win64python2.npy b/contrib/python/numpy/py2/numpy/lib/tests/data/win64python2.npy
new file mode 100644
index 00000000000..d9bc36af739
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/data/win64python2.npy
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test__datasource.py b/contrib/python/numpy/py2/numpy/lib/tests/test__datasource.py
new file mode 100644
index 00000000000..8eac16b589a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test__datasource.py
@@ -0,0 +1,378 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import pytest
+from tempfile import mkdtemp, mkstemp, NamedTemporaryFile
+from shutil import rmtree
+
+import numpy.lib._datasource as datasource
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_warns
+    )
+
+if sys.version_info[0] >= 3:
+    import urllib.request as urllib_request
+    from urllib.parse import urlparse
+    from urllib.error import URLError
+else:
+    import urllib2 as urllib_request
+    from urlparse import urlparse
+    from urllib2 import URLError
+
+
+def urlopen_stub(url, data=None):
+    '''Stub to replace urlopen for testing.'''
+    if url == valid_httpurl():
+        tmpfile = NamedTemporaryFile(prefix='urltmp_')
+        return tmpfile
+    else:
+        raise URLError('Name or service not known')
+
+# setup and teardown
+old_urlopen = None
+
+
+def setup_module():
+    global old_urlopen
+
+    old_urlopen = urllib_request.urlopen
+    urllib_request.urlopen = urlopen_stub
+
+
+def teardown_module():
+    urllib_request.urlopen = old_urlopen
+
+# A valid website for more robust testing
+http_path = 'http://www.google.com/'
+http_file = 'index.html'
+
+http_fakepath = 'http://fake.abc.web/site/'
+http_fakefile = 'fake.txt'
+
+malicious_files = ['/etc/shadow', '../../shadow',
+                   '..\\system.dat', 'c:\\windows\\system.dat']
+
+magic_line = b'three is the magic number'
+
+
+# Utility functions used by many tests
+def valid_textfile(filedir):
+    # Generate and return a valid temporary file.
+    fd, path = mkstemp(suffix='.txt', prefix='dstmp_', dir=filedir, text=True)
+    os.close(fd)
+    return path
+
+
+def invalid_textfile(filedir):
+    # Generate and return an invalid filename.
+    fd, path = mkstemp(suffix='.txt', prefix='dstmp_', dir=filedir)
+    os.close(fd)
+    os.remove(path)
+    return path
+
+
+def valid_httpurl():
+    return http_path+http_file
+
+
+def invalid_httpurl():
+    return http_fakepath+http_fakefile
+
+
+def valid_baseurl():
+    return http_path
+
+
+def invalid_baseurl():
+    return http_fakepath
+
+
+def valid_httpfile():
+    return http_file
+
+
+def invalid_httpfile():
+    return http_fakefile
+
+
+class TestDataSourceOpen(object):
+    def setup(self):
+        self.tmpdir = mkdtemp()
+        self.ds = datasource.DataSource(self.tmpdir)
+
+    def teardown(self):
+        rmtree(self.tmpdir)
+        del self.ds
+
+    def test_ValidHTTP(self):
+        fh = self.ds.open(valid_httpurl())
+        assert_(fh)
+        fh.close()
+
+    def test_InvalidHTTP(self):
+        url = invalid_httpurl()
+        assert_raises(IOError, self.ds.open, url)
+        try:
+            self.ds.open(url)
+        except IOError as e:
+            # Regression test for bug fixed in r4342.
+            assert_(e.errno is None)
+
+    def test_InvalidHTTPCacheURLError(self):
+        assert_raises(URLError, self.ds._cache, invalid_httpurl())
+
+    def test_ValidFile(self):
+        local_file = valid_textfile(self.tmpdir)
+        fh = self.ds.open(local_file)
+        assert_(fh)
+        fh.close()
+
+    def test_InvalidFile(self):
+        invalid_file = invalid_textfile(self.tmpdir)
+        assert_raises(IOError, self.ds.open, invalid_file)
+
+    def test_ValidGzipFile(self):
+        try:
+            import gzip
+        except ImportError:
+            # We don't have the gzip capabilities to test.
+            pytest.skip()
+        # Test datasource's internal file_opener for Gzip files.
+        filepath = os.path.join(self.tmpdir, 'foobar.txt.gz')
+        fp = gzip.open(filepath, 'w')
+        fp.write(magic_line)
+        fp.close()
+        fp = self.ds.open(filepath)
+        result = fp.readline()
+        fp.close()
+        assert_equal(magic_line, result)
+
+    def test_ValidBz2File(self):
+        try:
+            import bz2
+        except ImportError:
+            # We don't have the bz2 capabilities to test.
+            pytest.skip()
+        # Test datasource's internal file_opener for BZip2 files.
+        filepath = os.path.join(self.tmpdir, 'foobar.txt.bz2')
+        fp = bz2.BZ2File(filepath, 'w')
+        fp.write(magic_line)
+        fp.close()
+        fp = self.ds.open(filepath)
+        result = fp.readline()
+        fp.close()
+        assert_equal(magic_line, result)
+
+    @pytest.mark.skipif(sys.version_info[0] >= 3, reason="Python 2 only")
+    def test_Bz2File_text_mode_warning(self):
+        try:
+            import bz2
+        except ImportError:
+            # We don't have the bz2 capabilities to test.
+            pytest.skip()
+        # Test datasource's internal file_opener for BZip2 files.
+        filepath = os.path.join(self.tmpdir, 'foobar.txt.bz2')
+        fp = bz2.BZ2File(filepath, 'w')
+        fp.write(magic_line)
+        fp.close()
+        with assert_warns(RuntimeWarning):
+            fp = self.ds.open(filepath, 'rt')
+            result = fp.readline()
+            fp.close()
+        assert_equal(magic_line, result)
+
+
+class TestDataSourceExists(object):
+    def setup(self):
+        self.tmpdir = mkdtemp()
+        self.ds = datasource.DataSource(self.tmpdir)
+
+    def teardown(self):
+        rmtree(self.tmpdir)
+        del self.ds
+
+    def test_ValidHTTP(self):
+        assert_(self.ds.exists(valid_httpurl()))
+
+    def test_InvalidHTTP(self):
+        assert_equal(self.ds.exists(invalid_httpurl()), False)
+
+    def test_ValidFile(self):
+        # Test valid file in destpath
+        tmpfile = valid_textfile(self.tmpdir)
+        assert_(self.ds.exists(tmpfile))
+        # Test valid local file not in destpath
+        localdir = mkdtemp()
+        tmpfile = valid_textfile(localdir)
+        assert_(self.ds.exists(tmpfile))
+        rmtree(localdir)
+
+    def test_InvalidFile(self):
+        tmpfile = invalid_textfile(self.tmpdir)
+        assert_equal(self.ds.exists(tmpfile), False)
+
+
+class TestDataSourceAbspath(object):
+    def setup(self):
+        self.tmpdir = os.path.abspath(mkdtemp())
+        self.ds = datasource.DataSource(self.tmpdir)
+
+    def teardown(self):
+        rmtree(self.tmpdir)
+        del self.ds
+
+    def test_ValidHTTP(self):
+        scheme, netloc, upath, pms, qry, frg = urlparse(valid_httpurl())
+        local_path = os.path.join(self.tmpdir, netloc,
+                                  upath.strip(os.sep).strip('/'))
+        assert_equal(local_path, self.ds.abspath(valid_httpurl()))
+
+    def test_ValidFile(self):
+        tmpfile = valid_textfile(self.tmpdir)
+        tmpfilename = os.path.split(tmpfile)[-1]
+        # Test with filename only
+        assert_equal(tmpfile, self.ds.abspath(tmpfilename))
+        # Test filename with complete path
+        assert_equal(tmpfile, self.ds.abspath(tmpfile))
+
+    def test_InvalidHTTP(self):
+        scheme, netloc, upath, pms, qry, frg = urlparse(invalid_httpurl())
+        invalidhttp = os.path.join(self.tmpdir, netloc,
+                                   upath.strip(os.sep).strip('/'))
+        assert_(invalidhttp != self.ds.abspath(valid_httpurl()))
+
+    def test_InvalidFile(self):
+        invalidfile = valid_textfile(self.tmpdir)
+        tmpfile = valid_textfile(self.tmpdir)
+        tmpfilename = os.path.split(tmpfile)[-1]
+        # Test with filename only
+        assert_(invalidfile != self.ds.abspath(tmpfilename))
+        # Test filename with complete path
+        assert_(invalidfile != self.ds.abspath(tmpfile))
+
+    def test_sandboxing(self):
+        tmpfile = valid_textfile(self.tmpdir)
+        tmpfilename = os.path.split(tmpfile)[-1]
+
+        tmp_path = lambda x: os.path.abspath(self.ds.abspath(x))
+
+        assert_(tmp_path(valid_httpurl()).startswith(self.tmpdir))
+        assert_(tmp_path(invalid_httpurl()).startswith(self.tmpdir))
+        assert_(tmp_path(tmpfile).startswith(self.tmpdir))
+        assert_(tmp_path(tmpfilename).startswith(self.tmpdir))
+        for fn in malicious_files:
+            assert_(tmp_path(http_path+fn).startswith(self.tmpdir))
+            assert_(tmp_path(fn).startswith(self.tmpdir))
+
+    def test_windows_os_sep(self):
+        orig_os_sep = os.sep
+        try:
+            os.sep = '\\'
+            self.test_ValidHTTP()
+            self.test_ValidFile()
+            self.test_InvalidHTTP()
+            self.test_InvalidFile()
+            self.test_sandboxing()
+        finally:
+            os.sep = orig_os_sep
+
+
+class TestRepositoryAbspath(object):
+    def setup(self):
+        self.tmpdir = os.path.abspath(mkdtemp())
+        self.repos = datasource.Repository(valid_baseurl(), self.tmpdir)
+
+    def teardown(self):
+        rmtree(self.tmpdir)
+        del self.repos
+
+    def test_ValidHTTP(self):
+        scheme, netloc, upath, pms, qry, frg = urlparse(valid_httpurl())
+        local_path = os.path.join(self.repos._destpath, netloc,
+                                  upath.strip(os.sep).strip('/'))
+        filepath = self.repos.abspath(valid_httpfile())
+        assert_equal(local_path, filepath)
+
+    def test_sandboxing(self):
+        tmp_path = lambda x: os.path.abspath(self.repos.abspath(x))
+        assert_(tmp_path(valid_httpfile()).startswith(self.tmpdir))
+        for fn in malicious_files:
+            assert_(tmp_path(http_path+fn).startswith(self.tmpdir))
+            assert_(tmp_path(fn).startswith(self.tmpdir))
+
+    def test_windows_os_sep(self):
+        orig_os_sep = os.sep
+        try:
+            os.sep = '\\'
+            self.test_ValidHTTP()
+            self.test_sandboxing()
+        finally:
+            os.sep = orig_os_sep
+
+
+class TestRepositoryExists(object):
+    def setup(self):
+        self.tmpdir = mkdtemp()
+        self.repos = datasource.Repository(valid_baseurl(), self.tmpdir)
+
+    def teardown(self):
+        rmtree(self.tmpdir)
+        del self.repos
+
+    def test_ValidFile(self):
+        # Create local temp file
+        tmpfile = valid_textfile(self.tmpdir)
+        assert_(self.repos.exists(tmpfile))
+
+    def test_InvalidFile(self):
+        tmpfile = invalid_textfile(self.tmpdir)
+        assert_equal(self.repos.exists(tmpfile), False)
+
+    def test_RemoveHTTPFile(self):
+        assert_(self.repos.exists(valid_httpurl()))
+
+    def test_CachedHTTPFile(self):
+        localfile = valid_httpurl()
+        # Create a locally cached temp file with an URL based
+        # directory structure.  This is similar to what Repository.open
+        # would do.
+        scheme, netloc, upath, pms, qry, frg = urlparse(localfile)
+        local_path = os.path.join(self.repos._destpath, netloc)
+        os.mkdir(local_path, 0o0700)
+        tmpfile = valid_textfile(local_path)
+        assert_(self.repos.exists(tmpfile))
+
+
+class TestOpenFunc(object):
+    def setup(self):
+        self.tmpdir = mkdtemp()
+
+    def teardown(self):
+        rmtree(self.tmpdir)
+
+    def test_DataSourceOpen(self):
+        local_file = valid_textfile(self.tmpdir)
+        # Test case where destpath is passed in
+        fp = datasource.open(local_file, destpath=self.tmpdir)
+        assert_(fp)
+        fp.close()
+        # Test case where default destpath is used
+        fp = datasource.open(local_file)
+        assert_(fp)
+        fp.close()
+
+def test_del_attr_handling():
+    # DataSource __del__ can be called
+    # even if __init__ fails when the
+    # Exception object is caught by the
+    # caller as happens in refguide_check
+    # is_deprecated() function
+
+    ds = datasource.DataSource()
+    # simulate failed __init__ by removing key attribute
+    # produced within __init__ and expected by __del__
+    del ds._istmpdest
+    # should not raise an AttributeError if __del__
+    # gracefully handles failed __init__:
+    ds.__del__()
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test__iotools.py b/contrib/python/numpy/py2/numpy/lib/tests/test__iotools.py
new file mode 100644
index 00000000000..e04fdc8080b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test__iotools.py
@@ -0,0 +1,352 @@
+from __future__ import division, absolute_import, print_function
+
+import time
+from datetime import date
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_allclose, assert_raises,
+    )
+from numpy.lib._iotools import (
+    LineSplitter, NameValidator, StringConverter,
+    has_nested_fields, easy_dtype, flatten_dtype
+    )
+from numpy.compat import unicode
+
+
+class TestLineSplitter(object):
+    "Tests the LineSplitter class."
+
+    def test_no_delimiter(self):
+        "Test LineSplitter w/o delimiter"
+        strg = " 1 2 3 4  5 # test"
+        test = LineSplitter()(strg)
+        assert_equal(test, ['1', '2', '3', '4', '5'])
+        test = LineSplitter('')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '5'])
+
+    def test_space_delimiter(self):
+        "Test space delimiter"
+        strg = " 1 2 3 4  5 # test"
+        test = LineSplitter(' ')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+        test = LineSplitter('  ')(strg)
+        assert_equal(test, ['1 2 3 4', '5'])
+
+    def test_tab_delimiter(self):
+        "Test tab delimiter"
+        strg = " 1\t 2\t 3\t 4\t 5  6"
+        test = LineSplitter('\t')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '5  6'])
+        strg = " 1  2\t 3  4\t 5  6"
+        test = LineSplitter('\t')(strg)
+        assert_equal(test, ['1  2', '3  4', '5  6'])
+
+    def test_other_delimiter(self):
+        "Test LineSplitter on delimiter"
+        strg = "1,2,3,4,,5"
+        test = LineSplitter(',')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+        #
+        strg = " 1,2,3,4,,5 # test"
+        test = LineSplitter(',')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+
+        # gh-11028 bytes comment/delimiters should get encoded
+        strg = b" 1,2,3,4,,5 % test"
+        test = LineSplitter(delimiter=b',', comments=b'%')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+
+    def test_constant_fixed_width(self):
+        "Test LineSplitter w/ fixed-width fields"
+        strg = "  1  2  3  4     5   # test"
+        test = LineSplitter(3)(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5', ''])
+        #
+        strg = "  1     3  4  5  6# test"
+        test = LineSplitter(20)(strg)
+        assert_equal(test, ['1     3  4  5  6'])
+        #
+        strg = "  1     3  4  5  6# test"
+        test = LineSplitter(30)(strg)
+        assert_equal(test, ['1     3  4  5  6'])
+
+    def test_variable_fixed_width(self):
+        strg = "  1     3  4  5  6# test"
+        test = LineSplitter((3, 6, 6, 3))(strg)
+        assert_equal(test, ['1', '3', '4  5', '6'])
+        #
+        strg = "  1     3  4  5  6# test"
+        test = LineSplitter((6, 6, 9))(strg)
+        assert_equal(test, ['1', '3  4', '5  6'])
+
+# -----------------------------------------------------------------------------
+
+
+class TestNameValidator(object):
+
+    def test_case_sensitivity(self):
+        "Test case sensitivity"
+        names = ['A', 'a', 'b', 'c']
+        test = NameValidator().validate(names)
+        assert_equal(test, ['A', 'a', 'b', 'c'])
+        test = NameValidator(case_sensitive=False).validate(names)
+        assert_equal(test, ['A', 'A_1', 'B', 'C'])
+        test = NameValidator(case_sensitive='upper').validate(names)
+        assert_equal(test, ['A', 'A_1', 'B', 'C'])
+        test = NameValidator(case_sensitive='lower').validate(names)
+        assert_equal(test, ['a', 'a_1', 'b', 'c'])
+
+        # check exceptions
+        assert_raises(ValueError, NameValidator, case_sensitive='foobar')
+
+    def test_excludelist(self):
+        "Test excludelist"
+        names = ['dates', 'data', 'Other Data', 'mask']
+        validator = NameValidator(excludelist=['dates', 'data', 'mask'])
+        test = validator.validate(names)
+        assert_equal(test, ['dates_', 'data_', 'Other_Data', 'mask_'])
+
+    def test_missing_names(self):
+        "Test validate missing names"
+        namelist = ('a', 'b', 'c')
+        validator = NameValidator()
+        assert_equal(validator(namelist), ['a', 'b', 'c'])
+        namelist = ('', 'b', 'c')
+        assert_equal(validator(namelist), ['f0', 'b', 'c'])
+        namelist = ('a', 'b', '')
+        assert_equal(validator(namelist), ['a', 'b', 'f0'])
+        namelist = ('', 'f0', '')
+        assert_equal(validator(namelist), ['f1', 'f0', 'f2'])
+
+    def test_validate_nb_names(self):
+        "Test validate nb names"
+        namelist = ('a', 'b', 'c')
+        validator = NameValidator()
+        assert_equal(validator(namelist, nbfields=1), ('a',))
+        assert_equal(validator(namelist, nbfields=5, defaultfmt="g%i"),
+                     ['a', 'b', 'c', 'g0', 'g1'])
+
+    def test_validate_wo_names(self):
+        "Test validate no names"
+        namelist = None
+        validator = NameValidator()
+        assert_(validator(namelist) is None)
+        assert_equal(validator(namelist, nbfields=3), ['f0', 'f1', 'f2'])
+
+# -----------------------------------------------------------------------------
+
+
+def _bytes_to_date(s):
+    return date(*time.strptime(s, "%Y-%m-%d")[:3])
+
+
+class TestStringConverter(object):
+    "Test StringConverter"
+
+    def test_creation(self):
+        "Test creation of a StringConverter"
+        converter = StringConverter(int, -99999)
+        assert_equal(converter._status, 1)
+        assert_equal(converter.default, -99999)
+
+    def test_upgrade(self):
+        "Tests the upgrade method."
+
+        converter = StringConverter()
+        assert_equal(converter._status, 0)
+
+        # test int
+        assert_equal(converter.upgrade('0'), 0)
+        assert_equal(converter._status, 1)
+
+        # On systems where long defaults to 32-bit, the statuses will be
+        # offset by one, so we check for this here.
+        import numpy.core.numeric as nx
+        status_offset = int(nx.dtype(nx.int_).itemsize < nx.dtype(nx.int64).itemsize)
+
+        # test int > 2**32
+        assert_equal(converter.upgrade('17179869184'), 17179869184)
+        assert_equal(converter._status, 1 + status_offset)
+
+        # test float
+        assert_allclose(converter.upgrade('0.'), 0.0)
+        assert_equal(converter._status, 2 + status_offset)
+
+        # test complex
+        assert_equal(converter.upgrade('0j'), complex('0j'))
+        assert_equal(converter._status, 3 + status_offset)
+
+        # test str
+        # note that the longdouble type has been skipped, so the
+        # _status increases by 2. Everything should succeed with
+        # unicode conversion (5).
+        for s in ['a', u'a', b'a']:
+            res = converter.upgrade(s)
+            assert_(type(res) is unicode)
+            assert_equal(res, u'a')
+            assert_equal(converter._status, 5 + status_offset)
+
+    def test_missing(self):
+        "Tests the use of missing values."
+        converter = StringConverter(missing_values=('missing',
+                                                    'missed'))
+        converter.upgrade('0')
+        assert_equal(converter('0'), 0)
+        assert_equal(converter(''), converter.default)
+        assert_equal(converter('missing'), converter.default)
+        assert_equal(converter('missed'), converter.default)
+        try:
+            converter('miss')
+        except ValueError:
+            pass
+
+    def test_upgrademapper(self):
+        "Tests updatemapper"
+        dateparser = _bytes_to_date
+        StringConverter.upgrade_mapper(dateparser, date(2000, 1, 1))
+        convert = StringConverter(dateparser, date(2000, 1, 1))
+        test = convert('2001-01-01')
+        assert_equal(test, date(2001, 1, 1))
+        test = convert('2009-01-01')
+        assert_equal(test, date(2009, 1, 1))
+        test = convert('')
+        assert_equal(test, date(2000, 1, 1))
+
+    def test_string_to_object(self):
+        "Make sure that string-to-object functions are properly recognized"
+        old_mapper = StringConverter._mapper[:]  # copy of list
+        conv = StringConverter(_bytes_to_date)
+        assert_equal(conv._mapper, old_mapper)
+        assert_(hasattr(conv, 'default'))
+
+    def test_keep_default(self):
+        "Make sure we don't lose an explicit default"
+        converter = StringConverter(None, missing_values='',
+                                    default=-999)
+        converter.upgrade('3.14159265')
+        assert_equal(converter.default, -999)
+        assert_equal(converter.type, np.dtype(float))
+        #
+        converter = StringConverter(
+            None, missing_values='', default=0)
+        converter.upgrade('3.14159265')
+        assert_equal(converter.default, 0)
+        assert_equal(converter.type, np.dtype(float))
+
+    def test_keep_default_zero(self):
+        "Check that we don't lose a default of 0"
+        converter = StringConverter(int, default=0,
+                                    missing_values="N/A")
+        assert_equal(converter.default, 0)
+
+    def test_keep_missing_values(self):
+        "Check that we're not losing missing values"
+        converter = StringConverter(int, default=0,
+                                    missing_values="N/A")
+        assert_equal(
+            converter.missing_values, {'', 'N/A'})
+
+    def test_int64_dtype(self):
+        "Check that int64 integer types can be specified"
+        converter = StringConverter(np.int64, default=0)
+        val = "-9223372036854775807"
+        assert_(converter(val) == -9223372036854775807)
+        val = "9223372036854775807"
+        assert_(converter(val) == 9223372036854775807)
+
+    def test_uint64_dtype(self):
+        "Check that uint64 integer types can be specified"
+        converter = StringConverter(np.uint64, default=0)
+        val = "9223372043271415339"
+        assert_(converter(val) == 9223372043271415339)
+
+
+class TestMiscFunctions(object):
+
+    def test_has_nested_dtype(self):
+        "Test has_nested_dtype"
+        ndtype = np.dtype(float)
+        assert_equal(has_nested_fields(ndtype), False)
+        ndtype = np.dtype([('A', '|S3'), ('B', float)])
+        assert_equal(has_nested_fields(ndtype), False)
+        ndtype = np.dtype([('A', int), ('B', [('BA', float), ('BB', '|S1')])])
+        assert_equal(has_nested_fields(ndtype), True)
+
+    def test_easy_dtype(self):
+        "Test ndtype on dtypes"
+        # Simple case
+        ndtype = float
+        assert_equal(easy_dtype(ndtype), np.dtype(float))
+        # As string w/o names
+        ndtype = "i4, f8"
+        assert_equal(easy_dtype(ndtype),
+                     np.dtype([('f0', "i4"), ('f1', "f8")]))
+        # As string w/o names but different default format
+        assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"),
+                     np.dtype([('field_000', "i4"), ('field_001', "f8")]))
+        # As string w/ names
+        ndtype = "i4, f8"
+        assert_equal(easy_dtype(ndtype, names="a, b"),
+                     np.dtype([('a', "i4"), ('b', "f8")]))
+        # As string w/ names (too many)
+        ndtype = "i4, f8"
+        assert_equal(easy_dtype(ndtype, names="a, b, c"),
+                     np.dtype([('a', "i4"), ('b', "f8")]))
+        # As string w/ names (not enough)
+        ndtype = "i4, f8"
+        assert_equal(easy_dtype(ndtype, names=", b"),
+                     np.dtype([('f0', "i4"), ('b', "f8")]))
+        # ... (with different default format)
+        assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"),
+                     np.dtype([('a', "i4"), ('f00', "f8")]))
+        # As list of tuples w/o names
+        ndtype = [('A', int), ('B', float)]
+        assert_equal(easy_dtype(ndtype), np.dtype([('A', int), ('B', float)]))
+        # As list of tuples w/ names
+        assert_equal(easy_dtype(ndtype, names="a,b"),
+                     np.dtype([('a', int), ('b', float)]))
+        # As list of tuples w/ not enough names
+        assert_equal(easy_dtype(ndtype, names="a"),
+                     np.dtype([('a', int), ('f0', float)]))
+        # As list of tuples w/ too many names
+        assert_equal(easy_dtype(ndtype, names="a,b,c"),
+                     np.dtype([('a', int), ('b', float)]))
+        # As list of types w/o names
+        ndtype = (int, float, float)
+        assert_equal(easy_dtype(ndtype),
+                     np.dtype([('f0', int), ('f1', float), ('f2', float)]))
+        # As list of types w names
+        ndtype = (int, float, float)
+        assert_equal(easy_dtype(ndtype, names="a, b, c"),
+                     np.dtype([('a', int), ('b', float), ('c', float)]))
+        # As simple dtype w/ names
+        ndtype = np.dtype(float)
+        assert_equal(easy_dtype(ndtype, names="a, b, c"),
+                     np.dtype([(_, float) for _ in ('a', 'b', 'c')]))
+        # As simple dtype w/o names (but multiple fields)
+        ndtype = np.dtype(float)
+        assert_equal(
+            easy_dtype(ndtype, names=['', '', ''], defaultfmt="f%02i"),
+            np.dtype([(_, float) for _ in ('f00', 'f01', 'f02')]))
+
+    def test_flatten_dtype(self):
+        "Testing flatten_dtype"
+        # Standard dtype
+        dt = np.dtype([("a", "f8"), ("b", "f8")])
+        dt_flat = flatten_dtype(dt)
+        assert_equal(dt_flat, [float, float])
+        # Recursive dtype
+        dt = np.dtype([("a", [("aa", '|S1'), ("ab", '|S2')]), ("b", int)])
+        dt_flat = flatten_dtype(dt)
+        assert_equal(dt_flat, [np.dtype('|S1'), np.dtype('|S2'), int])
+        # dtype with shaped fields
+        dt = np.dtype([("a", (float, 2)), ("b", (int, 3))])
+        dt_flat = flatten_dtype(dt)
+        assert_equal(dt_flat, [float, int])
+        dt_flat = flatten_dtype(dt, True)
+        assert_equal(dt_flat, [float] * 2 + [int] * 3)
+        # dtype w/ titles
+        dt = np.dtype([(("a", "A"), "f8"), (("b", "B"), "f8")])
+        dt_flat = flatten_dtype(dt)
+        assert_equal(dt_flat, [float, float])
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test__version.py b/contrib/python/numpy/py2/numpy/lib/tests/test__version.py
new file mode 100644
index 00000000000..8e66a0c0323
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test__version.py
@@ -0,0 +1,66 @@
+"""Tests for the NumpyVersion class.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import assert_, assert_raises
+from numpy.lib import NumpyVersion
+
+
+def test_main_versions():
+    assert_(NumpyVersion('1.8.0') == '1.8.0')
+    for ver in ['1.9.0', '2.0.0', '1.8.1']:
+        assert_(NumpyVersion('1.8.0') < ver)
+
+    for ver in ['1.7.0', '1.7.1', '0.9.9']:
+        assert_(NumpyVersion('1.8.0') > ver)
+
+
+def test_version_1_point_10():
+    # regression test for gh-2998.
+    assert_(NumpyVersion('1.9.0') < '1.10.0')
+    assert_(NumpyVersion('1.11.0') < '1.11.1')
+    assert_(NumpyVersion('1.11.0') == '1.11.0')
+    assert_(NumpyVersion('1.99.11') < '1.99.12')
+
+
+def test_alpha_beta_rc():
+    assert_(NumpyVersion('1.8.0rc1') == '1.8.0rc1')
+    for ver in ['1.8.0', '1.8.0rc2']:
+        assert_(NumpyVersion('1.8.0rc1') < ver)
+
+    for ver in ['1.8.0a2', '1.8.0b3', '1.7.2rc4']:
+        assert_(NumpyVersion('1.8.0rc1') > ver)
+
+    assert_(NumpyVersion('1.8.0b1') > '1.8.0a2')
+
+
+def test_dev_version():
+    assert_(NumpyVersion('1.9.0.dev-Unknown') < '1.9.0')
+    for ver in ['1.9.0', '1.9.0a1', '1.9.0b2', '1.9.0b2.dev-ffffffff']:
+        assert_(NumpyVersion('1.9.0.dev-f16acvda') < ver)
+
+    assert_(NumpyVersion('1.9.0.dev-f16acvda') == '1.9.0.dev-11111111')
+
+
+def test_dev_a_b_rc_mixed():
+    assert_(NumpyVersion('1.9.0a2.dev-f16acvda') == '1.9.0a2.dev-11111111')
+    assert_(NumpyVersion('1.9.0a2.dev-6acvda54') < '1.9.0a2')
+
+
+def test_dev0_version():
+    assert_(NumpyVersion('1.9.0.dev0+Unknown') < '1.9.0')
+    for ver in ['1.9.0', '1.9.0a1', '1.9.0b2', '1.9.0b2.dev0+ffffffff']:
+        assert_(NumpyVersion('1.9.0.dev0+f16acvda') < ver)
+
+    assert_(NumpyVersion('1.9.0.dev0+f16acvda') == '1.9.0.dev0+11111111')
+
+
+def test_dev0_a_b_rc_mixed():
+    assert_(NumpyVersion('1.9.0a2.dev0+f16acvda') == '1.9.0a2.dev0+11111111')
+    assert_(NumpyVersion('1.9.0a2.dev0+6acvda54') < '1.9.0a2')
+
+
+def test_raises():
+    for ver in ['1.9', '1,9.0', '1.7.x']:
+        assert_raises(ValueError, NumpyVersion, ver)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_arraypad.py b/contrib/python/numpy/py2/numpy/lib/tests/test_arraypad.py
new file mode 100644
index 00000000000..6620db8df31
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_arraypad.py
@@ -0,0 +1,1286 @@
+"""Tests for the array padding functions.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+import numpy as np
+from numpy.testing import (assert_array_equal, assert_raises, assert_allclose,
+                           assert_equal)
+from numpy.lib import pad
+from numpy.lib.arraypad import _as_pairs
+
+
+class TestAsPairs(object):
+
+    def test_single_value(self):
+        """Test casting for a single value."""
+        expected = np.array([[3, 3]] * 10)
+        for x in (3, [3], [[3]]):
+            result = _as_pairs(x, 10)
+            assert_equal(result, expected)
+        # Test with dtype=object
+        obj = object()
+        assert_equal(
+            _as_pairs(obj, 10),
+            np.array([[obj, obj]] * 10)
+        )
+
+    def test_two_values(self):
+        """Test proper casting for two different values."""
+        # Broadcasting in the first dimension with numbers
+        expected = np.array([[3, 4]] * 10)
+        for x in ([3, 4], [[3, 4]]):
+            result = _as_pairs(x, 10)
+            assert_equal(result, expected)
+        # and with dtype=object
+        obj = object()
+        assert_equal(
+            _as_pairs(["a", obj], 10),
+            np.array([["a", obj]] * 10)
+        )
+
+        # Broadcasting in the second / last dimension with numbers
+        assert_equal(
+            _as_pairs([[3], [4]], 2),
+            np.array([[3, 3], [4, 4]])
+        )
+        # and with dtype=object
+        assert_equal(
+            _as_pairs([["a"], [obj]], 2),
+            np.array([["a", "a"], [obj, obj]])
+        )
+
+    def test_with_none(self):
+        expected = ((None, None), (None, None), (None, None))
+        assert_equal(
+            _as_pairs(None, 3, as_index=False),
+            expected
+        )
+        assert_equal(
+            _as_pairs(None, 3, as_index=True),
+            expected
+        )
+
+    def test_pass_through(self):
+        """Test if `x` already matching desired output are passed through."""
+        expected = np.arange(12).reshape((6, 2))
+        assert_equal(
+            _as_pairs(expected, 6),
+            expected
+        )
+
+    def test_as_index(self):
+        """Test results if `as_index=True`."""
+        assert_equal(
+            _as_pairs([2.6, 3.3], 10, as_index=True),
+            np.array([[3, 3]] * 10, dtype=np.intp)
+        )
+        assert_equal(
+            _as_pairs([2.6, 4.49], 10, as_index=True),
+            np.array([[3, 4]] * 10, dtype=np.intp)
+        )
+        for x in (-3, [-3], [[-3]], [-3, 4], [3, -4], [[-3, 4]], [[4, -3]],
+                  [[1, 2]] * 9 + [[1, -2]]):
+            with pytest.raises(ValueError, match="negative values"):
+                _as_pairs(x, 10, as_index=True)
+
+    def test_exceptions(self):
+        """Ensure faulty usage is discovered."""
+        with pytest.raises(ValueError, match="more dimensions than allowed"):
+            _as_pairs([[[3]]], 10)
+        with pytest.raises(ValueError, match="could not be broadcast"):
+            _as_pairs([[1, 2], [3, 4]], 3)
+        with pytest.raises(ValueError, match="could not be broadcast"):
+            _as_pairs(np.ones((2, 3)), 3)
+
+
+class TestConditionalShortcuts(object):
+    def test_zero_padding_shortcuts(self):
+        test = np.arange(120).reshape(4, 5, 6)
+        pad_amt = [(0, 0) for axis in test.shape]
+        modes = ['constant',
+                 'edge',
+                 'linear_ramp',
+                 'maximum',
+                 'mean',
+                 'median',
+                 'minimum',
+                 'reflect',
+                 'symmetric',
+                 'wrap',
+                 ]
+        for mode in modes:
+            assert_array_equal(test, pad(test, pad_amt, mode=mode))
+
+    def test_shallow_statistic_range(self):
+        test = np.arange(120).reshape(4, 5, 6)
+        pad_amt = [(1, 1) for axis in test.shape]
+        modes = ['maximum',
+                 'mean',
+                 'median',
+                 'minimum',
+                 ]
+        for mode in modes:
+            assert_array_equal(pad(test, pad_amt, mode='edge'),
+                               pad(test, pad_amt, mode=mode, stat_length=1))
+
+    def test_clip_statistic_range(self):
+        test = np.arange(30).reshape(5, 6)
+        pad_amt = [(3, 3) for axis in test.shape]
+        modes = ['maximum',
+                 'mean',
+                 'median',
+                 'minimum',
+                 ]
+        for mode in modes:
+            assert_array_equal(pad(test, pad_amt, mode=mode),
+                               pad(test, pad_amt, mode=mode, stat_length=30))
+
+
+class TestStatistic(object):
+    def test_check_mean_stat_length(self):
+        a = np.arange(100).astype('f')
+        a = pad(a, ((25, 20), ), 'mean', stat_length=((2, 3), ))
+        b = np.array(
+            [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
+             0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
+             0.5, 0.5, 0.5, 0.5, 0.5,
+
+             0., 1., 2., 3., 4., 5., 6., 7., 8., 9.,
+             10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+             20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
+             30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
+             40., 41., 42., 43., 44., 45., 46., 47., 48., 49.,
+             50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
+             60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
+             70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
+             80., 81., 82., 83., 84., 85., 86., 87., 88., 89.,
+             90., 91., 92., 93., 94., 95., 96., 97., 98., 99.,
+
+             98., 98., 98., 98., 98., 98., 98., 98., 98., 98.,
+             98., 98., 98., 98., 98., 98., 98., 98., 98., 98.
+             ])
+        assert_array_equal(a, b)
+
+    def test_check_maximum_1(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'maximum')
+        b = np.array(
+            [99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+             99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+             99, 99, 99, 99, 99,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+             99, 99, 99, 99, 99, 99, 99, 99, 99, 99]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_maximum_2(self):
+        a = np.arange(100) + 1
+        a = pad(a, (25, 20), 'maximum')
+        b = np.array(
+            [100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+             100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+             100, 100, 100, 100, 100,
+
+             1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+             11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+             21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+             31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+             41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+             51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+             61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+             71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+             81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+             91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+
+             100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+             100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_maximum_stat_length(self):
+        a = np.arange(100) + 1
+        a = pad(a, (25, 20), 'maximum', stat_length=10)
+        b = np.array(
+            [10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+             10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+             10, 10, 10, 10, 10,
+
+              1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
+             11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+             21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+             31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+             41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+             51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+             61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+             71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+             81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+             91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+
+             100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+             100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_minimum_1(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'minimum')
+        b = np.array(
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_minimum_2(self):
+        a = np.arange(100) + 2
+        a = pad(a, (25, 20), 'minimum')
+        b = np.array(
+            [2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+             2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+             2, 2, 2, 2, 2,
+
+             2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+             12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+             22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+             32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+             42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+             52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+             62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+             72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+             82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
+             92, 93, 94, 95, 96, 97, 98, 99, 100, 101,
+
+             2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+             2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_minimum_stat_length(self):
+        a = np.arange(100) + 1
+        a = pad(a, (25, 20), 'minimum', stat_length=10)
+        b = np.array(
+            [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+              1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+              1,  1,  1,  1,  1,
+
+              1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
+             11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+             21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+             31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+             41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+             51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+             61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+             71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+             81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+             91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+
+             91, 91, 91, 91, 91, 91, 91, 91, 91, 91,
+             91, 91, 91, 91, 91, 91, 91, 91, 91, 91]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_median(self):
+        a = np.arange(100).astype('f')
+        a = pad(a, (25, 20), 'median')
+        b = np.array(
+            [49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5,
+             49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5,
+             49.5, 49.5, 49.5, 49.5, 49.5,
+
+             0., 1., 2., 3., 4., 5., 6., 7., 8., 9.,
+             10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+             20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
+             30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
+             40., 41., 42., 43., 44., 45., 46., 47., 48., 49.,
+             50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
+             60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
+             70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
+             80., 81., 82., 83., 84., 85., 86., 87., 88., 89.,
+             90., 91., 92., 93., 94., 95., 96., 97., 98., 99.,
+
+             49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5,
+             49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_median_01(self):
+        a = np.array([[3, 1, 4], [4, 5, 9], [9, 8, 2]])
+        a = pad(a, 1, 'median')
+        b = np.array(
+            [[4, 4, 5, 4, 4],
+
+             [3, 3, 1, 4, 3],
+             [5, 4, 5, 9, 5],
+             [8, 9, 8, 2, 8],
+
+             [4, 4, 5, 4, 4]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_median_02(self):
+        a = np.array([[3, 1, 4], [4, 5, 9], [9, 8, 2]])
+        a = pad(a.T, 1, 'median').T
+        b = np.array(
+            [[5, 4, 5, 4, 5],
+
+             [3, 3, 1, 4, 3],
+             [5, 4, 5, 9, 5],
+             [8, 9, 8, 2, 8],
+
+             [5, 4, 5, 4, 5]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_median_stat_length(self):
+        a = np.arange(100).astype('f')
+        a[1] = 2.
+        a[97] = 96.
+        a = pad(a, (25, 20), 'median', stat_length=(3, 5))
+        b = np.array(
+            [ 2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
+              2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
+              2.,  2.,  2.,  2.,  2.,
+
+              0.,  2.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
+             10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+             20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
+             30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
+             40., 41., 42., 43., 44., 45., 46., 47., 48., 49.,
+             50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
+             60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
+             70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
+             80., 81., 82., 83., 84., 85., 86., 87., 88., 89.,
+             90., 91., 92., 93., 94., 95., 96., 96., 98., 99.,
+
+             96., 96., 96., 96., 96., 96., 96., 96., 96., 96.,
+             96., 96., 96., 96., 96., 96., 96., 96., 96., 96.]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_mean_shape_one(self):
+        a = [[4, 5, 6]]
+        a = pad(a, (5, 7), 'mean', stat_length=2)
+        b = np.array(
+            [[4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6],
+             [4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_mean_2(self):
+        a = np.arange(100).astype('f')
+        a = pad(a, (25, 20), 'mean')
+        b = np.array(
+            [49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5,
+             49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5,
+             49.5, 49.5, 49.5, 49.5, 49.5,
+
+             0., 1., 2., 3., 4., 5., 6., 7., 8., 9.,
+             10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
+             20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
+             30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
+             40., 41., 42., 43., 44., 45., 46., 47., 48., 49.,
+             50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
+             60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
+             70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
+             80., 81., 82., 83., 84., 85., 86., 87., 88., 89.,
+             90., 91., 92., 93., 94., 95., 96., 97., 98., 99.,
+
+             49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5,
+             49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5, 49.5]
+            )
+        assert_array_equal(a, b)
+
+    @pytest.mark.parametrize("mode", [
+        pytest.param("mean", marks=pytest.mark.xfail(reason="gh-11216")),
+        "median",
+        "minimum",
+        "maximum"
+    ])
+    def test_same_prepend_append(self, mode):
+        """ Test that appended and prepended values are equal """
+        # This test is constructed to trigger floating point rounding errors in
+        # a way that caused gh-11216 for mode=='mean'
+        a = np.array([-1, 2, -1]) + np.array([0, 1e-12, 0], dtype=np.float64)
+        a = np.pad(a, (1, 1), mode)
+        assert_equal(a[0], a[-1])
+
+
+class TestConstant(object):
+    def test_check_constant(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'constant', constant_values=(10, 20))
+        b = np.array(
+            [10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+             10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+             10, 10, 10, 10, 10,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+             20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_constant_zeros(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'constant')
+        b = np.array(
+            [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+              0,  0,  0,  0,  0,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+              0,  0,  0,  0,  0,  0,  0,  0,  0,  0]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_constant_float(self):
+        # If input array is int, but constant_values are float, the dtype of
+        # the array to be padded is kept
+        arr = np.arange(30).reshape(5, 6)
+        test = pad(arr, (1, 2), mode='constant',
+                   constant_values=1.1)
+        expected = np.array(
+            [[ 1,  1,  1,  1,  1,  1,  1,  1,  1],
+
+             [ 1,  0,  1,  2,  3,  4,  5,  1,  1],
+             [ 1,  6,  7,  8,  9, 10, 11,  1,  1],
+             [ 1, 12, 13, 14, 15, 16, 17,  1,  1],
+             [ 1, 18, 19, 20, 21, 22, 23,  1,  1],
+             [ 1, 24, 25, 26, 27, 28, 29,  1,  1],
+
+             [ 1,  1,  1,  1,  1,  1,  1,  1,  1],
+             [ 1,  1,  1,  1,  1,  1,  1,  1,  1]]
+            )
+        assert_allclose(test, expected)
+
+    def test_check_constant_float2(self):
+        # If input array is float, and constant_values are float, the dtype of
+        # the array to be padded is kept - here retaining the float constants
+        arr = np.arange(30).reshape(5, 6)
+        arr_float = arr.astype(np.float64)
+        test = pad(arr_float, ((1, 2), (1, 2)), mode='constant',
+                   constant_values=1.1)
+        expected = np.array(
+            [[  1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1],
+
+             [  1.1,   0. ,   1. ,   2. ,   3. ,   4. ,   5. ,   1.1,   1.1],
+             [  1.1,   6. ,   7. ,   8. ,   9. ,  10. ,  11. ,   1.1,   1.1],
+             [  1.1,  12. ,  13. ,  14. ,  15. ,  16. ,  17. ,   1.1,   1.1],
+             [  1.1,  18. ,  19. ,  20. ,  21. ,  22. ,  23. ,   1.1,   1.1],
+             [  1.1,  24. ,  25. ,  26. ,  27. ,  28. ,  29. ,   1.1,   1.1],
+
+             [  1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1],
+             [  1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1,   1.1]]
+            )
+        assert_allclose(test, expected)
+
+    def test_check_constant_float3(self):
+        a = np.arange(100, dtype=float)
+        a = pad(a, (25, 20), 'constant', constant_values=(-1.1, -1.2))
+        b = np.array(
+            [-1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1,
+             -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1, -1.1,
+             -1.1, -1.1, -1.1, -1.1, -1.1,
+
+             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2,
+             -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2, -1.2]
+            )
+        assert_allclose(a, b)
+
+    def test_check_constant_odd_pad_amount(self):
+        arr = np.arange(30).reshape(5, 6)
+        test = pad(arr, ((1,), (2,)), mode='constant',
+                   constant_values=3)
+        expected = np.array(
+            [[ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3],
+
+             [ 3,  3,  0,  1,  2,  3,  4,  5,  3,  3],
+             [ 3,  3,  6,  7,  8,  9, 10, 11,  3,  3],
+             [ 3,  3, 12, 13, 14, 15, 16, 17,  3,  3],
+             [ 3,  3, 18, 19, 20, 21, 22, 23,  3,  3],
+             [ 3,  3, 24, 25, 26, 27, 28, 29,  3,  3],
+
+             [ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3]]
+            )
+        assert_allclose(test, expected)
+
+    def test_check_constant_pad_2d(self):
+        arr = np.arange(4).reshape(2, 2)
+        test = np.lib.pad(arr, ((1, 2), (1, 3)), mode='constant',
+                          constant_values=((1, 2), (3, 4)))
+        expected = np.array(
+            [[3, 1, 1, 4, 4, 4],
+             [3, 0, 1, 4, 4, 4],
+             [3, 2, 3, 4, 4, 4],
+             [3, 2, 2, 4, 4, 4],
+             [3, 2, 2, 4, 4, 4]]
+        )
+        assert_allclose(test, expected)
+
+    def test_check_large_integers(self):
+        uint64_max = 2 ** 64 - 1
+        arr = np.full(5, uint64_max, dtype=np.uint64)
+        test = np.pad(arr, 1, mode="constant", constant_values=arr.min())
+        expected = np.full(7, uint64_max, dtype=np.uint64)
+        assert_array_equal(test, expected)
+
+        int64_max = 2 ** 63 - 1
+        arr = np.full(5, int64_max, dtype=np.int64)
+        test = np.pad(arr, 1, mode="constant", constant_values=arr.min())
+        expected = np.full(7, int64_max, dtype=np.int64)
+        assert_array_equal(test, expected)
+
+    def test_check_object_array(self):
+        arr = np.empty(1, dtype=object)
+        obj_a = object()
+        arr[0] = obj_a
+        obj_b = object()
+        obj_c = object()
+        arr = np.pad(arr, pad_width=1, mode='constant',
+                     constant_values=(obj_b, obj_c))
+
+        expected = np.empty((3,), dtype=object)
+        expected[0] = obj_b
+        expected[1] = obj_a
+        expected[2] = obj_c
+
+        assert_array_equal(arr, expected)
+
+
+class TestLinearRamp(object):
+    def test_check_simple(self):
+        a = np.arange(100).astype('f')
+        a = pad(a, (25, 20), 'linear_ramp', end_values=(4, 5))
+        b = np.array(
+            [4.00, 3.84, 3.68, 3.52, 3.36, 3.20, 3.04, 2.88, 2.72, 2.56,
+             2.40, 2.24, 2.08, 1.92, 1.76, 1.60, 1.44, 1.28, 1.12, 0.96,
+             0.80, 0.64, 0.48, 0.32, 0.16,
+
+             0.00, 1.00, 2.00, 3.00, 4.00, 5.00, 6.00, 7.00, 8.00, 9.00,
+             10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
+             20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
+             30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0,
+             40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0,
+             50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0,
+             60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0,
+             70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
+             80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0,
+             90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0,
+
+             94.3, 89.6, 84.9, 80.2, 75.5, 70.8, 66.1, 61.4, 56.7, 52.0,
+             47.3, 42.6, 37.9, 33.2, 28.5, 23.8, 19.1, 14.4, 9.7, 5.]
+            )
+        assert_allclose(a, b, rtol=1e-5, atol=1e-5)
+
+    def test_check_2d(self):
+        arr = np.arange(20).reshape(4, 5).astype(np.float64)
+        test = pad(arr, (2, 2), mode='linear_ramp', end_values=(0, 0))
+        expected = np.array(
+            [[0.,   0.,   0.,   0.,   0.,   0.,   0.,    0.,   0.],
+             [0.,   0.,   0.,  0.5,   1.,  1.5,   2.,    1.,   0.],
+             [0.,   0.,   0.,   1.,   2.,   3.,   4.,    2.,   0.],
+             [0.,  2.5,   5.,   6.,   7.,   8.,   9.,   4.5,   0.],
+             [0.,   5.,  10.,  11.,  12.,  13.,  14.,    7.,   0.],
+             [0.,  7.5,  15.,  16.,  17.,  18.,  19.,   9.5,   0.],
+             [0., 3.75,  7.5,   8.,  8.5,   9.,  9.5,  4.75,   0.],
+             [0.,   0.,   0.,   0.,   0.,   0.,   0.,    0.,   0.]])
+        assert_allclose(test, expected)
+
+    @pytest.mark.xfail(exceptions=(AssertionError,))
+    def test_object_array(self):
+        from fractions import Fraction
+        arr = np.array([Fraction(1, 2), Fraction(-1, 2)])
+        actual = np.pad(arr, (2, 3), mode='linear_ramp', end_values=0)
+
+        # deliberately chosen to have a non-power-of-2 denominator such that
+        # rounding to floats causes a failure.
+        expected = np.array([
+            Fraction( 0, 12),
+            Fraction( 3, 12),
+            Fraction( 6, 12),
+            Fraction(-6, 12),
+            Fraction(-4, 12),
+            Fraction(-2, 12),
+            Fraction(-0, 12),
+        ])
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("dtype", (
+        np.sctypes["uint"]
+        + np.sctypes["int"]
+        + np.sctypes["float"]
+        + np.sctypes["complex"]
+    ))
+    def test_negative_difference(self, dtype):
+        """
+        Check correct behavior of unsigned dtypes if there is a negative
+        difference between the edge to pad and `end_values`. Check both cases
+        to be independent of implementation. Test behavior for all other dtypes
+        in case dtype casting interferes with complex dtypes. See gh-14191.
+        """
+        x = np.array([3], dtype=dtype)
+        result = np.pad(x, 3, mode="linear_ramp", end_values=0)
+        expected = np.array([0, 1, 2, 3, 2, 1, 0], dtype=dtype)
+        assert_equal(result, expected)
+
+        x = np.array([0], dtype=dtype)
+        result = np.pad(x, 3, mode="linear_ramp", end_values=3)
+        expected = np.array([3, 2, 1, 0, 1, 2, 3], dtype=dtype)
+        assert_equal(result, expected)
+
+
+
+class TestReflect(object):
+    def test_check_simple(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'reflect')
+        b = np.array(
+            [25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+             15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+             5, 4, 3, 2, 1,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             98, 97, 96, 95, 94, 93, 92, 91, 90, 89,
+             88, 87, 86, 85, 84, 83, 82, 81, 80, 79]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_odd_method(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'reflect', reflect_type='odd')
+        b = np.array(
+            [-25, -24, -23, -22, -21, -20, -19, -18, -17, -16,
+             -15, -14, -13, -12, -11, -10, -9, -8, -7, -6,
+             -5, -4, -3, -2, -1,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+             110, 111, 112, 113, 114, 115, 116, 117, 118, 119]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_large_pad(self):
+        a = [[4, 5, 6], [6, 7, 8]]
+        a = pad(a, (5, 7), 'reflect')
+        b = np.array(
+            [[7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7, 8, 7, 6, 7],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_shape(self):
+        a = [[4, 5, 6]]
+        a = pad(a, (5, 7), 'reflect')
+        b = np.array(
+            [[5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5],
+             [5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5, 6, 5, 4, 5]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_01(self):
+        a = pad([1, 2, 3], 2, 'reflect')
+        b = np.array([3, 2, 1, 2, 3, 2, 1])
+        assert_array_equal(a, b)
+
+    def test_check_02(self):
+        a = pad([1, 2, 3], 3, 'reflect')
+        b = np.array([2, 3, 2, 1, 2, 3, 2, 1, 2])
+        assert_array_equal(a, b)
+
+    def test_check_03(self):
+        a = pad([1, 2, 3], 4, 'reflect')
+        b = np.array([1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3])
+        assert_array_equal(a, b)
+
+    def test_check_padding_an_empty_array(self):
+        a = pad(np.zeros((0, 3)), ((0,), (1,)), mode='reflect')
+        b = np.zeros((0, 5))
+        assert_array_equal(a, b)
+
+
+class TestSymmetric(object):
+    def test_check_simple(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'symmetric')
+        b = np.array(
+            [24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+             14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
+             4, 3, 2, 1, 0,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             99, 98, 97, 96, 95, 94, 93, 92, 91, 90,
+             89, 88, 87, 86, 85, 84, 83, 82, 81, 80]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_odd_method(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'symmetric', reflect_type='odd')
+        b = np.array(
+            [-24, -23, -22, -21, -20, -19, -18, -17, -16, -15,
+             -14, -13, -12, -11, -10, -9, -8, -7, -6, -5,
+             -4, -3, -2, -1, 0,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+             109, 110, 111, 112, 113, 114, 115, 116, 117, 118]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_large_pad(self):
+        a = [[4, 5, 6], [6, 7, 8]]
+        a = pad(a, (5, 7), 'symmetric')
+        b = np.array(
+            [[5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [7, 8, 8, 7, 6, 6, 7, 8, 8, 7, 6, 6, 7, 8, 8],
+             [7, 8, 8, 7, 6, 6, 7, 8, 8, 7, 6, 6, 7, 8, 8],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [7, 8, 8, 7, 6, 6, 7, 8, 8, 7, 6, 6, 7, 8, 8],
+
+             [7, 8, 8, 7, 6, 6, 7, 8, 8, 7, 6, 6, 7, 8, 8],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [7, 8, 8, 7, 6, 6, 7, 8, 8, 7, 6, 6, 7, 8, 8],
+             [7, 8, 8, 7, 6, 6, 7, 8, 8, 7, 6, 6, 7, 8, 8],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6]]
+            )
+
+        assert_array_equal(a, b)
+
+    def test_check_large_pad_odd(self):
+        a = [[4, 5, 6], [6, 7, 8]]
+        a = pad(a, (5, 7), 'symmetric', reflect_type='odd')
+        b = np.array(
+            [[-3, -2, -2, -1,  0,  0,  1,  2,  2,  3,  4,  4,  5,  6,  6],
+             [-3, -2, -2, -1,  0,  0,  1,  2,  2,  3,  4,  4,  5,  6,  6],
+             [-1,  0,  0,  1,  2,  2,  3,  4,  4,  5,  6,  6,  7,  8,  8],
+             [-1,  0,  0,  1,  2,  2,  3,  4,  4,  5,  6,  6,  7,  8,  8],
+             [ 1,  2,  2,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9, 10, 10],
+
+             [ 1,  2,  2,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9, 10, 10],
+             [ 3,  4,  4,  5,  6,  6,  7,  8,  8,  9, 10, 10, 11, 12, 12],
+
+             [ 3,  4,  4,  5,  6,  6,  7,  8,  8,  9, 10, 10, 11, 12, 12],
+             [ 5,  6,  6,  7,  8,  8,  9, 10, 10, 11, 12, 12, 13, 14, 14],
+             [ 5,  6,  6,  7,  8,  8,  9, 10, 10, 11, 12, 12, 13, 14, 14],
+             [ 7,  8,  8,  9, 10, 10, 11, 12, 12, 13, 14, 14, 15, 16, 16],
+             [ 7,  8,  8,  9, 10, 10, 11, 12, 12, 13, 14, 14, 15, 16, 16],
+             [ 9, 10, 10, 11, 12, 12, 13, 14, 14, 15, 16, 16, 17, 18, 18],
+             [ 9, 10, 10, 11, 12, 12, 13, 14, 14, 15, 16, 16, 17, 18, 18]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_shape(self):
+        a = [[4, 5, 6]]
+        a = pad(a, (5, 7), 'symmetric')
+        b = np.array(
+            [[5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6],
+             [5, 6, 6, 5, 4, 4, 5, 6, 6, 5, 4, 4, 5, 6, 6]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_01(self):
+        a = pad([1, 2, 3], 2, 'symmetric')
+        b = np.array([2, 1, 1, 2, 3, 3, 2])
+        assert_array_equal(a, b)
+
+    def test_check_02(self):
+        a = pad([1, 2, 3], 3, 'symmetric')
+        b = np.array([3, 2, 1, 1, 2, 3, 3, 2, 1])
+        assert_array_equal(a, b)
+
+    def test_check_03(self):
+        a = pad([1, 2, 3], 6, 'symmetric')
+        b = np.array([1, 2, 3, 3, 2, 1, 1, 2, 3, 3, 2, 1, 1, 2, 3])
+        assert_array_equal(a, b)
+
+
+class TestWrap(object):
+    def test_check_simple(self):
+        a = np.arange(100)
+        a = pad(a, (25, 20), 'wrap')
+        b = np.array(
+            [75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+             85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+             95, 96, 97, 98, 99,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+             30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+             40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+             50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+             60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+             70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+             80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+             90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+
+             0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+             10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_large_pad(self):
+        a = np.arange(12)
+        a = np.reshape(a, (3, 4))
+        a = pad(a, (10, 12), 'wrap')
+        b = np.array(
+            [[10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11],
+             [2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
+              3, 0, 1, 2, 3, 0, 1, 2, 3],
+             [6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
+              7, 4, 5, 6, 7, 4, 5, 6, 7],
+             [10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10,
+              11, 8, 9, 10, 11, 8, 9, 10, 11]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_01(self):
+        a = pad([1, 2, 3], 3, 'wrap')
+        b = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
+        assert_array_equal(a, b)
+
+    def test_check_02(self):
+        a = pad([1, 2, 3], 4, 'wrap')
+        b = np.array([3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1])
+        assert_array_equal(a, b)
+
+    def test_pad_with_zero(self):
+        a = np.ones((3, 5))
+        b = np.pad(a, (0, 5), mode="wrap")
+        assert_array_equal(a, b[:-5, :-5])
+
+
+class TestStatLen(object):
+    def test_check_simple(self):
+        a = np.arange(30)
+        a = np.reshape(a, (6, 5))
+        a = pad(a, ((2, 3), (3, 2)), mode='mean', stat_length=(3,))
+        b = np.array(
+            [[6, 6, 6, 5, 6, 7, 8, 9, 8, 8],
+             [6, 6, 6, 5, 6, 7, 8, 9, 8, 8],
+
+             [1, 1, 1, 0, 1, 2, 3, 4, 3, 3],
+             [6, 6, 6, 5, 6, 7, 8, 9, 8, 8],
+             [11, 11, 11, 10, 11, 12, 13, 14, 13, 13],
+             [16, 16, 16, 15, 16, 17, 18, 19, 18, 18],
+             [21, 21, 21, 20, 21, 22, 23, 24, 23, 23],
+             [26, 26, 26, 25, 26, 27, 28, 29, 28, 28],
+
+             [21, 21, 21, 20, 21, 22, 23, 24, 23, 23],
+             [21, 21, 21, 20, 21, 22, 23, 24, 23, 23],
+             [21, 21, 21, 20, 21, 22, 23, 24, 23, 23]]
+            )
+        assert_array_equal(a, b)
+
+
+class TestEdge(object):
+    def test_check_simple(self):
+        a = np.arange(12)
+        a = np.reshape(a, (4, 3))
+        a = pad(a, ((2, 3), (3, 2)), 'edge')
+        b = np.array(
+            [[0, 0, 0, 0, 1, 2, 2, 2],
+             [0, 0, 0, 0, 1, 2, 2, 2],
+
+             [0, 0, 0, 0, 1, 2, 2, 2],
+             [3, 3, 3, 3, 4, 5, 5, 5],
+             [6, 6, 6, 6, 7, 8, 8, 8],
+             [9, 9, 9, 9, 10, 11, 11, 11],
+
+             [9, 9, 9, 9, 10, 11, 11, 11],
+             [9, 9, 9, 9, 10, 11, 11, 11],
+             [9, 9, 9, 9, 10, 11, 11, 11]]
+            )
+        assert_array_equal(a, b)
+
+    def test_check_width_shape_1_2(self):
+        # Check a pad_width of the form ((1, 2),).
+        # Regression test for issue gh-7808.
+        a = np.array([1, 2, 3])
+        padded = pad(a, ((1, 2),), 'edge')
+        expected = np.array([1, 1, 2, 3, 3, 3])
+        assert_array_equal(padded, expected)
+
+        a = np.array([[1, 2, 3], [4, 5, 6]])
+        padded = pad(a, ((1, 2),), 'edge')
+        expected = pad(a, ((1, 2), (1, 2)), 'edge')
+        assert_array_equal(padded, expected)
+
+        a = np.arange(24).reshape(2, 3, 4)
+        padded = pad(a, ((1, 2),), 'edge')
+        expected = pad(a, ((1, 2), (1, 2), (1, 2)), 'edge')
+        assert_array_equal(padded, expected)
+
+
+class TestZeroPadWidth(object):
+    def test_zero_pad_width(self):
+        arr = np.arange(30)
+        arr = np.reshape(arr, (6, 5))
+        for pad_width in (0, (0, 0), ((0, 0), (0, 0))):
+            assert_array_equal(arr, pad(arr, pad_width, mode='constant'))
+
+
+class TestLegacyVectorFunction(object):
+    def test_legacy_vector_functionality(self):
+        def _padwithtens(vector, pad_width, iaxis, kwargs):
+            vector[:pad_width[0]] = 10
+            vector[-pad_width[1]:] = 10
+            return vector
+
+        a = np.arange(6).reshape(2, 3)
+        a = pad(a, 2, _padwithtens)
+        b = np.array(
+            [[10, 10, 10, 10, 10, 10, 10],
+             [10, 10, 10, 10, 10, 10, 10],
+
+             [10, 10,  0,  1,  2, 10, 10],
+             [10, 10,  3,  4,  5, 10, 10],
+
+             [10, 10, 10, 10, 10, 10, 10],
+             [10, 10, 10, 10, 10, 10, 10]]
+            )
+        assert_array_equal(a, b)
+
+
+class TestNdarrayPadWidth(object):
+    def test_check_simple(self):
+        a = np.arange(12)
+        a = np.reshape(a, (4, 3))
+        a = pad(a, np.array(((2, 3), (3, 2))), 'edge')
+        b = np.array(
+            [[0,  0,  0,    0,  1,  2,    2,  2],
+             [0,  0,  0,    0,  1,  2,    2,  2],
+
+             [0,  0,  0,    0,  1,  2,    2,  2],
+             [3,  3,  3,    3,  4,  5,    5,  5],
+             [6,  6,  6,    6,  7,  8,    8,  8],
+             [9,  9,  9,    9, 10, 11,   11, 11],
+
+             [9,  9,  9,    9, 10, 11,   11, 11],
+             [9,  9,  9,    9, 10, 11,   11, 11],
+             [9,  9,  9,    9, 10, 11,   11, 11]]
+            )
+        assert_array_equal(a, b)
+
+
+class TestUnicodeInput(object):
+    def test_unicode_mode(self):
+        constant_mode = u'constant'
+        a = np.pad([1], 2, mode=constant_mode)
+        b = np.array([0, 0, 1, 0, 0])
+        assert_array_equal(a, b)
+
+
+class TestObjectInput(object):
+    def test_object_input(self):
+        # Regression test for issue gh-11395.
+        a = np.full((4, 3), None)
+        pad_amt = ((2, 3), (3, 2))
+        b = np.full((9, 8), None)
+        modes = ['edge',
+                 'symmetric',
+                 'reflect',
+                 'wrap',
+                 ]
+        for mode in modes:
+            assert_array_equal(pad(a, pad_amt, mode=mode), b)
+
+
+class TestValueError1(object):
+    def test_check_simple(self):
+        arr = np.arange(30)
+        arr = np.reshape(arr, (6, 5))
+        kwargs = dict(mode='mean', stat_length=(3, ))
+        assert_raises(ValueError, pad, arr, ((2, 3), (3, 2), (4, 5)),
+                      **kwargs)
+
+    def test_check_negative_stat_length(self):
+        arr = np.arange(30)
+        arr = np.reshape(arr, (6, 5))
+        kwargs = dict(mode='mean', stat_length=(-3, ))
+        assert_raises(ValueError, pad, arr, ((2, 3), (3, 2)),
+                      **kwargs)
+
+    def test_check_negative_pad_width(self):
+        arr = np.arange(30)
+        arr = np.reshape(arr, (6, 5))
+        kwargs = dict(mode='mean', stat_length=(3, ))
+        assert_raises(ValueError, pad, arr, ((-2, 3), (3, 2)),
+                      **kwargs)
+
+    def test_check_empty_array(self):
+        assert_raises(ValueError, pad, [], 4, mode='reflect')
+        assert_raises(ValueError, pad, np.ndarray(0), 4, mode='reflect')
+        assert_raises(ValueError, pad, np.zeros((0, 3)), ((1,), (0,)),
+                      mode='reflect')
+
+
+class TestValueError2(object):
+    def test_check_negative_pad_amount(self):
+        arr = np.arange(30)
+        arr = np.reshape(arr, (6, 5))
+        kwargs = dict(mode='mean', stat_length=(3, ))
+        assert_raises(ValueError, pad, arr, ((-2, 3), (3, 2)),
+                      **kwargs)
+
+
+class TestValueError3(object):
+    def test_check_kwarg_not_allowed(self):
+        arr = np.arange(30).reshape(5, 6)
+        assert_raises(ValueError, pad, arr, 4, mode='mean',
+                      reflect_type='odd')
+
+    def test_mode_not_set(self):
+        arr = np.arange(30).reshape(5, 6)
+        assert_raises(TypeError, pad, arr, 4)
+
+    def test_malformed_pad_amount(self):
+        arr = np.arange(30).reshape(5, 6)
+        assert_raises(ValueError, pad, arr, (4, 5, 6, 7), mode='constant')
+
+    def test_malformed_pad_amount2(self):
+        arr = np.arange(30).reshape(5, 6)
+        assert_raises(ValueError, pad, arr, ((3, 4, 5), (0, 1, 2)),
+                      mode='constant')
+
+    def test_pad_too_many_axes(self):
+        arr = np.arange(30).reshape(5, 6)
+
+        # Attempt to pad using a 3D array equivalent
+        bad_shape = (((3,), (4,), (5,)), ((0,), (1,), (2,)))
+        assert_raises(ValueError, pad, arr, bad_shape,
+                      mode='constant')
+
+
+class TestTypeError1(object):
+    def test_float(self):
+        arr = np.arange(30)
+        assert_raises(TypeError, pad, arr, ((-2.1, 3), (3, 2)))
+        assert_raises(TypeError, pad, arr, np.array(((-2.1, 3), (3, 2))))
+
+    def test_str(self):
+        arr = np.arange(30)
+        assert_raises(TypeError, pad, arr, 'foo')
+        assert_raises(TypeError, pad, arr, np.array('foo'))
+
+    def test_object(self):
+        class FooBar(object):
+            pass
+        arr = np.arange(30)
+        assert_raises(TypeError, pad, arr, FooBar())
+
+    def test_complex(self):
+        arr = np.arange(30)
+        assert_raises(TypeError, pad, arr, complex(1, -1))
+        assert_raises(TypeError, pad, arr, np.array(complex(1, -1)))
+
+    def test_check_wrong_pad_amount(self):
+        arr = np.arange(30)
+        arr = np.reshape(arr, (6, 5))
+        kwargs = dict(mode='mean', stat_length=(3, ))
+        assert_raises(TypeError, pad, arr, ((2, 3, 4), (3, 2)),
+                      **kwargs)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_arraysetops.py b/contrib/python/numpy/py2/numpy/lib/tests/test_arraysetops.py
new file mode 100644
index 00000000000..93d4b279f30
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_arraysetops.py
@@ -0,0 +1,623 @@
+"""Test functions for 1D array set operations.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+
+from numpy.testing import (assert_array_equal, assert_equal,
+                           assert_raises, assert_raises_regex)
+from numpy.lib.arraysetops import (
+    ediff1d, intersect1d, setxor1d, union1d, setdiff1d, unique, in1d, isin
+    )
+import pytest
+
+
+
+class TestSetOps(object):
+
+    def test_intersect1d(self):
+        # unique inputs
+        a = np.array([5, 7, 1, 2])
+        b = np.array([2, 4, 3, 1, 5])
+
+        ec = np.array([1, 2, 5])
+        c = intersect1d(a, b, assume_unique=True)
+        assert_array_equal(c, ec)
+
+        # non-unique inputs
+        a = np.array([5, 5, 7, 1, 2])
+        b = np.array([2, 1, 4, 3, 3, 1, 5])
+
+        ed = np.array([1, 2, 5])
+        c = intersect1d(a, b)
+        assert_array_equal(c, ed)
+        assert_array_equal([], intersect1d([], []))
+
+    def test_intersect1d_array_like(self):
+        # See gh-11772
+        class Test(object):
+            def __array__(self):
+                return np.arange(3)
+
+        a = Test()
+        res = intersect1d(a, a)
+        assert_array_equal(res, a)
+        res = intersect1d([1, 2, 3], [1, 2, 3])
+        assert_array_equal(res, [1, 2, 3])
+
+    def test_intersect1d_indices(self):
+        # unique inputs
+        a = np.array([1, 2, 3, 4])
+        b = np.array([2, 1, 4, 6])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ee = np.array([1, 2, 4])
+        assert_array_equal(c, ee)
+        assert_array_equal(a[i1], ee)
+        assert_array_equal(b[i2], ee)
+
+        # non-unique inputs
+        a = np.array([1, 2, 2, 3, 4, 3, 2])
+        b = np.array([1, 8, 4, 2, 2, 3, 2, 3])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ef = np.array([1, 2, 3, 4])
+        assert_array_equal(c, ef)
+        assert_array_equal(a[i1], ef)
+        assert_array_equal(b[i2], ef)
+
+        # non1d, unique inputs
+        a = np.array([[2, 4, 5, 6], [7, 8, 1, 15]])
+        b = np.array([[3, 2, 7, 6], [10, 12, 8, 9]])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 6, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+
+        # non1d, not assumed to be uniqueinputs
+        a = np.array([[2, 4, 5, 6, 6], [4, 7, 8, 7, 2]])
+        b = np.array([[3, 2, 7, 7], [10, 12, 8, 7]])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+
+    def test_setxor1d(self):
+        a = np.array([5, 7, 1, 2])
+        b = np.array([2, 4, 3, 1, 5])
+
+        ec = np.array([3, 4, 7])
+        c = setxor1d(a, b)
+        assert_array_equal(c, ec)
+
+        a = np.array([1, 2, 3])
+        b = np.array([6, 5, 4])
+
+        ec = np.array([1, 2, 3, 4, 5, 6])
+        c = setxor1d(a, b)
+        assert_array_equal(c, ec)
+
+        a = np.array([1, 8, 2, 3])
+        b = np.array([6, 5, 4, 8])
+
+        ec = np.array([1, 2, 3, 4, 5, 6])
+        c = setxor1d(a, b)
+        assert_array_equal(c, ec)
+
+        assert_array_equal([], setxor1d([], []))
+
+    def test_ediff1d(self):
+        zero_elem = np.array([])
+        one_elem = np.array([1])
+        two_elem = np.array([1, 2])
+
+        assert_array_equal([], ediff1d(zero_elem))
+        assert_array_equal([0], ediff1d(zero_elem, to_begin=0))
+        assert_array_equal([0], ediff1d(zero_elem, to_end=0))
+        assert_array_equal([-1, 0], ediff1d(zero_elem, to_begin=-1, to_end=0))
+        assert_array_equal([], ediff1d(one_elem))
+        assert_array_equal([1], ediff1d(two_elem))
+        assert_array_equal([7,1,9], ediff1d(two_elem, to_begin=7, to_end=9))
+        assert_array_equal([5,6,1,7,8], ediff1d(two_elem, to_begin=[5,6], to_end=[7,8]))
+        assert_array_equal([1,9], ediff1d(two_elem, to_end=9))
+        assert_array_equal([1,7,8], ediff1d(two_elem, to_end=[7,8]))
+        assert_array_equal([7,1], ediff1d(two_elem, to_begin=7))
+        assert_array_equal([5,6,1], ediff1d(two_elem, to_begin=[5,6]))
+
+    @pytest.mark.parametrize("ary, prepend, append", [
+        # should fail because trying to cast
+        # np.nan standard floating point value
+        # into an integer array:
+        (np.array([1, 2, 3], dtype=np.int64),
+         None,
+         np.nan),
+        # should fail because attempting
+        # to downcast to smaller int type:
+        (np.array([1, 2, 3], dtype=np.int16),
+         np.array([5, 1<<20, 2], dtype=np.int32),
+         None),
+        # should fail because attempting to cast
+        # two special floating point values
+        # to integers (on both sides of ary):
+        (np.array([1., 3., 9.], dtype=np.int8),
+         np.nan,
+         np.nan),
+         ])
+    def test_ediff1d_forbidden_type_casts(self, ary, prepend, append):
+        # verify resolution of gh-11490
+
+        # specifically, raise an appropriate
+        # Exception when attempting to append or
+        # prepend with an incompatible type
+        msg = 'cannot convert'
+        with assert_raises_regex(ValueError, msg):
+            ediff1d(ary=ary,
+                    to_end=append,
+                    to_begin=prepend)
+
+    @pytest.mark.parametrize("ary,"
+                             "prepend,"
+                             "append,"
+                             "expected", [
+        (np.array([1, 2, 3], dtype=np.int16),
+         0,
+         None,
+         np.array([0, 1, 1], dtype=np.int16)),
+        (np.array([1, 2, 3], dtype=np.int32),
+         0,
+         0,
+         np.array([0, 1, 1, 0], dtype=np.int32)),
+        (np.array([1, 2, 3], dtype=np.int64),
+         3,
+         -9,
+         np.array([3, 1, 1, -9], dtype=np.int64)),
+         ])
+    def test_ediff1d_scalar_handling(self,
+                                     ary,
+                                     prepend,
+                                     append,
+                                     expected):
+        # maintain backwards-compatibility
+        # of scalar prepend / append behavior
+        # in ediff1d following fix for gh-11490
+        actual = np.ediff1d(ary=ary,
+                            to_end=append,
+                            to_begin=prepend)
+        assert_equal(actual, expected)
+
+
+    def test_isin(self):
+        # the tests for in1d cover most of isin's behavior
+        # if in1d is removed, would need to change those tests to test
+        # isin instead.
+        def _isin_slow(a, b):
+            b = np.asarray(b).flatten().tolist()
+            return a in b
+        isin_slow = np.vectorize(_isin_slow, otypes=[bool], excluded={1})
+        def assert_isin_equal(a, b):
+            x = isin(a, b)
+            y = isin_slow(a, b)
+            assert_array_equal(x, y)
+
+        #multidimensional arrays in both arguments
+        a = np.arange(24).reshape([2, 3, 4])
+        b = np.array([[10, 20, 30], [0, 1, 3], [11, 22, 33]])
+        assert_isin_equal(a, b)
+
+        #array-likes as both arguments
+        c = [(9, 8), (7, 6)]
+        d = (9, 7)
+        assert_isin_equal(c, d)
+
+        #zero-d array:
+        f = np.array(3)
+        assert_isin_equal(f, b)
+        assert_isin_equal(a, f)
+        assert_isin_equal(f, f)
+
+        #scalar:
+        assert_isin_equal(5, b)
+        assert_isin_equal(a, 6)
+        assert_isin_equal(5, 6)
+
+        #empty array-like:
+        x = []
+        assert_isin_equal(x, b)
+        assert_isin_equal(a, x)
+        assert_isin_equal(x, x)
+
+    def test_in1d(self):
+        # we use two different sizes for the b array here to test the
+        # two different paths in in1d().
+        for mult in (1, 10):
+            # One check without np.array to make sure lists are handled correct
+            a = [5, 7, 1, 2]
+            b = [2, 4, 3, 1, 5] * mult
+            ec = np.array([True, False, True, True])
+            c = in1d(a, b, assume_unique=True)
+            assert_array_equal(c, ec)
+
+            a[0] = 8
+            ec = np.array([False, False, True, True])
+            c = in1d(a, b, assume_unique=True)
+            assert_array_equal(c, ec)
+
+            a[0], a[3] = 4, 8
+            ec = np.array([True, False, True, False])
+            c = in1d(a, b, assume_unique=True)
+            assert_array_equal(c, ec)
+
+            a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5])
+            b = [2, 3, 4] * mult
+            ec = [False, True, False, True, True, True, True, True, True,
+                  False, True, False, False, False]
+            c = in1d(a, b)
+            assert_array_equal(c, ec)
+
+            b = b + [5, 5, 4] * mult
+            ec = [True, True, True, True, True, True, True, True, True, True,
+                  True, False, True, True]
+            c = in1d(a, b)
+            assert_array_equal(c, ec)
+
+            a = np.array([5, 7, 1, 2])
+            b = np.array([2, 4, 3, 1, 5] * mult)
+            ec = np.array([True, False, True, True])
+            c = in1d(a, b)
+            assert_array_equal(c, ec)
+
+            a = np.array([5, 7, 1, 1, 2])
+            b = np.array([2, 4, 3, 3, 1, 5] * mult)
+            ec = np.array([True, False, True, True, True])
+            c = in1d(a, b)
+            assert_array_equal(c, ec)
+
+            a = np.array([5, 5])
+            b = np.array([2, 2] * mult)
+            ec = np.array([False, False])
+            c = in1d(a, b)
+            assert_array_equal(c, ec)
+
+        a = np.array([5])
+        b = np.array([2])
+        ec = np.array([False])
+        c = in1d(a, b)
+        assert_array_equal(c, ec)
+
+        assert_array_equal(in1d([], []), [])
+
+    def test_in1d_char_array(self):
+        a = np.array(['a', 'b', 'c', 'd', 'e', 'c', 'e', 'b'])
+        b = np.array(['a', 'c'])
+
+        ec = np.array([True, False, True, False, False, True, False, False])
+        c = in1d(a, b)
+
+        assert_array_equal(c, ec)
+
+    def test_in1d_invert(self):
+        "Test in1d's invert parameter"
+        # We use two different sizes for the b array here to test the
+        # two different paths in in1d().
+        for mult in (1, 10):
+            a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5])
+            b = [2, 3, 4] * mult
+            assert_array_equal(np.invert(in1d(a, b)), in1d(a, b, invert=True))
+
+    def test_in1d_ravel(self):
+        # Test that in1d ravels its input arrays. This is not documented
+        # behavior however. The test is to ensure consistentency.
+        a = np.arange(6).reshape(2, 3)
+        b = np.arange(3, 9).reshape(3, 2)
+        long_b = np.arange(3, 63).reshape(30, 2)
+        ec = np.array([False, False, False, True, True, True])
+
+        assert_array_equal(in1d(a, b, assume_unique=True), ec)
+        assert_array_equal(in1d(a, b, assume_unique=False), ec)
+        assert_array_equal(in1d(a, long_b, assume_unique=True), ec)
+        assert_array_equal(in1d(a, long_b, assume_unique=False), ec)
+
+    def test_in1d_first_array_is_object(self):
+        ar1 = [None]
+        ar2 = np.array([1]*10)
+        expected = np.array([False])
+        result = np.in1d(ar1, ar2)
+        assert_array_equal(result, expected)
+
+    def test_in1d_second_array_is_object(self):
+        ar1 = 1
+        ar2 = np.array([None]*10)
+        expected = np.array([False])
+        result = np.in1d(ar1, ar2)
+        assert_array_equal(result, expected)
+
+    def test_in1d_both_arrays_are_object(self):
+        ar1 = [None]
+        ar2 = np.array([None]*10)
+        expected = np.array([True])
+        result = np.in1d(ar1, ar2)
+        assert_array_equal(result, expected)
+
+    def test_in1d_both_arrays_have_structured_dtype(self):
+        # Test arrays of a structured data type containing an integer field
+        # and a field of dtype `object` allowing for arbitrary Python objects
+        dt = np.dtype([('field1', int), ('field2', object)])
+        ar1 = np.array([(1, None)], dtype=dt)
+        ar2 = np.array([(1, None)]*10, dtype=dt)
+        expected = np.array([True])
+        result = np.in1d(ar1, ar2)
+        assert_array_equal(result, expected)
+
+    def test_union1d(self):
+        a = np.array([5, 4, 7, 1, 2])
+        b = np.array([2, 4, 3, 3, 2, 1, 5])
+
+        ec = np.array([1, 2, 3, 4, 5, 7])
+        c = union1d(a, b)
+        assert_array_equal(c, ec)
+
+        # Tests gh-10340, arguments to union1d should be
+        # flattened if they are not already 1D
+        x = np.array([[0, 1, 2], [3, 4, 5]])
+        y = np.array([0, 1, 2, 3, 4])
+        ez = np.array([0, 1, 2, 3, 4, 5])
+        z = union1d(x, y)
+        assert_array_equal(z, ez)
+
+        assert_array_equal([], union1d([], []))
+
+    def test_setdiff1d(self):
+        a = np.array([6, 5, 4, 7, 1, 2, 7, 4])
+        b = np.array([2, 4, 3, 3, 2, 1, 5])
+
+        ec = np.array([6, 7])
+        c = setdiff1d(a, b)
+        assert_array_equal(c, ec)
+
+        a = np.arange(21)
+        b = np.arange(19)
+        ec = np.array([19, 20])
+        c = setdiff1d(a, b)
+        assert_array_equal(c, ec)
+
+        assert_array_equal([], setdiff1d([], []))
+        a = np.array((), np.uint32)
+        assert_equal(setdiff1d(a, []).dtype, np.uint32)
+
+    def test_setdiff1d_unique(self):
+        a = np.array([3, 2, 1])
+        b = np.array([7, 5, 2])
+        expected = np.array([3, 1])
+        actual = setdiff1d(a, b, assume_unique=True)
+        assert_equal(actual, expected)
+
+    def test_setdiff1d_char_array(self):
+        a = np.array(['a', 'b', 'c'])
+        b = np.array(['a', 'b', 's'])
+        assert_array_equal(setdiff1d(a, b), np.array(['c']))
+
+    def test_manyways(self):
+        a = np.array([5, 7, 1, 2, 8])
+        b = np.array([9, 8, 2, 4, 3, 1, 5])
+
+        c1 = setxor1d(a, b)
+        aux1 = intersect1d(a, b)
+        aux2 = union1d(a, b)
+        c2 = setdiff1d(aux2, aux1)
+        assert_array_equal(c1, c2)
+
+
+class TestUnique(object):
+
+    def test_unique_1d(self):
+
+        def check_all(a, b, i1, i2, c, dt):
+            base_msg = 'check {0} failed for type {1}'
+
+            msg = base_msg.format('values', dt)
+            v = unique(a)
+            assert_array_equal(v, b, msg)
+
+            msg = base_msg.format('return_index', dt)
+            v, j = unique(a, 1, 0, 0)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, i1, msg)
+
+            msg = base_msg.format('return_inverse', dt)
+            v, j = unique(a, 0, 1, 0)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, i2, msg)
+
+            msg = base_msg.format('return_counts', dt)
+            v, j = unique(a, 0, 0, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j, c, msg)
+
+            msg = base_msg.format('return_index and return_inverse', dt)
+            v, j1, j2 = unique(a, 1, 1, 0)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j1, i1, msg)
+            assert_array_equal(j2, i2, msg)
+
+            msg = base_msg.format('return_index and return_counts', dt)
+            v, j1, j2 = unique(a, 1, 0, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j1, i1, msg)
+            assert_array_equal(j2, c, msg)
+
+            msg = base_msg.format('return_inverse and return_counts', dt)
+            v, j1, j2 = unique(a, 0, 1, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j1, i2, msg)
+            assert_array_equal(j2, c, msg)
+
+            msg = base_msg.format(('return_index, return_inverse '
+                                   'and return_counts'), dt)
+            v, j1, j2, j3 = unique(a, 1, 1, 1)
+            assert_array_equal(v, b, msg)
+            assert_array_equal(j1, i1, msg)
+            assert_array_equal(j2, i2, msg)
+            assert_array_equal(j3, c, msg)
+
+        a = [5, 7, 1, 2, 1, 5, 7]*10
+        b = [1, 2, 5, 7]
+        i1 = [2, 3, 0, 1]
+        i2 = [2, 3, 0, 1, 0, 2, 3]*10
+        c = np.multiply([2, 1, 2, 2], 10)
+
+        # test for numeric arrays
+        types = []
+        types.extend(np.typecodes['AllInteger'])
+        types.extend(np.typecodes['AllFloat'])
+        types.append('datetime64[D]')
+        types.append('timedelta64[D]')
+        for dt in types:
+            aa = np.array(a, dt)
+            bb = np.array(b, dt)
+            check_all(aa, bb, i1, i2, c, dt)
+
+        # test for object arrays
+        dt = 'O'
+        aa = np.empty(len(a), dt)
+        aa[:] = a
+        bb = np.empty(len(b), dt)
+        bb[:] = b
+        check_all(aa, bb, i1, i2, c, dt)
+
+        # test for structured arrays
+        dt = [('', 'i'), ('', 'i')]
+        aa = np.array(list(zip(a, a)), dt)
+        bb = np.array(list(zip(b, b)), dt)
+        check_all(aa, bb, i1, i2, c, dt)
+
+        # test for ticket #2799
+        aa = [1. + 0.j, 1 - 1.j, 1]
+        assert_array_equal(np.unique(aa), [1. - 1.j, 1. + 0.j])
+
+        # test for ticket #4785
+        a = [(1, 2), (1, 2), (2, 3)]
+        unq = [1, 2, 3]
+        inv = [0, 1, 0, 1, 1, 2]
+        a1 = unique(a)
+        assert_array_equal(a1, unq)
+        a2, a2_inv = unique(a, return_inverse=True)
+        assert_array_equal(a2, unq)
+        assert_array_equal(a2_inv, inv)
+
+        # test for chararrays with return_inverse (gh-5099)
+        a = np.chararray(5)
+        a[...] = ''
+        a2, a2_inv = np.unique(a, return_inverse=True)
+        assert_array_equal(a2_inv, np.zeros(5))
+
+        # test for ticket #9137
+        a = []
+        a1_idx = np.unique(a, return_index=True)[1]
+        a2_inv = np.unique(a, return_inverse=True)[1]
+        a3_idx, a3_inv = np.unique(a, return_index=True, return_inverse=True)[1:]
+        assert_equal(a1_idx.dtype, np.intp)
+        assert_equal(a2_inv.dtype, np.intp)
+        assert_equal(a3_idx.dtype, np.intp)
+        assert_equal(a3_inv.dtype, np.intp)
+
+    def test_unique_axis_errors(self):
+        assert_raises(TypeError, self._run_axis_tests, object)
+        assert_raises(TypeError, self._run_axis_tests,
+                      [('a', int), ('b', object)])
+
+        assert_raises(np.AxisError, unique, np.arange(10), axis=2)
+        assert_raises(np.AxisError, unique, np.arange(10), axis=-2)
+
+    def test_unique_axis_list(self):
+        msg = "Unique failed on list of lists"
+        inp = [[0, 1, 0], [0, 1, 0]]
+        inp_arr = np.asarray(inp)
+        assert_array_equal(unique(inp, axis=0), unique(inp_arr, axis=0), msg)
+        assert_array_equal(unique(inp, axis=1), unique(inp_arr, axis=1), msg)
+
+    def test_unique_axis(self):
+        types = []
+        types.extend(np.typecodes['AllInteger'])
+        types.extend(np.typecodes['AllFloat'])
+        types.append('datetime64[D]')
+        types.append('timedelta64[D]')
+        types.append([('a', int), ('b', int)])
+        types.append([('a', int), ('b', float)])
+
+        for dtype in types:
+            self._run_axis_tests(dtype)
+
+        msg = 'Non-bitwise-equal booleans test failed'
+        data = np.arange(10, dtype=np.uint8).reshape(-1, 2).view(bool)
+        result = np.array([[False, True], [True, True]], dtype=bool)
+        assert_array_equal(unique(data, axis=0), result, msg)
+
+        msg = 'Negative zero equality test failed'
+        data = np.array([[-0.0, 0.0], [0.0, -0.0], [-0.0, 0.0], [0.0, -0.0]])
+        result = np.array([[-0.0, 0.0]])
+        assert_array_equal(unique(data, axis=0), result, msg)
+
+    def test_unique_masked(self):
+        # issue 8664
+        x = np.array([64, 0, 1, 2, 3, 63, 63, 0, 0, 0, 1, 2, 0, 63, 0], dtype='uint8')
+        y = np.ma.masked_equal(x, 0)
+
+        v = np.unique(y)
+        v2, i, c = np.unique(y, return_index=True, return_counts=True)
+
+        msg = 'Unique returned different results when asked for index'
+        assert_array_equal(v.data, v2.data, msg)
+        assert_array_equal(v.mask, v2.mask, msg)
+
+    def test_unique_sort_order_with_axis(self):
+        # These tests fail if sorting along axis is done by treating subarrays
+        # as unsigned byte strings.  See gh-10495.
+        fmt = "sort order incorrect for integer type '%s'"
+        for dt in 'bhilq':
+            a = np.array([[-1],[0]], dt)
+            b = np.unique(a, axis=0)
+            assert_array_equal(a, b, fmt % dt)
+
+    def _run_axis_tests(self, dtype):
+        data = np.array([[0, 1, 0, 0],
+                         [1, 0, 0, 0],
+                         [0, 1, 0, 0],
+                         [1, 0, 0, 0]]).astype(dtype)
+
+        msg = 'Unique with 1d array and axis=0 failed'
+        result = np.array([0, 1])
+        assert_array_equal(unique(data), result.astype(dtype), msg)
+
+        msg = 'Unique with 2d array and axis=0 failed'
+        result = np.array([[0, 1, 0, 0], [1, 0, 0, 0]])
+        assert_array_equal(unique(data, axis=0), result.astype(dtype), msg)
+
+        msg = 'Unique with 2d array and axis=1 failed'
+        result = np.array([[0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 1, 0]])
+        assert_array_equal(unique(data, axis=1), result.astype(dtype), msg)
+
+        msg = 'Unique with 3d array and axis=2 failed'
+        data3d = np.dstack([data] * 3)
+        result = data3d[..., :1]
+        assert_array_equal(unique(data3d, axis=2), result, msg)
+
+        uniq, idx, inv, cnt = unique(data, axis=0, return_index=True,
+                                     return_inverse=True, return_counts=True)
+        msg = "Unique's return_index=True failed with axis=0"
+        assert_array_equal(data[idx], uniq, msg)
+        msg = "Unique's return_inverse=True failed with axis=0"
+        assert_array_equal(uniq[inv], data)
+        msg = "Unique's return_counts=True failed with axis=0"
+        assert_array_equal(cnt, np.array([2, 2]), msg)
+
+        uniq, idx, inv, cnt = unique(data, axis=1, return_index=True,
+                                     return_inverse=True, return_counts=True)
+        msg = "Unique's return_index=True failed with axis=1"
+        assert_array_equal(data[:, idx], uniq)
+        msg = "Unique's return_inverse=True failed with axis=1"
+        assert_array_equal(uniq[:, inv], data)
+        msg = "Unique's return_counts=True failed with axis=1"
+        assert_array_equal(cnt, np.array([2, 1, 1]), msg)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_arrayterator.py b/contrib/python/numpy/py2/numpy/lib/tests/test_arrayterator.py
new file mode 100644
index 00000000000..2ce4456a5b5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_arrayterator.py
@@ -0,0 +1,48 @@
+from __future__ import division, absolute_import, print_function
+
+from operator import mul
+from functools import reduce
+
+import numpy as np
+from numpy.random import randint
+from numpy.lib import Arrayterator
+from numpy.testing import assert_
+
+
+def test():
+    np.random.seed(np.arange(10))
+
+    # Create a random array
+    ndims = randint(5)+1
+    shape = tuple(randint(10)+1 for dim in range(ndims))
+    els = reduce(mul, shape)
+    a = np.arange(els)
+    a.shape = shape
+
+    buf_size = randint(2*els)
+    b = Arrayterator(a, buf_size)
+
+    # Check that each block has at most ``buf_size`` elements
+    for block in b:
+        assert_(len(block.flat) <= (buf_size or els))
+
+    # Check that all elements are iterated correctly
+    assert_(list(b.flat) == list(a.flat))
+
+    # Slice arrayterator
+    start = [randint(dim) for dim in shape]
+    stop = [randint(dim)+1 for dim in shape]
+    step = [randint(dim)+1 for dim in shape]
+    slice_ = tuple(slice(*t) for t in zip(start, stop, step))
+    c = b[slice_]
+    d = a[slice_]
+
+    # Check that each block has at most ``buf_size`` elements
+    for block in c:
+        assert_(len(block.flat) <= (buf_size or els))
+
+    # Check that the arrayterator is sliced correctly
+    assert_(np.all(c.__array__() == d))
+
+    # Check that all elements are iterated correctly
+    assert_(list(c.flat) == list(d.flat))
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_financial.py b/contrib/python/numpy/py2/numpy/lib/tests/test_financial.py
new file mode 100644
index 00000000000..52491504114
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_financial.py
@@ -0,0 +1,340 @@
+from __future__ import division, absolute_import, print_function
+
+from decimal import Decimal
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_almost_equal, assert_allclose, assert_equal, assert_raises
+    )
+
+
+class TestFinancial(object):
+    def test_rate(self):
+        assert_almost_equal(
+            np.rate(10, 0, -3500, 10000),
+            0.1107, 4)
+
+    def test_rate_decimal(self):
+        rate = np.rate(Decimal('10'), Decimal('0'), Decimal('-3500'), Decimal('10000'))
+        assert_equal(Decimal('0.1106908537142689284704528100'), rate)
+
+    def test_irr(self):
+        v = [-150000, 15000, 25000, 35000, 45000, 60000]
+        assert_almost_equal(np.irr(v), 0.0524, 2)
+        v = [-100, 0, 0, 74]
+        assert_almost_equal(np.irr(v), -0.0955, 2)
+        v = [-100, 39, 59, 55, 20]
+        assert_almost_equal(np.irr(v), 0.28095, 2)
+        v = [-100, 100, 0, -7]
+        assert_almost_equal(np.irr(v), -0.0833, 2)
+        v = [-100, 100, 0, 7]
+        assert_almost_equal(np.irr(v), 0.06206, 2)
+        v = [-5, 10.5, 1, -8, 1]
+        assert_almost_equal(np.irr(v), 0.0886, 2)
+
+        # Test that if there is no solution then np.irr returns nan
+        # Fixes gh-6744
+        v = [-1, -2, -3]
+        assert_equal(np.irr(v), np.nan)
+
+    def test_pv(self):
+        assert_almost_equal(np.pv(0.07, 20, 12000, 0), -127128.17, 2)
+
+    def test_pv_decimal(self):
+        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0')),
+                     Decimal('-127128.1709461939327295222005'))
+
+    def test_fv(self):
+        assert_equal(np.fv(0.075, 20, -2000, 0, 0), 86609.362673042924)
+
+    def test_fv_decimal(self):
+        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), 0, 0),
+                     Decimal('86609.36267304300040536731624'))
+
+    def test_pmt(self):
+        res = np.pmt(0.08 / 12, 5 * 12, 15000)
+        tgt = -304.145914
+        assert_allclose(res, tgt)
+        # Test the edge case where rate == 0.0
+        res = np.pmt(0.0, 5 * 12, 15000)
+        tgt = -250.0
+        assert_allclose(res, tgt)
+        # Test the case where we use broadcast and
+        # the arguments passed in are arrays.
+        res = np.pmt([[0.0, 0.8], [0.3, 0.8]], [12, 3], [2000, 20000])
+        tgt = np.array([[-166.66667, -19311.258], [-626.90814, -19311.258]])
+        assert_allclose(res, tgt)
+
+    def test_pmt_decimal(self):
+        res = np.pmt(Decimal('0.08') / Decimal('12'), 5 * 12, 15000)
+        tgt = Decimal('-304.1459143262052370338701494')
+        assert_equal(res, tgt)
+        # Test the edge case where rate == 0.0
+        res = np.pmt(Decimal('0'), Decimal('60'), Decimal('15000'))
+        tgt = -250
+        assert_equal(res, tgt)
+        # Test the case where we use broadcast and
+        # the arguments passed in are arrays.
+        res = np.pmt([[Decimal('0'), Decimal('0.8')], [Decimal('0.3'), Decimal('0.8')]],
+                     [Decimal('12'), Decimal('3')], [Decimal('2000'), Decimal('20000')])
+        tgt = np.array([[Decimal('-166.6666666666666666666666667'), Decimal('-19311.25827814569536423841060')],
+                        [Decimal('-626.9081401700757748402586600'), Decimal('-19311.25827814569536423841060')]])
+
+        # Cannot use the `assert_allclose` because it uses isfinite under the covers
+        # which does not support the Decimal type
+        # See issue: https://github.com/numpy/numpy/issues/9954
+        assert_equal(res[0][0], tgt[0][0])
+        assert_equal(res[0][1], tgt[0][1])
+        assert_equal(res[1][0], tgt[1][0])
+        assert_equal(res[1][1], tgt[1][1])
+
+    def test_ppmt(self):
+        assert_equal(np.round(np.ppmt(0.1 / 12, 1, 60, 55000), 2), -710.25)
+
+    def test_ppmt_decimal(self):
+        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000')),
+                     Decimal('-710.2541257864217612489830917'))
+
+    # Two tests showing how Decimal is actually getting at a more exact result
+    # .23 / 12 does not come out nicely as a float but does as a decimal
+    def test_ppmt_special_rate(self):
+        assert_equal(np.round(np.ppmt(0.23 / 12, 1, 60, 10000000000), 8), -90238044.232277036)
+
+    def test_ppmt_special_rate_decimal(self):
+        # When rounded out to 8 decimal places like the float based test, this should not equal the same value
+        # as the float, substituted for the decimal
+        def raise_error_because_not_equal():
+            assert_equal(
+                round(np.ppmt(Decimal('0.23') / Decimal('12'), 1, 60, Decimal('10000000000')), 8),
+                Decimal('-90238044.232277036'))
+
+        assert_raises(AssertionError, raise_error_because_not_equal)
+        assert_equal(np.ppmt(Decimal('0.23') / Decimal('12'), 1, 60, Decimal('10000000000')),
+                     Decimal('-90238044.2322778884413969909'))
+
+    def test_ipmt(self):
+        assert_almost_equal(np.round(np.ipmt(0.1 / 12, 1, 24, 2000), 2), -16.67)
+
+    def test_ipmt_decimal(self):
+        result = np.ipmt(Decimal('0.1') / Decimal('12'), 1, 24, 2000)
+        assert_equal(result.flat[0], Decimal('-16.66666666666666666666666667'))
+
+    def test_nper(self):
+        assert_almost_equal(np.nper(0.075, -2000, 0, 100000.),
+                            21.54, 2)
+
+    def test_nper2(self):
+        assert_almost_equal(np.nper(0.0, -2000, 0, 100000.),
+                            50.0, 1)
+
+    def test_npv(self):
+        assert_almost_equal(
+            np.npv(0.05, [-15000, 1500, 2500, 3500, 4500, 6000]),
+            122.89, 2)
+
+    def test_npv_decimal(self):
+        assert_equal(
+            np.npv(Decimal('0.05'), [-15000, 1500, 2500, 3500, 4500, 6000]),
+            Decimal('122.894854950942692161628715'))
+
+    def test_mirr(self):
+        val = [-4500, -800, 800, 800, 600, 600, 800, 800, 700, 3000]
+        assert_almost_equal(np.mirr(val, 0.08, 0.055), 0.0666, 4)
+
+        val = [-120000, 39000, 30000, 21000, 37000, 46000]
+        assert_almost_equal(np.mirr(val, 0.10, 0.12), 0.126094, 6)
+
+        val = [100, 200, -50, 300, -200]
+        assert_almost_equal(np.mirr(val, 0.05, 0.06), 0.3428, 4)
+
+        val = [39000, 30000, 21000, 37000, 46000]
+        assert_(np.isnan(np.mirr(val, 0.10, 0.12)))
+
+    def test_mirr_decimal(self):
+        val = [Decimal('-4500'), Decimal('-800'), Decimal('800'), Decimal('800'),
+               Decimal('600'), Decimal('600'), Decimal('800'), Decimal('800'),
+               Decimal('700'), Decimal('3000')]
+        assert_equal(np.mirr(val, Decimal('0.08'), Decimal('0.055')),
+                     Decimal('0.066597175031553548874239618'))
+
+        val = [Decimal('-120000'), Decimal('39000'), Decimal('30000'),
+               Decimal('21000'), Decimal('37000'), Decimal('46000')]
+        assert_equal(np.mirr(val, Decimal('0.10'), Decimal('0.12')), Decimal('0.126094130365905145828421880'))
+
+        val = [Decimal('100'), Decimal('200'), Decimal('-50'),
+               Decimal('300'), Decimal('-200')]
+        assert_equal(np.mirr(val, Decimal('0.05'), Decimal('0.06')), Decimal('0.342823387842176663647819868'))
+
+        val = [Decimal('39000'), Decimal('30000'), Decimal('21000'), Decimal('37000'), Decimal('46000')]
+        assert_(np.isnan(np.mirr(val, Decimal('0.10'), Decimal('0.12'))))
+
+    def test_when(self):
+        # begin
+        assert_equal(np.rate(10, 20, -3500, 10000, 1),
+                     np.rate(10, 20, -3500, 10000, 'begin'))
+        # end
+        assert_equal(np.rate(10, 20, -3500, 10000),
+                     np.rate(10, 20, -3500, 10000, 'end'))
+        assert_equal(np.rate(10, 20, -3500, 10000, 0),
+                     np.rate(10, 20, -3500, 10000, 'end'))
+
+        # begin
+        assert_equal(np.pv(0.07, 20, 12000, 0, 1),
+                     np.pv(0.07, 20, 12000, 0, 'begin'))
+        # end
+        assert_equal(np.pv(0.07, 20, 12000, 0),
+                     np.pv(0.07, 20, 12000, 0, 'end'))
+        assert_equal(np.pv(0.07, 20, 12000, 0, 0),
+                     np.pv(0.07, 20, 12000, 0, 'end'))
+
+        # begin
+        assert_equal(np.fv(0.075, 20, -2000, 0, 1),
+                     np.fv(0.075, 20, -2000, 0, 'begin'))
+        # end
+        assert_equal(np.fv(0.075, 20, -2000, 0),
+                     np.fv(0.075, 20, -2000, 0, 'end'))
+        assert_equal(np.fv(0.075, 20, -2000, 0, 0),
+                     np.fv(0.075, 20, -2000, 0, 'end'))
+
+        # begin
+        assert_equal(np.pmt(0.08 / 12, 5 * 12, 15000., 0, 1),
+                     np.pmt(0.08 / 12, 5 * 12, 15000., 0, 'begin'))
+        # end
+        assert_equal(np.pmt(0.08 / 12, 5 * 12, 15000., 0),
+                     np.pmt(0.08 / 12, 5 * 12, 15000., 0, 'end'))
+        assert_equal(np.pmt(0.08 / 12, 5 * 12, 15000., 0, 0),
+                     np.pmt(0.08 / 12, 5 * 12, 15000., 0, 'end'))
+
+        # begin
+        assert_equal(np.ppmt(0.1 / 12, 1, 60, 55000, 0, 1),
+                     np.ppmt(0.1 / 12, 1, 60, 55000, 0, 'begin'))
+        # end
+        assert_equal(np.ppmt(0.1 / 12, 1, 60, 55000, 0),
+                     np.ppmt(0.1 / 12, 1, 60, 55000, 0, 'end'))
+        assert_equal(np.ppmt(0.1 / 12, 1, 60, 55000, 0, 0),
+                     np.ppmt(0.1 / 12, 1, 60, 55000, 0, 'end'))
+
+        # begin
+        assert_equal(np.ipmt(0.1 / 12, 1, 24, 2000, 0, 1),
+                     np.ipmt(0.1 / 12, 1, 24, 2000, 0, 'begin'))
+        # end
+        assert_equal(np.ipmt(0.1 / 12, 1, 24, 2000, 0),
+                     np.ipmt(0.1 / 12, 1, 24, 2000, 0, 'end'))
+        assert_equal(np.ipmt(0.1 / 12, 1, 24, 2000, 0, 0),
+                     np.ipmt(0.1 / 12, 1, 24, 2000, 0, 'end'))
+
+        # begin
+        assert_equal(np.nper(0.075, -2000, 0, 100000., 1),
+                     np.nper(0.075, -2000, 0, 100000., 'begin'))
+        # end
+        assert_equal(np.nper(0.075, -2000, 0, 100000.),
+                     np.nper(0.075, -2000, 0, 100000., 'end'))
+        assert_equal(np.nper(0.075, -2000, 0, 100000., 0),
+                     np.nper(0.075, -2000, 0, 100000., 'end'))
+
+    def test_decimal_with_when(self):
+        """Test that decimals are still supported if the when argument is passed"""
+        # begin
+        assert_equal(np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), Decimal('1')),
+                     np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), 'begin'))
+        # end
+        assert_equal(np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000')),
+                     np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), 'end'))
+        assert_equal(np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), Decimal('0')),
+                     np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), 'end'))
+
+        # begin
+        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), Decimal('1')),
+                     np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), 'begin'))
+        # end
+        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0')),
+                     np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), 'end'))
+        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), Decimal('0')),
+                     np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), 'end'))
+
+        # begin
+        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), Decimal('1')),
+                     np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), 'begin'))
+        # end
+        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0')),
+                     np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), 'end'))
+        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), Decimal('0')),
+                     np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), 'end'))
+
+        # begin
+        assert_equal(np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
+                            Decimal('0'), Decimal('1')),
+                     np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
+                            Decimal('0'), 'begin'))
+        # end
+        assert_equal(np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
+                            Decimal('0')),
+                     np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
+                            Decimal('0'), 'end'))
+        assert_equal(np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
+                            Decimal('0'), Decimal('0')),
+                     np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
+                            Decimal('0'), 'end'))
+
+        # begin
+        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
+                             Decimal('0'), Decimal('1')),
+                     np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
+                             Decimal('0'), 'begin'))
+        # end
+        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
+                             Decimal('0')),
+                     np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
+                             Decimal('0'), 'end'))
+        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
+                             Decimal('0'), Decimal('0')),
+                     np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
+                             Decimal('0'), 'end'))
+
+        # begin
+        assert_equal(np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
+                             Decimal('0'), Decimal('1')).flat[0],
+                     np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
+                             Decimal('0'), 'begin').flat[0])
+        # end
+        assert_equal(np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
+                             Decimal('0')).flat[0],
+                     np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
+                             Decimal('0'), 'end').flat[0])
+        assert_equal(np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
+                             Decimal('0'), Decimal('0')).flat[0],
+                     np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
+                             Decimal('0'), 'end').flat[0])
+
+    def test_broadcast(self):
+        assert_almost_equal(np.nper(0.075, -2000, 0, 100000., [0, 1]),
+                            [21.5449442, 20.76156441], 4)
+
+        assert_almost_equal(np.ipmt(0.1 / 12, list(range(5)), 24, 2000),
+                            [-17.29165168, -16.66666667, -16.03647345,
+                             -15.40102862, -14.76028842], 4)
+
+        assert_almost_equal(np.ppmt(0.1 / 12, list(range(5)), 24, 2000),
+                            [-74.998201, -75.62318601, -76.25337923,
+                             -76.88882405, -77.52956425], 4)
+
+        assert_almost_equal(np.ppmt(0.1 / 12, list(range(5)), 24, 2000, 0,
+                                    [0, 0, 1, 'end', 'begin']),
+                            [-74.998201, -75.62318601, -75.62318601,
+                             -76.88882405, -76.88882405], 4)
+
+    def test_broadcast_decimal(self):
+        # Use almost equal because precision is tested in the explicit tests, this test is to ensure
+        # broadcast with Decimal is not broken.
+        assert_almost_equal(np.ipmt(Decimal('0.1') / Decimal('12'), list(range(5)), Decimal('24'), Decimal('2000')),
+                            [Decimal('-17.29165168'), Decimal('-16.66666667'), Decimal('-16.03647345'),
+                             Decimal('-15.40102862'), Decimal('-14.76028842')], 4)
+
+        assert_almost_equal(np.ppmt(Decimal('0.1') / Decimal('12'), list(range(5)), Decimal('24'), Decimal('2000')),
+                            [Decimal('-74.998201'), Decimal('-75.62318601'), Decimal('-76.25337923'),
+                             Decimal('-76.88882405'), Decimal('-77.52956425')], 4)
+
+        assert_almost_equal(np.ppmt(Decimal('0.1') / Decimal('12'), list(range(5)), Decimal('24'), Decimal('2000'),
+                                    Decimal('0'), [Decimal('0'), Decimal('0'), Decimal('1'), 'end', 'begin']),
+                            [Decimal('-74.998201'), Decimal('-75.62318601'), Decimal('-75.62318601'),
+                             Decimal('-76.88882405'), Decimal('-76.88882405')], 4)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_format.py b/contrib/python/numpy/py2/numpy/lib/tests/test_format.py
new file mode 100644
index 00000000000..4a3fbdf5714
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_format.py
@@ -0,0 +1,940 @@
+from __future__ import division, absolute_import, print_function
+
+# doctest
+r''' Test the .npy file format.
+
+Set up:
+
+    >>> import sys
+    >>> from io import BytesIO
+    >>> from numpy.lib import format
+    >>>
+    >>> scalars = [
+    ...     np.uint8,
+    ...     np.int8,
+    ...     np.uint16,
+    ...     np.int16,
+    ...     np.uint32,
+    ...     np.int32,
+    ...     np.uint64,
+    ...     np.int64,
+    ...     np.float32,
+    ...     np.float64,
+    ...     np.complex64,
+    ...     np.complex128,
+    ...     object,
+    ... ]
+    >>>
+    >>> basic_arrays = []
+    >>>
+    >>> for scalar in scalars:
+    ...     for endian in '<>':
+    ...         dtype = np.dtype(scalar).newbyteorder(endian)
+    ...         basic = np.arange(15).astype(dtype)
+    ...         basic_arrays.extend([
+    ...             np.array([], dtype=dtype),
+    ...             np.array(10, dtype=dtype),
+    ...             basic,
+    ...             basic.reshape((3,5)),
+    ...             basic.reshape((3,5)).T,
+    ...             basic.reshape((3,5))[::-1,::2],
+    ...         ])
+    ...
+    >>>
+    >>> Pdescr = [
+    ...     ('x', 'i4', (2,)),
+    ...     ('y', 'f8', (2, 2)),
+    ...     ('z', 'u1')]
+    >>>
+    >>>
+    >>> PbufferT = [
+    ...     ([3,2], [[6.,4.],[6.,4.]], 8),
+    ...     ([4,3], [[7.,5.],[7.,5.]], 9),
+    ...     ]
+    >>>
+    >>>
+    >>> Ndescr = [
+    ...     ('x', 'i4', (2,)),
+    ...     ('Info', [
+    ...         ('value', 'c16'),
+    ...         ('y2', 'f8'),
+    ...         ('Info2', [
+    ...             ('name', 'S2'),
+    ...             ('value', 'c16', (2,)),
+    ...             ('y3', 'f8', (2,)),
+    ...             ('z3', 'u4', (2,))]),
+    ...         ('name', 'S2'),
+    ...         ('z2', 'b1')]),
+    ...     ('color', 'S2'),
+    ...     ('info', [
+    ...         ('Name', 'U8'),
+    ...         ('Value', 'c16')]),
+    ...     ('y', 'f8', (2, 2)),
+    ...     ('z', 'u1')]
+    >>>
+    >>>
+    >>> NbufferT = [
+    ...     ([3,2], (6j, 6., ('nn', [6j,4j], [6.,4.], [1,2]), 'NN', True), 'cc', ('NN', 6j), [[6.,4.],[6.,4.]], 8),
+    ...     ([4,3], (7j, 7., ('oo', [7j,5j], [7.,5.], [2,1]), 'OO', False), 'dd', ('OO', 7j), [[7.,5.],[7.,5.]], 9),
+    ...     ]
+    >>>
+    >>>
+    >>> record_arrays = [
+    ...     np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('<')),
+    ...     np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('<')),
+    ...     np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('>')),
+    ...     np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('>')),
+    ... ]
+
+Test the magic string writing.
+
+    >>> format.magic(1, 0)
+    '\x93NUMPY\x01\x00'
+    >>> format.magic(0, 0)
+    '\x93NUMPY\x00\x00'
+    >>> format.magic(255, 255)
+    '\x93NUMPY\xff\xff'
+    >>> format.magic(2, 5)
+    '\x93NUMPY\x02\x05'
+
+Test the magic string reading.
+
+    >>> format.read_magic(BytesIO(format.magic(1, 0)))
+    (1, 0)
+    >>> format.read_magic(BytesIO(format.magic(0, 0)))
+    (0, 0)
+    >>> format.read_magic(BytesIO(format.magic(255, 255)))
+    (255, 255)
+    >>> format.read_magic(BytesIO(format.magic(2, 5)))
+    (2, 5)
+
+Test the header writing.
+
+    >>> for arr in basic_arrays + record_arrays:
+    ...     f = BytesIO()
+    ...     format.write_array_header_1_0(f, arr)   # XXX: arr is not a dict, items gets called on it
+    ...     print(repr(f.getvalue()))
+    ...
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|u1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|u1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<u2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>u2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<i2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>i2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<u4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>u4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<i4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>i4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<u8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>u8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<i8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>i8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<f4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>f4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<f8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>f8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<c8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>c8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (0,)}             \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': ()}               \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (15,)}            \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (3, 5)}           \n"
+    "F\x00{'descr': '<c16', 'fortran_order': True, 'shape': (5, 3)}            \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (3, 3)}           \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (0,)}             \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': ()}               \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (15,)}            \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (3, 5)}           \n"
+    "F\x00{'descr': '>c16', 'fortran_order': True, 'shape': (5, 3)}            \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (3, 3)}           \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': 'O', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': 'O', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': 'O', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "v\x00{'descr': [('x', '<i4', (2,)), ('y', '<f8', (2, 2)), ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}         \n"
+    "\x16\x02{'descr': [('x', '<i4', (2,)),\n           ('Info',\n            [('value', '<c16'),\n             ('y2', '<f8'),\n             ('Info2',\n              [('name', '|S2'),\n               ('value', '<c16', (2,)),\n               ('y3', '<f8', (2,)),\n               ('z3', '<u4', (2,))]),\n             ('name', '|S2'),\n             ('z2', '|b1')]),\n           ('color', '|S2'),\n           ('info', [('Name', '<U8'), ('Value', '<c16')]),\n           ('y', '<f8', (2, 2)),\n           ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}      \n"
+    "v\x00{'descr': [('x', '>i4', (2,)), ('y', '>f8', (2, 2)), ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}         \n"
+    "\x16\x02{'descr': [('x', '>i4', (2,)),\n           ('Info',\n            [('value', '>c16'),\n             ('y2', '>f8'),\n             ('Info2',\n              [('name', '|S2'),\n               ('value', '>c16', (2,)),\n               ('y3', '>f8', (2,)),\n               ('z3', '>u4', (2,))]),\n             ('name', '|S2'),\n             ('z2', '|b1')]),\n           ('color', '|S2'),\n           ('info', [('Name', '>U8'), ('Value', '>c16')]),\n           ('y', '>f8', (2, 2)),\n           ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}      \n"
+'''
+import sys
+import os
+import shutil
+import tempfile
+import warnings
+import pytest
+from io import BytesIO
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_array_equal, assert_raises, assert_raises_regex,
+    )
+from numpy.lib import format
+
+
+tempdir = None
+
+# Module-level setup.
+
+
+def setup_module():
+    global tempdir
+    tempdir = tempfile.mkdtemp()
+
+
+def teardown_module():
+    global tempdir
+    if tempdir is not None and os.path.isdir(tempdir):
+        shutil.rmtree(tempdir)
+        tempdir = None
+
+
+# Generate some basic arrays to test with.
+scalars = [
+    np.uint8,
+    np.int8,
+    np.uint16,
+    np.int16,
+    np.uint32,
+    np.int32,
+    np.uint64,
+    np.int64,
+    np.float32,
+    np.float64,
+    np.complex64,
+    np.complex128,
+    object,
+]
+basic_arrays = []
+for scalar in scalars:
+    for endian in '<>':
+        dtype = np.dtype(scalar).newbyteorder(endian)
+        basic = np.arange(1500).astype(dtype)
+        basic_arrays.extend([
+            # Empty
+            np.array([], dtype=dtype),
+            # Rank-0
+            np.array(10, dtype=dtype),
+            # 1-D
+            basic,
+            # 2-D C-contiguous
+            basic.reshape((30, 50)),
+            # 2-D F-contiguous
+            basic.reshape((30, 50)).T,
+            # 2-D non-contiguous
+            basic.reshape((30, 50))[::-1, ::2],
+        ])
+
+# More complicated record arrays.
+# This is the structure of the table used for plain objects:
+#
+# +-+-+-+
+# |x|y|z|
+# +-+-+-+
+
+# Structure of a plain array description:
+Pdescr = [
+    ('x', 'i4', (2,)),
+    ('y', 'f8', (2, 2)),
+    ('z', 'u1')]
+
+# A plain list of tuples with values for testing:
+PbufferT = [
+    # x     y                  z
+    ([3, 2], [[6., 4.], [6., 4.]], 8),
+    ([4, 3], [[7., 5.], [7., 5.]], 9),
+    ]
+
+
+# This is the structure of the table used for nested objects (DON'T PANIC!):
+#
+# +-+---------------------------------+-----+----------+-+-+
+# |x|Info                             |color|info      |y|z|
+# | +-----+--+----------------+----+--+     +----+-----+ | |
+# | |value|y2|Info2           |name|z2|     |Name|Value| | |
+# | |     |  +----+-----+--+--+    |  |     |    |     | | |
+# | |     |  |name|value|y3|z3|    |  |     |    |     | | |
+# +-+-----+--+----+-----+--+--+----+--+-----+----+-----+-+-+
+#
+
+# The corresponding nested array description:
+Ndescr = [
+    ('x', 'i4', (2,)),
+    ('Info', [
+        ('value', 'c16'),
+        ('y2', 'f8'),
+        ('Info2', [
+            ('name', 'S2'),
+            ('value', 'c16', (2,)),
+            ('y3', 'f8', (2,)),
+            ('z3', 'u4', (2,))]),
+        ('name', 'S2'),
+        ('z2', 'b1')]),
+    ('color', 'S2'),
+    ('info', [
+        ('Name', 'U8'),
+        ('Value', 'c16')]),
+    ('y', 'f8', (2, 2)),
+    ('z', 'u1')]
+
+NbufferT = [
+    # x     Info                                                color info        y                  z
+    #       value y2 Info2                            name z2         Name Value
+    #                name   value    y3       z3
+    ([3, 2], (6j, 6., ('nn', [6j, 4j], [6., 4.], [1, 2]), 'NN', True),
+     'cc', ('NN', 6j), [[6., 4.], [6., 4.]], 8),
+    ([4, 3], (7j, 7., ('oo', [7j, 5j], [7., 5.], [2, 1]), 'OO', False),
+     'dd', ('OO', 7j), [[7., 5.], [7., 5.]], 9),
+    ]
+
+record_arrays = [
+    np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('<')),
+    np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('<')),
+    np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('>')),
+    np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('>')),
+    np.zeros(1, dtype=[('c', ('<f8', (5,)), (2,))])
+]
+
+
+#BytesIO that reads a random number of bytes at a time
+class BytesIOSRandomSize(BytesIO):
+    def read(self, size=None):
+        import random
+        size = random.randint(1, size)
+        return super(BytesIOSRandomSize, self).read(size)
+
+
+def roundtrip(arr):
+    f = BytesIO()
+    format.write_array(f, arr)
+    f2 = BytesIO(f.getvalue())
+    arr2 = format.read_array(f2, allow_pickle=True)
+    return arr2
+
+
+def roundtrip_randsize(arr):
+    f = BytesIO()
+    format.write_array(f, arr)
+    f2 = BytesIOSRandomSize(f.getvalue())
+    arr2 = format.read_array(f2)
+    return arr2
+
+
+def roundtrip_truncated(arr):
+    f = BytesIO()
+    format.write_array(f, arr)
+    #BytesIO is one byte short
+    f2 = BytesIO(f.getvalue()[0:-1])
+    arr2 = format.read_array(f2)
+    return arr2
+
+
+def assert_equal_(o1, o2):
+    assert_(o1 == o2)
+
+
+def test_roundtrip():
+    for arr in basic_arrays + record_arrays:
+        arr2 = roundtrip(arr)
+        assert_array_equal(arr, arr2)
+
+
+def test_roundtrip_randsize():
+    for arr in basic_arrays + record_arrays:
+        if arr.dtype != object:
+            arr2 = roundtrip_randsize(arr)
+            assert_array_equal(arr, arr2)
+
+
+def test_roundtrip_truncated():
+    for arr in basic_arrays:
+        if arr.dtype != object:
+            assert_raises(ValueError, roundtrip_truncated, arr)
+
+
+def test_long_str():
+    # check items larger than internal buffer size, gh-4027
+    long_str_arr = np.ones(1, dtype=np.dtype((str, format.BUFFER_SIZE + 1)))
+    long_str_arr2 = roundtrip(long_str_arr)
+    assert_array_equal(long_str_arr, long_str_arr2)
+
+
+[email protected]
+def test_memmap_roundtrip():
+    # Fixme: used to crash on windows
+    if not (sys.platform == 'win32' or sys.platform == 'cygwin'):
+        for arr in basic_arrays + record_arrays:
+            if arr.dtype.hasobject:
+                # Skip these since they can't be mmap'ed.
+                continue
+            # Write it out normally and through mmap.
+            nfn = os.path.join(tempdir, 'normal.npy')
+            mfn = os.path.join(tempdir, 'memmap.npy')
+            fp = open(nfn, 'wb')
+            try:
+                format.write_array(fp, arr)
+            finally:
+                fp.close()
+
+            fortran_order = (
+                arr.flags.f_contiguous and not arr.flags.c_contiguous)
+            ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype,
+                                    shape=arr.shape, fortran_order=fortran_order)
+            ma[...] = arr
+            del ma
+
+            # Check that both of these files' contents are the same.
+            fp = open(nfn, 'rb')
+            normal_bytes = fp.read()
+            fp.close()
+            fp = open(mfn, 'rb')
+            memmap_bytes = fp.read()
+            fp.close()
+            assert_equal_(normal_bytes, memmap_bytes)
+
+            # Check that reading the file using memmap works.
+            ma = format.open_memmap(nfn, mode='r')
+            del ma
+
+
+def test_compressed_roundtrip():
+    arr = np.random.rand(200, 200)
+    npz_file = os.path.join(tempdir, 'compressed.npz')
+    np.savez_compressed(npz_file, arr=arr)
+    arr1 = np.load(npz_file)['arr']
+    assert_array_equal(arr, arr1)
+
+
+# aligned
+dt1 = np.dtype('i1, i4, i1', align=True)
+# non-aligned, explicit offsets
+dt2 = np.dtype({'names': ['a', 'b'], 'formats': ['i4', 'i4'],
+                'offsets': [1, 6]})
+# nested struct-in-struct
+dt3 = np.dtype({'names': ['c', 'd'], 'formats': ['i4', dt2]})
+# field with '' name
+dt4 = np.dtype({'names': ['a', '', 'b'], 'formats': ['i4']*3})
+# titles
+dt5 = np.dtype({'names': ['a', 'b'], 'formats': ['i4', 'i4'],
+                'offsets': [1, 6], 'titles': ['aa', 'bb']})
+
+[email protected]("dt", [dt1, dt2, dt3, dt4, dt5])
+def test_load_padded_dtype(dt):
+    arr = np.zeros(3, dt)
+    for i in range(3):
+        arr[i] = i + 5
+    npz_file = os.path.join(tempdir, 'aligned.npz')
+    np.savez(npz_file, arr=arr)
+    arr1 = np.load(npz_file)['arr']
+    assert_array_equal(arr, arr1)
+
+
+def test_python2_python3_interoperability():
+    if sys.version_info[0] >= 3:
+        fname = 'win64python2.npy'
+    else:
+        fname = 'python3.npy'
+    path = os.path.join(os.path.dirname(__file__), 'data', fname)
+    data = np.load(path)
+    assert_array_equal(data, np.ones(2))
+
+def test_pickle_python2_python3():
+    # Test that loading object arrays saved on Python 2 works both on
+    # Python 2 and Python 3 and vice versa
+    data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
+    if sys.version_info[0] >= 3:
+        xrange = range
+    else:
+        import __builtin__
+        xrange = __builtin__.xrange
+
+    expected = np.array([None, xrange, u'\u512a\u826f',
+                         b'\xe4\xb8\x8d\xe8\x89\xaf'],
+                        dtype=object)
+
+    for fname in ['py2-objarr.npy', 'py2-objarr.npz',
+                  'py3-objarr.npy', 'py3-objarr.npz']:
+        path = os.path.join(data_dir, fname)
+
+        for encoding in ['bytes', 'latin1']:
+            data_f = np.load(path, allow_pickle=True, encoding=encoding)
+            if fname.endswith('.npz'):
+                data = data_f['x']
+                data_f.close()
+            else:
+                data = data_f
+
+            if sys.version_info[0] >= 3:
+                if encoding == 'latin1' and fname.startswith('py2'):
+                    assert_(isinstance(data[3], str))
+                    assert_array_equal(data[:-1], expected[:-1])
+                    # mojibake occurs
+                    assert_array_equal(data[-1].encode(encoding), expected[-1])
+                else:
+                    assert_(isinstance(data[3], bytes))
+                    assert_array_equal(data, expected)
+            else:
+                assert_array_equal(data, expected)
+
+        if sys.version_info[0] >= 3:
+            if fname.startswith('py2'):
+                if fname.endswith('.npz'):
+                    data = np.load(path, allow_pickle=True)
+                    assert_raises(UnicodeError, data.__getitem__, 'x')
+                    data.close()
+                    data = np.load(path, allow_pickle=True, fix_imports=False,
+                                   encoding='latin1')
+                    assert_raises(ImportError, data.__getitem__, 'x')
+                    data.close()
+                else:
+                    assert_raises(UnicodeError, np.load, path,
+                                  allow_pickle=True)
+                    assert_raises(ImportError, np.load, path,
+                                  allow_pickle=True, fix_imports=False,
+                                  encoding='latin1')
+
+
+def test_pickle_disallow():
+    data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
+    path = os.path.join(data_dir, 'py2-objarr.npy')
+    assert_raises(ValueError, np.load, path,
+                  allow_pickle=False, encoding='latin1')
+
+    path = os.path.join(data_dir, 'py2-objarr.npz')
+    f = np.load(path, allow_pickle=False, encoding='latin1')
+    assert_raises(ValueError, f.__getitem__, 'x')
+
+    path = os.path.join(tempdir, 'pickle-disabled.npy')
+    assert_raises(ValueError, np.save, path, np.array([None], dtype=object),
+                  allow_pickle=False)
+
+[email protected]('dt', [
+    np.dtype(np.dtype([('a', np.int8),
+                       ('b', np.int16),
+                       ('c', np.int32),
+                      ], align=True),
+             (3,)),
+    np.dtype([('x', np.dtype({'names':['a','b'],
+                              'formats':['i1','i1'],
+                              'offsets':[0,4],
+                              'itemsize':8,
+                             },
+                    (3,)),
+               (4,),
+             )]),
+    np.dtype([('x',
+                   ('<f8', (5,)),
+                   (2,),
+               )]),
+    np.dtype([('x', np.dtype((
+        np.dtype((
+            np.dtype({'names':['a','b'],
+                      'formats':['i1','i1'],
+                      'offsets':[0,4],
+                      'itemsize':8}),
+            (3,)
+            )),
+        (4,)
+        )))
+        ]),
+    np.dtype([
+        ('a', np.dtype((
+            np.dtype((
+                np.dtype((
+                    np.dtype([
+                        ('a', int),
+                        ('b', np.dtype({'names':['a','b'],
+                                        'formats':['i1','i1'],
+                                        'offsets':[0,4],
+                                        'itemsize':8})),
+                    ]),
+                    (3,),
+                )),
+                (4,),
+            )),
+            (5,),
+        )))
+        ]),
+    ])
+
+def test_descr_to_dtype(dt):
+    dt1 = format.descr_to_dtype(dt.descr)
+    assert_equal_(dt1, dt)
+    arr1 = np.zeros(3, dt)
+    arr2 = roundtrip(arr1)
+    assert_array_equal(arr1, arr2)
+
+def test_version_2_0():
+    f = BytesIO()
+    # requires more than 2 byte for header
+    dt = [(("%d" % i) * 100, float) for i in range(500)]
+    d = np.ones(1000, dtype=dt)
+
+    format.write_array(f, d, version=(2, 0))
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', UserWarning)
+        format.write_array(f, d)
+        assert_(w[0].category is UserWarning)
+
+    # check alignment of data portion
+    f.seek(0)
+    header = f.readline()
+    assert_(len(header) % format.ARRAY_ALIGN == 0)
+
+    f.seek(0)
+    n = format.read_array(f)
+    assert_array_equal(d, n)
+
+    # 1.0 requested but data cannot be saved this way
+    assert_raises(ValueError, format.write_array, f, d, (1, 0))
+
+
+[email protected]
+def test_version_2_0_memmap():
+    # requires more than 2 byte for header
+    dt = [(("%d" % i) * 100, float) for i in range(500)]
+    d = np.ones(1000, dtype=dt)
+    tf = tempfile.mktemp('', 'mmap', dir=tempdir)
+
+    # 1.0 requested but data cannot be saved this way
+    assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype,
+                            shape=d.shape, version=(1, 0))
+
+    ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+                            shape=d.shape, version=(2, 0))
+    ma[...] = d
+    del ma
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', UserWarning)
+        ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+                                shape=d.shape, version=None)
+        assert_(w[0].category is UserWarning)
+        ma[...] = d
+        del ma
+
+    ma = format.open_memmap(tf, mode='r')
+    assert_array_equal(ma, d)
+
+
+def test_write_version():
+    f = BytesIO()
+    arr = np.arange(1)
+    # These should pass.
+    format.write_array(f, arr, version=(1, 0))
+    format.write_array(f, arr)
+
+    format.write_array(f, arr, version=None)
+    format.write_array(f, arr)
+
+    format.write_array(f, arr, version=(2, 0))
+    format.write_array(f, arr)
+
+    # These should all fail.
+    bad_versions = [
+        (1, 1),
+        (0, 0),
+        (0, 1),
+        (2, 2),
+        (255, 255),
+    ]
+    for version in bad_versions:
+        with assert_raises_regex(ValueError,
+                                 'we only support format version.*'):
+            format.write_array(f, arr, version=version)
+
+
+bad_version_magic = [
+    b'\x93NUMPY\x01\x01',
+    b'\x93NUMPY\x00\x00',
+    b'\x93NUMPY\x00\x01',
+    b'\x93NUMPY\x02\x00',
+    b'\x93NUMPY\x02\x02',
+    b'\x93NUMPY\xff\xff',
+]
+malformed_magic = [
+    b'\x92NUMPY\x01\x00',
+    b'\x00NUMPY\x01\x00',
+    b'\x93numpy\x01\x00',
+    b'\x93MATLB\x01\x00',
+    b'\x93NUMPY\x01',
+    b'\x93NUMPY',
+    b'',
+]
+
+def test_read_magic():
+    s1 = BytesIO()
+    s2 = BytesIO()
+
+    arr = np.ones((3, 6), dtype=float)
+
+    format.write_array(s1, arr, version=(1, 0))
+    format.write_array(s2, arr, version=(2, 0))
+
+    s1.seek(0)
+    s2.seek(0)
+
+    version1 = format.read_magic(s1)
+    version2 = format.read_magic(s2)
+
+    assert_(version1 == (1, 0))
+    assert_(version2 == (2, 0))
+
+    assert_(s1.tell() == format.MAGIC_LEN)
+    assert_(s2.tell() == format.MAGIC_LEN)
+
+def test_read_magic_bad_magic():
+    for magic in malformed_magic:
+        f = BytesIO(magic)
+        assert_raises(ValueError, format.read_array, f)
+
+
+def test_read_version_1_0_bad_magic():
+    for magic in bad_version_magic + malformed_magic:
+        f = BytesIO(magic)
+        assert_raises(ValueError, format.read_array, f)
+
+
+def test_bad_magic_args():
+    assert_raises(ValueError, format.magic, -1, 1)
+    assert_raises(ValueError, format.magic, 256, 1)
+    assert_raises(ValueError, format.magic, 1, -1)
+    assert_raises(ValueError, format.magic, 1, 256)
+
+
+def test_large_header():
+    s = BytesIO()
+    d = {'a': 1, 'b': 2}
+    format.write_array_header_1_0(s, d)
+
+    s = BytesIO()
+    d = {'a': 1, 'b': 2, 'c': 'x'*256*256}
+    assert_raises(ValueError, format.write_array_header_1_0, s, d)
+
+
+def test_read_array_header_1_0():
+    s = BytesIO()
+
+    arr = np.ones((3, 6), dtype=float)
+    format.write_array(s, arr, version=(1, 0))
+
+    s.seek(format.MAGIC_LEN)
+    shape, fortran, dtype = format.read_array_header_1_0(s)
+
+    assert_(s.tell() % format.ARRAY_ALIGN == 0)
+    assert_((shape, fortran, dtype) == ((3, 6), False, float))
+
+
+def test_read_array_header_2_0():
+    s = BytesIO()
+
+    arr = np.ones((3, 6), dtype=float)
+    format.write_array(s, arr, version=(2, 0))
+
+    s.seek(format.MAGIC_LEN)
+    shape, fortran, dtype = format.read_array_header_2_0(s)
+
+    assert_(s.tell() % format.ARRAY_ALIGN == 0)
+    assert_((shape, fortran, dtype) == ((3, 6), False, float))
+
+
+def test_bad_header():
+    # header of length less than 2 should fail
+    s = BytesIO()
+    assert_raises(ValueError, format.read_array_header_1_0, s)
+    s = BytesIO(b'1')
+    assert_raises(ValueError, format.read_array_header_1_0, s)
+
+    # header shorter than indicated size should fail
+    s = BytesIO(b'\x01\x00')
+    assert_raises(ValueError, format.read_array_header_1_0, s)
+
+    # headers without the exact keys required should fail
+    d = {"shape": (1, 2),
+         "descr": "x"}
+    s = BytesIO()
+    format.write_array_header_1_0(s, d)
+    assert_raises(ValueError, format.read_array_header_1_0, s)
+
+    d = {"shape": (1, 2),
+         "fortran_order": False,
+         "descr": "x",
+         "extrakey": -1}
+    s = BytesIO()
+    format.write_array_header_1_0(s, d)
+    assert_raises(ValueError, format.read_array_header_1_0, s)
+
+
+def test_large_file_support():
+    if (sys.platform == 'win32' or sys.platform == 'cygwin'):
+        pytest.skip("Unknown if Windows has sparse filesystems")
+    # try creating a large sparse file
+    tf_name = os.path.join(tempdir, 'sparse_file')
+    try:
+        # seek past end would work too, but linux truncate somewhat
+        # increases the chances that we have a sparse filesystem and can
+        # avoid actually writing 5GB
+        import subprocess as sp
+        sp.check_call(["truncate", "-s", "5368709120", tf_name])
+    except Exception:
+        pytest.skip("Could not create 5GB large file")
+    # write a small array to the end
+    with open(tf_name, "wb") as f:
+        f.seek(5368709120)
+        d = np.arange(5)
+        np.save(f, d)
+    # read it back
+    with open(tf_name, "rb") as f:
+        f.seek(5368709120)
+        r = np.load(f)
+    assert_array_equal(r, d)
+
+
+[email protected](np.dtype(np.intp).itemsize < 8,
+                    reason="test requires 64-bit system")
+[email protected]
+def test_large_archive():
+    # Regression test for product of saving arrays with dimensions of array
+    # having a product that doesn't fit in int32.  See gh-7598 for details.
+    try:
+        a = np.empty((2**30, 2), dtype=np.uint8)
+    except MemoryError:
+        pytest.skip("Could not create large file")
+
+    fname = os.path.join(tempdir, "large_archive")
+
+    with open(fname, "wb") as f:
+        np.savez(f, arr=a)
+
+    with open(fname, "rb") as f:
+        new_a = np.load(f)["arr"]
+
+    assert_(a.shape == new_a.shape)
+
+
+def test_empty_npz():
+    # Test for gh-9989
+    fname = os.path.join(tempdir, "nothing.npz")
+    np.savez(fname)
+    np.load(fname)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_function_base.py b/contrib/python/numpy/py2/numpy/lib/tests/test_function_base.py
new file mode 100644
index 00000000000..088ca2baebf
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_function_base.py
@@ -0,0 +1,3141 @@
+from __future__ import division, absolute_import, print_function
+
+import operator
+import warnings
+import sys
+import decimal
+import pytest
+
+import numpy as np
+from numpy import ma
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_almost_equal,
+    assert_array_almost_equal, assert_raises, assert_allclose, IS_PYPY,
+    assert_warns, assert_raises_regex, suppress_warnings, HAS_REFCOUNT,
+    )
+import numpy.lib.function_base as nfb
+from numpy.random import rand
+from numpy.lib import (
+    add_newdoc_ufunc, angle, average, bartlett, blackman, corrcoef, cov,
+    delete, diff, digitize, extract, flipud, gradient, hamming, hanning,
+    i0, insert, interp, kaiser, meshgrid, msort, piecewise, place, rot90,
+    select, setxor1d, sinc, trapz, trim_zeros, unwrap, unique, vectorize
+    )
+
+from numpy.compat import long
+
+
+def get_mat(n):
+    data = np.arange(n)
+    data = np.add.outer(data, data)
+    return data
+
+
+class TestRot90(object):
+    def test_basic(self):
+        assert_raises(ValueError, rot90, np.ones(4))
+        assert_raises(ValueError, rot90, np.ones((2,2,2)), axes=(0,1,2))
+        assert_raises(ValueError, rot90, np.ones((2,2)), axes=(0,2))
+        assert_raises(ValueError, rot90, np.ones((2,2)), axes=(1,1))
+        assert_raises(ValueError, rot90, np.ones((2,2,2)), axes=(-2,1))
+
+        a = [[0, 1, 2],
+             [3, 4, 5]]
+        b1 = [[2, 5],
+              [1, 4],
+              [0, 3]]
+        b2 = [[5, 4, 3],
+              [2, 1, 0]]
+        b3 = [[3, 0],
+              [4, 1],
+              [5, 2]]
+        b4 = [[0, 1, 2],
+              [3, 4, 5]]
+
+        for k in range(-3, 13, 4):
+            assert_equal(rot90(a, k=k), b1)
+        for k in range(-2, 13, 4):
+            assert_equal(rot90(a, k=k), b2)
+        for k in range(-1, 13, 4):
+            assert_equal(rot90(a, k=k), b3)
+        for k in range(0, 13, 4):
+            assert_equal(rot90(a, k=k), b4)
+
+        assert_equal(rot90(rot90(a, axes=(0,1)), axes=(1,0)), a)
+        assert_equal(rot90(a, k=1, axes=(1,0)), rot90(a, k=-1, axes=(0,1)))
+
+    def test_axes(self):
+        a = np.ones((50, 40, 3))
+        assert_equal(rot90(a).shape, (40, 50, 3))
+        assert_equal(rot90(a, axes=(0,2)), rot90(a, axes=(0,-1)))
+        assert_equal(rot90(a, axes=(1,2)), rot90(a, axes=(-2,-1)))
+
+    def test_rotation_axes(self):
+        a = np.arange(8).reshape((2,2,2))
+
+        a_rot90_01 = [[[2, 3],
+                       [6, 7]],
+                      [[0, 1],
+                       [4, 5]]]
+        a_rot90_12 = [[[1, 3],
+                       [0, 2]],
+                      [[5, 7],
+                       [4, 6]]]
+        a_rot90_20 = [[[4, 0],
+                       [6, 2]],
+                      [[5, 1],
+                       [7, 3]]]
+        a_rot90_10 = [[[4, 5],
+                       [0, 1]],
+                      [[6, 7],
+                       [2, 3]]]
+
+        assert_equal(rot90(a, axes=(0, 1)), a_rot90_01)
+        assert_equal(rot90(a, axes=(1, 0)), a_rot90_10)
+        assert_equal(rot90(a, axes=(1, 2)), a_rot90_12)
+
+        for k in range(1,5):
+            assert_equal(rot90(a, k=k, axes=(2, 0)),
+                         rot90(a_rot90_20, k=k-1, axes=(2, 0)))
+
+
+class TestFlip(object):
+
+    def test_axes(self):
+        assert_raises(np.AxisError, np.flip, np.ones(4), axis=1)
+        assert_raises(np.AxisError, np.flip, np.ones((4, 4)), axis=2)
+        assert_raises(np.AxisError, np.flip, np.ones((4, 4)), axis=-3)
+        assert_raises(np.AxisError, np.flip, np.ones((4, 4)), axis=(0, 3))
+
+    def test_basic_lr(self):
+        a = get_mat(4)
+        b = a[:, ::-1]
+        assert_equal(np.flip(a, 1), b)
+        a = [[0, 1, 2],
+             [3, 4, 5]]
+        b = [[2, 1, 0],
+             [5, 4, 3]]
+        assert_equal(np.flip(a, 1), b)
+
+    def test_basic_ud(self):
+        a = get_mat(4)
+        b = a[::-1, :]
+        assert_equal(np.flip(a, 0), b)
+        a = [[0, 1, 2],
+             [3, 4, 5]]
+        b = [[3, 4, 5],
+             [0, 1, 2]]
+        assert_equal(np.flip(a, 0), b)
+
+    def test_3d_swap_axis0(self):
+        a = np.array([[[0, 1],
+                       [2, 3]],
+                      [[4, 5],
+                       [6, 7]]])
+
+        b = np.array([[[4, 5],
+                       [6, 7]],
+                      [[0, 1],
+                       [2, 3]]])
+
+        assert_equal(np.flip(a, 0), b)
+
+    def test_3d_swap_axis1(self):
+        a = np.array([[[0, 1],
+                       [2, 3]],
+                      [[4, 5],
+                       [6, 7]]])
+
+        b = np.array([[[2, 3],
+                       [0, 1]],
+                      [[6, 7],
+                       [4, 5]]])
+
+        assert_equal(np.flip(a, 1), b)
+
+    def test_3d_swap_axis2(self):
+        a = np.array([[[0, 1],
+                       [2, 3]],
+                      [[4, 5],
+                       [6, 7]]])
+
+        b = np.array([[[1, 0],
+                       [3, 2]],
+                      [[5, 4],
+                       [7, 6]]])
+
+        assert_equal(np.flip(a, 2), b)
+
+    def test_4d(self):
+        a = np.arange(2 * 3 * 4 * 5).reshape(2, 3, 4, 5)
+        for i in range(a.ndim):
+            assert_equal(np.flip(a, i),
+                         np.flipud(a.swapaxes(0, i)).swapaxes(i, 0))
+
+    def test_default_axis(self):
+        a = np.array([[1, 2, 3],
+                      [4, 5, 6]])
+        b = np.array([[6, 5, 4],
+                      [3, 2, 1]])
+        assert_equal(np.flip(a), b)
+
+    def test_multiple_axes(self):
+        a = np.array([[[0, 1],
+                       [2, 3]],
+                      [[4, 5],
+                       [6, 7]]])
+
+        assert_equal(np.flip(a, axis=()), a)
+
+        b = np.array([[[5, 4],
+                       [7, 6]],
+                      [[1, 0],
+                       [3, 2]]])
+
+        assert_equal(np.flip(a, axis=(0, 2)), b)
+
+        c = np.array([[[3, 2],
+                       [1, 0]],
+                      [[7, 6],
+                       [5, 4]]])
+
+        assert_equal(np.flip(a, axis=(1, 2)), c)
+
+
+class TestAny(object):
+
+    def test_basic(self):
+        y1 = [0, 0, 1, 0]
+        y2 = [0, 0, 0, 0]
+        y3 = [1, 0, 1, 0]
+        assert_(np.any(y1))
+        assert_(np.any(y3))
+        assert_(not np.any(y2))
+
+    def test_nd(self):
+        y1 = [[0, 0, 0], [0, 1, 0], [1, 1, 0]]
+        assert_(np.any(y1))
+        assert_array_equal(np.sometrue(y1, axis=0), [1, 1, 0])
+        assert_array_equal(np.sometrue(y1, axis=1), [0, 1, 1])
+
+
+class TestAll(object):
+
+    def test_basic(self):
+        y1 = [0, 1, 1, 0]
+        y2 = [0, 0, 0, 0]
+        y3 = [1, 1, 1, 1]
+        assert_(not np.all(y1))
+        assert_(np.all(y3))
+        assert_(not np.all(y2))
+        assert_(np.all(~np.array(y2)))
+
+    def test_nd(self):
+        y1 = [[0, 0, 1], [0, 1, 1], [1, 1, 1]]
+        assert_(not np.all(y1))
+        assert_array_equal(np.alltrue(y1, axis=0), [0, 0, 1])
+        assert_array_equal(np.alltrue(y1, axis=1), [0, 0, 1])
+
+
+class TestCopy(object):
+
+    def test_basic(self):
+        a = np.array([[1, 2], [3, 4]])
+        a_copy = np.copy(a)
+        assert_array_equal(a, a_copy)
+        a_copy[0, 0] = 10
+        assert_equal(a[0, 0], 1)
+        assert_equal(a_copy[0, 0], 10)
+
+    def test_order(self):
+        # It turns out that people rely on np.copy() preserving order by
+        # default; changing this broke scikit-learn:
+        # github.com/scikit-learn/scikit-learn/commit/7842748cf777412c506a8c0ed28090711d3a3783  # noqa
+        a = np.array([[1, 2], [3, 4]])
+        assert_(a.flags.c_contiguous)
+        assert_(not a.flags.f_contiguous)
+        a_fort = np.array([[1, 2], [3, 4]], order="F")
+        assert_(not a_fort.flags.c_contiguous)
+        assert_(a_fort.flags.f_contiguous)
+        a_copy = np.copy(a)
+        assert_(a_copy.flags.c_contiguous)
+        assert_(not a_copy.flags.f_contiguous)
+        a_fort_copy = np.copy(a_fort)
+        assert_(not a_fort_copy.flags.c_contiguous)
+        assert_(a_fort_copy.flags.f_contiguous)
+
+
+class TestAverage(object):
+
+    def test_basic(self):
+        y1 = np.array([1, 2, 3])
+        assert_(average(y1, axis=0) == 2.)
+        y2 = np.array([1., 2., 3.])
+        assert_(average(y2, axis=0) == 2.)
+        y3 = [0., 0., 0.]
+        assert_(average(y3, axis=0) == 0.)
+
+        y4 = np.ones((4, 4))
+        y4[0, 1] = 0
+        y4[1, 0] = 2
+        assert_almost_equal(y4.mean(0), average(y4, 0))
+        assert_almost_equal(y4.mean(1), average(y4, 1))
+
+        y5 = rand(5, 5)
+        assert_almost_equal(y5.mean(0), average(y5, 0))
+        assert_almost_equal(y5.mean(1), average(y5, 1))
+
+    def test_weights(self):
+        y = np.arange(10)
+        w = np.arange(10)
+        actual = average(y, weights=w)
+        desired = (np.arange(10) ** 2).sum() * 1. / np.arange(10).sum()
+        assert_almost_equal(actual, desired)
+
+        y1 = np.array([[1, 2, 3], [4, 5, 6]])
+        w0 = [1, 2]
+        actual = average(y1, weights=w0, axis=0)
+        desired = np.array([3., 4., 5.])
+        assert_almost_equal(actual, desired)
+
+        w1 = [0, 0, 1]
+        actual = average(y1, weights=w1, axis=1)
+        desired = np.array([3., 6.])
+        assert_almost_equal(actual, desired)
+
+        # This should raise an error. Can we test for that ?
+        # assert_equal(average(y1, weights=w1), 9./2.)
+
+        # 2D Case
+        w2 = [[0, 0, 1], [0, 0, 2]]
+        desired = np.array([3., 6.])
+        assert_array_equal(average(y1, weights=w2, axis=1), desired)
+        assert_equal(average(y1, weights=w2), 5.)
+
+        y3 = rand(5).astype(np.float32)
+        w3 = rand(5).astype(np.float64)
+
+        assert_(np.average(y3, weights=w3).dtype == np.result_type(y3, w3))
+
+    def test_returned(self):
+        y = np.array([[1, 2, 3], [4, 5, 6]])
+
+        # No weights
+        avg, scl = average(y, returned=True)
+        assert_equal(scl, 6.)
+
+        avg, scl = average(y, 0, returned=True)
+        assert_array_equal(scl, np.array([2., 2., 2.]))
+
+        avg, scl = average(y, 1, returned=True)
+        assert_array_equal(scl, np.array([3., 3.]))
+
+        # With weights
+        w0 = [1, 2]
+        avg, scl = average(y, weights=w0, axis=0, returned=True)
+        assert_array_equal(scl, np.array([3., 3., 3.]))
+
+        w1 = [1, 2, 3]
+        avg, scl = average(y, weights=w1, axis=1, returned=True)
+        assert_array_equal(scl, np.array([6., 6.]))
+
+        w2 = [[0, 0, 1], [1, 2, 3]]
+        avg, scl = average(y, weights=w2, axis=1, returned=True)
+        assert_array_equal(scl, np.array([1., 6.]))
+
+    def test_subclasses(self):
+        class subclass(np.ndarray):
+            pass
+        a = np.array([[1,2],[3,4]]).view(subclass)
+        w = np.array([[1,2],[3,4]]).view(subclass)
+
+        assert_equal(type(np.average(a)), subclass)
+        assert_equal(type(np.average(a, weights=w)), subclass)
+
+    def test_upcasting(self):
+        types = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
+                 ('f4', 'f4', 'f4'), ('f4', 'f8', 'f8')]
+        for at, wt, rt in types:
+            a = np.array([[1,2],[3,4]], dtype=at)
+            w = np.array([[1,2],[3,4]], dtype=wt)
+            assert_equal(np.average(a, weights=w).dtype, np.dtype(rt))
+
+    def test_object_dtype(self):
+        a = np.array([decimal.Decimal(x) for x in range(10)])
+        w = np.array([decimal.Decimal(1) for _ in range(10)])
+        w /= w.sum()
+        assert_almost_equal(a.mean(0), average(a, weights=w))
+
+class TestSelect(object):
+    choices = [np.array([1, 2, 3]),
+               np.array([4, 5, 6]),
+               np.array([7, 8, 9])]
+    conditions = [np.array([False, False, False]),
+                  np.array([False, True, False]),
+                  np.array([False, False, True])]
+
+    def _select(self, cond, values, default=0):
+        output = []
+        for m in range(len(cond)):
+            output += [V[m] for V, C in zip(values, cond) if C[m]] or [default]
+        return output
+
+    def test_basic(self):
+        choices = self.choices
+        conditions = self.conditions
+        assert_array_equal(select(conditions, choices, default=15),
+                           self._select(conditions, choices, default=15))
+
+        assert_equal(len(choices), 3)
+        assert_equal(len(conditions), 3)
+
+    def test_broadcasting(self):
+        conditions = [np.array(True), np.array([False, True, False])]
+        choices = [1, np.arange(12).reshape(4, 3)]
+        assert_array_equal(select(conditions, choices), np.ones((4, 3)))
+        # default can broadcast too:
+        assert_equal(select([True], [0], default=[0]).shape, (1,))
+
+    def test_return_dtype(self):
+        assert_equal(select(self.conditions, self.choices, 1j).dtype,
+                     np.complex_)
+        # But the conditions need to be stronger then the scalar default
+        # if it is scalar.
+        choices = [choice.astype(np.int8) for choice in self.choices]
+        assert_equal(select(self.conditions, choices).dtype, np.int8)
+
+        d = np.array([1, 2, 3, np.nan, 5, 7])
+        m = np.isnan(d)
+        assert_equal(select([m], [d]), [0, 0, 0, np.nan, 0, 0])
+
+    def test_deprecated_empty(self):
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            assert_equal(select([], [], 3j), 3j)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("always")
+            assert_warns(DeprecationWarning, select, [], [])
+            warnings.simplefilter("error")
+            assert_raises(DeprecationWarning, select, [], [])
+
+    def test_non_bool_deprecation(self):
+        choices = self.choices
+        conditions = self.conditions[:]
+        with warnings.catch_warnings():
+            warnings.filterwarnings("always")
+            conditions[0] = conditions[0].astype(np.int_)
+            assert_warns(DeprecationWarning, select, conditions, choices)
+            conditions[0] = conditions[0].astype(np.uint8)
+            assert_warns(DeprecationWarning, select, conditions, choices)
+            warnings.filterwarnings("error")
+            assert_raises(DeprecationWarning, select, conditions, choices)
+
+    def test_many_arguments(self):
+        # This used to be limited by NPY_MAXARGS == 32
+        conditions = [np.array([False])] * 100
+        choices = [np.array([1])] * 100
+        select(conditions, choices)
+
+
+class TestInsert(object):
+
+    def test_basic(self):
+        a = [1, 2, 3]
+        assert_equal(insert(a, 0, 1), [1, 1, 2, 3])
+        assert_equal(insert(a, 3, 1), [1, 2, 3, 1])
+        assert_equal(insert(a, [1, 1, 1], [1, 2, 3]), [1, 1, 2, 3, 2, 3])
+        assert_equal(insert(a, 1, [1, 2, 3]), [1, 1, 2, 3, 2, 3])
+        assert_equal(insert(a, [1, -1, 3], 9), [1, 9, 2, 9, 3, 9])
+        assert_equal(insert(a, slice(-1, None, -1), 9), [9, 1, 9, 2, 9, 3])
+        assert_equal(insert(a, [-1, 1, 3], [7, 8, 9]), [1, 8, 2, 7, 3, 9])
+        b = np.array([0, 1], dtype=np.float64)
+        assert_equal(insert(b, 0, b[0]), [0., 0., 1.])
+        assert_equal(insert(b, [], []), b)
+        # Bools will be treated differently in the future:
+        # assert_equal(insert(a, np.array([True]*4), 9), [9, 1, 9, 2, 9, 3, 9])
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', FutureWarning)
+            assert_equal(
+                insert(a, np.array([True] * 4), 9), [1, 9, 9, 9, 9, 2, 3])
+            assert_(w[0].category is FutureWarning)
+
+    def test_multidim(self):
+        a = [[1, 1, 1]]
+        r = [[2, 2, 2],
+             [1, 1, 1]]
+        assert_equal(insert(a, 0, [1]), [1, 1, 1, 1])
+        assert_equal(insert(a, 0, [2, 2, 2], axis=0), r)
+        assert_equal(insert(a, 0, 2, axis=0), r)
+        assert_equal(insert(a, 2, 2, axis=1), [[1, 1, 2, 1]])
+
+        a = np.array([[1, 1], [2, 2], [3, 3]])
+        b = np.arange(1, 4).repeat(3).reshape(3, 3)
+        c = np.concatenate(
+            (a[:, 0:1], np.arange(1, 4).repeat(3).reshape(3, 3).T,
+             a[:, 1:2]), axis=1)
+        assert_equal(insert(a, [1], [[1], [2], [3]], axis=1), b)
+        assert_equal(insert(a, [1], [1, 2, 3], axis=1), c)
+        # scalars behave differently, in this case exactly opposite:
+        assert_equal(insert(a, 1, [1, 2, 3], axis=1), b)
+        assert_equal(insert(a, 1, [[1], [2], [3]], axis=1), c)
+
+        a = np.arange(4).reshape(2, 2)
+        assert_equal(insert(a[:, :1], 1, a[:, 1], axis=1), a)
+        assert_equal(insert(a[:1,:], 1, a[1,:], axis=0), a)
+
+        # negative axis value
+        a = np.arange(24).reshape((2, 3, 4))
+        assert_equal(insert(a, 1, a[:,:, 3], axis=-1),
+                     insert(a, 1, a[:,:, 3], axis=2))
+        assert_equal(insert(a, 1, a[:, 2,:], axis=-2),
+                     insert(a, 1, a[:, 2,:], axis=1))
+
+        # invalid axis value
+        assert_raises(np.AxisError, insert, a, 1, a[:, 2, :], axis=3)
+        assert_raises(np.AxisError, insert, a, 1, a[:, 2, :], axis=-4)
+
+        # negative axis value
+        a = np.arange(24).reshape((2, 3, 4))
+        assert_equal(insert(a, 1, a[:, :, 3], axis=-1),
+                     insert(a, 1, a[:, :, 3], axis=2))
+        assert_equal(insert(a, 1, a[:, 2, :], axis=-2),
+                     insert(a, 1, a[:, 2, :], axis=1))
+
+    def test_0d(self):
+        # This is an error in the future
+        a = np.array(1)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', DeprecationWarning)
+            assert_equal(insert(a, [], 2, axis=0), np.array(2))
+            assert_(w[0].category is DeprecationWarning)
+
+    def test_subclass(self):
+        class SubClass(np.ndarray):
+            pass
+        a = np.arange(10).view(SubClass)
+        assert_(isinstance(np.insert(a, 0, [0]), SubClass))
+        assert_(isinstance(np.insert(a, [], []), SubClass))
+        assert_(isinstance(np.insert(a, [0, 1], [1, 2]), SubClass))
+        assert_(isinstance(np.insert(a, slice(1, 2), [1, 2]), SubClass))
+        assert_(isinstance(np.insert(a, slice(1, -2, -1), []), SubClass))
+        # This is an error in the future:
+        a = np.array(1).view(SubClass)
+        assert_(isinstance(np.insert(a, 0, [0]), SubClass))
+
+    def test_index_array_copied(self):
+        x = np.array([1, 1, 1])
+        np.insert([0, 1, 2], x, [3, 4, 5])
+        assert_equal(x, np.array([1, 1, 1]))
+
+    def test_structured_array(self):
+        a = np.array([(1, 'a'), (2, 'b'), (3, 'c')],
+                     dtype=[('foo', 'i'), ('bar', 'a1')])
+        val = (4, 'd')
+        b = np.insert(a, 0, val)
+        assert_array_equal(b[0], np.array(val, dtype=b.dtype))
+        val = [(4, 'd')] * 2
+        b = np.insert(a, [0, 2], val)
+        assert_array_equal(b[[0, 3]], np.array(val, dtype=b.dtype))
+
+
+class TestAmax(object):
+
+    def test_basic(self):
+        a = [3, 4, 5, 10, -3, -5, 6.0]
+        assert_equal(np.amax(a), 10.0)
+        b = [[3, 6.0, 9.0],
+             [4, 10.0, 5.0],
+             [8, 3.0, 2.0]]
+        assert_equal(np.amax(b, axis=0), [8.0, 10.0, 9.0])
+        assert_equal(np.amax(b, axis=1), [9.0, 10.0, 8.0])
+
+
+class TestAmin(object):
+
+    def test_basic(self):
+        a = [3, 4, 5, 10, -3, -5, 6.0]
+        assert_equal(np.amin(a), -5.0)
+        b = [[3, 6.0, 9.0],
+             [4, 10.0, 5.0],
+             [8, 3.0, 2.0]]
+        assert_equal(np.amin(b, axis=0), [3.0, 3.0, 2.0])
+        assert_equal(np.amin(b, axis=1), [3.0, 4.0, 2.0])
+
+
+class TestPtp(object):
+
+    def test_basic(self):
+        a = np.array([3, 4, 5, 10, -3, -5, 6.0])
+        assert_equal(a.ptp(axis=0), 15.0)
+        b = np.array([[3, 6.0, 9.0],
+                      [4, 10.0, 5.0],
+                      [8, 3.0, 2.0]])
+        assert_equal(b.ptp(axis=0), [5.0, 7.0, 7.0])
+        assert_equal(b.ptp(axis=-1), [6.0, 6.0, 6.0])
+
+        assert_equal(b.ptp(axis=0, keepdims=True), [[5.0, 7.0, 7.0]])
+        assert_equal(b.ptp(axis=(0,1), keepdims=True), [[8.0]])
+
+
+class TestCumsum(object):
+
+    def test_basic(self):
+        ba = [1, 2, 10, 11, 6, 5, 4]
+        ba2 = [[1, 2, 3, 4], [5, 6, 7, 9], [10, 3, 4, 5]]
+        for ctype in [np.int8, np.uint8, np.int16, np.uint16, np.int32,
+                      np.uint32, np.float32, np.float64, np.complex64,
+                      np.complex128]:
+            a = np.array(ba, ctype)
+            a2 = np.array(ba2, ctype)
+
+            tgt = np.array([1, 3, 13, 24, 30, 35, 39], ctype)
+            assert_array_equal(np.cumsum(a, axis=0), tgt)
+
+            tgt = np.array(
+                [[1, 2, 3, 4], [6, 8, 10, 13], [16, 11, 14, 18]], ctype)
+            assert_array_equal(np.cumsum(a2, axis=0), tgt)
+
+            tgt = np.array(
+                [[1, 3, 6, 10], [5, 11, 18, 27], [10, 13, 17, 22]], ctype)
+            assert_array_equal(np.cumsum(a2, axis=1), tgt)
+
+
+class TestProd(object):
+
+    def test_basic(self):
+        ba = [1, 2, 10, 11, 6, 5, 4]
+        ba2 = [[1, 2, 3, 4], [5, 6, 7, 9], [10, 3, 4, 5]]
+        for ctype in [np.int16, np.uint16, np.int32, np.uint32,
+                      np.float32, np.float64, np.complex64, np.complex128]:
+            a = np.array(ba, ctype)
+            a2 = np.array(ba2, ctype)
+            if ctype in ['1', 'b']:
+                assert_raises(ArithmeticError, np.prod, a)
+                assert_raises(ArithmeticError, np.prod, a2, 1)
+            else:
+                assert_equal(a.prod(axis=0), 26400)
+                assert_array_equal(a2.prod(axis=0),
+                                   np.array([50, 36, 84, 180], ctype))
+                assert_array_equal(a2.prod(axis=-1),
+                                   np.array([24, 1890, 600], ctype))
+
+
+class TestCumprod(object):
+
+    def test_basic(self):
+        ba = [1, 2, 10, 11, 6, 5, 4]
+        ba2 = [[1, 2, 3, 4], [5, 6, 7, 9], [10, 3, 4, 5]]
+        for ctype in [np.int16, np.uint16, np.int32, np.uint32,
+                      np.float32, np.float64, np.complex64, np.complex128]:
+            a = np.array(ba, ctype)
+            a2 = np.array(ba2, ctype)
+            if ctype in ['1', 'b']:
+                assert_raises(ArithmeticError, np.cumprod, a)
+                assert_raises(ArithmeticError, np.cumprod, a2, 1)
+                assert_raises(ArithmeticError, np.cumprod, a)
+            else:
+                assert_array_equal(np.cumprod(a, axis=-1),
+                                   np.array([1, 2, 20, 220,
+                                             1320, 6600, 26400], ctype))
+                assert_array_equal(np.cumprod(a2, axis=0),
+                                   np.array([[1, 2, 3, 4],
+                                             [5, 12, 21, 36],
+                                             [50, 36, 84, 180]], ctype))
+                assert_array_equal(np.cumprod(a2, axis=-1),
+                                   np.array([[1, 2, 6, 24],
+                                             [5, 30, 210, 1890],
+                                             [10, 30, 120, 600]], ctype))
+
+
+class TestDiff(object):
+
+    def test_basic(self):
+        x = [1, 4, 6, 7, 12]
+        out = np.array([3, 2, 1, 5])
+        out2 = np.array([-1, -1, 4])
+        out3 = np.array([0, 5])
+        assert_array_equal(diff(x), out)
+        assert_array_equal(diff(x, n=2), out2)
+        assert_array_equal(diff(x, n=3), out3)
+
+        x = [1.1, 2.2, 3.0, -0.2, -0.1]
+        out = np.array([1.1, 0.8, -3.2, 0.1])
+        assert_almost_equal(diff(x), out)
+
+        x = [True, True, False, False]
+        out = np.array([False, True, False])
+        out2 = np.array([True, True])
+        assert_array_equal(diff(x), out)
+        assert_array_equal(diff(x, n=2), out2)
+
+    def test_axis(self):
+        x = np.zeros((10, 20, 30))
+        x[:, 1::2, :] = 1
+        exp = np.ones((10, 19, 30))
+        exp[:, 1::2, :] = -1
+        assert_array_equal(diff(x), np.zeros((10, 20, 29)))
+        assert_array_equal(diff(x, axis=-1), np.zeros((10, 20, 29)))
+        assert_array_equal(diff(x, axis=0), np.zeros((9, 20, 30)))
+        assert_array_equal(diff(x, axis=1), exp)
+        assert_array_equal(diff(x, axis=-2), exp)
+        assert_raises(np.AxisError, diff, x, axis=3)
+        assert_raises(np.AxisError, diff, x, axis=-4)
+
+    def test_nd(self):
+        x = 20 * rand(10, 20, 30)
+        out1 = x[:, :, 1:] - x[:, :, :-1]
+        out2 = out1[:, :, 1:] - out1[:, :, :-1]
+        out3 = x[1:, :, :] - x[:-1, :, :]
+        out4 = out3[1:, :, :] - out3[:-1, :, :]
+        assert_array_equal(diff(x), out1)
+        assert_array_equal(diff(x, n=2), out2)
+        assert_array_equal(diff(x, axis=0), out3)
+        assert_array_equal(diff(x, n=2, axis=0), out4)
+
+    def test_n(self):
+        x = list(range(3))
+        assert_raises(ValueError, diff, x, n=-1)
+        output = [diff(x, n=n) for n in range(1, 5)]
+        expected = [[1, 1], [0], [], []]
+        assert_(diff(x, n=0) is x)
+        for n, (expected, out) in enumerate(zip(expected, output), start=1):
+            assert_(type(out) is np.ndarray)
+            assert_array_equal(out, expected)
+            assert_equal(out.dtype, np.int_)
+            assert_equal(len(out), max(0, len(x) - n))
+
+    def test_times(self):
+        x = np.arange('1066-10-13', '1066-10-16', dtype=np.datetime64)
+        expected = [
+            np.array([1, 1], dtype='timedelta64[D]'),
+            np.array([0], dtype='timedelta64[D]'),
+        ]
+        expected.extend([np.array([], dtype='timedelta64[D]')] * 3)
+        for n, exp in enumerate(expected, start=1):
+            out = diff(x, n=n)
+            assert_array_equal(out, exp)
+            assert_equal(out.dtype, exp.dtype)
+
+    def test_subclass(self):
+        x = ma.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]],
+                     mask=[[False, False], [True, False],
+                           [False, True], [True, True], [False, False]])
+        out = diff(x)
+        assert_array_equal(out.data, [[1], [1], [1], [1], [1]])
+        assert_array_equal(out.mask, [[False], [True],
+                                      [True], [True], [False]])
+        assert_(type(out) is type(x))
+
+        out3 = diff(x, n=3)
+        assert_array_equal(out3.data, [[], [], [], [], []])
+        assert_array_equal(out3.mask, [[], [], [], [], []])
+        assert_(type(out3) is type(x))
+
+    def test_prepend(self):
+        x = np.arange(5) + 1
+        assert_array_equal(diff(x, prepend=0), np.ones(5))
+        assert_array_equal(diff(x, prepend=[0]), np.ones(5))
+        assert_array_equal(np.cumsum(np.diff(x, prepend=0)), x)
+        assert_array_equal(diff(x, prepend=[-1, 0]), np.ones(6))
+
+        x = np.arange(4).reshape(2, 2)
+        result = np.diff(x, axis=1, prepend=0)
+        expected = [[0, 1], [2, 1]]
+        assert_array_equal(result, expected)
+        result = np.diff(x, axis=1, prepend=[[0], [0]])
+        assert_array_equal(result, expected)
+
+        result = np.diff(x, axis=0, prepend=0)
+        expected = [[0, 1], [2, 2]]
+        assert_array_equal(result, expected)
+        result = np.diff(x, axis=0, prepend=[[0, 0]])
+        assert_array_equal(result, expected)
+
+        assert_raises(ValueError, np.diff, x, prepend=np.zeros((3,3)))
+
+        assert_raises(np.AxisError, diff, x, prepend=0, axis=3)
+
+    def test_append(self):
+        x = np.arange(5)
+        result = diff(x, append=0)
+        expected = [1, 1, 1, 1, -4]
+        assert_array_equal(result, expected)
+        result = diff(x, append=[0])
+        assert_array_equal(result, expected)
+        result = diff(x, append=[0, 2])
+        expected = expected + [2]
+        assert_array_equal(result, expected)
+
+        x = np.arange(4).reshape(2, 2)
+        result = np.diff(x, axis=1, append=0)
+        expected = [[1, -1], [1, -3]]
+        assert_array_equal(result, expected)
+        result = np.diff(x, axis=1, append=[[0], [0]])
+        assert_array_equal(result, expected)
+
+        result = np.diff(x, axis=0, append=0)
+        expected = [[2, 2], [-2, -3]]
+        assert_array_equal(result, expected)
+        result = np.diff(x, axis=0, append=[[0, 0]])
+        assert_array_equal(result, expected)
+
+        assert_raises(ValueError, np.diff, x, append=np.zeros((3,3)))
+
+        assert_raises(np.AxisError, diff, x, append=0, axis=3)
+
+
+class TestDelete(object):
+
+    def setup(self):
+        self.a = np.arange(5)
+        self.nd_a = np.arange(5).repeat(2).reshape(1, 5, 2)
+
+    def _check_inverse_of_slicing(self, indices):
+        a_del = delete(self.a, indices)
+        nd_a_del = delete(self.nd_a, indices, axis=1)
+        msg = 'Delete failed for obj: %r' % indices
+        # NOTE: The cast should be removed after warning phase for bools
+        if not isinstance(indices, (slice, int, long, np.integer)):
+            indices = np.asarray(indices, dtype=np.intp)
+            indices = indices[(indices >= 0) & (indices < 5)]
+        assert_array_equal(setxor1d(a_del, self.a[indices, ]), self.a,
+                           err_msg=msg)
+        xor = setxor1d(nd_a_del[0,:, 0], self.nd_a[0, indices, 0])
+        assert_array_equal(xor, self.nd_a[0,:, 0], err_msg=msg)
+
+    def test_slices(self):
+        lims = [-6, -2, 0, 1, 2, 4, 5]
+        steps = [-3, -1, 1, 3]
+        for start in lims:
+            for stop in lims:
+                for step in steps:
+                    s = slice(start, stop, step)
+                    self._check_inverse_of_slicing(s)
+
+    def test_fancy(self):
+        # Deprecation/FutureWarning tests should be kept after change.
+        self._check_inverse_of_slicing(np.array([[0, 1], [2, 1]]))
+        with warnings.catch_warnings():
+            warnings.filterwarnings('error', category=DeprecationWarning)
+            assert_raises(DeprecationWarning, delete, self.a, [100])
+            assert_raises(DeprecationWarning, delete, self.a, [-100])
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', category=FutureWarning)
+            self._check_inverse_of_slicing([0, -1, 2, 2])
+            obj = np.array([True, False, False], dtype=bool)
+            self._check_inverse_of_slicing(obj)
+            assert_(w[0].category is FutureWarning)
+            assert_(w[1].category is FutureWarning)
+
+    def test_single(self):
+        self._check_inverse_of_slicing(0)
+        self._check_inverse_of_slicing(-4)
+
+    def test_0d(self):
+        a = np.array(1)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', DeprecationWarning)
+            assert_equal(delete(a, [], axis=0), a)
+            assert_(w[0].category is DeprecationWarning)
+
+    def test_subclass(self):
+        class SubClass(np.ndarray):
+            pass
+        a = self.a.view(SubClass)
+        assert_(isinstance(delete(a, 0), SubClass))
+        assert_(isinstance(delete(a, []), SubClass))
+        assert_(isinstance(delete(a, [0, 1]), SubClass))
+        assert_(isinstance(delete(a, slice(1, 2)), SubClass))
+        assert_(isinstance(delete(a, slice(1, -2)), SubClass))
+
+    def test_array_order_preserve(self):
+        # See gh-7113
+        k = np.arange(10).reshape(2, 5, order='F')
+        m = delete(k, slice(60, None), axis=1)
+
+        # 'k' is Fortran ordered, and 'm' should have the
+        # same ordering as 'k' and NOT become C ordered
+        assert_equal(m.flags.c_contiguous, k.flags.c_contiguous)
+        assert_equal(m.flags.f_contiguous, k.flags.f_contiguous)
+
+
+class TestGradient(object):
+
+    def test_basic(self):
+        v = [[1, 1], [3, 4]]
+        x = np.array(v)
+        dx = [np.array([[2., 3.], [2., 3.]]),
+              np.array([[0., 0.], [1., 1.]])]
+        assert_array_equal(gradient(x), dx)
+        assert_array_equal(gradient(v), dx)
+
+    def test_args(self):
+        dx = np.cumsum(np.ones(5))
+        dx_uneven = [1., 2., 5., 9., 11.]
+        f_2d = np.arange(25).reshape(5, 5)
+
+        # distances must be scalars or have size equal to gradient[axis]
+        gradient(np.arange(5), 3.)
+        gradient(np.arange(5), np.array(3.))
+        gradient(np.arange(5), dx)
+        # dy is set equal to dx because scalar
+        gradient(f_2d, 1.5)
+        gradient(f_2d, np.array(1.5))
+
+        gradient(f_2d, dx_uneven, dx_uneven)
+        # mix between even and uneven spaces and
+        # mix between scalar and vector
+        gradient(f_2d, dx, 2)
+
+        # 2D but axis specified
+        gradient(f_2d, dx, axis=1)
+
+        # 2d coordinate arguments are not yet allowed
+        assert_raises_regex(ValueError, '.*scalars or 1d',
+            gradient, f_2d, np.stack([dx]*2, axis=-1), 1)
+
+    def test_badargs(self):
+        f_2d = np.arange(25).reshape(5, 5)
+        x = np.cumsum(np.ones(5))
+
+        # wrong sizes
+        assert_raises(ValueError, gradient, f_2d, x, np.ones(2))
+        assert_raises(ValueError, gradient, f_2d, 1, np.ones(2))
+        assert_raises(ValueError, gradient, f_2d, np.ones(2), np.ones(2))
+        # wrong number of arguments
+        assert_raises(TypeError, gradient, f_2d, x)
+        assert_raises(TypeError, gradient, f_2d, x, axis=(0,1))
+        assert_raises(TypeError, gradient, f_2d, x, x, x)
+        assert_raises(TypeError, gradient, f_2d, 1, 1, 1)
+        assert_raises(TypeError, gradient, f_2d, x, x, axis=1)
+        assert_raises(TypeError, gradient, f_2d, 1, 1, axis=1)
+
+    def test_datetime64(self):
+        # Make sure gradient() can handle special types like datetime64
+        x = np.array(
+            ['1910-08-16', '1910-08-11', '1910-08-10', '1910-08-12',
+             '1910-10-12', '1910-12-12', '1912-12-12'],
+            dtype='datetime64[D]')
+        dx = np.array(
+            [-5, -3, 0, 31, 61, 396, 731],
+            dtype='timedelta64[D]')
+        assert_array_equal(gradient(x), dx)
+        assert_(dx.dtype == np.dtype('timedelta64[D]'))
+
+    def test_masked(self):
+        # Make sure that gradient supports subclasses like masked arrays
+        x = np.ma.array([[1, 1], [3, 4]],
+                        mask=[[False, False], [False, False]])
+        out = gradient(x)[0]
+        assert_equal(type(out), type(x))
+        # And make sure that the output and input don't have aliased mask
+        # arrays
+        assert_(x.mask is not out.mask)
+        # Also check that edge_order=2 doesn't alter the original mask
+        x2 = np.ma.arange(5)
+        x2[2] = np.ma.masked
+        np.gradient(x2, edge_order=2)
+        assert_array_equal(x2.mask, [False, False, True, False, False])
+
+    def test_second_order_accurate(self):
+        # Testing that the relative numerical error is less that 3% for
+        # this example problem. This corresponds to second order
+        # accurate finite differences for all interior and boundary
+        # points.
+        x = np.linspace(0, 1, 10)
+        dx = x[1] - x[0]
+        y = 2 * x ** 3 + 4 * x ** 2 + 2 * x
+        analytical = 6 * x ** 2 + 8 * x + 2
+        num_error = np.abs((np.gradient(y, dx, edge_order=2) / analytical) - 1)
+        assert_(np.all(num_error < 0.03) == True)
+
+        # test with unevenly spaced
+        np.random.seed(0)
+        x = np.sort(np.random.random(10))
+        y = 2 * x ** 3 + 4 * x ** 2 + 2 * x
+        analytical = 6 * x ** 2 + 8 * x + 2
+        num_error = np.abs((np.gradient(y, x, edge_order=2) / analytical) - 1)
+        assert_(np.all(num_error < 0.03) == True)
+
+    def test_spacing(self):
+        f = np.array([0, 2., 3., 4., 5., 5.])
+        f = np.tile(f, (6,1)) + f.reshape(-1, 1)
+        x_uneven = np.array([0., 0.5, 1., 3., 5., 7.])
+        x_even = np.arange(6.)
+
+        fdx_even_ord1 = np.tile([2., 1.5, 1., 1., 0.5, 0.], (6,1))
+        fdx_even_ord2 = np.tile([2.5, 1.5, 1., 1., 0.5, -0.5], (6,1))
+        fdx_uneven_ord1 = np.tile([4., 3., 1.7, 0.5, 0.25, 0.], (6,1))
+        fdx_uneven_ord2 = np.tile([5., 3., 1.7, 0.5, 0.25, -0.25], (6,1))
+
+        # evenly spaced
+        for edge_order, exp_res in [(1, fdx_even_ord1), (2, fdx_even_ord2)]:
+            res1 = gradient(f, 1., axis=(0,1), edge_order=edge_order)
+            res2 = gradient(f, x_even, x_even,
+                            axis=(0,1), edge_order=edge_order)
+            res3 = gradient(f, x_even, x_even,
+                            axis=None, edge_order=edge_order)
+            assert_array_equal(res1, res2)
+            assert_array_equal(res2, res3)
+            assert_almost_equal(res1[0], exp_res.T)
+            assert_almost_equal(res1[1], exp_res)
+
+            res1 = gradient(f, 1., axis=0, edge_order=edge_order)
+            res2 = gradient(f, x_even, axis=0, edge_order=edge_order)
+            assert_(res1.shape == res2.shape)
+            assert_almost_equal(res2, exp_res.T)
+
+            res1 = gradient(f, 1., axis=1, edge_order=edge_order)
+            res2 = gradient(f, x_even, axis=1, edge_order=edge_order)
+            assert_(res1.shape == res2.shape)
+            assert_array_equal(res2, exp_res)
+
+        # unevenly spaced
+        for edge_order, exp_res in [(1, fdx_uneven_ord1), (2, fdx_uneven_ord2)]:
+            res1 = gradient(f, x_uneven, x_uneven,
+                            axis=(0,1), edge_order=edge_order)
+            res2 = gradient(f, x_uneven, x_uneven,
+                            axis=None, edge_order=edge_order)
+            assert_array_equal(res1, res2)
+            assert_almost_equal(res1[0], exp_res.T)
+            assert_almost_equal(res1[1], exp_res)
+
+            res1 = gradient(f, x_uneven, axis=0, edge_order=edge_order)
+            assert_almost_equal(res1, exp_res.T)
+
+            res1 = gradient(f, x_uneven, axis=1, edge_order=edge_order)
+            assert_almost_equal(res1, exp_res)
+
+        # mixed
+        res1 = gradient(f, x_even, x_uneven, axis=(0,1), edge_order=1)
+        res2 = gradient(f, x_uneven, x_even, axis=(1,0), edge_order=1)
+        assert_array_equal(res1[0], res2[1])
+        assert_array_equal(res1[1], res2[0])
+        assert_almost_equal(res1[0], fdx_even_ord1.T)
+        assert_almost_equal(res1[1], fdx_uneven_ord1)
+
+        res1 = gradient(f, x_even, x_uneven, axis=(0,1), edge_order=2)
+        res2 = gradient(f, x_uneven, x_even, axis=(1,0), edge_order=2)
+        assert_array_equal(res1[0], res2[1])
+        assert_array_equal(res1[1], res2[0])
+        assert_almost_equal(res1[0], fdx_even_ord2.T)
+        assert_almost_equal(res1[1], fdx_uneven_ord2)
+
+    def test_specific_axes(self):
+        # Testing that gradient can work on a given axis only
+        v = [[1, 1], [3, 4]]
+        x = np.array(v)
+        dx = [np.array([[2., 3.], [2., 3.]]),
+              np.array([[0., 0.], [1., 1.]])]
+        assert_array_equal(gradient(x, axis=0), dx[0])
+        assert_array_equal(gradient(x, axis=1), dx[1])
+        assert_array_equal(gradient(x, axis=-1), dx[1])
+        assert_array_equal(gradient(x, axis=(1, 0)), [dx[1], dx[0]])
+
+        # test axis=None which means all axes
+        assert_almost_equal(gradient(x, axis=None), [dx[0], dx[1]])
+        # and is the same as no axis keyword given
+        assert_almost_equal(gradient(x, axis=None), gradient(x))
+
+        # test vararg order
+        assert_array_equal(gradient(x, 2, 3, axis=(1, 0)),
+                           [dx[1]/2.0, dx[0]/3.0])
+        # test maximal number of varargs
+        assert_raises(TypeError, gradient, x, 1, 2, axis=1)
+
+        assert_raises(np.AxisError, gradient, x, axis=3)
+        assert_raises(np.AxisError, gradient, x, axis=-3)
+        # assert_raises(TypeError, gradient, x, axis=[1,])
+
+    def test_timedelta64(self):
+        # Make sure gradient() can handle special types like timedelta64
+        x = np.array(
+            [-5, -3, 10, 12, 61, 321, 300],
+            dtype='timedelta64[D]')
+        dx = np.array(
+            [2, 7, 7, 25, 154, 119, -21],
+            dtype='timedelta64[D]')
+        assert_array_equal(gradient(x), dx)
+        assert_(dx.dtype == np.dtype('timedelta64[D]'))
+
+    def test_inexact_dtypes(self):
+        for dt in [np.float16, np.float32, np.float64]:
+            # dtypes should not be promoted in a different way to what diff does
+            x = np.array([1, 2, 3], dtype=dt)
+            assert_equal(gradient(x).dtype, np.diff(x).dtype)
+
+    def test_values(self):
+        # needs at least 2 points for edge_order ==1
+        gradient(np.arange(2), edge_order=1)
+        # needs at least 3 points for edge_order ==1
+        gradient(np.arange(3), edge_order=2)
+
+        assert_raises(ValueError, gradient, np.arange(0), edge_order=1)
+        assert_raises(ValueError, gradient, np.arange(0), edge_order=2)
+        assert_raises(ValueError, gradient, np.arange(1), edge_order=1)
+        assert_raises(ValueError, gradient, np.arange(1), edge_order=2)
+        assert_raises(ValueError, gradient, np.arange(2), edge_order=2)
+
+
+class TestAngle(object):
+
+    def test_basic(self):
+        x = [1 + 3j, np.sqrt(2) / 2.0 + 1j * np.sqrt(2) / 2,
+             1, 1j, -1, -1j, 1 - 3j, -1 + 3j]
+        y = angle(x)
+        yo = [
+            np.arctan(3.0 / 1.0),
+            np.arctan(1.0), 0, np.pi / 2, np.pi, -np.pi / 2.0,
+            -np.arctan(3.0 / 1.0), np.pi - np.arctan(3.0 / 1.0)]
+        z = angle(x, deg=1)
+        zo = np.array(yo) * 180 / np.pi
+        assert_array_almost_equal(y, yo, 11)
+        assert_array_almost_equal(z, zo, 11)
+
+    def test_subclass(self):
+        x = np.ma.array([1 + 3j, 1, np.sqrt(2)/2 * (1 + 1j)])
+        x[1] = np.ma.masked
+        expected = np.ma.array([np.arctan(3.0 / 1.0), 0, np.arctan(1.0)])
+        expected[1] = np.ma.masked
+        actual = angle(x)
+        assert_equal(type(actual), type(expected))
+        assert_equal(actual.mask, expected.mask)
+        assert_equal(actual, expected)
+
+
+class TestTrimZeros(object):
+
+    """
+    Only testing for integer splits.
+
+    """
+
+    def test_basic(self):
+        a = np.array([0, 0, 1, 2, 3, 4, 0])
+        res = trim_zeros(a)
+        assert_array_equal(res, np.array([1, 2, 3, 4]))
+
+    def test_leading_skip(self):
+        a = np.array([0, 0, 1, 0, 2, 3, 4, 0])
+        res = trim_zeros(a)
+        assert_array_equal(res, np.array([1, 0, 2, 3, 4]))
+
+    def test_trailing_skip(self):
+        a = np.array([0, 0, 1, 0, 2, 3, 0, 4, 0])
+        res = trim_zeros(a)
+        assert_array_equal(res, np.array([1, 0, 2, 3, 0, 4]))
+
+
+class TestExtins(object):
+
+    def test_basic(self):
+        a = np.array([1, 3, 2, 1, 2, 3, 3])
+        b = extract(a > 1, a)
+        assert_array_equal(b, [3, 2, 2, 3, 3])
+
+    def test_place(self):
+        # Make sure that non-np.ndarray objects
+        # raise an error instead of doing nothing
+        assert_raises(TypeError, place, [1, 2, 3], [True, False], [0, 1])
+
+        a = np.array([1, 4, 3, 2, 5, 8, 7])
+        place(a, [0, 1, 0, 1, 0, 1, 0], [2, 4, 6])
+        assert_array_equal(a, [1, 2, 3, 4, 5, 6, 7])
+
+        place(a, np.zeros(7), [])
+        assert_array_equal(a, np.arange(1, 8))
+
+        place(a, [1, 0, 1, 0, 1, 0, 1], [8, 9])
+        assert_array_equal(a, [8, 2, 9, 4, 8, 6, 9])
+        assert_raises_regex(ValueError, "Cannot insert from an empty array",
+                            lambda: place(a, [0, 0, 0, 0, 0, 1, 0], []))
+
+        # See Issue #6974
+        a = np.array(['12', '34'])
+        place(a, [0, 1], '9')
+        assert_array_equal(a, ['12', '9'])
+
+    def test_both(self):
+        a = rand(10)
+        mask = a > 0.5
+        ac = a.copy()
+        c = extract(mask, a)
+        place(a, mask, 0)
+        place(a, mask, c)
+        assert_array_equal(a, ac)
+
+
+class TestVectorize(object):
+
+    def test_simple(self):
+        def addsubtract(a, b):
+            if a > b:
+                return a - b
+            else:
+                return a + b
+
+        f = vectorize(addsubtract)
+        r = f([0, 3, 6, 9], [1, 3, 5, 7])
+        assert_array_equal(r, [1, 6, 1, 2])
+
+    def test_scalar(self):
+        def addsubtract(a, b):
+            if a > b:
+                return a - b
+            else:
+                return a + b
+
+        f = vectorize(addsubtract)
+        r = f([0, 3, 6, 9], 5)
+        assert_array_equal(r, [5, 8, 1, 4])
+
+    def test_large(self):
+        x = np.linspace(-3, 2, 10000)
+        f = vectorize(lambda x: x)
+        y = f(x)
+        assert_array_equal(y, x)
+
+    def test_ufunc(self):
+        import math
+        f = vectorize(math.cos)
+        args = np.array([0, 0.5 * np.pi, np.pi, 1.5 * np.pi, 2 * np.pi])
+        r1 = f(args)
+        r2 = np.cos(args)
+        assert_array_almost_equal(r1, r2)
+
+    def test_keywords(self):
+
+        def foo(a, b=1):
+            return a + b
+
+        f = vectorize(foo)
+        args = np.array([1, 2, 3])
+        r1 = f(args)
+        r2 = np.array([2, 3, 4])
+        assert_array_equal(r1, r2)
+        r1 = f(args, 2)
+        r2 = np.array([3, 4, 5])
+        assert_array_equal(r1, r2)
+
+    def test_keywords_no_func_code(self):
+        # This needs to test a function that has keywords but
+        # no func_code attribute, since otherwise vectorize will
+        # inspect the func_code.
+        import random
+        try:
+            vectorize(random.randrange)  # Should succeed
+        except Exception:
+            raise AssertionError()
+
+    def test_keywords2_ticket_2100(self):
+        # Test kwarg support: enhancement ticket 2100
+
+        def foo(a, b=1):
+            return a + b
+
+        f = vectorize(foo)
+        args = np.array([1, 2, 3])
+        r1 = f(a=args)
+        r2 = np.array([2, 3, 4])
+        assert_array_equal(r1, r2)
+        r1 = f(b=1, a=args)
+        assert_array_equal(r1, r2)
+        r1 = f(args, b=2)
+        r2 = np.array([3, 4, 5])
+        assert_array_equal(r1, r2)
+
+    def test_keywords3_ticket_2100(self):
+        # Test excluded with mixed positional and kwargs: ticket 2100
+        def mypolyval(x, p):
+            _p = list(p)
+            res = _p.pop(0)
+            while _p:
+                res = res * x + _p.pop(0)
+            return res
+
+        vpolyval = np.vectorize(mypolyval, excluded=['p', 1])
+        ans = [3, 6]
+        assert_array_equal(ans, vpolyval(x=[0, 1], p=[1, 2, 3]))
+        assert_array_equal(ans, vpolyval([0, 1], p=[1, 2, 3]))
+        assert_array_equal(ans, vpolyval([0, 1], [1, 2, 3]))
+
+    def test_keywords4_ticket_2100(self):
+        # Test vectorizing function with no positional args.
+        @vectorize
+        def f(**kw):
+            res = 1.0
+            for _k in kw:
+                res *= kw[_k]
+            return res
+
+        assert_array_equal(f(a=[1, 2], b=[3, 4]), [3, 8])
+
+    def test_keywords5_ticket_2100(self):
+        # Test vectorizing function with no kwargs args.
+        @vectorize
+        def f(*v):
+            return np.prod(v)
+
+        assert_array_equal(f([1, 2], [3, 4]), [3, 8])
+
+    def test_coverage1_ticket_2100(self):
+        def foo():
+            return 1
+
+        f = vectorize(foo)
+        assert_array_equal(f(), 1)
+
+    def test_assigning_docstring(self):
+        def foo(x):
+            """Original documentation"""
+            return x
+
+        f = vectorize(foo)
+        assert_equal(f.__doc__, foo.__doc__)
+
+        doc = "Provided documentation"
+        f = vectorize(foo, doc=doc)
+        assert_equal(f.__doc__, doc)
+
+    def test_UnboundMethod_ticket_1156(self):
+        # Regression test for issue 1156
+        class Foo:
+            b = 2
+
+            def bar(self, a):
+                return a ** self.b
+
+        assert_array_equal(vectorize(Foo().bar)(np.arange(9)),
+                           np.arange(9) ** 2)
+        assert_array_equal(vectorize(Foo.bar)(Foo(), np.arange(9)),
+                           np.arange(9) ** 2)
+
+    def test_execution_order_ticket_1487(self):
+        # Regression test for dependence on execution order: issue 1487
+        f1 = vectorize(lambda x: x)
+        res1a = f1(np.arange(3))
+        res1b = f1(np.arange(0.1, 3))
+        f2 = vectorize(lambda x: x)
+        res2b = f2(np.arange(0.1, 3))
+        res2a = f2(np.arange(3))
+        assert_equal(res1a, res2a)
+        assert_equal(res1b, res2b)
+
+    def test_string_ticket_1892(self):
+        # Test vectorization over strings: issue 1892.
+        f = np.vectorize(lambda x: x)
+        s = '0123456789' * 10
+        assert_equal(s, f(s))
+
+    def test_cache(self):
+        # Ensure that vectorized func called exactly once per argument.
+        _calls = [0]
+
+        @vectorize
+        def f(x):
+            _calls[0] += 1
+            return x ** 2
+
+        f.cache = True
+        x = np.arange(5)
+        assert_array_equal(f(x), x * x)
+        assert_equal(_calls[0], len(x))
+
+    def test_otypes(self):
+        f = np.vectorize(lambda x: x)
+        f.otypes = 'i'
+        x = np.arange(5)
+        assert_array_equal(f(x), x)
+
+    def test_parse_gufunc_signature(self):
+        assert_equal(nfb._parse_gufunc_signature('(x)->()'), ([('x',)], [()]))
+        assert_equal(nfb._parse_gufunc_signature('(x,y)->()'),
+                     ([('x', 'y')], [()]))
+        assert_equal(nfb._parse_gufunc_signature('(x),(y)->()'),
+                     ([('x',), ('y',)], [()]))
+        assert_equal(nfb._parse_gufunc_signature('(x)->(y)'),
+                     ([('x',)], [('y',)]))
+        assert_equal(nfb._parse_gufunc_signature('(x)->(y),()'),
+                     ([('x',)], [('y',), ()]))
+        assert_equal(nfb._parse_gufunc_signature('(),(a,b,c),(d)->(d,e)'),
+                     ([(), ('a', 'b', 'c'), ('d',)], [('d', 'e')]))
+        with assert_raises(ValueError):
+            nfb._parse_gufunc_signature('(x)(y)->()')
+        with assert_raises(ValueError):
+            nfb._parse_gufunc_signature('(x),(y)->')
+        with assert_raises(ValueError):
+            nfb._parse_gufunc_signature('((x))->(x)')
+
+    def test_signature_simple(self):
+        def addsubtract(a, b):
+            if a > b:
+                return a - b
+            else:
+                return a + b
+
+        f = vectorize(addsubtract, signature='(),()->()')
+        r = f([0, 3, 6, 9], [1, 3, 5, 7])
+        assert_array_equal(r, [1, 6, 1, 2])
+
+    def test_signature_mean_last(self):
+        def mean(a):
+            return a.mean()
+
+        f = vectorize(mean, signature='(n)->()')
+        r = f([[1, 3], [2, 4]])
+        assert_array_equal(r, [2, 3])
+
+    def test_signature_center(self):
+        def center(a):
+            return a - a.mean()
+
+        f = vectorize(center, signature='(n)->(n)')
+        r = f([[1, 3], [2, 4]])
+        assert_array_equal(r, [[-1, 1], [-1, 1]])
+
+    def test_signature_two_outputs(self):
+        f = vectorize(lambda x: (x, x), signature='()->(),()')
+        r = f([1, 2, 3])
+        assert_(isinstance(r, tuple) and len(r) == 2)
+        assert_array_equal(r[0], [1, 2, 3])
+        assert_array_equal(r[1], [1, 2, 3])
+
+    def test_signature_outer(self):
+        f = vectorize(np.outer, signature='(a),(b)->(a,b)')
+        r = f([1, 2], [1, 2, 3])
+        assert_array_equal(r, [[1, 2, 3], [2, 4, 6]])
+
+        r = f([[[1, 2]]], [1, 2, 3])
+        assert_array_equal(r, [[[[1, 2, 3], [2, 4, 6]]]])
+
+        r = f([[1, 0], [2, 0]], [1, 2, 3])
+        assert_array_equal(r, [[[1, 2, 3], [0, 0, 0]],
+                               [[2, 4, 6], [0, 0, 0]]])
+
+        r = f([1, 2], [[1, 2, 3], [0, 0, 0]])
+        assert_array_equal(r, [[[1, 2, 3], [2, 4, 6]],
+                               [[0, 0, 0], [0, 0, 0]]])
+
+    def test_signature_computed_size(self):
+        f = vectorize(lambda x: x[:-1], signature='(n)->(m)')
+        r = f([1, 2, 3])
+        assert_array_equal(r, [1, 2])
+
+        r = f([[1, 2, 3], [2, 3, 4]])
+        assert_array_equal(r, [[1, 2], [2, 3]])
+
+    def test_signature_excluded(self):
+
+        def foo(a, b=1):
+            return a + b
+
+        f = vectorize(foo, signature='()->()', excluded={'b'})
+        assert_array_equal(f([1, 2, 3]), [2, 3, 4])
+        assert_array_equal(f([1, 2, 3], b=0), [1, 2, 3])
+
+    def test_signature_otypes(self):
+        f = vectorize(lambda x: x, signature='(n)->(n)', otypes=['float64'])
+        r = f([1, 2, 3])
+        assert_equal(r.dtype, np.dtype('float64'))
+        assert_array_equal(r, [1, 2, 3])
+
+    def test_signature_invalid_inputs(self):
+        f = vectorize(operator.add, signature='(n),(n)->(n)')
+        with assert_raises_regex(TypeError, 'wrong number of positional'):
+            f([1, 2])
+        with assert_raises_regex(
+                ValueError, 'does not have enough dimensions'):
+            f(1, 2)
+        with assert_raises_regex(
+                ValueError, 'inconsistent size for core dimension'):
+            f([1, 2], [1, 2, 3])
+
+        f = vectorize(operator.add, signature='()->()')
+        with assert_raises_regex(TypeError, 'wrong number of positional'):
+            f(1, 2)
+
+    def test_signature_invalid_outputs(self):
+
+        f = vectorize(lambda x: x[:-1], signature='(n)->(n)')
+        with assert_raises_regex(
+                ValueError, 'inconsistent size for core dimension'):
+            f([1, 2, 3])
+
+        f = vectorize(lambda x: x, signature='()->(),()')
+        with assert_raises_regex(ValueError, 'wrong number of outputs'):
+            f(1)
+
+        f = vectorize(lambda x: (x, x), signature='()->()')
+        with assert_raises_regex(ValueError, 'wrong number of outputs'):
+            f([1, 2])
+
+    def test_size_zero_output(self):
+        # see issue 5868
+        f = np.vectorize(lambda x: x)
+        x = np.zeros([0, 5], dtype=int)
+        with assert_raises_regex(ValueError, 'otypes'):
+            f(x)
+
+        f.otypes = 'i'
+        assert_array_equal(f(x), x)
+
+        f = np.vectorize(lambda x: x, signature='()->()')
+        with assert_raises_regex(ValueError, 'otypes'):
+            f(x)
+
+        f = np.vectorize(lambda x: x, signature='()->()', otypes='i')
+        assert_array_equal(f(x), x)
+
+        f = np.vectorize(lambda x: x, signature='(n)->(n)', otypes='i')
+        assert_array_equal(f(x), x)
+
+        f = np.vectorize(lambda x: x, signature='(n)->(n)')
+        assert_array_equal(f(x.T), x.T)
+
+        f = np.vectorize(lambda x: [x], signature='()->(n)', otypes='i')
+        with assert_raises_regex(ValueError, 'new output dimensions'):
+            f(x)
+
+
+class TestDigitize(object):
+
+    def test_forward(self):
+        x = np.arange(-6, 5)
+        bins = np.arange(-5, 5)
+        assert_array_equal(digitize(x, bins), np.arange(11))
+
+    def test_reverse(self):
+        x = np.arange(5, -6, -1)
+        bins = np.arange(5, -5, -1)
+        assert_array_equal(digitize(x, bins), np.arange(11))
+
+    def test_random(self):
+        x = rand(10)
+        bin = np.linspace(x.min(), x.max(), 10)
+        assert_(np.all(digitize(x, bin) != 0))
+
+    def test_right_basic(self):
+        x = [1, 5, 4, 10, 8, 11, 0]
+        bins = [1, 5, 10]
+        default_answer = [1, 2, 1, 3, 2, 3, 0]
+        assert_array_equal(digitize(x, bins), default_answer)
+        right_answer = [0, 1, 1, 2, 2, 3, 0]
+        assert_array_equal(digitize(x, bins, True), right_answer)
+
+    def test_right_open(self):
+        x = np.arange(-6, 5)
+        bins = np.arange(-6, 4)
+        assert_array_equal(digitize(x, bins, True), np.arange(11))
+
+    def test_right_open_reverse(self):
+        x = np.arange(5, -6, -1)
+        bins = np.arange(4, -6, -1)
+        assert_array_equal(digitize(x, bins, True), np.arange(11))
+
+    def test_right_open_random(self):
+        x = rand(10)
+        bins = np.linspace(x.min(), x.max(), 10)
+        assert_(np.all(digitize(x, bins, True) != 10))
+
+    def test_monotonic(self):
+        x = [-1, 0, 1, 2]
+        bins = [0, 0, 1]
+        assert_array_equal(digitize(x, bins, False), [0, 2, 3, 3])
+        assert_array_equal(digitize(x, bins, True), [0, 0, 2, 3])
+        bins = [1, 1, 0]
+        assert_array_equal(digitize(x, bins, False), [3, 2, 0, 0])
+        assert_array_equal(digitize(x, bins, True), [3, 3, 2, 0])
+        bins = [1, 1, 1, 1]
+        assert_array_equal(digitize(x, bins, False), [0, 0, 4, 4])
+        assert_array_equal(digitize(x, bins, True), [0, 0, 0, 4])
+        bins = [0, 0, 1, 0]
+        assert_raises(ValueError, digitize, x, bins)
+        bins = [1, 1, 0, 1]
+        assert_raises(ValueError, digitize, x, bins)
+
+    def test_casting_error(self):
+        x = [1, 2, 3 + 1.j]
+        bins = [1, 2, 3]
+        assert_raises(TypeError, digitize, x, bins)
+        x, bins = bins, x
+        assert_raises(TypeError, digitize, x, bins)
+
+    def test_return_type(self):
+        # Functions returning indices should always return base ndarrays
+        class A(np.ndarray):
+            pass
+        a = np.arange(5).view(A)
+        b = np.arange(1, 3).view(A)
+        assert_(not isinstance(digitize(b, a, False), A))
+        assert_(not isinstance(digitize(b, a, True), A))
+
+    def test_large_integers_increasing(self):
+        # gh-11022
+        x = 2**54  # loses precision in a float
+        assert_equal(np.digitize(x, [x - 1, x + 1]), 1)
+
+    @pytest.mark.xfail(
+        reason="gh-11022: np.core.multiarray._monoticity loses precision")
+    def test_large_integers_decreasing(self):
+        # gh-11022
+        x = 2**54  # loses precision in a float
+        assert_equal(np.digitize(x, [x + 1, x - 1]), 1)
+
+
+class TestUnwrap(object):
+
+    def test_simple(self):
+        # check that unwrap removes jumps greater that 2*pi
+        assert_array_equal(unwrap([1, 1 + 2 * np.pi]), [1, 1])
+        # check that unwrap maintains continuity
+        assert_(np.all(diff(unwrap(rand(10) * 100)) < np.pi))
+
+
+class TestFilterwindows(object):
+
+    def test_hanning(self):
+        # check symmetry
+        w = hanning(10)
+        assert_array_almost_equal(w, flipud(w), 7)
+        # check known value
+        assert_almost_equal(np.sum(w, axis=0), 4.500, 4)
+
+    def test_hamming(self):
+        # check symmetry
+        w = hamming(10)
+        assert_array_almost_equal(w, flipud(w), 7)
+        # check known value
+        assert_almost_equal(np.sum(w, axis=0), 4.9400, 4)
+
+    def test_bartlett(self):
+        # check symmetry
+        w = bartlett(10)
+        assert_array_almost_equal(w, flipud(w), 7)
+        # check known value
+        assert_almost_equal(np.sum(w, axis=0), 4.4444, 4)
+
+    def test_blackman(self):
+        # check symmetry
+        w = blackman(10)
+        assert_array_almost_equal(w, flipud(w), 7)
+        # check known value
+        assert_almost_equal(np.sum(w, axis=0), 3.7800, 4)
+
+
+class TestTrapz(object):
+
+    def test_simple(self):
+        x = np.arange(-10, 10, .1)
+        r = trapz(np.exp(-.5 * x ** 2) / np.sqrt(2 * np.pi), dx=0.1)
+        # check integral of normal equals 1
+        assert_almost_equal(r, 1, 7)
+
+    def test_ndim(self):
+        x = np.linspace(0, 1, 3)
+        y = np.linspace(0, 2, 8)
+        z = np.linspace(0, 3, 13)
+
+        wx = np.ones_like(x) * (x[1] - x[0])
+        wx[0] /= 2
+        wx[-1] /= 2
+        wy = np.ones_like(y) * (y[1] - y[0])
+        wy[0] /= 2
+        wy[-1] /= 2
+        wz = np.ones_like(z) * (z[1] - z[0])
+        wz[0] /= 2
+        wz[-1] /= 2
+
+        q = x[:, None, None] + y[None,:, None] + z[None, None,:]
+
+        qx = (q * wx[:, None, None]).sum(axis=0)
+        qy = (q * wy[None, :, None]).sum(axis=1)
+        qz = (q * wz[None, None, :]).sum(axis=2)
+
+        # n-d `x`
+        r = trapz(q, x=x[:, None, None], axis=0)
+        assert_almost_equal(r, qx)
+        r = trapz(q, x=y[None,:, None], axis=1)
+        assert_almost_equal(r, qy)
+        r = trapz(q, x=z[None, None,:], axis=2)
+        assert_almost_equal(r, qz)
+
+        # 1-d `x`
+        r = trapz(q, x=x, axis=0)
+        assert_almost_equal(r, qx)
+        r = trapz(q, x=y, axis=1)
+        assert_almost_equal(r, qy)
+        r = trapz(q, x=z, axis=2)
+        assert_almost_equal(r, qz)
+
+    def test_masked(self):
+        # Testing that masked arrays behave as if the function is 0 where
+        # masked
+        x = np.arange(5)
+        y = x * x
+        mask = x == 2
+        ym = np.ma.array(y, mask=mask)
+        r = 13.0  # sum(0.5 * (0 + 1) * 1.0 + 0.5 * (9 + 16))
+        assert_almost_equal(trapz(ym, x), r)
+
+        xm = np.ma.array(x, mask=mask)
+        assert_almost_equal(trapz(ym, xm), r)
+
+        xm = np.ma.array(x, mask=mask)
+        assert_almost_equal(trapz(y, xm), r)
+
+
+class TestSinc(object):
+
+    def test_simple(self):
+        assert_(sinc(0) == 1)
+        w = sinc(np.linspace(-1, 1, 100))
+        # check symmetry
+        assert_array_almost_equal(w, flipud(w), 7)
+
+    def test_array_like(self):
+        x = [0, 0.5]
+        y1 = sinc(np.array(x))
+        y2 = sinc(list(x))
+        y3 = sinc(tuple(x))
+        assert_array_equal(y1, y2)
+        assert_array_equal(y1, y3)
+
+
+class TestUnique(object):
+
+    def test_simple(self):
+        x = np.array([4, 3, 2, 1, 1, 2, 3, 4, 0])
+        assert_(np.all(unique(x) == [0, 1, 2, 3, 4]))
+        assert_(unique(np.array([1, 1, 1, 1, 1])) == np.array([1]))
+        x = ['widget', 'ham', 'foo', 'bar', 'foo', 'ham']
+        assert_(np.all(unique(x) == ['bar', 'foo', 'ham', 'widget']))
+        x = np.array([5 + 6j, 1 + 1j, 1 + 10j, 10, 5 + 6j])
+        assert_(np.all(unique(x) == [1 + 1j, 1 + 10j, 5 + 6j, 10]))
+
+
+class TestCheckFinite(object):
+
+    def test_simple(self):
+        a = [1, 2, 3]
+        b = [1, 2, np.inf]
+        c = [1, 2, np.nan]
+        np.lib.asarray_chkfinite(a)
+        assert_raises(ValueError, np.lib.asarray_chkfinite, b)
+        assert_raises(ValueError, np.lib.asarray_chkfinite, c)
+
+    def test_dtype_order(self):
+        # Regression test for missing dtype and order arguments
+        a = [1, 2, 3]
+        a = np.lib.asarray_chkfinite(a, order='F', dtype=np.float64)
+        assert_(a.dtype == np.float64)
+
+
+class TestCorrCoef(object):
+    A = np.array(
+        [[0.15391142, 0.18045767, 0.14197213],
+         [0.70461506, 0.96474128, 0.27906989],
+         [0.9297531, 0.32296769, 0.19267156]])
+    B = np.array(
+        [[0.10377691, 0.5417086, 0.49807457],
+         [0.82872117, 0.77801674, 0.39226705],
+         [0.9314666, 0.66800209, 0.03538394]])
+    res1 = np.array(
+        [[1., 0.9379533, -0.04931983],
+         [0.9379533, 1., 0.30007991],
+         [-0.04931983, 0.30007991, 1.]])
+    res2 = np.array(
+        [[1., 0.9379533, -0.04931983, 0.30151751, 0.66318558, 0.51532523],
+         [0.9379533, 1., 0.30007991, -0.04781421, 0.88157256, 0.78052386],
+         [-0.04931983, 0.30007991, 1., -0.96717111, 0.71483595, 0.83053601],
+         [0.30151751, -0.04781421, -0.96717111, 1., -0.51366032, -0.66173113],
+         [0.66318558, 0.88157256, 0.71483595, -0.51366032, 1., 0.98317823],
+         [0.51532523, 0.78052386, 0.83053601, -0.66173113, 0.98317823, 1.]])
+
+    def test_non_array(self):
+        assert_almost_equal(np.corrcoef([0, 1, 0], [1, 0, 1]),
+                            [[1., -1.], [-1.,  1.]])
+
+    def test_simple(self):
+        tgt1 = corrcoef(self.A)
+        assert_almost_equal(tgt1, self.res1)
+        assert_(np.all(np.abs(tgt1) <= 1.0))
+
+        tgt2 = corrcoef(self.A, self.B)
+        assert_almost_equal(tgt2, self.res2)
+        assert_(np.all(np.abs(tgt2) <= 1.0))
+
+    def test_ddof(self):
+        # ddof raises DeprecationWarning
+        with suppress_warnings() as sup:
+            warnings.simplefilter("always")
+            assert_warns(DeprecationWarning, corrcoef, self.A, ddof=-1)
+            sup.filter(DeprecationWarning)
+            # ddof has no or negligible effect on the function
+            assert_almost_equal(corrcoef(self.A, ddof=-1), self.res1)
+            assert_almost_equal(corrcoef(self.A, self.B, ddof=-1), self.res2)
+            assert_almost_equal(corrcoef(self.A, ddof=3), self.res1)
+            assert_almost_equal(corrcoef(self.A, self.B, ddof=3), self.res2)
+
+    def test_bias(self):
+        # bias raises DeprecationWarning
+        with suppress_warnings() as sup:
+            warnings.simplefilter("always")
+            assert_warns(DeprecationWarning, corrcoef, self.A, self.B, 1, 0)
+            assert_warns(DeprecationWarning, corrcoef, self.A, bias=0)
+            sup.filter(DeprecationWarning)
+            # bias has no or negligible effect on the function
+            assert_almost_equal(corrcoef(self.A, bias=1), self.res1)
+
+    def test_complex(self):
+        x = np.array([[1, 2, 3], [1j, 2j, 3j]])
+        res = corrcoef(x)
+        tgt = np.array([[1., -1.j], [1.j, 1.]])
+        assert_allclose(res, tgt)
+        assert_(np.all(np.abs(res) <= 1.0))
+
+    def test_xy(self):
+        x = np.array([[1, 2, 3]])
+        y = np.array([[1j, 2j, 3j]])
+        assert_allclose(np.corrcoef(x, y), np.array([[1., -1.j], [1.j, 1.]]))
+
+    def test_empty(self):
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter('always', RuntimeWarning)
+            assert_array_equal(corrcoef(np.array([])), np.nan)
+            assert_array_equal(corrcoef(np.array([]).reshape(0, 2)),
+                               np.array([]).reshape(0, 0))
+            assert_array_equal(corrcoef(np.array([]).reshape(2, 0)),
+                               np.array([[np.nan, np.nan], [np.nan, np.nan]]))
+
+    def test_extreme(self):
+        x = [[1e-100, 1e100], [1e100, 1e-100]]
+        with np.errstate(all='raise'):
+            c = corrcoef(x)
+        assert_array_almost_equal(c, np.array([[1., -1.], [-1., 1.]]))
+        assert_(np.all(np.abs(c) <= 1.0))
+
+
+class TestCov(object):
+    x1 = np.array([[0, 2], [1, 1], [2, 0]]).T
+    res1 = np.array([[1., -1.], [-1., 1.]])
+    x2 = np.array([0.0, 1.0, 2.0], ndmin=2)
+    frequencies = np.array([1, 4, 1])
+    x2_repeats = np.array([[0.0], [1.0], [1.0], [1.0], [1.0], [2.0]]).T
+    res2 = np.array([[0.4, -0.4], [-0.4, 0.4]])
+    unit_frequencies = np.ones(3, dtype=np.integer)
+    weights = np.array([1.0, 4.0, 1.0])
+    res3 = np.array([[2. / 3., -2. / 3.], [-2. / 3., 2. / 3.]])
+    unit_weights = np.ones(3)
+    x3 = np.array([0.3942, 0.5969, 0.7730, 0.9918, 0.7964])
+
+    def test_basic(self):
+        assert_allclose(cov(self.x1), self.res1)
+
+    def test_complex(self):
+        x = np.array([[1, 2, 3], [1j, 2j, 3j]])
+        res = np.array([[1., -1.j], [1.j, 1.]])
+        assert_allclose(cov(x), res)
+        assert_allclose(cov(x, aweights=np.ones(3)), res)
+
+    def test_xy(self):
+        x = np.array([[1, 2, 3]])
+        y = np.array([[1j, 2j, 3j]])
+        assert_allclose(cov(x, y), np.array([[1., -1.j], [1.j, 1.]]))
+
+    def test_empty(self):
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter('always', RuntimeWarning)
+            assert_array_equal(cov(np.array([])), np.nan)
+            assert_array_equal(cov(np.array([]).reshape(0, 2)),
+                               np.array([]).reshape(0, 0))
+            assert_array_equal(cov(np.array([]).reshape(2, 0)),
+                               np.array([[np.nan, np.nan], [np.nan, np.nan]]))
+
+    def test_wrong_ddof(self):
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter('always', RuntimeWarning)
+            assert_array_equal(cov(self.x1, ddof=5),
+                               np.array([[np.inf, -np.inf],
+                                         [-np.inf, np.inf]]))
+
+    def test_1D_rowvar(self):
+        assert_allclose(cov(self.x3), cov(self.x3, rowvar=0))
+        y = np.array([0.0780, 0.3107, 0.2111, 0.0334, 0.8501])
+        assert_allclose(cov(self.x3, y), cov(self.x3, y, rowvar=0))
+
+    def test_1D_variance(self):
+        assert_allclose(cov(self.x3, ddof=1), np.var(self.x3, ddof=1))
+
+    def test_fweights(self):
+        assert_allclose(cov(self.x2, fweights=self.frequencies),
+                        cov(self.x2_repeats))
+        assert_allclose(cov(self.x1, fweights=self.frequencies),
+                        self.res2)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies),
+                        self.res1)
+        nonint = self.frequencies + 0.5
+        assert_raises(TypeError, cov, self.x1, fweights=nonint)
+        f = np.ones((2, 3), dtype=np.integer)
+        assert_raises(RuntimeError, cov, self.x1, fweights=f)
+        f = np.ones(2, dtype=np.integer)
+        assert_raises(RuntimeError, cov, self.x1, fweights=f)
+        f = -1 * np.ones(3, dtype=np.integer)
+        assert_raises(ValueError, cov, self.x1, fweights=f)
+
+    def test_aweights(self):
+        assert_allclose(cov(self.x1, aweights=self.weights), self.res3)
+        assert_allclose(cov(self.x1, aweights=3.0 * self.weights),
+                        cov(self.x1, aweights=self.weights))
+        assert_allclose(cov(self.x1, aweights=self.unit_weights), self.res1)
+        w = np.ones((2, 3))
+        assert_raises(RuntimeError, cov, self.x1, aweights=w)
+        w = np.ones(2)
+        assert_raises(RuntimeError, cov, self.x1, aweights=w)
+        w = -1.0 * np.ones(3)
+        assert_raises(ValueError, cov, self.x1, aweights=w)
+
+    def test_unit_fweights_and_aweights(self):
+        assert_allclose(cov(self.x2, fweights=self.frequencies,
+                            aweights=self.unit_weights),
+                        cov(self.x2_repeats))
+        assert_allclose(cov(self.x1, fweights=self.frequencies,
+                            aweights=self.unit_weights),
+                        self.res2)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=self.unit_weights),
+                        self.res1)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=self.weights),
+                        self.res3)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=3.0 * self.weights),
+                        cov(self.x1, aweights=self.weights))
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=self.unit_weights),
+                        self.res1)
+
+
+class Test_I0(object):
+
+    def test_simple(self):
+        assert_almost_equal(
+            i0(0.5),
+            np.array(1.0634833707413234))
+
+        A = np.array([0.49842636, 0.6969809, 0.22011976, 0.0155549])
+        assert_almost_equal(
+            i0(A),
+            np.array([1.06307822, 1.12518299, 1.01214991, 1.00006049]))
+
+        B = np.array([[0.827002, 0.99959078],
+                      [0.89694769, 0.39298162],
+                      [0.37954418, 0.05206293],
+                      [0.36465447, 0.72446427],
+                      [0.48164949, 0.50324519]])
+        assert_almost_equal(
+            i0(B),
+            np.array([[1.17843223, 1.26583466],
+                      [1.21147086, 1.03898290],
+                      [1.03633899, 1.00067775],
+                      [1.03352052, 1.13557954],
+                      [1.05884290, 1.06432317]]))
+
+
+class TestKaiser(object):
+
+    def test_simple(self):
+        assert_(np.isfinite(kaiser(1, 1.0)))
+        assert_almost_equal(kaiser(0, 1.0),
+                            np.array([]))
+        assert_almost_equal(kaiser(2, 1.0),
+                            np.array([0.78984831, 0.78984831]))
+        assert_almost_equal(kaiser(5, 1.0),
+                            np.array([0.78984831, 0.94503323, 1.,
+                                      0.94503323, 0.78984831]))
+        assert_almost_equal(kaiser(5, 1.56789),
+                            np.array([0.58285404, 0.88409679, 1.,
+                                      0.88409679, 0.58285404]))
+
+    def test_int_beta(self):
+        kaiser(3, 4)
+
+
+class TestMsort(object):
+
+    def test_simple(self):
+        A = np.array([[0.44567325, 0.79115165, 0.54900530],
+                      [0.36844147, 0.37325583, 0.96098397],
+                      [0.64864341, 0.52929049, 0.39172155]])
+        assert_almost_equal(
+            msort(A),
+            np.array([[0.36844147, 0.37325583, 0.39172155],
+                      [0.44567325, 0.52929049, 0.54900530],
+                      [0.64864341, 0.79115165, 0.96098397]]))
+
+
+class TestMeshgrid(object):
+
+    def test_simple(self):
+        [X, Y] = meshgrid([1, 2, 3], [4, 5, 6, 7])
+        assert_array_equal(X, np.array([[1, 2, 3],
+                                        [1, 2, 3],
+                                        [1, 2, 3],
+                                        [1, 2, 3]]))
+        assert_array_equal(Y, np.array([[4, 4, 4],
+                                        [5, 5, 5],
+                                        [6, 6, 6],
+                                        [7, 7, 7]]))
+
+    def test_single_input(self):
+        [X] = meshgrid([1, 2, 3, 4])
+        assert_array_equal(X, np.array([1, 2, 3, 4]))
+
+    def test_no_input(self):
+        args = []
+        assert_array_equal([], meshgrid(*args))
+        assert_array_equal([], meshgrid(*args, copy=False))
+
+    def test_indexing(self):
+        x = [1, 2, 3]
+        y = [4, 5, 6, 7]
+        [X, Y] = meshgrid(x, y, indexing='ij')
+        assert_array_equal(X, np.array([[1, 1, 1, 1],
+                                        [2, 2, 2, 2],
+                                        [3, 3, 3, 3]]))
+        assert_array_equal(Y, np.array([[4, 5, 6, 7],
+                                        [4, 5, 6, 7],
+                                        [4, 5, 6, 7]]))
+
+        # Test expected shapes:
+        z = [8, 9]
+        assert_(meshgrid(x, y)[0].shape == (4, 3))
+        assert_(meshgrid(x, y, indexing='ij')[0].shape == (3, 4))
+        assert_(meshgrid(x, y, z)[0].shape == (4, 3, 2))
+        assert_(meshgrid(x, y, z, indexing='ij')[0].shape == (3, 4, 2))
+
+        assert_raises(ValueError, meshgrid, x, y, indexing='notvalid')
+
+    def test_sparse(self):
+        [X, Y] = meshgrid([1, 2, 3], [4, 5, 6, 7], sparse=True)
+        assert_array_equal(X, np.array([[1, 2, 3]]))
+        assert_array_equal(Y, np.array([[4], [5], [6], [7]]))
+
+    def test_invalid_arguments(self):
+        # Test that meshgrid complains about invalid arguments
+        # Regression test for issue #4755:
+        # https://github.com/numpy/numpy/issues/4755
+        assert_raises(TypeError, meshgrid,
+                      [1, 2, 3], [4, 5, 6, 7], indices='ij')
+
+    def test_return_type(self):
+        # Test for appropriate dtype in returned arrays.
+        # Regression test for issue #5297
+        # https://github.com/numpy/numpy/issues/5297
+        x = np.arange(0, 10, dtype=np.float32)
+        y = np.arange(10, 20, dtype=np.float64)
+
+        X, Y = np.meshgrid(x,y)
+
+        assert_(X.dtype == x.dtype)
+        assert_(Y.dtype == y.dtype)
+
+        # copy
+        X, Y = np.meshgrid(x,y, copy=True)
+
+        assert_(X.dtype == x.dtype)
+        assert_(Y.dtype == y.dtype)
+
+        # sparse
+        X, Y = np.meshgrid(x,y, sparse=True)
+
+        assert_(X.dtype == x.dtype)
+        assert_(Y.dtype == y.dtype)
+
+    def test_writeback(self):
+        # Issue 8561
+        X = np.array([1.1, 2.2])
+        Y = np.array([3.3, 4.4])
+        x, y = np.meshgrid(X, Y, sparse=False, copy=True)
+
+        x[0, :] = 0
+        assert_equal(x[0, :], 0)
+        assert_equal(x[1, :], X)
+
+
+class TestPiecewise(object):
+
+    def test_simple(self):
+        # Condition is single bool list
+        x = piecewise([0, 0], [True, False], [1])
+        assert_array_equal(x, [1, 0])
+
+        # List of conditions: single bool list
+        x = piecewise([0, 0], [[True, False]], [1])
+        assert_array_equal(x, [1, 0])
+
+        # Conditions is single bool array
+        x = piecewise([0, 0], np.array([True, False]), [1])
+        assert_array_equal(x, [1, 0])
+
+        # Condition is single int array
+        x = piecewise([0, 0], np.array([1, 0]), [1])
+        assert_array_equal(x, [1, 0])
+
+        # List of conditions: int array
+        x = piecewise([0, 0], [np.array([1, 0])], [1])
+        assert_array_equal(x, [1, 0])
+
+        x = piecewise([0, 0], [[False, True]], [lambda x:-1])
+        assert_array_equal(x, [0, -1])
+
+        assert_raises_regex(ValueError, '1 or 2 functions are expected',
+            piecewise, [0, 0], [[False, True]], [])
+        assert_raises_regex(ValueError, '1 or 2 functions are expected',
+            piecewise, [0, 0], [[False, True]], [1, 2, 3])
+
+    def test_two_conditions(self):
+        x = piecewise([1, 2], [[True, False], [False, True]], [3, 4])
+        assert_array_equal(x, [3, 4])
+
+    def test_scalar_domains_three_conditions(self):
+        x = piecewise(3, [True, False, False], [4, 2, 0])
+        assert_equal(x, 4)
+
+    def test_default(self):
+        # No value specified for x[1], should be 0
+        x = piecewise([1, 2], [True, False], [2])
+        assert_array_equal(x, [2, 0])
+
+        # Should set x[1] to 3
+        x = piecewise([1, 2], [True, False], [2, 3])
+        assert_array_equal(x, [2, 3])
+
+    def test_0d(self):
+        x = np.array(3)
+        y = piecewise(x, x > 3, [4, 0])
+        assert_(y.ndim == 0)
+        assert_(y == 0)
+
+        x = 5
+        y = piecewise(x, [True, False], [1, 0])
+        assert_(y.ndim == 0)
+        assert_(y == 1)
+
+        # With 3 ranges (It was failing, before)
+        y = piecewise(x, [False, False, True], [1, 2, 3])
+        assert_array_equal(y, 3)
+
+    def test_0d_comparison(self):
+        x = 3
+        y = piecewise(x, [x <= 3, x > 3], [4, 0])  # Should succeed.
+        assert_equal(y, 4)
+
+        # With 3 ranges (It was failing, before)
+        x = 4
+        y = piecewise(x, [x <= 3, (x > 3) * (x <= 5), x > 5], [1, 2, 3])
+        assert_array_equal(y, 2)
+
+        assert_raises_regex(ValueError, '2 or 3 functions are expected',
+            piecewise, x, [x <= 3, x > 3], [1])
+        assert_raises_regex(ValueError, '2 or 3 functions are expected',
+            piecewise, x, [x <= 3, x > 3], [1, 1, 1, 1])
+
+    def test_0d_0d_condition(self):
+        x = np.array(3)
+        c = np.array(x > 3)
+        y = piecewise(x, [c], [1, 2])
+        assert_equal(y, 2)
+
+    def test_multidimensional_extrafunc(self):
+        x = np.array([[-2.5, -1.5, -0.5],
+                      [0.5, 1.5, 2.5]])
+        y = piecewise(x, [x < 0, x >= 2], [-1, 1, 3])
+        assert_array_equal(y, np.array([[-1., -1., -1.],
+                                        [3., 3., 1.]]))
+
+
+class TestBincount(object):
+
+    def test_simple(self):
+        y = np.bincount(np.arange(4))
+        assert_array_equal(y, np.ones(4))
+
+    def test_simple2(self):
+        y = np.bincount(np.array([1, 5, 2, 4, 1]))
+        assert_array_equal(y, np.array([0, 2, 1, 0, 1, 1]))
+
+    def test_simple_weight(self):
+        x = np.arange(4)
+        w = np.array([0.2, 0.3, 0.5, 0.1])
+        y = np.bincount(x, w)
+        assert_array_equal(y, w)
+
+    def test_simple_weight2(self):
+        x = np.array([1, 2, 4, 5, 2])
+        w = np.array([0.2, 0.3, 0.5, 0.1, 0.2])
+        y = np.bincount(x, w)
+        assert_array_equal(y, np.array([0, 0.2, 0.5, 0, 0.5, 0.1]))
+
+    def test_with_minlength(self):
+        x = np.array([0, 1, 0, 1, 1])
+        y = np.bincount(x, minlength=3)
+        assert_array_equal(y, np.array([2, 3, 0]))
+        x = []
+        y = np.bincount(x, minlength=0)
+        assert_array_equal(y, np.array([]))
+
+    def test_with_minlength_smaller_than_maxvalue(self):
+        x = np.array([0, 1, 1, 2, 2, 3, 3])
+        y = np.bincount(x, minlength=2)
+        assert_array_equal(y, np.array([1, 2, 2, 2]))
+        y = np.bincount(x, minlength=0)
+        assert_array_equal(y, np.array([1, 2, 2, 2]))
+
+    def test_with_minlength_and_weights(self):
+        x = np.array([1, 2, 4, 5, 2])
+        w = np.array([0.2, 0.3, 0.5, 0.1, 0.2])
+        y = np.bincount(x, w, 8)
+        assert_array_equal(y, np.array([0, 0.2, 0.5, 0, 0.5, 0.1, 0, 0]))
+
+    def test_empty(self):
+        x = np.array([], dtype=int)
+        y = np.bincount(x)
+        assert_array_equal(x, y)
+
+    def test_empty_with_minlength(self):
+        x = np.array([], dtype=int)
+        y = np.bincount(x, minlength=5)
+        assert_array_equal(y, np.zeros(5, dtype=int))
+
+    def test_with_incorrect_minlength(self):
+        x = np.array([], dtype=int)
+        assert_raises_regex(TypeError,
+                            "'str' object cannot be interpreted",
+                            lambda: np.bincount(x, minlength="foobar"))
+        assert_raises_regex(ValueError,
+                            "must not be negative",
+                            lambda: np.bincount(x, minlength=-1))
+
+        x = np.arange(5)
+        assert_raises_regex(TypeError,
+                            "'str' object cannot be interpreted",
+                            lambda: np.bincount(x, minlength="foobar"))
+        assert_raises_regex(ValueError,
+                            "must not be negative",
+                            lambda: np.bincount(x, minlength=-1))
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    def test_dtype_reference_leaks(self):
+        # gh-6805
+        intp_refcount = sys.getrefcount(np.dtype(np.intp))
+        double_refcount = sys.getrefcount(np.dtype(np.double))
+
+        for j in range(10):
+            np.bincount([1, 2, 3])
+        assert_equal(sys.getrefcount(np.dtype(np.intp)), intp_refcount)
+        assert_equal(sys.getrefcount(np.dtype(np.double)), double_refcount)
+
+        for j in range(10):
+            np.bincount([1, 2, 3], [4, 5, 6])
+        assert_equal(sys.getrefcount(np.dtype(np.intp)), intp_refcount)
+        assert_equal(sys.getrefcount(np.dtype(np.double)), double_refcount)
+
+
+class TestInterp(object):
+
+    def test_exceptions(self):
+        assert_raises(ValueError, interp, 0, [], [])
+        assert_raises(ValueError, interp, 0, [0], [1, 2])
+        assert_raises(ValueError, interp, 0, [0, 1], [1, 2], period=0)
+        assert_raises(ValueError, interp, 0, [], [], period=360)
+        assert_raises(ValueError, interp, 0, [0], [1, 2], period=360)
+
+    def test_basic(self):
+        x = np.linspace(0, 1, 5)
+        y = np.linspace(0, 1, 5)
+        x0 = np.linspace(0, 1, 50)
+        assert_almost_equal(np.interp(x0, x, y), x0)
+
+    def test_right_left_behavior(self):
+        # Needs range of sizes to test different code paths.
+        # size ==1 is special cased, 1 < size < 5 is linear search, and
+        # size >= 5 goes through local search and possibly binary search.
+        for size in range(1, 10):
+            xp = np.arange(size, dtype=np.double)
+            yp = np.ones(size, dtype=np.double)
+            incpts = np.array([-1, 0, size - 1, size], dtype=np.double)
+            decpts = incpts[::-1]
+
+            incres = interp(incpts, xp, yp)
+            decres = interp(decpts, xp, yp)
+            inctgt = np.array([1, 1, 1, 1], dtype=float)
+            dectgt = inctgt[::-1]
+            assert_equal(incres, inctgt)
+            assert_equal(decres, dectgt)
+
+            incres = interp(incpts, xp, yp, left=0)
+            decres = interp(decpts, xp, yp, left=0)
+            inctgt = np.array([0, 1, 1, 1], dtype=float)
+            dectgt = inctgt[::-1]
+            assert_equal(incres, inctgt)
+            assert_equal(decres, dectgt)
+
+            incres = interp(incpts, xp, yp, right=2)
+            decres = interp(decpts, xp, yp, right=2)
+            inctgt = np.array([1, 1, 1, 2], dtype=float)
+            dectgt = inctgt[::-1]
+            assert_equal(incres, inctgt)
+            assert_equal(decres, dectgt)
+
+            incres = interp(incpts, xp, yp, left=0, right=2)
+            decres = interp(decpts, xp, yp, left=0, right=2)
+            inctgt = np.array([0, 1, 1, 2], dtype=float)
+            dectgt = inctgt[::-1]
+            assert_equal(incres, inctgt)
+            assert_equal(decres, dectgt)
+
+    def test_scalar_interpolation_point(self):
+        x = np.linspace(0, 1, 5)
+        y = np.linspace(0, 1, 5)
+        x0 = 0
+        assert_almost_equal(np.interp(x0, x, y), x0)
+        x0 = .3
+        assert_almost_equal(np.interp(x0, x, y), x0)
+        x0 = np.float32(.3)
+        assert_almost_equal(np.interp(x0, x, y), x0)
+        x0 = np.float64(.3)
+        assert_almost_equal(np.interp(x0, x, y), x0)
+        x0 = np.nan
+        assert_almost_equal(np.interp(x0, x, y), x0)
+
+    def test_non_finite_behavior(self):
+        x = [1, 2, 2.5, 3, 4]
+        xp = [1, 2, 3, 4]
+        fp = [1, 2, np.inf, 4]
+        assert_almost_equal(np.interp(x, xp, fp), [1, 2, np.inf, np.inf, 4])
+        fp = [1, 2, np.nan, 4]
+        assert_almost_equal(np.interp(x, xp, fp), [1, 2, np.nan, np.nan, 4])
+
+    def test_complex_interp(self):
+        # test complex interpolation
+        x = np.linspace(0, 1, 5)
+        y = np.linspace(0, 1, 5) + (1 + np.linspace(0, 1, 5))*1.0j
+        x0 = 0.3
+        y0 = x0 + (1+x0)*1.0j
+        assert_almost_equal(np.interp(x0, x, y), y0)
+        # test complex left and right
+        x0 = -1
+        left = 2 + 3.0j
+        assert_almost_equal(np.interp(x0, x, y, left=left), left)
+        x0 = 2.0
+        right = 2 + 3.0j
+        assert_almost_equal(np.interp(x0, x, y, right=right), right)
+        # test complex non finite
+        x = [1, 2, 2.5, 3, 4]
+        xp = [1, 2, 3, 4]
+        fp = [1, 2+1j, np.inf, 4]
+        y = [1, 2+1j, np.inf+0.5j, np.inf, 4]
+        assert_almost_equal(np.interp(x, xp, fp), y)
+        # test complex periodic
+        x = [-180, -170, -185, 185, -10, -5, 0, 365]
+        xp = [190, -190, 350, -350]
+        fp = [5+1.0j, 10+2j, 3+3j, 4+4j]
+        y = [7.5+1.5j, 5.+1.0j, 8.75+1.75j, 6.25+1.25j, 3.+3j, 3.25+3.25j,
+             3.5+3.5j, 3.75+3.75j]
+        assert_almost_equal(np.interp(x, xp, fp, period=360), y)
+
+    def test_zero_dimensional_interpolation_point(self):
+        x = np.linspace(0, 1, 5)
+        y = np.linspace(0, 1, 5)
+        x0 = np.array(.3)
+        assert_almost_equal(np.interp(x0, x, y), x0)
+
+        xp = np.array([0, 2, 4])
+        fp = np.array([1, -1, 1])
+
+        actual = np.interp(np.array(1), xp, fp)
+        assert_equal(actual, 0)
+        assert_(isinstance(actual, np.float64))
+
+        actual = np.interp(np.array(4.5), xp, fp, period=4)
+        assert_equal(actual, 0.5)
+        assert_(isinstance(actual, np.float64))
+
+    def test_if_len_x_is_small(self):
+        xp = np.arange(0, 10, 0.0001)
+        fp = np.sin(xp)
+        assert_almost_equal(np.interp(np.pi, xp, fp), 0.0)
+
+    def test_period(self):
+        x = [-180, -170, -185, 185, -10, -5, 0, 365]
+        xp = [190, -190, 350, -350]
+        fp = [5, 10, 3, 4]
+        y = [7.5, 5., 8.75, 6.25, 3., 3.25, 3.5, 3.75]
+        assert_almost_equal(np.interp(x, xp, fp, period=360), y)
+        x = np.array(x, order='F').reshape(2, -1)
+        y = np.array(y, order='C').reshape(2, -1)
+        assert_almost_equal(np.interp(x, xp, fp, period=360), y)
+
+
+def compare_results(res, desired):
+    for i in range(len(desired)):
+        assert_array_equal(res[i], desired[i])
+
+
+class TestPercentile(object):
+
+    def test_basic(self):
+        x = np.arange(8) * 0.5
+        assert_equal(np.percentile(x, 0), 0.)
+        assert_equal(np.percentile(x, 100), 3.5)
+        assert_equal(np.percentile(x, 50), 1.75)
+        x[1] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(x, 0), np.nan)
+            assert_equal(np.percentile(x, 0, interpolation='nearest'), np.nan)
+            assert_(w[0].category is RuntimeWarning)
+
+    def test_api(self):
+        d = np.ones(5)
+        np.percentile(d, 5, None, None, False)
+        np.percentile(d, 5, None, None, False, 'linear')
+        o = np.ones((1,))
+        np.percentile(d, 5, None, o, False, 'linear')
+
+    def test_2D(self):
+        x = np.array([[1, 1, 1],
+                      [1, 1, 1],
+                      [4, 4, 3],
+                      [1, 1, 1],
+                      [1, 1, 1]])
+        assert_array_equal(np.percentile(x, 50, axis=0), [1, 1, 1])
+
+    def test_linear(self):
+
+        # Test defaults
+        assert_equal(np.percentile(range(10), 50), 4.5)
+
+        # explicitly specify interpolation_method 'linear' (the default)
+        assert_equal(np.percentile(range(10), 50,
+                                   interpolation='linear'), 4.5)
+
+    def test_lower_higher(self):
+
+        # interpolation_method 'lower'/'higher'
+        assert_equal(np.percentile(range(10), 50,
+                                   interpolation='lower'), 4)
+        assert_equal(np.percentile(range(10), 50,
+                                   interpolation='higher'), 5)
+
+    def test_midpoint(self):
+        assert_equal(np.percentile(range(10), 51,
+                                   interpolation='midpoint'), 4.5)
+        assert_equal(np.percentile(range(11), 51,
+                                   interpolation='midpoint'), 5.5)
+        assert_equal(np.percentile(range(11), 50,
+                                   interpolation='midpoint'), 5)
+
+    def test_nearest(self):
+        assert_equal(np.percentile(range(10), 51,
+                                   interpolation='nearest'), 5)
+        assert_equal(np.percentile(range(10), 49,
+                                   interpolation='nearest'), 4)
+
+    def test_sequence(self):
+        x = np.arange(8) * 0.5
+        assert_equal(np.percentile(x, [0, 100, 50]), [0, 3.5, 1.75])
+
+    def test_axis(self):
+        x = np.arange(12).reshape(3, 4)
+
+        assert_equal(np.percentile(x, (25, 50, 100)), [2.75, 5.5, 11.0])
+
+        r0 = [[2, 3, 4, 5], [4, 5, 6, 7], [8, 9, 10, 11]]
+        assert_equal(np.percentile(x, (25, 50, 100), axis=0), r0)
+
+        r1 = [[0.75, 1.5, 3], [4.75, 5.5, 7], [8.75, 9.5, 11]]
+        assert_equal(np.percentile(x, (25, 50, 100), axis=1), np.array(r1).T)
+
+        # ensure qth axis is always first as with np.array(old_percentile(..))
+        x = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
+        assert_equal(np.percentile(x, (25, 50)).shape, (2,))
+        assert_equal(np.percentile(x, (25, 50, 75)).shape, (3,))
+        assert_equal(np.percentile(x, (25, 50), axis=0).shape, (2, 4, 5, 6))
+        assert_equal(np.percentile(x, (25, 50), axis=1).shape, (2, 3, 5, 6))
+        assert_equal(np.percentile(x, (25, 50), axis=2).shape, (2, 3, 4, 6))
+        assert_equal(np.percentile(x, (25, 50), axis=3).shape, (2, 3, 4, 5))
+        assert_equal(
+            np.percentile(x, (25, 50, 75), axis=1).shape, (3, 3, 5, 6))
+        assert_equal(np.percentile(x, (25, 50),
+                                   interpolation="higher").shape, (2,))
+        assert_equal(np.percentile(x, (25, 50, 75),
+                                   interpolation="higher").shape, (3,))
+        assert_equal(np.percentile(x, (25, 50), axis=0,
+                                   interpolation="higher").shape, (2, 4, 5, 6))
+        assert_equal(np.percentile(x, (25, 50), axis=1,
+                                   interpolation="higher").shape, (2, 3, 5, 6))
+        assert_equal(np.percentile(x, (25, 50), axis=2,
+                                   interpolation="higher").shape, (2, 3, 4, 6))
+        assert_equal(np.percentile(x, (25, 50), axis=3,
+                                   interpolation="higher").shape, (2, 3, 4, 5))
+        assert_equal(np.percentile(x, (25, 50, 75), axis=1,
+                                   interpolation="higher").shape, (3, 3, 5, 6))
+
+    def test_scalar_q(self):
+        # test for no empty dimensions for compatibility with old percentile
+        x = np.arange(12).reshape(3, 4)
+        assert_equal(np.percentile(x, 50), 5.5)
+        assert_(np.isscalar(np.percentile(x, 50)))
+        r0 = np.array([4.,  5.,  6.,  7.])
+        assert_equal(np.percentile(x, 50, axis=0), r0)
+        assert_equal(np.percentile(x, 50, axis=0).shape, r0.shape)
+        r1 = np.array([1.5,  5.5,  9.5])
+        assert_almost_equal(np.percentile(x, 50, axis=1), r1)
+        assert_equal(np.percentile(x, 50, axis=1).shape, r1.shape)
+
+        out = np.empty(1)
+        assert_equal(np.percentile(x, 50, out=out), 5.5)
+        assert_equal(out, 5.5)
+        out = np.empty(4)
+        assert_equal(np.percentile(x, 50, axis=0, out=out), r0)
+        assert_equal(out, r0)
+        out = np.empty(3)
+        assert_equal(np.percentile(x, 50, axis=1, out=out), r1)
+        assert_equal(out, r1)
+
+        # test for no empty dimensions for compatibility with old percentile
+        x = np.arange(12).reshape(3, 4)
+        assert_equal(np.percentile(x, 50, interpolation='lower'), 5.)
+        assert_(np.isscalar(np.percentile(x, 50)))
+        r0 = np.array([4.,  5.,  6.,  7.])
+        c0 = np.percentile(x, 50, interpolation='lower', axis=0)
+        assert_equal(c0, r0)
+        assert_equal(c0.shape, r0.shape)
+        r1 = np.array([1.,  5.,  9.])
+        c1 = np.percentile(x, 50, interpolation='lower', axis=1)
+        assert_almost_equal(c1, r1)
+        assert_equal(c1.shape, r1.shape)
+
+        out = np.empty((), dtype=x.dtype)
+        c = np.percentile(x, 50, interpolation='lower', out=out)
+        assert_equal(c, 5)
+        assert_equal(out, 5)
+        out = np.empty(4, dtype=x.dtype)
+        c = np.percentile(x, 50, interpolation='lower', axis=0, out=out)
+        assert_equal(c, r0)
+        assert_equal(out, r0)
+        out = np.empty(3, dtype=x.dtype)
+        c = np.percentile(x, 50, interpolation='lower', axis=1, out=out)
+        assert_equal(c, r1)
+        assert_equal(out, r1)
+
+    def test_exception(self):
+        assert_raises(ValueError, np.percentile, [1, 2], 56,
+                      interpolation='foobar')
+        assert_raises(ValueError, np.percentile, [1], 101)
+        assert_raises(ValueError, np.percentile, [1], -1)
+        assert_raises(ValueError, np.percentile, [1], list(range(50)) + [101])
+        assert_raises(ValueError, np.percentile, [1], list(range(50)) + [-0.1])
+
+    def test_percentile_list(self):
+        assert_equal(np.percentile([1, 2, 3], 0), 1)
+
+    def test_percentile_out(self):
+        x = np.array([1, 2, 3])
+        y = np.zeros((3,))
+        p = (1, 2, 3)
+        np.percentile(x, p, out=y)
+        assert_equal(y, np.percentile(x, p))
+
+        x = np.array([[1, 2, 3],
+                      [4, 5, 6]])
+
+        y = np.zeros((3, 3))
+        np.percentile(x, p, axis=0, out=y)
+        assert_equal(y, np.percentile(x, p, axis=0))
+
+        y = np.zeros((3, 2))
+        np.percentile(x, p, axis=1, out=y)
+        assert_equal(y, np.percentile(x, p, axis=1))
+
+        x = np.arange(12).reshape(3, 4)
+        # q.dim > 1, float
+        r0 = np.array([[2.,  3.,  4., 5.], [4., 5., 6., 7.]])
+        out = np.empty((2, 4))
+        assert_equal(np.percentile(x, (25, 50), axis=0, out=out), r0)
+        assert_equal(out, r0)
+        r1 = np.array([[0.75,  4.75,  8.75], [1.5,  5.5,  9.5]])
+        out = np.empty((2, 3))
+        assert_equal(np.percentile(x, (25, 50), axis=1, out=out), r1)
+        assert_equal(out, r1)
+
+        # q.dim > 1, int
+        r0 = np.array([[0,  1,  2, 3], [4, 5, 6, 7]])
+        out = np.empty((2, 4), dtype=x.dtype)
+        c = np.percentile(x, (25, 50), interpolation='lower', axis=0, out=out)
+        assert_equal(c, r0)
+        assert_equal(out, r0)
+        r1 = np.array([[0,  4,  8], [1,  5,  9]])
+        out = np.empty((2, 3), dtype=x.dtype)
+        c = np.percentile(x, (25, 50), interpolation='lower', axis=1, out=out)
+        assert_equal(c, r1)
+        assert_equal(out, r1)
+
+    def test_percentile_empty_dim(self):
+        # empty dims are preserved
+        d = np.arange(11 * 2).reshape(11, 1, 2, 1)
+        assert_array_equal(np.percentile(d, 50, axis=0).shape, (1, 2, 1))
+        assert_array_equal(np.percentile(d, 50, axis=1).shape, (11, 2, 1))
+        assert_array_equal(np.percentile(d, 50, axis=2).shape, (11, 1, 1))
+        assert_array_equal(np.percentile(d, 50, axis=3).shape, (11, 1, 2))
+        assert_array_equal(np.percentile(d, 50, axis=-1).shape, (11, 1, 2))
+        assert_array_equal(np.percentile(d, 50, axis=-2).shape, (11, 1, 1))
+        assert_array_equal(np.percentile(d, 50, axis=-3).shape, (11, 2, 1))
+        assert_array_equal(np.percentile(d, 50, axis=-4).shape, (1, 2, 1))
+
+        assert_array_equal(np.percentile(d, 50, axis=2,
+                                         interpolation='midpoint').shape,
+                           (11, 1, 1))
+        assert_array_equal(np.percentile(d, 50, axis=-2,
+                                         interpolation='midpoint').shape,
+                           (11, 1, 1))
+
+        assert_array_equal(np.array(np.percentile(d, [10, 50], axis=0)).shape,
+                           (2, 1, 2, 1))
+        assert_array_equal(np.array(np.percentile(d, [10, 50], axis=1)).shape,
+                           (2, 11, 2, 1))
+        assert_array_equal(np.array(np.percentile(d, [10, 50], axis=2)).shape,
+                           (2, 11, 1, 1))
+        assert_array_equal(np.array(np.percentile(d, [10, 50], axis=3)).shape,
+                           (2, 11, 1, 2))
+
+    def test_percentile_no_overwrite(self):
+        a = np.array([2, 3, 4, 1])
+        np.percentile(a, [50], overwrite_input=False)
+        assert_equal(a, np.array([2, 3, 4, 1]))
+
+        a = np.array([2, 3, 4, 1])
+        np.percentile(a, [50])
+        assert_equal(a, np.array([2, 3, 4, 1]))
+
+    def test_no_p_overwrite(self):
+        p = np.linspace(0., 100., num=5)
+        np.percentile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, np.linspace(0., 100., num=5))
+        p = np.linspace(0., 100., num=5).tolist()
+        np.percentile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, np.linspace(0., 100., num=5).tolist())
+
+    def test_percentile_overwrite(self):
+        a = np.array([2, 3, 4, 1])
+        b = np.percentile(a, [50], overwrite_input=True)
+        assert_equal(b, np.array([2.5]))
+
+        b = np.percentile([2, 3, 4, 1], [50], overwrite_input=True)
+        assert_equal(b, np.array([2.5]))
+
+    def test_extended_axis(self):
+        o = np.random.normal(size=(71, 23))
+        x = np.dstack([o] * 10)
+        assert_equal(np.percentile(x, 30, axis=(0, 1)), np.percentile(o, 30))
+        x = np.moveaxis(x, -1, 0)
+        assert_equal(np.percentile(x, 30, axis=(-2, -1)), np.percentile(o, 30))
+        x = x.swapaxes(0, 1).copy()
+        assert_equal(np.percentile(x, 30, axis=(0, -1)), np.percentile(o, 30))
+        x = x.swapaxes(0, 1).copy()
+
+        assert_equal(np.percentile(x, [25, 60], axis=(0, 1, 2)),
+                     np.percentile(x, [25, 60], axis=None))
+        assert_equal(np.percentile(x, [25, 60], axis=(0,)),
+                     np.percentile(x, [25, 60], axis=0))
+
+        d = np.arange(3 * 5 * 7 * 11).reshape((3, 5, 7, 11))
+        np.random.shuffle(d.ravel())
+        assert_equal(np.percentile(d, 25,  axis=(0, 1, 2))[0],
+                     np.percentile(d[:,:,:, 0].flatten(), 25))
+        assert_equal(np.percentile(d, [10, 90], axis=(0, 1, 3))[:, 1],
+                     np.percentile(d[:,:, 1,:].flatten(), [10, 90]))
+        assert_equal(np.percentile(d, 25, axis=(3, 1, -4))[2],
+                     np.percentile(d[:,:, 2,:].flatten(), 25))
+        assert_equal(np.percentile(d, 25, axis=(3, 1, 2))[2],
+                     np.percentile(d[2,:,:,:].flatten(), 25))
+        assert_equal(np.percentile(d, 25, axis=(3, 2))[2, 1],
+                     np.percentile(d[2, 1,:,:].flatten(), 25))
+        assert_equal(np.percentile(d, 25, axis=(1, -2))[2, 1],
+                     np.percentile(d[2,:,:, 1].flatten(), 25))
+        assert_equal(np.percentile(d, 25, axis=(1, 3))[2, 2],
+                     np.percentile(d[2,:, 2,:].flatten(), 25))
+
+    def test_extended_axis_invalid(self):
+        d = np.ones((3, 5, 7, 11))
+        assert_raises(np.AxisError, np.percentile, d, axis=-5, q=25)
+        assert_raises(np.AxisError, np.percentile, d, axis=(0, -5), q=25)
+        assert_raises(np.AxisError, np.percentile, d, axis=4, q=25)
+        assert_raises(np.AxisError, np.percentile, d, axis=(0, 4), q=25)
+        # each of these refers to the same axis twice
+        assert_raises(ValueError, np.percentile, d, axis=(1, 1), q=25)
+        assert_raises(ValueError, np.percentile, d, axis=(-1, -1), q=25)
+        assert_raises(ValueError, np.percentile, d, axis=(3, -1), q=25)
+
+    def test_keepdims(self):
+        d = np.ones((3, 5, 7, 11))
+        assert_equal(np.percentile(d, 7, axis=None, keepdims=True).shape,
+                     (1, 1, 1, 1))
+        assert_equal(np.percentile(d, 7, axis=(0, 1), keepdims=True).shape,
+                     (1, 1, 7, 11))
+        assert_equal(np.percentile(d, 7, axis=(0, 3), keepdims=True).shape,
+                     (1, 5, 7, 1))
+        assert_equal(np.percentile(d, 7, axis=(1,), keepdims=True).shape,
+                     (3, 1, 7, 11))
+        assert_equal(np.percentile(d, 7, (0, 1, 2, 3), keepdims=True).shape,
+                     (1, 1, 1, 1))
+        assert_equal(np.percentile(d, 7, axis=(0, 1, 3), keepdims=True).shape,
+                     (1, 1, 7, 1))
+
+        assert_equal(np.percentile(d, [1, 7], axis=(0, 1, 3),
+                                   keepdims=True).shape, (2, 1, 1, 7, 1))
+        assert_equal(np.percentile(d, [1, 7], axis=(0, 3),
+                                   keepdims=True).shape, (2, 1, 5, 7, 1))
+
+    def test_out(self):
+        o = np.zeros((4,))
+        d = np.ones((3, 4))
+        assert_equal(np.percentile(d, 0, 0, out=o), o)
+        assert_equal(np.percentile(d, 0, 0, interpolation='nearest', out=o), o)
+        o = np.zeros((3,))
+        assert_equal(np.percentile(d, 1, 1, out=o), o)
+        assert_equal(np.percentile(d, 1, 1, interpolation='nearest', out=o), o)
+
+        o = np.zeros(())
+        assert_equal(np.percentile(d, 2, out=o), o)
+        assert_equal(np.percentile(d, 2, interpolation='nearest', out=o), o)
+
+    def test_out_nan(self):
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            o = np.zeros((4,))
+            d = np.ones((3, 4))
+            d[2, 1] = np.nan
+            assert_equal(np.percentile(d, 0, 0, out=o), o)
+            assert_equal(
+                np.percentile(d, 0, 0, interpolation='nearest', out=o), o)
+            o = np.zeros((3,))
+            assert_equal(np.percentile(d, 1, 1, out=o), o)
+            assert_equal(
+                np.percentile(d, 1, 1, interpolation='nearest', out=o), o)
+            o = np.zeros(())
+            assert_equal(np.percentile(d, 1, out=o), o)
+            assert_equal(
+                np.percentile(d, 1, interpolation='nearest', out=o), o)
+
+    def test_nan_behavior(self):
+        a = np.arange(24, dtype=float)
+        a[2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, 0.3), np.nan)
+            assert_equal(np.percentile(a, 0.3, axis=0), np.nan)
+            assert_equal(np.percentile(a, [0.3, 0.6], axis=0),
+                         np.array([np.nan] * 2))
+            assert_(w[0].category is RuntimeWarning)
+            assert_(w[1].category is RuntimeWarning)
+            assert_(w[2].category is RuntimeWarning)
+
+        a = np.arange(24, dtype=float).reshape(2, 3, 4)
+        a[1, 2, 3] = np.nan
+        a[1, 1, 2] = np.nan
+
+        # no axis
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, 0.3), np.nan)
+            assert_equal(np.percentile(a, 0.3).ndim, 0)
+            assert_(w[0].category is RuntimeWarning)
+
+        # axis0 zerod
+        b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 0)
+        b[2, 3] = np.nan
+        b[1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, 0.3, 0), b)
+
+        # axis0 not zerod
+        b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
+                          [0.3, 0.6], 0)
+        b[:, 2, 3] = np.nan
+        b[:, 1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, [0.3, 0.6], 0), b)
+
+        # axis1 zerod
+        b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 1)
+        b[1, 3] = np.nan
+        b[1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, 0.3, 1), b)
+        # axis1 not zerod
+        b = np.percentile(
+            np.arange(24, dtype=float).reshape(2, 3, 4), [0.3, 0.6], 1)
+        b[:, 1, 3] = np.nan
+        b[:, 1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, [0.3, 0.6], 1), b)
+
+        # axis02 zerod
+        b = np.percentile(
+            np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, (0, 2))
+        b[1] = np.nan
+        b[2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, 0.3, (0, 2)), b)
+        # axis02 not zerod
+        b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
+                          [0.3, 0.6], (0, 2))
+        b[:, 1] = np.nan
+        b[:, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(a, [0.3, 0.6], (0, 2)), b)
+        # axis02 not zerod with nearest interpolation
+        b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
+                          [0.3, 0.6], (0, 2), interpolation='nearest')
+        b[:, 1] = np.nan
+        b[:, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.percentile(
+                a, [0.3, 0.6], (0, 2), interpolation='nearest'), b)
+
+
+class TestQuantile(object):
+    # most of this is already tested by TestPercentile
+
+    def test_basic(self):
+        x = np.arange(8) * 0.5
+        assert_equal(np.quantile(x, 0), 0.)
+        assert_equal(np.quantile(x, 1), 3.5)
+        assert_equal(np.quantile(x, 0.5), 1.75)
+
+    def test_no_p_overwrite(self):
+        # this is worth retesting, because quantile does not make a copy
+        p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
+        p = p0.copy()
+        np.quantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
+
+        p0 = p0.tolist()
+        p = p.tolist()
+        np.quantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
+
+
+class TestMedian(object):
+
+    def test_basic(self):
+        a0 = np.array(1)
+        a1 = np.arange(2)
+        a2 = np.arange(6).reshape(2, 3)
+        assert_equal(np.median(a0), 1)
+        assert_allclose(np.median(a1), 0.5)
+        assert_allclose(np.median(a2), 2.5)
+        assert_allclose(np.median(a2, axis=0), [1.5,  2.5,  3.5])
+        assert_equal(np.median(a2, axis=1), [1, 4])
+        assert_allclose(np.median(a2, axis=None), 2.5)
+
+        a = np.array([0.0444502, 0.0463301, 0.141249, 0.0606775])
+        assert_almost_equal((a[1] + a[3]) / 2., np.median(a))
+        a = np.array([0.0463301, 0.0444502, 0.141249])
+        assert_equal(a[0], np.median(a))
+        a = np.array([0.0444502, 0.141249, 0.0463301])
+        assert_equal(a[-1], np.median(a))
+        # check array scalar result
+        assert_equal(np.median(a).ndim, 0)
+        a[1] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a).ndim, 0)
+            assert_(w[0].category is RuntimeWarning)
+
+    def test_axis_keyword(self):
+        a3 = np.array([[2, 3],
+                       [0, 1],
+                       [6, 7],
+                       [4, 5]])
+        for a in [a3, np.random.randint(0, 100, size=(2, 3, 4))]:
+            orig = a.copy()
+            np.median(a, axis=None)
+            for ax in range(a.ndim):
+                np.median(a, axis=ax)
+            assert_array_equal(a, orig)
+
+        assert_allclose(np.median(a3, axis=0), [3,  4])
+        assert_allclose(np.median(a3.T, axis=1), [3,  4])
+        assert_allclose(np.median(a3), 3.5)
+        assert_allclose(np.median(a3, axis=None), 3.5)
+        assert_allclose(np.median(a3.T), 3.5)
+
+    def test_overwrite_keyword(self):
+        a3 = np.array([[2, 3],
+                       [0, 1],
+                       [6, 7],
+                       [4, 5]])
+        a0 = np.array(1)
+        a1 = np.arange(2)
+        a2 = np.arange(6).reshape(2, 3)
+        assert_allclose(np.median(a0.copy(), overwrite_input=True), 1)
+        assert_allclose(np.median(a1.copy(), overwrite_input=True), 0.5)
+        assert_allclose(np.median(a2.copy(), overwrite_input=True), 2.5)
+        assert_allclose(np.median(a2.copy(), overwrite_input=True, axis=0),
+                        [1.5,  2.5,  3.5])
+        assert_allclose(
+            np.median(a2.copy(), overwrite_input=True, axis=1), [1, 4])
+        assert_allclose(
+            np.median(a2.copy(), overwrite_input=True, axis=None), 2.5)
+        assert_allclose(
+            np.median(a3.copy(), overwrite_input=True, axis=0), [3,  4])
+        assert_allclose(np.median(a3.T.copy(), overwrite_input=True, axis=1),
+                        [3,  4])
+
+        a4 = np.arange(3 * 4 * 5, dtype=np.float32).reshape((3, 4, 5))
+        np.random.shuffle(a4.ravel())
+        assert_allclose(np.median(a4, axis=None),
+                        np.median(a4.copy(), axis=None, overwrite_input=True))
+        assert_allclose(np.median(a4, axis=0),
+                        np.median(a4.copy(), axis=0, overwrite_input=True))
+        assert_allclose(np.median(a4, axis=1),
+                        np.median(a4.copy(), axis=1, overwrite_input=True))
+        assert_allclose(np.median(a4, axis=2),
+                        np.median(a4.copy(), axis=2, overwrite_input=True))
+
+    def test_array_like(self):
+        x = [1, 2, 3]
+        assert_almost_equal(np.median(x), 2)
+        x2 = [x]
+        assert_almost_equal(np.median(x2), 2)
+        assert_allclose(np.median(x2, axis=0), x)
+
+    def test_subclass(self):
+        # gh-3846
+        class MySubClass(np.ndarray):
+
+            def __new__(cls, input_array, info=None):
+                obj = np.asarray(input_array).view(cls)
+                obj.info = info
+                return obj
+
+            def mean(self, axis=None, dtype=None, out=None):
+                return -7
+
+        a = MySubClass([1, 2, 3])
+        assert_equal(np.median(a), -7)
+
+    def test_out(self):
+        o = np.zeros((4,))
+        d = np.ones((3, 4))
+        assert_equal(np.median(d, 0, out=o), o)
+        o = np.zeros((3,))
+        assert_equal(np.median(d, 1, out=o), o)
+        o = np.zeros(())
+        assert_equal(np.median(d, out=o), o)
+
+    def test_out_nan(self):
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            o = np.zeros((4,))
+            d = np.ones((3, 4))
+            d[2, 1] = np.nan
+            assert_equal(np.median(d, 0, out=o), o)
+            o = np.zeros((3,))
+            assert_equal(np.median(d, 1, out=o), o)
+            o = np.zeros(())
+            assert_equal(np.median(d, out=o), o)
+
+    def test_nan_behavior(self):
+        a = np.arange(24, dtype=float)
+        a[2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a), np.nan)
+            assert_equal(np.median(a, axis=0), np.nan)
+            assert_(w[0].category is RuntimeWarning)
+            assert_(w[1].category is RuntimeWarning)
+
+        a = np.arange(24, dtype=float).reshape(2, 3, 4)
+        a[1, 2, 3] = np.nan
+        a[1, 1, 2] = np.nan
+
+        # no axis
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a), np.nan)
+            assert_equal(np.median(a).ndim, 0)
+            assert_(w[0].category is RuntimeWarning)
+
+        # axis0
+        b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), 0)
+        b[2, 3] = np.nan
+        b[1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a, 0), b)
+            assert_equal(len(w), 1)
+
+        # axis1
+        b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), 1)
+        b[1, 3] = np.nan
+        b[1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a, 1), b)
+            assert_equal(len(w), 1)
+
+        # axis02
+        b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), (0, 2))
+        b[1] = np.nan
+        b[2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a, (0, 2)), b)
+            assert_equal(len(w), 1)
+
+    def test_empty(self):
+        # empty arrays
+        a = np.array([], dtype=float)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a), np.nan)
+            assert_(w[0].category is RuntimeWarning)
+
+        # multiple dimensions
+        a = np.array([], dtype=float, ndmin=3)
+        # no axis
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a), np.nan)
+            assert_(w[0].category is RuntimeWarning)
+
+        # axis 0 and 1
+        b = np.array([], dtype=float, ndmin=2)
+        assert_equal(np.median(a, axis=0), b)
+        assert_equal(np.median(a, axis=1), b)
+
+        # axis 2
+        b = np.array(np.nan, dtype=float, ndmin=2)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.median(a, axis=2), b)
+            assert_(w[0].category is RuntimeWarning)
+
+    def test_object(self):
+        o = np.arange(7.)
+        assert_(type(np.median(o.astype(object))), float)
+        o[2] = np.nan
+        assert_(type(np.median(o.astype(object))), float)
+
+    def test_extended_axis(self):
+        o = np.random.normal(size=(71, 23))
+        x = np.dstack([o] * 10)
+        assert_equal(np.median(x, axis=(0, 1)), np.median(o))
+        x = np.moveaxis(x, -1, 0)
+        assert_equal(np.median(x, axis=(-2, -1)), np.median(o))
+        x = x.swapaxes(0, 1).copy()
+        assert_equal(np.median(x, axis=(0, -1)), np.median(o))
+
+        assert_equal(np.median(x, axis=(0, 1, 2)), np.median(x, axis=None))
+        assert_equal(np.median(x, axis=(0, )), np.median(x, axis=0))
+        assert_equal(np.median(x, axis=(-1, )), np.median(x, axis=-1))
+
+        d = np.arange(3 * 5 * 7 * 11).reshape((3, 5, 7, 11))
+        np.random.shuffle(d.ravel())
+        assert_equal(np.median(d, axis=(0, 1, 2))[0],
+                     np.median(d[:,:,:, 0].flatten()))
+        assert_equal(np.median(d, axis=(0, 1, 3))[1],
+                     np.median(d[:,:, 1,:].flatten()))
+        assert_equal(np.median(d, axis=(3, 1, -4))[2],
+                     np.median(d[:,:, 2,:].flatten()))
+        assert_equal(np.median(d, axis=(3, 1, 2))[2],
+                     np.median(d[2,:,:,:].flatten()))
+        assert_equal(np.median(d, axis=(3, 2))[2, 1],
+                     np.median(d[2, 1,:,:].flatten()))
+        assert_equal(np.median(d, axis=(1, -2))[2, 1],
+                     np.median(d[2,:,:, 1].flatten()))
+        assert_equal(np.median(d, axis=(1, 3))[2, 2],
+                     np.median(d[2,:, 2,:].flatten()))
+
+    def test_extended_axis_invalid(self):
+        d = np.ones((3, 5, 7, 11))
+        assert_raises(np.AxisError, np.median, d, axis=-5)
+        assert_raises(np.AxisError, np.median, d, axis=(0, -5))
+        assert_raises(np.AxisError, np.median, d, axis=4)
+        assert_raises(np.AxisError, np.median, d, axis=(0, 4))
+        assert_raises(ValueError, np.median, d, axis=(1, 1))
+
+    def test_keepdims(self):
+        d = np.ones((3, 5, 7, 11))
+        assert_equal(np.median(d, axis=None, keepdims=True).shape,
+                     (1, 1, 1, 1))
+        assert_equal(np.median(d, axis=(0, 1), keepdims=True).shape,
+                     (1, 1, 7, 11))
+        assert_equal(np.median(d, axis=(0, 3), keepdims=True).shape,
+                     (1, 5, 7, 1))
+        assert_equal(np.median(d, axis=(1,), keepdims=True).shape,
+                     (3, 1, 7, 11))
+        assert_equal(np.median(d, axis=(0, 1, 2, 3), keepdims=True).shape,
+                     (1, 1, 1, 1))
+        assert_equal(np.median(d, axis=(0, 1, 3), keepdims=True).shape,
+                     (1, 1, 7, 1))
+
+
+class TestAdd_newdoc_ufunc(object):
+
+    def test_ufunc_arg(self):
+        assert_raises(TypeError, add_newdoc_ufunc, 2, "blah")
+        assert_raises(ValueError, add_newdoc_ufunc, np.add, "blah")
+
+    def test_string_arg(self):
+        assert_raises(TypeError, add_newdoc_ufunc, np.add, 3)
+
+
+class TestAdd_newdoc(object):
+
+    @pytest.mark.skipif(sys.flags.optimize == 2, reason="Python running -OO")
+    @pytest.mark.xfail(IS_PYPY, reason="PyPy does not modify tp_doc")
+    def test_add_doc(self):
+        # test np.add_newdoc
+        tgt = "Current flat index into the array."
+        assert_equal(np.core.flatiter.index.__doc__[:len(tgt)], tgt)
+        assert_(len(np.core.ufunc.identity.__doc__) > 300)
+        assert_(len(np.lib.index_tricks.mgrid.__doc__) > 300)
+
+class TestSortComplex(object):
+
+    @pytest.mark.parametrize("type_in, type_out", [
+        ('l', 'D'),
+        ('h', 'F'),
+        ('H', 'F'),
+        ('b', 'F'),
+        ('B', 'F'),
+        ('g', 'G'),
+        ])
+    def test_sort_real(self, type_in, type_out):
+        # sort_complex() type casting for real input types
+        a = np.array([5, 3, 6, 2, 1], dtype=type_in)
+        actual = np.sort_complex(a)
+        expected = np.sort(a).astype(type_out)
+        assert_equal(actual, expected)
+        assert_equal(actual.dtype, expected.dtype)
+
+    def test_sort_complex(self):
+        # sort_complex() handling of complex input
+        a = np.array([2 + 3j, 1 - 2j, 1 - 3j, 2 + 1j], dtype='D')
+        expected = np.array([1 - 3j, 1 - 2j, 2 + 1j, 2 + 3j], dtype='D')
+        actual = np.sort_complex(a)
+        assert_equal(actual, expected)
+        assert_equal(actual.dtype, expected.dtype)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_histograms.py b/contrib/python/numpy/py2/numpy/lib/tests/test_histograms.py
new file mode 100644
index 00000000000..594c8e782c5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_histograms.py
@@ -0,0 +1,844 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+
+from numpy.lib.histograms import histogram, histogramdd, histogram_bin_edges
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_almost_equal,
+    assert_array_almost_equal, assert_raises, assert_allclose,
+    assert_array_max_ulp, assert_raises_regex, suppress_warnings,
+    )
+import pytest
+
+
+class TestHistogram(object):
+
+    def setup(self):
+        pass
+
+    def teardown(self):
+        pass
+
+    def test_simple(self):
+        n = 100
+        v = np.random.rand(n)
+        (a, b) = histogram(v)
+        # check if the sum of the bins equals the number of samples
+        assert_equal(np.sum(a, axis=0), n)
+        # check that the bin counts are evenly spaced when the data is from
+        # a linear function
+        (a, b) = histogram(np.linspace(0, 10, 100))
+        assert_array_equal(a, 10)
+
+    def test_one_bin(self):
+        # Ticket 632
+        hist, edges = histogram([1, 2, 3, 4], [1, 2])
+        assert_array_equal(hist, [2, ])
+        assert_array_equal(edges, [1, 2])
+        assert_raises(ValueError, histogram, [1, 2], bins=0)
+        h, e = histogram([1, 2], bins=1)
+        assert_equal(h, np.array([2]))
+        assert_allclose(e, np.array([1., 2.]))
+
+    def test_normed(self):
+        sup = suppress_warnings()
+        with sup:
+            rec = sup.record(np.VisibleDeprecationWarning, '.*normed.*')
+            # Check that the integral of the density equals 1.
+            n = 100
+            v = np.random.rand(n)
+            a, b = histogram(v, normed=True)
+            area = np.sum(a * np.diff(b))
+            assert_almost_equal(area, 1)
+            assert_equal(len(rec), 1)
+
+        sup = suppress_warnings()
+        with sup:
+            rec = sup.record(np.VisibleDeprecationWarning, '.*normed.*')
+            # Check with non-constant bin widths (buggy but backwards
+            # compatible)
+            v = np.arange(10)
+            bins = [0, 1, 5, 9, 10]
+            a, b = histogram(v, bins, normed=True)
+            area = np.sum(a * np.diff(b))
+            assert_almost_equal(area, 1)
+            assert_equal(len(rec), 1)
+
+    def test_density(self):
+        # Check that the integral of the density equals 1.
+        n = 100
+        v = np.random.rand(n)
+        a, b = histogram(v, density=True)
+        area = np.sum(a * np.diff(b))
+        assert_almost_equal(area, 1)
+
+        # Check with non-constant bin widths
+        v = np.arange(10)
+        bins = [0, 1, 3, 6, 10]
+        a, b = histogram(v, bins, density=True)
+        assert_array_equal(a, .1)
+        assert_equal(np.sum(a * np.diff(b)), 1)
+
+        # Test that passing False works too
+        a, b = histogram(v, bins, density=False)
+        assert_array_equal(a, [1, 2, 3, 4])
+
+        # Variale bin widths are especially useful to deal with
+        # infinities.
+        v = np.arange(10)
+        bins = [0, 1, 3, 6, np.inf]
+        a, b = histogram(v, bins, density=True)
+        assert_array_equal(a, [.1, .1, .1, 0.])
+
+        # Taken from a bug report from N. Becker on the numpy-discussion
+        # mailing list Aug. 6, 2010.
+        counts, dmy = np.histogram(
+            [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True)
+        assert_equal(counts, [.25, 0])
+
+    def test_outliers(self):
+        # Check that outliers are not tallied
+        a = np.arange(10) + .5
+
+        # Lower outliers
+        h, b = histogram(a, range=[0, 9])
+        assert_equal(h.sum(), 9)
+
+        # Upper outliers
+        h, b = histogram(a, range=[1, 10])
+        assert_equal(h.sum(), 9)
+
+        # Normalization
+        h, b = histogram(a, range=[1, 9], density=True)
+        assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15)
+
+        # Weights
+        w = np.arange(10) + .5
+        h, b = histogram(a, range=[1, 9], weights=w, density=True)
+        assert_equal((h * np.diff(b)).sum(), 1)
+
+        h, b = histogram(a, bins=8, range=[1, 9], weights=w)
+        assert_equal(h, w[1:-1])
+
+    def test_arr_weights_mismatch(self):
+        a = np.arange(10) + .5
+        w = np.arange(11) + .5
+        with assert_raises_regex(ValueError, "same shape as"):
+            h, b = histogram(a, range=[1, 9], weights=w, density=True)
+
+
+    def test_type(self):
+        # Check the type of the returned histogram
+        a = np.arange(10) + .5
+        h, b = histogram(a)
+        assert_(np.issubdtype(h.dtype, np.integer))
+
+        h, b = histogram(a, density=True)
+        assert_(np.issubdtype(h.dtype, np.floating))
+
+        h, b = histogram(a, weights=np.ones(10, int))
+        assert_(np.issubdtype(h.dtype, np.integer))
+
+        h, b = histogram(a, weights=np.ones(10, float))
+        assert_(np.issubdtype(h.dtype, np.floating))
+
+    def test_f32_rounding(self):
+        # gh-4799, check that the rounding of the edges works with float32
+        x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32)
+        y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32)
+        counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100)
+        assert_equal(counts_hist.sum(), 3.)
+
+    def test_bool_conversion(self):
+        # gh-12107
+        # Reference integer histogram
+        a = np.array([1, 1, 0], dtype=np.uint8)
+        int_hist, int_edges = np.histogram(a)
+
+        # Should raise an warning on booleans
+        # Ensure that the histograms are equivalent, need to suppress
+        # the warnings to get the actual outputs
+        with suppress_warnings() as sup:
+            rec = sup.record(RuntimeWarning, 'Converting input from .*')
+            hist, edges = np.histogram([True, True, False])
+            # A warning should be issued
+            assert_equal(len(rec), 1)
+            assert_array_equal(hist, int_hist)
+            assert_array_equal(edges, int_edges)
+
+    def test_weights(self):
+        v = np.random.rand(100)
+        w = np.ones(100) * 5
+        a, b = histogram(v)
+        na, nb = histogram(v, density=True)
+        wa, wb = histogram(v, weights=w)
+        nwa, nwb = histogram(v, weights=w, density=True)
+        assert_array_almost_equal(a * 5, wa)
+        assert_array_almost_equal(na, nwa)
+
+        # Check weights are properly applied.
+        v = np.linspace(0, 10, 10)
+        w = np.concatenate((np.zeros(5), np.ones(5)))
+        wa, wb = histogram(v, bins=np.arange(11), weights=w)
+        assert_array_almost_equal(wa, w)
+
+        # Check with integer weights
+        wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1])
+        assert_array_equal(wa, [4, 5, 0, 1])
+        wa, wb = histogram(
+            [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], density=True)
+        assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4)
+
+        # Check weights with non-uniform bin widths
+        a, b = histogram(
+            np.arange(9), [0, 1, 3, 6, 10],
+            weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True)
+        assert_almost_equal(a, [.2, .1, .1, .075])
+
+    def test_exotic_weights(self):
+
+        # Test the use of weights that are not integer or floats, but e.g.
+        # complex numbers or object types.
+
+        # Complex weights
+        values = np.array([1.3, 2.5, 2.3])
+        weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2])
+
+        # Check with custom bins
+        wa, wb = histogram(values, bins=[0, 2, 3], weights=weights)
+        assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3]))
+
+        # Check with even bins
+        wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights)
+        assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3]))
+
+        # Decimal weights
+        from decimal import Decimal
+        values = np.array([1.3, 2.5, 2.3])
+        weights = np.array([Decimal(1), Decimal(2), Decimal(3)])
+
+        # Check with custom bins
+        wa, wb = histogram(values, bins=[0, 2, 3], weights=weights)
+        assert_array_almost_equal(wa, [Decimal(1), Decimal(5)])
+
+        # Check with even bins
+        wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights)
+        assert_array_almost_equal(wa, [Decimal(1), Decimal(5)])
+
+    def test_no_side_effects(self):
+        # This is a regression test that ensures that values passed to
+        # ``histogram`` are unchanged.
+        values = np.array([1.3, 2.5, 2.3])
+        np.histogram(values, range=[-10, 10], bins=100)
+        assert_array_almost_equal(values, [1.3, 2.5, 2.3])
+
+    def test_empty(self):
+        a, b = histogram([], bins=([0, 1]))
+        assert_array_equal(a, np.array([0]))
+        assert_array_equal(b, np.array([0, 1]))
+
+    def test_error_binnum_type (self):
+        # Tests if right Error is raised if bins argument is float
+        vals = np.linspace(0.0, 1.0, num=100)
+        histogram(vals, 5)
+        assert_raises(TypeError, histogram, vals, 2.4)
+
+    def test_finite_range(self):
+        # Normal ranges should be fine
+        vals = np.linspace(0.0, 1.0, num=100)
+        histogram(vals, range=[0.25,0.75])
+        assert_raises(ValueError, histogram, vals, range=[np.nan,0.75])
+        assert_raises(ValueError, histogram, vals, range=[0.25,np.inf])
+
+    def test_invalid_range(self):
+        # start of range must be < end of range
+        vals = np.linspace(0.0, 1.0, num=100)
+        with assert_raises_regex(ValueError, "max must be larger than"):
+            np.histogram(vals, range=[0.1, 0.01])
+
+    def test_bin_edge_cases(self):
+        # Ensure that floating-point computations correctly place edge cases.
+        arr = np.array([337, 404, 739, 806, 1007, 1811, 2012])
+        hist, edges = np.histogram(arr, bins=8296, range=(2, 2280))
+        mask = hist > 0
+        left_edges = edges[:-1][mask]
+        right_edges = edges[1:][mask]
+        for x, left, right in zip(arr, left_edges, right_edges):
+            assert_(x >= left)
+            assert_(x < right)
+
+    def test_last_bin_inclusive_range(self):
+        arr = np.array([0.,  0.,  0.,  1.,  2.,  3.,  3.,  4.,  5.])
+        hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5))
+        assert_equal(hist[-1], 1)
+
+    def test_bin_array_dims(self):
+        # gracefully handle bins object > 1 dimension
+        vals = np.linspace(0.0, 1.0, num=100)
+        bins = np.array([[0, 0.5], [0.6, 1.0]])
+        with assert_raises_regex(ValueError, "must be 1d"):
+            np.histogram(vals, bins=bins)
+
+    def test_unsigned_monotonicity_check(self):
+        # Ensures ValueError is raised if bins not increasing monotonically
+        # when bins contain unsigned values (see #9222)
+        arr = np.array([2])
+        bins = np.array([1, 3, 1], dtype='uint64')
+        with assert_raises(ValueError):
+            hist, edges = np.histogram(arr, bins=bins)
+
+    def test_object_array_of_0d(self):
+        # gh-7864
+        assert_raises(ValueError,
+            histogram, [np.array(0.4) for i in range(10)] + [-np.inf])
+        assert_raises(ValueError,
+            histogram, [np.array(0.4) for i in range(10)] + [np.inf])
+
+        # these should not crash
+        np.histogram([np.array(0.5) for i in range(10)] + [.500000000000001])
+        np.histogram([np.array(0.5) for i in range(10)] + [.5])
+
+    def test_some_nan_values(self):
+        # gh-7503
+        one_nan = np.array([0, 1, np.nan])
+        all_nan = np.array([np.nan, np.nan])
+
+        # the internal comparisons with NaN give warnings
+        sup = suppress_warnings()
+        sup.filter(RuntimeWarning)
+        with sup:
+            # can't infer range with nan
+            assert_raises(ValueError, histogram, one_nan, bins='auto')
+            assert_raises(ValueError, histogram, all_nan, bins='auto')
+
+            # explicit range solves the problem
+            h, b = histogram(one_nan, bins='auto', range=(0, 1))
+            assert_equal(h.sum(), 2)  # nan is not counted
+            h, b = histogram(all_nan, bins='auto', range=(0, 1))
+            assert_equal(h.sum(), 0)  # nan is not counted
+
+            # as does an explicit set of bins
+            h, b = histogram(one_nan, bins=[0, 1])
+            assert_equal(h.sum(), 2)  # nan is not counted
+            h, b = histogram(all_nan, bins=[0, 1])
+            assert_equal(h.sum(), 0)  # nan is not counted
+
+    def test_datetime(self):
+        begin = np.datetime64('2000-01-01', 'D')
+        offsets = np.array([0, 0, 1, 1, 2, 3, 5, 10, 20])
+        bins = np.array([0, 2, 7, 20])
+        dates = begin + offsets
+        date_bins = begin + bins
+
+        td = np.dtype('timedelta64[D]')
+
+        # Results should be the same for integer offsets or datetime values.
+        # For now, only explicit bins are supported, since linspace does not
+        # work on datetimes or timedeltas
+        d_count, d_edge = histogram(dates, bins=date_bins)
+        t_count, t_edge = histogram(offsets.astype(td), bins=bins.astype(td))
+        i_count, i_edge = histogram(offsets, bins=bins)
+
+        assert_equal(d_count, i_count)
+        assert_equal(t_count, i_count)
+
+        assert_equal((d_edge - begin).astype(int), i_edge)
+        assert_equal(t_edge.astype(int), i_edge)
+
+        assert_equal(d_edge.dtype, dates.dtype)
+        assert_equal(t_edge.dtype, td)
+
+    def do_signed_overflow_bounds(self, dtype):
+        exponent = 8 * np.dtype(dtype).itemsize - 1
+        arr = np.array([-2**exponent + 4, 2**exponent - 4], dtype=dtype)
+        hist, e = histogram(arr, bins=2)
+        assert_equal(e, [-2**exponent + 4, 0, 2**exponent - 4])
+        assert_equal(hist, [1, 1])
+
+    def test_signed_overflow_bounds(self):
+        self.do_signed_overflow_bounds(np.byte)
+        self.do_signed_overflow_bounds(np.short)
+        self.do_signed_overflow_bounds(np.intc)
+        self.do_signed_overflow_bounds(np.int_)
+        self.do_signed_overflow_bounds(np.longlong)
+
+    def do_precision_lower_bound(self, float_small, float_large):
+        eps = np.finfo(float_large).eps
+
+        arr = np.array([1.0], float_small)
+        range = np.array([1.0 + eps, 2.0], float_large)
+
+        # test is looking for behavior when the bounds change between dtypes
+        if range.astype(float_small)[0] != 1:
+            return
+
+        # previously crashed
+        count, x_loc = np.histogram(arr, bins=1, range=range)
+        assert_equal(count, [1])
+
+        # gh-10322 means that the type comes from arr - this may change
+        assert_equal(x_loc.dtype, float_small)
+
+    def do_precision_upper_bound(self, float_small, float_large):
+        eps = np.finfo(float_large).eps
+
+        arr = np.array([1.0], float_small)
+        range = np.array([0.0, 1.0 - eps], float_large)
+
+        # test is looking for behavior when the bounds change between dtypes
+        if range.astype(float_small)[-1] != 1:
+            return
+
+        # previously crashed
+        count, x_loc = np.histogram(arr, bins=1, range=range)
+        assert_equal(count, [1])
+
+        # gh-10322 means that the type comes from arr - this may change
+        assert_equal(x_loc.dtype, float_small)
+
+    def do_precision(self, float_small, float_large):
+        self.do_precision_lower_bound(float_small, float_large)
+        self.do_precision_upper_bound(float_small, float_large)
+
+    def test_precision(self):
+        # not looping results in a useful stack trace upon failure
+        self.do_precision(np.half, np.single)
+        self.do_precision(np.half, np.double)
+        self.do_precision(np.half, np.longdouble)
+        self.do_precision(np.single, np.double)
+        self.do_precision(np.single, np.longdouble)
+        self.do_precision(np.double, np.longdouble)
+
+    def test_histogram_bin_edges(self):
+        hist, e = histogram([1, 2, 3, 4], [1, 2])
+        edges = histogram_bin_edges([1, 2, 3, 4], [1, 2])
+        assert_array_equal(edges, e)
+
+        arr = np.array([0.,  0.,  0.,  1.,  2.,  3.,  3.,  4.,  5.])
+        hist, e = histogram(arr, bins=30, range=(-0.5, 5))
+        edges = histogram_bin_edges(arr, bins=30, range=(-0.5, 5))
+        assert_array_equal(edges, e)
+
+        hist, e = histogram(arr, bins='auto', range=(0, 1))
+        edges = histogram_bin_edges(arr, bins='auto', range=(0, 1))
+        assert_array_equal(edges, e)
+
+
+class TestHistogramOptimBinNums(object):
+    """
+    Provide test coverage when using provided estimators for optimal number of
+    bins
+    """
+
+    def test_empty(self):
+        estimator_list = ['fd', 'scott', 'rice', 'sturges',
+                          'doane', 'sqrt', 'auto', 'stone']
+        # check it can deal with empty data
+        for estimator in estimator_list:
+            a, b = histogram([], bins=estimator)
+            assert_array_equal(a, np.array([0]))
+            assert_array_equal(b, np.array([0, 1]))
+
+    def test_simple(self):
+        """
+        Straightforward testing with a mixture of linspace data (for
+        consistency). All test values have been precomputed and the values
+        shouldn't change
+        """
+        # Some basic sanity checking, with some fixed data.
+        # Checking for the correct number of bins
+        basic_test = {50:   {'fd': 4,  'scott': 4,  'rice': 8,  'sturges': 7,
+                             'doane': 8, 'sqrt': 8, 'auto': 7, 'stone': 2},
+                      500:  {'fd': 8,  'scott': 8,  'rice': 16, 'sturges': 10,
+                             'doane': 12, 'sqrt': 23, 'auto': 10, 'stone': 9},
+                      5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14,
+                             'doane': 17, 'sqrt': 71, 'auto': 17, 'stone': 20}}
+
+        for testlen, expectedResults in basic_test.items():
+            # Create some sort of non uniform data to test with
+            # (2 peak uniform mixture)
+            x1 = np.linspace(-10, -1, testlen // 5 * 2)
+            x2 = np.linspace(1, 10, testlen // 5 * 3)
+            x = np.concatenate((x1, x2))
+            for estimator, numbins in expectedResults.items():
+                a, b = np.histogram(x, estimator)
+                assert_equal(len(a), numbins, err_msg="For the {0} estimator "
+                             "with datasize of {1}".format(estimator, testlen))
+
+    def test_small(self):
+        """
+        Smaller datasets have the potential to cause issues with the data
+        adaptive methods, especially the FD method. All bin numbers have been
+        precalculated.
+        """
+        small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
+                         'doane': 1, 'sqrt': 1, 'stone': 1},
+                     2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
+                         'doane': 1, 'sqrt': 2, 'stone': 1},
+                     3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3,
+                         'doane': 3, 'sqrt': 2, 'stone': 1}}
+
+        for testlen, expectedResults in small_dat.items():
+            testdat = np.arange(testlen)
+            for estimator, expbins in expectedResults.items():
+                a, b = np.histogram(testdat, estimator)
+                assert_equal(len(a), expbins, err_msg="For the {0} estimator "
+                             "with datasize of {1}".format(estimator, testlen))
+
+    def test_incorrect_methods(self):
+        """
+        Check a Value Error is thrown when an unknown string is passed in
+        """
+        check_list = ['mad', 'freeman', 'histograms', 'IQR']
+        for estimator in check_list:
+            assert_raises(ValueError, histogram, [1, 2, 3], estimator)
+
+    def test_novariance(self):
+        """
+        Check that methods handle no variance in data
+        Primarily for Scott and FD as the SD and IQR are both 0 in this case
+        """
+        novar_dataset = np.ones(100)
+        novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
+                            'doane': 1, 'sqrt': 1, 'auto': 1, 'stone': 1}
+
+        for estimator, numbins in novar_resultdict.items():
+            a, b = np.histogram(novar_dataset, estimator)
+            assert_equal(len(a), numbins, err_msg="{0} estimator, "
+                         "No Variance test".format(estimator))
+
+    def test_limited_variance(self):
+        """
+        Check when IQR is 0, but variance exists, we return the sturges value
+        and not the fd value.
+        """
+        lim_var_data = np.ones(1000)
+        lim_var_data[:3] = 0
+        lim_var_data[-4:] = 100
+
+        edges_auto = histogram_bin_edges(lim_var_data, 'auto')
+        assert_equal(edges_auto, np.linspace(0, 100, 12))
+
+        edges_fd = histogram_bin_edges(lim_var_data, 'fd')
+        assert_equal(edges_fd, np.array([0, 100]))
+
+        edges_sturges = histogram_bin_edges(lim_var_data, 'sturges')
+        assert_equal(edges_sturges, np.linspace(0, 100, 12))
+
+    def test_outlier(self):
+        """
+        Check the FD, Scott and Doane with outliers.
+
+        The FD estimates a smaller binwidth since it's less affected by
+        outliers. Since the range is so (artificially) large, this means more
+        bins, most of which will be empty, but the data of interest usually is
+        unaffected. The Scott estimator is more affected and returns fewer bins,
+        despite most of the variance being in one area of the data. The Doane
+        estimator lies somewhere between the other two.
+        """
+        xcenter = np.linspace(-10, 10, 50)
+        outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
+
+        outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11, 'stone': 6}
+
+        for estimator, numbins in outlier_resultdict.items():
+            a, b = np.histogram(outlier_dataset, estimator)
+            assert_equal(len(a), numbins)
+
+    def test_scott_vs_stone(self):
+        """Verify that Scott's rule and Stone's rule converges for normally distributed data"""
+
+        def nbins_ratio(seed, size):
+            rng = np.random.RandomState(seed)
+            x = rng.normal(loc=0, scale=2, size=size)
+            a, b = len(np.histogram(x, 'stone')[0]), len(np.histogram(x, 'scott')[0])
+            return a / (a + b)
+
+        ll = [[nbins_ratio(seed, size) for size in np.geomspace(start=10, stop=100, num=4).round().astype(int)]
+              for seed in range(256)]
+
+        # the average difference between the two methods decreases as the dataset size increases.
+        assert_almost_equal(abs(np.mean(ll, axis=0) - 0.5),
+                            [0.1065248,
+                             0.0968844,
+                             0.0331818,
+                             0.0178057],
+                            decimal=3)
+
+    def test_simple_range(self):
+        """
+        Straightforward testing with a mixture of linspace data (for
+        consistency). Adding in a 3rd mixture that will then be
+        completely ignored. All test values have been precomputed and
+        the shouldn't change.
+        """
+        # some basic sanity checking, with some fixed data.
+        # Checking for the correct number of bins
+        basic_test = {
+                      50:   {'fd': 8,  'scott': 8,  'rice': 15,
+                             'sturges': 14, 'auto': 14, 'stone': 8},
+                      500:  {'fd': 15, 'scott': 16, 'rice': 32,
+                             'sturges': 20, 'auto': 20, 'stone': 80},
+                      5000: {'fd': 33, 'scott': 33, 'rice': 69,
+                             'sturges': 27, 'auto': 33, 'stone': 80}
+                     }
+
+        for testlen, expectedResults in basic_test.items():
+            # create some sort of non uniform data to test with
+            # (3 peak uniform mixture)
+            x1 = np.linspace(-10, -1, testlen // 5 * 2)
+            x2 = np.linspace(1, 10, testlen // 5 * 3)
+            x3 = np.linspace(-100, -50, testlen)
+            x = np.hstack((x1, x2, x3))
+            for estimator, numbins in expectedResults.items():
+                a, b = np.histogram(x, estimator, range = (-20, 20))
+                msg = "For the {0} estimator".format(estimator)
+                msg += " with datasize of {0}".format(testlen)
+                assert_equal(len(a), numbins, err_msg=msg)
+
+    @pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
+                                      'stone', 'rice', 'sturges'])
+    def test_signed_integer_data(self, bins):
+        # Regression test for gh-14379.
+        a = np.array([-2, 0, 127], dtype=np.int8)
+        hist, edges = np.histogram(a, bins=bins)
+        hist32, edges32 = np.histogram(a.astype(np.int32), bins=bins)
+        assert_array_equal(hist, hist32)
+        assert_array_equal(edges, edges32)
+
+    def test_simple_weighted(self):
+        """
+        Check that weighted data raises a TypeError
+        """
+        estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
+        for estimator in estimator_list:
+            assert_raises(TypeError, histogram, [1, 2, 3],
+                          estimator, weights=[1, 2, 3])
+
+
+class TestHistogramdd(object):
+
+    def test_simple(self):
+        x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5],
+                      [.5,  .5, 1.5], [.5,  1.5, 2.5], [.5,  2.5, 2.5]])
+        H, edges = histogramdd(x, (2, 3, 3),
+                               range=[[-1, 1], [0, 3], [0, 3]])
+        answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]],
+                           [[0, 1, 0], [0, 0, 1], [0, 0, 1]]])
+        assert_array_equal(H, answer)
+
+        # Check normalization
+        ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]]
+        H, edges = histogramdd(x, bins=ed, density=True)
+        assert_(np.all(H == answer / 12.))
+
+        # Check that H has the correct shape.
+        H, edges = histogramdd(x, (2, 3, 4),
+                               range=[[-1, 1], [0, 3], [0, 4]],
+                               density=True)
+        answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]],
+                           [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]])
+        assert_array_almost_equal(H, answer / 6., 4)
+        # Check that a sequence of arrays is accepted and H has the correct
+        # shape.
+        z = [np.squeeze(y) for y in np.split(x, 3, axis=1)]
+        H, edges = histogramdd(
+            z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]])
+        answer = np.array([[[0, 0], [0, 0], [0, 0]],
+                           [[0, 1], [0, 0], [1, 0]],
+                           [[0, 1], [0, 0], [0, 0]],
+                           [[0, 0], [0, 0], [0, 0]]])
+        assert_array_equal(H, answer)
+
+        Z = np.zeros((5, 5, 5))
+        Z[list(range(5)), list(range(5)), list(range(5))] = 1.
+        H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5)
+        assert_array_equal(H, Z)
+
+    def test_shape_3d(self):
+        # All possible permutations for bins of different lengths in 3D.
+        bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4),
+                (4, 5, 6))
+        r = np.random.rand(10, 3)
+        for b in bins:
+            H, edges = histogramdd(r, b)
+            assert_(H.shape == b)
+
+    def test_shape_4d(self):
+        # All possible permutations for bins of different lengths in 4D.
+        bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4),
+                (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6),
+                (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7),
+                (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5),
+                (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5),
+                (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4))
+
+        r = np.random.rand(10, 4)
+        for b in bins:
+            H, edges = histogramdd(r, b)
+            assert_(H.shape == b)
+
+    def test_weights(self):
+        v = np.random.rand(100, 2)
+        hist, edges = histogramdd(v)
+        n_hist, edges = histogramdd(v, density=True)
+        w_hist, edges = histogramdd(v, weights=np.ones(100))
+        assert_array_equal(w_hist, hist)
+        w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, density=True)
+        assert_array_equal(w_hist, n_hist)
+        w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2)
+        assert_array_equal(w_hist, 2 * hist)
+
+    def test_identical_samples(self):
+        x = np.zeros((10, 2), int)
+        hist, edges = histogramdd(x, bins=2)
+        assert_array_equal(edges[0], np.array([-0.5, 0., 0.5]))
+
+    def test_empty(self):
+        a, b = histogramdd([[], []], bins=([0, 1], [0, 1]))
+        assert_array_max_ulp(a, np.array([[0.]]))
+        a, b = np.histogramdd([[], [], []], bins=2)
+        assert_array_max_ulp(a, np.zeros((2, 2, 2)))
+
+    def test_bins_errors(self):
+        # There are two ways to specify bins. Check for the right errors
+        # when mixing those.
+        x = np.arange(8).reshape(2, 4)
+        assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
+        assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
+        assert_raises(
+            ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
+        assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
+
+    def test_inf_edges(self):
+        # Test using +/-inf bin edges works. See #1788.
+        with np.errstate(invalid='ignore'):
+            x = np.arange(6).reshape(3, 2)
+            expected = np.array([[1, 0], [0, 1], [0, 1]])
+            h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]])
+            assert_allclose(h, expected)
+            h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])])
+            assert_allclose(h, expected)
+            h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]])
+            assert_allclose(h, expected)
+
+    def test_rightmost_binedge(self):
+        # Test event very close to rightmost binedge. See Github issue #4266
+        x = [0.9999999995]
+        bins = [[0., 0.5, 1.0]]
+        hist, _ = histogramdd(x, bins=bins)
+        assert_(hist[0] == 0.0)
+        assert_(hist[1] == 1.)
+        x = [1.0]
+        bins = [[0., 0.5, 1.0]]
+        hist, _ = histogramdd(x, bins=bins)
+        assert_(hist[0] == 0.0)
+        assert_(hist[1] == 1.)
+        x = [1.0000000001]
+        bins = [[0., 0.5, 1.0]]
+        hist, _ = histogramdd(x, bins=bins)
+        assert_(hist[0] == 0.0)
+        assert_(hist[1] == 0.0)
+        x = [1.0001]
+        bins = [[0., 0.5, 1.0]]
+        hist, _ = histogramdd(x, bins=bins)
+        assert_(hist[0] == 0.0)
+        assert_(hist[1] == 0.0)
+
+    def test_finite_range(self):
+        vals = np.random.random((100, 3))
+        histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]])
+        assert_raises(ValueError, histogramdd, vals,
+                      range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
+        assert_raises(ValueError, histogramdd, vals,
+                      range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
+
+    def test_equal_edges(self):
+        """ Test that adjacent entries in an edge array can be equal """
+        x = np.array([0, 1, 2])
+        y = np.array([0, 1, 2])
+        x_edges = np.array([0, 2, 2])
+        y_edges = 1
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        hist_expected = np.array([
+            [2.],
+            [1.],  # x == 2 falls in the final bin
+        ])
+        assert_equal(hist, hist_expected)
+
+    def test_edge_dtype(self):
+        """ Test that if an edge array is input, its type is preserved """
+        x = np.array([0, 10, 20])
+        y = x / 10
+        x_edges = np.array([0, 5, 15, 20])
+        y_edges = x_edges / 10
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(edges[0].dtype, x_edges.dtype)
+        assert_equal(edges[1].dtype, y_edges.dtype)
+
+    def test_large_integers(self):
+        big = 2**60  # Too large to represent with a full precision float
+
+        x = np.array([0], np.int64)
+        x_edges = np.array([-1, +1], np.int64)
+        y = big + x
+        y_edges = big + x_edges
+
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(hist[0, 0], 1)
+
+    def test_density_non_uniform_2d(self):
+        # Defines the following grid:
+        #
+        #    0 2     8
+        #   0+-+-----+
+        #    + |     +
+        #    + |     +
+        #   6+-+-----+
+        #   8+-+-----+
+        x_edges = np.array([0, 2, 8])
+        y_edges = np.array([0, 6, 8])
+        relative_areas = np.array([
+            [3, 9],
+            [1, 3]])
+
+        # ensure the number of points in each region is proportional to its area
+        x = np.array([1] + [1]*3 + [7]*3 + [7]*9)
+        y = np.array([7] + [1]*3 + [7]*3 + [1]*9)
+
+        # sanity check that the above worked as intended
+        hist, edges = histogramdd((y, x), bins=(y_edges, x_edges))
+        assert_equal(hist, relative_areas)
+
+        # resulting histogram should be uniform, since counts and areas are propotional
+        hist, edges = histogramdd((y, x), bins=(y_edges, x_edges), density=True)
+        assert_equal(hist, 1 / (8*8))
+
+    def test_density_non_uniform_1d(self):
+        # compare to histogram to show the results are the same
+        v = np.arange(10)
+        bins = np.array([0, 1, 3, 6, 10])
+        hist, edges = histogram(v, bins, density=True)
+        hist_dd, edges_dd = histogramdd((v,), (bins,), density=True)
+        assert_equal(hist, hist_dd)
+        assert_equal(edges, edges_dd[0])
+
+    def test_density_via_normed(self):
+        # normed should simply alias to density argument
+        v = np.arange(10)
+        bins = np.array([0, 1, 3, 6, 10])
+        hist, edges = histogram(v, bins, density=True)
+        hist_dd, edges_dd = histogramdd((v,), (bins,), normed=True)
+        assert_equal(hist, hist_dd)
+        assert_equal(edges, edges_dd[0])
+
+    def test_density_normed_redundancy(self):
+        v = np.arange(10)
+        bins = np.array([0, 1, 3, 6, 10])
+        with assert_raises_regex(TypeError, "Cannot specify both"):
+            hist_dd, edges_dd = histogramdd((v,), (bins,),
+                                            density=True,
+                                            normed=True)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_index_tricks.py b/contrib/python/numpy/py2/numpy/lib/tests/test_index_tricks.py
new file mode 100644
index 00000000000..3246f68ff41
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_index_tricks.py
@@ -0,0 +1,454 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_almost_equal,
+    assert_array_almost_equal, assert_raises, assert_raises_regex,
+    assert_warns
+    )
+from numpy.lib.index_tricks import (
+    mgrid, ogrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
+    index_exp, ndindex, r_, s_, ix_
+    )
+
+
+class TestRavelUnravelIndex(object):
+    def test_basic(self):
+        assert_equal(np.unravel_index(2, (2, 2)), (1, 0))
+
+        # test backwards compatibility with older dims
+        # keyword argument; see Issue #10586
+        with assert_warns(DeprecationWarning):
+            # we should achieve the correct result
+            # AND raise the appropriate warning
+            # when using older "dims" kw argument
+            assert_equal(np.unravel_index(indices=2,
+                                          dims=(2, 2)),
+                                          (1, 0))
+
+        # test that new shape argument works properly
+        assert_equal(np.unravel_index(indices=2,
+                                      shape=(2, 2)),
+                                      (1, 0))
+
+        # test that an invalid second keyword argument
+        # is properly handled
+        with assert_raises(TypeError):
+            np.unravel_index(indices=2, hape=(2, 2))
+
+        with assert_raises(TypeError):
+            np.unravel_index(2, hape=(2, 2))
+
+        with assert_raises(TypeError):
+            np.unravel_index(254, ims=(17, 94))
+
+        assert_equal(np.ravel_multi_index((1, 0), (2, 2)), 2)
+        assert_equal(np.unravel_index(254, (17, 94)), (2, 66))
+        assert_equal(np.ravel_multi_index((2, 66), (17, 94)), 254)
+        assert_raises(ValueError, np.unravel_index, -1, (2, 2))
+        assert_raises(TypeError, np.unravel_index, 0.5, (2, 2))
+        assert_raises(ValueError, np.unravel_index, 4, (2, 2))
+        assert_raises(ValueError, np.ravel_multi_index, (-3, 1), (2, 2))
+        assert_raises(ValueError, np.ravel_multi_index, (2, 1), (2, 2))
+        assert_raises(ValueError, np.ravel_multi_index, (0, -3), (2, 2))
+        assert_raises(ValueError, np.ravel_multi_index, (0, 2), (2, 2))
+        assert_raises(TypeError, np.ravel_multi_index, (0.1, 0.), (2, 2))
+
+        assert_equal(np.unravel_index((2*3 + 1)*6 + 4, (4, 3, 6)), [2, 1, 4])
+        assert_equal(
+            np.ravel_multi_index([2, 1, 4], (4, 3, 6)), (2*3 + 1)*6 + 4)
+
+        arr = np.array([[3, 6, 6], [4, 5, 1]])
+        assert_equal(np.ravel_multi_index(arr, (7, 6)), [22, 41, 37])
+        assert_equal(
+            np.ravel_multi_index(arr, (7, 6), order='F'), [31, 41, 13])
+        assert_equal(
+            np.ravel_multi_index(arr, (4, 6), mode='clip'), [22, 23, 19])
+        assert_equal(np.ravel_multi_index(arr, (4, 4), mode=('clip', 'wrap')),
+                     [12, 13, 13])
+        assert_equal(np.ravel_multi_index((3, 1, 4, 1), (6, 7, 8, 9)), 1621)
+
+        assert_equal(np.unravel_index(np.array([22, 41, 37]), (7, 6)),
+                     [[3, 6, 6], [4, 5, 1]])
+        assert_equal(
+            np.unravel_index(np.array([31, 41, 13]), (7, 6), order='F'),
+            [[3, 6, 6], [4, 5, 1]])
+        assert_equal(np.unravel_index(1621, (6, 7, 8, 9)), [3, 1, 4, 1])
+
+    def test_big_indices(self):
+        # ravel_multi_index for big indices (issue #7546)
+        if np.intp == np.int64:
+            arr = ([1, 29], [3, 5], [3, 117], [19, 2],
+                   [2379, 1284], [2, 2], [0, 1])
+            assert_equal(
+                np.ravel_multi_index(arr, (41, 7, 120, 36, 2706, 8, 6)),
+                [5627771580, 117259570957])
+
+        # test overflow checking for too big array (issue #7546)
+        dummy_arr = ([0],[0])
+        half_max = np.iinfo(np.intp).max // 2
+        assert_equal(
+            np.ravel_multi_index(dummy_arr, (half_max, 2)), [0])
+        assert_raises(ValueError,
+            np.ravel_multi_index, dummy_arr, (half_max+1, 2))
+        assert_equal(
+            np.ravel_multi_index(dummy_arr, (half_max, 2), order='F'), [0])
+        assert_raises(ValueError,
+            np.ravel_multi_index, dummy_arr, (half_max+1, 2), order='F')
+
+    def test_dtypes(self):
+        # Test with different data types
+        for dtype in [np.int16, np.uint16, np.int32,
+                      np.uint32, np.int64, np.uint64]:
+            coords = np.array(
+                [[1, 0, 1, 2, 3, 4], [1, 6, 1, 3, 2, 0]], dtype=dtype)
+            shape = (5, 8)
+            uncoords = 8*coords[0]+coords[1]
+            assert_equal(np.ravel_multi_index(coords, shape), uncoords)
+            assert_equal(coords, np.unravel_index(uncoords, shape))
+            uncoords = coords[0]+5*coords[1]
+            assert_equal(
+                np.ravel_multi_index(coords, shape, order='F'), uncoords)
+            assert_equal(coords, np.unravel_index(uncoords, shape, order='F'))
+
+            coords = np.array(
+                [[1, 0, 1, 2, 3, 4], [1, 6, 1, 3, 2, 0], [1, 3, 1, 0, 9, 5]],
+                dtype=dtype)
+            shape = (5, 8, 10)
+            uncoords = 10*(8*coords[0]+coords[1])+coords[2]
+            assert_equal(np.ravel_multi_index(coords, shape), uncoords)
+            assert_equal(coords, np.unravel_index(uncoords, shape))
+            uncoords = coords[0]+5*(coords[1]+8*coords[2])
+            assert_equal(
+                np.ravel_multi_index(coords, shape, order='F'), uncoords)
+            assert_equal(coords, np.unravel_index(uncoords, shape, order='F'))
+
+    def test_clipmodes(self):
+        # Test clipmodes
+        assert_equal(
+            np.ravel_multi_index([5, 1, -1, 2], (4, 3, 7, 12), mode='wrap'),
+            np.ravel_multi_index([1, 1, 6, 2], (4, 3, 7, 12)))
+        assert_equal(np.ravel_multi_index([5, 1, -1, 2], (4, 3, 7, 12),
+                                          mode=(
+                                              'wrap', 'raise', 'clip', 'raise')),
+                     np.ravel_multi_index([1, 1, 0, 2], (4, 3, 7, 12)))
+        assert_raises(
+            ValueError, np.ravel_multi_index, [5, 1, -1, 2], (4, 3, 7, 12))
+
+    def test_writeability(self):
+        # See gh-7269
+        x, y = np.unravel_index([1, 2, 3], (4, 5))
+        assert_(x.flags.writeable)
+        assert_(y.flags.writeable)
+
+    def test_0d(self):
+        # gh-580
+        x = np.unravel_index(0, ())
+        assert_equal(x, ())
+
+        assert_raises_regex(ValueError, "0d array", np.unravel_index, [0], ())
+        assert_raises_regex(
+            ValueError, "out of bounds", np.unravel_index, [1], ())
+
+
+class TestGrid(object):
+    def test_basic(self):
+        a = mgrid[-1:1:10j]
+        b = mgrid[-1:1:0.1]
+        assert_(a.shape == (10,))
+        assert_(b.shape == (20,))
+        assert_(a[0] == -1)
+        assert_almost_equal(a[-1], 1)
+        assert_(b[0] == -1)
+        assert_almost_equal(b[1]-b[0], 0.1, 11)
+        assert_almost_equal(b[-1], b[0]+19*0.1, 11)
+        assert_almost_equal(a[1]-a[0], 2.0/9.0, 11)
+
+    def test_linspace_equivalence(self):
+        y, st = np.linspace(2, 10, retstep=1)
+        assert_almost_equal(st, 8/49.0)
+        assert_array_almost_equal(y, mgrid[2:10:50j], 13)
+
+    def test_nd(self):
+        c = mgrid[-1:1:10j, -2:2:10j]
+        d = mgrid[-1:1:0.1, -2:2:0.2]
+        assert_(c.shape == (2, 10, 10))
+        assert_(d.shape == (2, 20, 20))
+        assert_array_equal(c[0][0, :], -np.ones(10, 'd'))
+        assert_array_equal(c[1][:, 0], -2*np.ones(10, 'd'))
+        assert_array_almost_equal(c[0][-1, :], np.ones(10, 'd'), 11)
+        assert_array_almost_equal(c[1][:, -1], 2*np.ones(10, 'd'), 11)
+        assert_array_almost_equal(d[0, 1, :] - d[0, 0, :],
+                                  0.1*np.ones(20, 'd'), 11)
+        assert_array_almost_equal(d[1, :, 1] - d[1, :, 0],
+                                  0.2*np.ones(20, 'd'), 11)
+
+    def test_sparse(self):
+        grid_full   = mgrid[-1:1:10j, -2:2:10j]
+        grid_sparse = ogrid[-1:1:10j, -2:2:10j]
+
+        # sparse grids can be made dense by broadcasting
+        grid_broadcast = np.broadcast_arrays(*grid_sparse)
+        for f, b in zip(grid_full, grid_broadcast):
+            assert_equal(f, b)
+
+    @pytest.mark.parametrize("start, stop, step, expected", [
+        (None, 10, 10j, (200, 10)),
+        (-10, 20, None, (1800, 30)),
+        ])
+    def test_mgrid_size_none_handling(self, start, stop, step, expected):
+        # regression test None value handling for
+        # start and step values used by mgrid;
+        # internally, this aims to cover previously
+        # unexplored code paths in nd_grid()
+        grid = mgrid[start:stop:step, start:stop:step]
+        # need a smaller grid to explore one of the
+        # untested code paths
+        grid_small = mgrid[start:stop:step]
+        assert_equal(grid.size, expected[0])
+        assert_equal(grid_small.size, expected[1])
+
+
+class TestConcatenator(object):
+    def test_1d(self):
+        assert_array_equal(r_[1, 2, 3, 4, 5, 6], np.array([1, 2, 3, 4, 5, 6]))
+        b = np.ones(5)
+        c = r_[b, 0, 0, b]
+        assert_array_equal(c, [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])
+
+    def test_mixed_type(self):
+        g = r_[10.1, 1:10]
+        assert_(g.dtype == 'f8')
+
+    def test_more_mixed_type(self):
+        g = r_[-10.1, np.array([1]), np.array([2, 3, 4]), 10.0]
+        assert_(g.dtype == 'f8')
+
+    def test_complex_step(self):
+        # Regression test for #12262
+        g = r_[0:36:100j]
+        assert_(g.shape == (100,))
+
+    def test_2d(self):
+        b = np.random.rand(5, 5)
+        c = np.random.rand(5, 5)
+        d = r_['1', b, c]  # append columns
+        assert_(d.shape == (5, 10))
+        assert_array_equal(d[:, :5], b)
+        assert_array_equal(d[:, 5:], c)
+        d = r_[b, c]
+        assert_(d.shape == (10, 5))
+        assert_array_equal(d[:5, :], b)
+        assert_array_equal(d[5:, :], c)
+
+    def test_0d(self):
+        assert_equal(r_[0, np.array(1), 2], [0, 1, 2])
+        assert_equal(r_[[0, 1, 2], np.array(3)], [0, 1, 2, 3])
+        assert_equal(r_[np.array(0), [1, 2, 3]], [0, 1, 2, 3])
+
+
+class TestNdenumerate(object):
+    def test_basic(self):
+        a = np.array([[1, 2], [3, 4]])
+        assert_equal(list(ndenumerate(a)),
+                     [((0, 0), 1), ((0, 1), 2), ((1, 0), 3), ((1, 1), 4)])
+
+
+class TestIndexExpression(object):
+    def test_regression_1(self):
+        # ticket #1196
+        a = np.arange(2)
+        assert_equal(a[:-1], a[s_[:-1]])
+        assert_equal(a[:-1], a[index_exp[:-1]])
+
+    def test_simple_1(self):
+        a = np.random.rand(4, 5, 6)
+
+        assert_equal(a[:, :3, [1, 2]], a[index_exp[:, :3, [1, 2]]])
+        assert_equal(a[:, :3, [1, 2]], a[s_[:, :3, [1, 2]]])
+
+
+class TestIx_(object):
+    def test_regression_1(self):
+        # Test empty inputs create outputs of indexing type, gh-5804
+        # Test both lists and arrays
+        for func in (range, np.arange):
+            a, = np.ix_(func(0))
+            assert_equal(a.dtype, np.intp)
+
+    def test_shape_and_dtype(self):
+        sizes = (4, 5, 3, 2)
+        # Test both lists and arrays
+        for func in (range, np.arange):
+            arrays = np.ix_(*[func(sz) for sz in sizes])
+            for k, (a, sz) in enumerate(zip(arrays, sizes)):
+                assert_equal(a.shape[k], sz)
+                assert_(all(sh == 1 for j, sh in enumerate(a.shape) if j != k))
+                assert_(np.issubdtype(a.dtype, np.integer))
+
+    def test_bool(self):
+        bool_a = [True, False, True, True]
+        int_a, = np.nonzero(bool_a)
+        assert_equal(np.ix_(bool_a)[0], int_a)
+
+    def test_1d_only(self):
+        idx2d = [[1, 2, 3], [4, 5, 6]]
+        assert_raises(ValueError, np.ix_, idx2d)
+
+    def test_repeated_input(self):
+        length_of_vector = 5
+        x = np.arange(length_of_vector)
+        out = ix_(x, x)
+        assert_equal(out[0].shape, (length_of_vector, 1))
+        assert_equal(out[1].shape, (1, length_of_vector))
+        # check that input shape is not modified
+        assert_equal(x.shape, (length_of_vector,))
+
+
+def test_c_():
+    a = np.c_[np.array([[1, 2, 3]]), 0, 0, np.array([[4, 5, 6]])]
+    assert_equal(a, [[1, 2, 3, 0, 0, 4, 5, 6]])
+
+
+class TestFillDiagonal(object):
+    def test_basic(self):
+        a = np.zeros((3, 3), int)
+        fill_diagonal(a, 5)
+        assert_array_equal(
+            a, np.array([[5, 0, 0],
+                         [0, 5, 0],
+                         [0, 0, 5]])
+            )
+
+    def test_tall_matrix(self):
+        a = np.zeros((10, 3), int)
+        fill_diagonal(a, 5)
+        assert_array_equal(
+            a, np.array([[5, 0, 0],
+                         [0, 5, 0],
+                         [0, 0, 5],
+                         [0, 0, 0],
+                         [0, 0, 0],
+                         [0, 0, 0],
+                         [0, 0, 0],
+                         [0, 0, 0],
+                         [0, 0, 0],
+                         [0, 0, 0]])
+            )
+
+    def test_tall_matrix_wrap(self):
+        a = np.zeros((10, 3), int)
+        fill_diagonal(a, 5, True)
+        assert_array_equal(
+            a, np.array([[5, 0, 0],
+                         [0, 5, 0],
+                         [0, 0, 5],
+                         [0, 0, 0],
+                         [5, 0, 0],
+                         [0, 5, 0],
+                         [0, 0, 5],
+                         [0, 0, 0],
+                         [5, 0, 0],
+                         [0, 5, 0]])
+            )
+
+    def test_wide_matrix(self):
+        a = np.zeros((3, 10), int)
+        fill_diagonal(a, 5)
+        assert_array_equal(
+            a, np.array([[5, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 5, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 5, 0, 0, 0, 0, 0, 0, 0]])
+            )
+
+    def test_operate_4d_array(self):
+        a = np.zeros((3, 3, 3, 3), int)
+        fill_diagonal(a, 4)
+        i = np.array([0, 1, 2])
+        assert_equal(np.where(a != 0), (i, i, i, i))
+
+    def test_low_dim_handling(self):
+        # raise error with low dimensionality
+        a = np.zeros(3, int)
+        with assert_raises_regex(ValueError, "at least 2-d"):
+            fill_diagonal(a, 5)
+
+    def test_hetero_shape_handling(self):
+        # raise error with high dimensionality and
+        # shape mismatch
+        a = np.zeros((3,3,7,3), int)
+        with assert_raises_regex(ValueError, "equal length"):
+            fill_diagonal(a, 2)
+
+
+def test_diag_indices():
+    di = diag_indices(4)
+    a = np.array([[1, 2, 3, 4],
+                  [5, 6, 7, 8],
+                  [9, 10, 11, 12],
+                  [13, 14, 15, 16]])
+    a[di] = 100
+    assert_array_equal(
+        a, np.array([[100, 2, 3, 4],
+                     [5, 100, 7, 8],
+                     [9, 10, 100, 12],
+                     [13, 14, 15, 100]])
+        )
+
+    # Now, we create indices to manipulate a 3-d array:
+    d3 = diag_indices(2, 3)
+
+    # And use it to set the diagonal of a zeros array to 1:
+    a = np.zeros((2, 2, 2), int)
+    a[d3] = 1
+    assert_array_equal(
+        a, np.array([[[1, 0],
+                      [0, 0]],
+                     [[0, 0],
+                      [0, 1]]])
+        )
+
+
+class TestDiagIndicesFrom(object):
+
+    def test_diag_indices_from(self):
+        x = np.random.random((4, 4))
+        r, c = diag_indices_from(x)
+        assert_array_equal(r, np.arange(4))
+        assert_array_equal(c, np.arange(4))
+
+    def test_error_small_input(self):
+        x = np.ones(7)
+        with assert_raises_regex(ValueError, "at least 2-d"):
+            diag_indices_from(x)
+
+    def test_error_shape_mismatch(self):
+        x = np.zeros((3, 3, 2, 3), int)
+        with assert_raises_regex(ValueError, "equal length"):
+            diag_indices_from(x)
+
+
+def test_ndindex():
+    x = list(ndindex(1, 2, 3))
+    expected = [ix for ix, e in ndenumerate(np.zeros((1, 2, 3)))]
+    assert_array_equal(x, expected)
+
+    x = list(ndindex((1, 2, 3)))
+    assert_array_equal(x, expected)
+
+    # Test use of scalars and tuples
+    x = list(ndindex((3,)))
+    assert_array_equal(x, list(ndindex(3)))
+
+    # Make sure size argument is optional
+    x = list(ndindex())
+    assert_equal(x, [()])
+
+    x = list(ndindex(()))
+    assert_equal(x, [()])
+
+    # Make sure 0-sized ndindex works correctly
+    x = list(ndindex(*[0]))
+    assert_equal(x, [])
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_io.py b/contrib/python/numpy/py2/numpy/lib/tests/test_io.py
new file mode 100644
index 00000000000..899e490312e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_io.py
@@ -0,0 +1,2518 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import gzip
+import os
+import threading
+import time
+import warnings
+import io
+import re
+import pytest
+from tempfile import NamedTemporaryFile
+from io import BytesIO, StringIO
+from datetime import datetime
+import locale
+
+import numpy as np
+import numpy.ma as ma
+from numpy.lib._iotools import ConverterError, ConversionWarning
+from numpy.compat import asbytes, bytes, Path
+from numpy.ma.testutils import assert_equal
+from numpy.testing import (
+    assert_warns, assert_, assert_raises_regex, assert_raises,
+    assert_allclose, assert_array_equal, temppath, tempdir, IS_PYPY,
+    HAS_REFCOUNT, suppress_warnings, assert_no_gc_cycles,
+    )
+
+
+class TextIO(BytesIO):
+    """Helper IO class.
+
+    Writes encode strings to bytes if needed, reads return bytes.
+    This makes it easier to emulate files opened in binary mode
+    without needing to explicitly convert strings to bytes in
+    setting up the test data.
+
+    """
+    def __init__(self, s=""):
+        BytesIO.__init__(self, asbytes(s))
+
+    def write(self, s):
+        BytesIO.write(self, asbytes(s))
+
+    def writelines(self, lines):
+        BytesIO.writelines(self, [asbytes(s) for s in lines])
+
+
+MAJVER, MINVER = sys.version_info[:2]
+IS_64BIT = sys.maxsize > 2**32
+try:
+    import bz2
+    HAS_BZ2 = True
+except ImportError:
+    HAS_BZ2 = False
+try:
+    import lzma
+    HAS_LZMA = True
+except ImportError:
+    HAS_LZMA = False
+
+
+def strptime(s, fmt=None):
+    """
+    This function is available in the datetime module only from Python >=
+    2.5.
+
+    """
+    if type(s) == bytes:
+        s = s.decode("latin1")
+    return datetime(*time.strptime(s, fmt)[:3])
+
+
+class RoundtripTest(object):
+    def roundtrip(self, save_func, *args, **kwargs):
+        """
+        save_func : callable
+            Function used to save arrays to file.
+        file_on_disk : bool
+            If true, store the file on disk, instead of in a
+            string buffer.
+        save_kwds : dict
+            Parameters passed to `save_func`.
+        load_kwds : dict
+            Parameters passed to `numpy.load`.
+        args : tuple of arrays
+            Arrays stored to file.
+
+        """
+        save_kwds = kwargs.get('save_kwds', {})
+        load_kwds = kwargs.get('load_kwds', {"allow_pickle": True})
+        file_on_disk = kwargs.get('file_on_disk', False)
+
+        if file_on_disk:
+            target_file = NamedTemporaryFile(delete=False)
+            load_file = target_file.name
+        else:
+            target_file = BytesIO()
+            load_file = target_file
+
+        try:
+            arr = args
+
+            save_func(target_file, *arr, **save_kwds)
+            target_file.flush()
+            target_file.seek(0)
+
+            if sys.platform == 'win32' and not isinstance(target_file, BytesIO):
+                target_file.close()
+
+            arr_reloaded = np.load(load_file, **load_kwds)
+
+            self.arr = arr
+            self.arr_reloaded = arr_reloaded
+        finally:
+            if not isinstance(target_file, BytesIO):
+                target_file.close()
+                # holds an open file descriptor so it can't be deleted on win
+                if 'arr_reloaded' in locals():
+                    if not isinstance(arr_reloaded, np.lib.npyio.NpzFile):
+                        os.remove(target_file.name)
+
+    def check_roundtrips(self, a):
+        self.roundtrip(a)
+        self.roundtrip(a, file_on_disk=True)
+        self.roundtrip(np.asfortranarray(a))
+        self.roundtrip(np.asfortranarray(a), file_on_disk=True)
+        if a.shape[0] > 1:
+            # neither C nor Fortran contiguous for 2D arrays or more
+            self.roundtrip(np.asfortranarray(a)[1:])
+            self.roundtrip(np.asfortranarray(a)[1:], file_on_disk=True)
+
+    def test_array(self):
+        a = np.array([], float)
+        self.check_roundtrips(a)
+
+        a = np.array([[1, 2], [3, 4]], float)
+        self.check_roundtrips(a)
+
+        a = np.array([[1, 2], [3, 4]], int)
+        self.check_roundtrips(a)
+
+        a = np.array([[1 + 5j, 2 + 6j], [3 + 7j, 4 + 8j]], dtype=np.csingle)
+        self.check_roundtrips(a)
+
+        a = np.array([[1 + 5j, 2 + 6j], [3 + 7j, 4 + 8j]], dtype=np.cdouble)
+        self.check_roundtrips(a)
+
+    def test_array_object(self):
+        a = np.array([], object)
+        self.check_roundtrips(a)
+
+        a = np.array([[1, 2], [3, 4]], object)
+        self.check_roundtrips(a)
+
+    def test_1D(self):
+        a = np.array([1, 2, 3, 4], int)
+        self.roundtrip(a)
+
+    @pytest.mark.skipif(sys.platform == 'win32', reason="Fails on Win32")
+    def test_mmap(self):
+        a = np.array([[1, 2.5], [4, 7.3]])
+        self.roundtrip(a, file_on_disk=True, load_kwds={'mmap_mode': 'r'})
+
+        a = np.asfortranarray([[1, 2.5], [4, 7.3]])
+        self.roundtrip(a, file_on_disk=True, load_kwds={'mmap_mode': 'r'})
+
+    def test_record(self):
+        a = np.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')])
+        self.check_roundtrips(a)
+
+    @pytest.mark.slow
+    def test_format_2_0(self):
+        dt = [(("%d" % i) * 100, float) for i in range(500)]
+        a = np.ones(1000, dtype=dt)
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings('always', '', UserWarning)
+            self.check_roundtrips(a)
+
+
+class TestSaveLoad(RoundtripTest):
+    def roundtrip(self, *args, **kwargs):
+        RoundtripTest.roundtrip(self, np.save, *args, **kwargs)
+        assert_equal(self.arr[0], self.arr_reloaded)
+        assert_equal(self.arr[0].dtype, self.arr_reloaded.dtype)
+        assert_equal(self.arr[0].flags.fnc, self.arr_reloaded.flags.fnc)
+
+
+class TestSavezLoad(RoundtripTest):
+    def roundtrip(self, *args, **kwargs):
+        RoundtripTest.roundtrip(self, np.savez, *args, **kwargs)
+        try:
+            for n, arr in enumerate(self.arr):
+                reloaded = self.arr_reloaded['arr_%d' % n]
+                assert_equal(arr, reloaded)
+                assert_equal(arr.dtype, reloaded.dtype)
+                assert_equal(arr.flags.fnc, reloaded.flags.fnc)
+        finally:
+            # delete tempfile, must be done here on windows
+            if self.arr_reloaded.fid:
+                self.arr_reloaded.fid.close()
+                os.remove(self.arr_reloaded.fid.name)
+
+    @pytest.mark.skipif(not IS_64BIT, reason="Needs 64bit platform")
+    @pytest.mark.slow
+    def test_big_arrays(self):
+        L = (1 << 31) + 100000
+        a = np.empty(L, dtype=np.uint8)
+        with temppath(prefix="numpy_test_big_arrays_", suffix=".npz") as tmp:
+            np.savez(tmp, a=a)
+            del a
+            npfile = np.load(tmp)
+            a = npfile['a']  # Should succeed
+            npfile.close()
+            del a  # Avoid pyflakes unused variable warning.
+
+    def test_multiple_arrays(self):
+        a = np.array([[1, 2], [3, 4]], float)
+        b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
+        self.roundtrip(a, b)
+
+    def test_named_arrays(self):
+        a = np.array([[1, 2], [3, 4]], float)
+        b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
+        c = BytesIO()
+        np.savez(c, file_a=a, file_b=b)
+        c.seek(0)
+        l = np.load(c)
+        assert_equal(a, l['file_a'])
+        assert_equal(b, l['file_b'])
+
+    def test_BagObj(self):
+        a = np.array([[1, 2], [3, 4]], float)
+        b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
+        c = BytesIO()
+        np.savez(c, file_a=a, file_b=b)
+        c.seek(0)
+        l = np.load(c)
+        assert_equal(sorted(dir(l.f)), ['file_a','file_b'])
+        assert_equal(a, l.f.file_a)
+        assert_equal(b, l.f.file_b)
+
+    def test_savez_filename_clashes(self):
+        # Test that issue #852 is fixed
+        # and savez functions in multithreaded environment
+
+        def writer(error_list):
+            with temppath(suffix='.npz') as tmp:
+                arr = np.random.randn(500, 500)
+                try:
+                    np.savez(tmp, arr=arr)
+                except OSError as err:
+                    error_list.append(err)
+
+        errors = []
+        threads = [threading.Thread(target=writer, args=(errors,))
+                   for j in range(3)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        if errors:
+            raise AssertionError(errors)
+
+    def test_not_closing_opened_fid(self):
+        # Test that issue #2178 is fixed:
+        # verify could seek on 'loaded' file
+        with temppath(suffix='.npz') as tmp:
+            with open(tmp, 'wb') as fp:
+                np.savez(fp, data='LOVELY LOAD')
+            with open(tmp, 'rb', 10000) as fp:
+                fp.seek(0)
+                assert_(not fp.closed)
+                np.load(fp)['data']
+                # fp must not get closed by .load
+                assert_(not fp.closed)
+                fp.seek(0)
+                assert_(not fp.closed)
+
+    #FIXME: Is this still true?
+    @pytest.mark.skipif(IS_PYPY, reason="Missing context manager on PyPy")
+    def test_closing_fid(self):
+        # Test that issue #1517 (too many opened files) remains closed
+        # It might be a "weak" test since failed to get triggered on
+        # e.g. Debian sid of 2012 Jul 05 but was reported to
+        # trigger the failure on Ubuntu 10.04:
+        # http://projects.scipy.org/numpy/ticket/1517#comment:2
+        with temppath(suffix='.npz') as tmp:
+            np.savez(tmp, data='LOVELY LOAD')
+            # We need to check if the garbage collector can properly close
+            # numpy npz file returned by np.load when their reference count
+            # goes to zero.  Python 3 running in debug mode raises a
+            # ResourceWarning when file closing is left to the garbage
+            # collector, so we catch the warnings.  Because ResourceWarning
+            # is unknown in Python < 3.x, we take the easy way out and
+            # catch all warnings.
+            with suppress_warnings() as sup:
+                sup.filter(Warning)  # TODO: specify exact message
+                for i in range(1, 1025):
+                    try:
+                        np.load(tmp)["data"]
+                    except Exception as e:
+                        msg = "Failed to load data from a file: %s" % e
+                        raise AssertionError(msg)
+
+    def test_closing_zipfile_after_load(self):
+        # Check that zipfile owns file and can close it.  This needs to
+        # pass a file name to load for the test. On windows failure will
+        # cause a second error will be raised when the attempt to remove
+        # the open file is made.
+        prefix = 'numpy_test_closing_zipfile_after_load_'
+        with temppath(suffix='.npz', prefix=prefix) as tmp:
+            np.savez(tmp, lab='place holder')
+            data = np.load(tmp)
+            fp = data.zip.fp
+            data.close()
+            assert_(fp.closed)
+
+
+class TestSaveTxt(object):
+    def test_array(self):
+        a = np.array([[1, 2], [3, 4]], float)
+        fmt = "%.18e"
+        c = BytesIO()
+        np.savetxt(c, a, fmt=fmt)
+        c.seek(0)
+        assert_equal(c.readlines(),
+                     [asbytes((fmt + ' ' + fmt + '\n') % (1, 2)),
+                      asbytes((fmt + ' ' + fmt + '\n') % (3, 4))])
+
+        a = np.array([[1, 2], [3, 4]], int)
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%d')
+        c.seek(0)
+        assert_equal(c.readlines(), [b'1 2\n', b'3 4\n'])
+
+    def test_1D(self):
+        a = np.array([1, 2, 3, 4], int)
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%d')
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(lines, [b'1\n', b'2\n', b'3\n', b'4\n'])
+
+    def test_0D_3D(self):
+        c = BytesIO()
+        assert_raises(ValueError, np.savetxt, c, np.array(1))
+        assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]]))
+
+    def test_structured(self):
+        a = np.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')])
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%d')
+        c.seek(0)
+        assert_equal(c.readlines(), [b'1 2\n', b'3 4\n'])
+
+    def test_structured_padded(self):
+        # gh-13297
+        a = np.array([(1, 2, 3),(4, 5, 6)], dtype=[
+            ('foo', 'i4'), ('bar', 'i4'), ('baz', 'i4')
+        ])
+        c = BytesIO()
+        np.savetxt(c, a[['foo', 'baz']], fmt='%d')
+        c.seek(0)
+        assert_equal(c.readlines(), [b'1 3\n', b'4 6\n'])
+
+    @pytest.mark.skipif(Path is None, reason="No pathlib.Path")
+    def test_multifield_view(self):
+        a = np.ones(1, dtype=[('x', 'i4'), ('y', 'i4'), ('z', 'f4')])
+        v = a[['x', 'z']]
+        with temppath(suffix='.npy') as path:
+            path = Path(path)
+            np.save(path, v)
+            data = np.load(path)
+            assert_array_equal(data, v)
+
+    def test_delimiter(self):
+        a = np.array([[1., 2.], [3., 4.]])
+        c = BytesIO()
+        np.savetxt(c, a, delimiter=',', fmt='%d')
+        c.seek(0)
+        assert_equal(c.readlines(), [b'1,2\n', b'3,4\n'])
+
+    def test_format(self):
+        a = np.array([(1, 2), (3, 4)])
+        c = BytesIO()
+        # Sequence of formats
+        np.savetxt(c, a, fmt=['%02d', '%3.1f'])
+        c.seek(0)
+        assert_equal(c.readlines(), [b'01 2.0\n', b'03 4.0\n'])
+
+        # A single multiformat string
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%02d : %3.1f')
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(lines, [b'01 : 2.0\n', b'03 : 4.0\n'])
+
+        # Specify delimiter, should be overridden
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%02d : %3.1f', delimiter=',')
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(lines, [b'01 : 2.0\n', b'03 : 4.0\n'])
+
+        # Bad fmt, should raise a ValueError
+        c = BytesIO()
+        assert_raises(ValueError, np.savetxt, c, a, fmt=99)
+
+    def test_header_footer(self):
+        # Test the functionality of the header and footer keyword argument.
+
+        c = BytesIO()
+        a = np.array([(1, 2), (3, 4)], dtype=int)
+        test_header_footer = 'Test header / footer'
+        # Test the header keyword argument
+        np.savetxt(c, a, fmt='%1d', header=test_header_footer)
+        c.seek(0)
+        assert_equal(c.read(),
+                     asbytes('# ' + test_header_footer + '\n1 2\n3 4\n'))
+        # Test the footer keyword argument
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%1d', footer=test_header_footer)
+        c.seek(0)
+        assert_equal(c.read(),
+                     asbytes('1 2\n3 4\n# ' + test_header_footer + '\n'))
+        # Test the commentstr keyword argument used on the header
+        c = BytesIO()
+        commentstr = '% '
+        np.savetxt(c, a, fmt='%1d',
+                   header=test_header_footer, comments=commentstr)
+        c.seek(0)
+        assert_equal(c.read(),
+                     asbytes(commentstr + test_header_footer + '\n' + '1 2\n3 4\n'))
+        # Test the commentstr keyword argument used on the footer
+        c = BytesIO()
+        commentstr = '% '
+        np.savetxt(c, a, fmt='%1d',
+                   footer=test_header_footer, comments=commentstr)
+        c.seek(0)
+        assert_equal(c.read(),
+                     asbytes('1 2\n3 4\n' + commentstr + test_header_footer + '\n'))
+
+    def test_file_roundtrip(self):
+        with temppath() as name:
+            a = np.array([(1, 2), (3, 4)])
+            np.savetxt(name, a)
+            b = np.loadtxt(name)
+            assert_array_equal(a, b)
+
+    def test_complex_arrays(self):
+        ncols = 2
+        nrows = 2
+        a = np.zeros((ncols, nrows), dtype=np.complex128)
+        re = np.pi
+        im = np.e
+        a[:] = re + 1.0j * im
+
+        # One format only
+        c = BytesIO()
+        np.savetxt(c, a, fmt=' %+.3e')
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(
+            lines,
+            [b' ( +3.142e+00+ +2.718e+00j)  ( +3.142e+00+ +2.718e+00j)\n',
+             b' ( +3.142e+00+ +2.718e+00j)  ( +3.142e+00+ +2.718e+00j)\n'])
+
+        # One format for each real and imaginary part
+        c = BytesIO()
+        np.savetxt(c, a, fmt='  %+.3e' * 2 * ncols)
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(
+            lines,
+            [b'  +3.142e+00  +2.718e+00  +3.142e+00  +2.718e+00\n',
+             b'  +3.142e+00  +2.718e+00  +3.142e+00  +2.718e+00\n'])
+
+        # One format for each complex number
+        c = BytesIO()
+        np.savetxt(c, a, fmt=['(%.3e%+.3ej)'] * ncols)
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(
+            lines,
+            [b'(3.142e+00+2.718e+00j) (3.142e+00+2.718e+00j)\n',
+             b'(3.142e+00+2.718e+00j) (3.142e+00+2.718e+00j)\n'])
+
+    def test_complex_negative_exponent(self):
+        # Previous to 1.15, some formats generated x+-yj, gh 7895
+        ncols = 2
+        nrows = 2
+        a = np.zeros((ncols, nrows), dtype=np.complex128)
+        re = np.pi
+        im = np.e
+        a[:] = re - 1.0j * im
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%.3e')
+        c.seek(0)
+        lines = c.readlines()
+        assert_equal(
+            lines,
+            [b' (3.142e+00-2.718e+00j)  (3.142e+00-2.718e+00j)\n',
+             b' (3.142e+00-2.718e+00j)  (3.142e+00-2.718e+00j)\n'])
+
+
+        
+
+    def test_custom_writer(self):
+
+        class CustomWriter(list):
+            def write(self, text):
+                self.extend(text.split(b'\n'))
+
+        w = CustomWriter()
+        a = np.array([(1, 2), (3, 4)])
+        np.savetxt(w, a)
+        b = np.loadtxt(w)
+        assert_array_equal(a, b)
+
+    def test_unicode(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        with tempdir() as tmpdir:
+            # set encoding as on windows it may not be unicode even on py3
+            np.savetxt(os.path.join(tmpdir, 'test.csv'), a, fmt=['%s'],
+                       encoding='UTF-8')
+
+    def test_unicode_roundtrip(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        # our gz wrapper support encoding
+        suffixes = ['', '.gz']
+        # stdlib 2 versions do not support encoding
+        if MAJVER > 2:
+            if HAS_BZ2:
+                suffixes.append('.bz2')
+            if HAS_LZMA:
+                suffixes.extend(['.xz', '.lzma'])
+        with tempdir() as tmpdir:
+            for suffix in suffixes:
+                np.savetxt(os.path.join(tmpdir, 'test.csv' + suffix), a,
+                           fmt=['%s'], encoding='UTF-16-LE')
+                b = np.loadtxt(os.path.join(tmpdir, 'test.csv' + suffix),
+                               encoding='UTF-16-LE', dtype=np.unicode)
+                assert_array_equal(a, b)
+
+    def test_unicode_bytestream(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        s = BytesIO()
+        np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
+        s.seek(0)
+        assert_equal(s.read().decode('UTF-8'), utf8 + '\n')
+
+    def test_unicode_stringstream(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        s = StringIO()
+        np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
+        s.seek(0)
+        assert_equal(s.read(), utf8 + '\n')
+
+
+class LoadTxtBase(object):
+    def check_compressed(self, fopen, suffixes):
+        # Test that we can load data from a compressed file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            for suffix in suffixes:
+                with temppath(suffix=suffix) as name:
+                    with fopen(name, mode='wt', encoding='UTF-32-LE') as f:
+                        f.write(data)
+                    res = self.loadfunc(name, encoding='UTF-32-LE')
+                    assert_array_equal(res, wanted)
+                    with fopen(name, "rt",  encoding='UTF-32-LE') as f:
+                        res = self.loadfunc(f)
+                    assert_array_equal(res, wanted)
+
+    # Python2 .open does not support encoding
+    @pytest.mark.skipif(MAJVER == 2, reason="Needs Python version >= 3")
+    def test_compressed_gzip(self):
+        self.check_compressed(gzip.open, ('.gz',))
+
+    @pytest.mark.skipif(not HAS_BZ2, reason="Needs bz2")
+    @pytest.mark.skipif(MAJVER == 2, reason="Needs Python version >= 3")
+    def test_compressed_bz2(self):
+        self.check_compressed(bz2.open, ('.bz2',))
+
+    @pytest.mark.skipif(not HAS_LZMA, reason="Needs lzma")
+    @pytest.mark.skipif(MAJVER == 2, reason="Needs Python version >= 3")
+    def test_compressed_lzma(self):
+        self.check_compressed(lzma.open, ('.xz', '.lzma'))
+
+    def test_encoding(self):
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write('0.\n1.\n2.'.encode("UTF-16"))
+            x = self.loadfunc(path, encoding="UTF-16")
+            assert_array_equal(x, [0., 1., 2.])
+
+    def test_stringload(self):
+        # umlaute
+        nonascii = b'\xc3\xb6\xc3\xbc\xc3\xb6'.decode("UTF-8")
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write(nonascii.encode("UTF-16"))
+            x = self.loadfunc(path, encoding="UTF-16", dtype=np.unicode)
+            assert_array_equal(x, nonascii)
+
+    def test_binary_decode(self):
+        utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
+        v = self.loadfunc(BytesIO(utf16), dtype=np.unicode, encoding='UTF-16')
+        assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
+
+    def test_converters_decode(self):
+        # test converters that decode strings
+        c = TextIO()
+        c.write(b'\xcf\x96')
+        c.seek(0)
+        x = self.loadfunc(c, dtype=np.unicode,
+                          converters={0: lambda x: x.decode('UTF-8')})
+        a = np.array([b'\xcf\x96'.decode('UTF-8')])
+        assert_array_equal(x, a)
+
+    def test_converters_nodecode(self):
+        # test native string converters enabled by setting an encoding
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        with temppath() as path:
+            with io.open(path, 'wt', encoding='UTF-8') as f:
+                f.write(utf8)
+            x = self.loadfunc(path, dtype=np.unicode,
+                              converters={0: lambda x: x + 't'},
+                              encoding='UTF-8')
+            a = np.array([utf8 + 't'])
+            assert_array_equal(x, a)
+
+
+class TestLoadTxt(LoadTxtBase):
+    loadfunc = staticmethod(np.loadtxt)
+
+    def setup(self):
+        # lower chunksize for testing
+        self.orig_chunk = np.lib.npyio._loadtxt_chunksize
+        np.lib.npyio._loadtxt_chunksize = 1
+    def teardown(self):
+        np.lib.npyio._loadtxt_chunksize = self.orig_chunk
+
+    def test_record(self):
+        c = TextIO()
+        c.write('1 2\n3 4')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=[('x', np.int32), ('y', np.int32)])
+        a = np.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')])
+        assert_array_equal(x, a)
+
+        d = TextIO()
+        d.write('M 64.0 75.0\nF 25.0 60.0')
+        d.seek(0)
+        mydescriptor = {'names': ('gender', 'age', 'weight'),
+                        'formats': ('S1', 'i4', 'f4')}
+        b = np.array([('M', 64.0, 75.0),
+                      ('F', 25.0, 60.0)], dtype=mydescriptor)
+        y = np.loadtxt(d, dtype=mydescriptor)
+        assert_array_equal(y, b)
+
+    def test_array(self):
+        c = TextIO()
+        c.write('1 2\n3 4')
+
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int)
+        a = np.array([[1, 2], [3, 4]], int)
+        assert_array_equal(x, a)
+
+        c.seek(0)
+        x = np.loadtxt(c, dtype=float)
+        a = np.array([[1, 2], [3, 4]], float)
+        assert_array_equal(x, a)
+
+    def test_1D(self):
+        c = TextIO()
+        c.write('1\n2\n3\n4\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int)
+        a = np.array([1, 2, 3, 4], int)
+        assert_array_equal(x, a)
+
+        c = TextIO()
+        c.write('1,2,3,4\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',')
+        a = np.array([1, 2, 3, 4], int)
+        assert_array_equal(x, a)
+
+    def test_missing(self):
+        c = TextIO()
+        c.write('1,2,3,,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       converters={3: lambda s: int(s or - 999)})
+        a = np.array([1, 2, 3, -999, 5], int)
+        assert_array_equal(x, a)
+
+    def test_converters_with_usecols(self):
+        c = TextIO()
+        c.write('1,2,3,,5\n6,7,8,9,10\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       converters={3: lambda s: int(s or - 999)},
+                       usecols=(1, 3,))
+        a = np.array([[2, -999], [7, 9]], int)
+        assert_array_equal(x, a)
+
+    def test_comments_unicode(self):
+        c = TextIO()
+        c.write('# comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments=u'#')
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+    def test_comments_byte(self):
+        c = TextIO()
+        c.write('# comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments=b'#')
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+    def test_comments_multiple(self):
+        c = TextIO()
+        c.write('# comment\n1,2,3\n@ comment2\n4,5,6 // comment3')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments=['#', '@', '//'])
+        a = np.array([[1, 2, 3], [4, 5, 6]], int)
+        assert_array_equal(x, a)
+
+    def test_comments_multi_chars(self):
+        c = TextIO()
+        c.write('/* comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments='/*')
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+        # Check that '/*' is not transformed to ['/', '*']
+        c = TextIO()
+        c.write('*/ comment\n1,2,3,5\n')
+        c.seek(0)
+        assert_raises(ValueError, np.loadtxt, c, dtype=int, delimiter=',',
+                      comments='/*')
+
+    def test_skiprows(self):
+        c = TextIO()
+        c.write('comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       skiprows=1)
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+        c = TextIO()
+        c.write('# comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       skiprows=1)
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+    def test_usecols(self):
+        a = np.array([[1, 2], [3, 4]], float)
+        c = BytesIO()
+        np.savetxt(c, a)
+        c.seek(0)
+        x = np.loadtxt(c, dtype=float, usecols=(1,))
+        assert_array_equal(x, a[:, 1])
+
+        a = np.array([[1, 2, 3], [3, 4, 5]], float)
+        c = BytesIO()
+        np.savetxt(c, a)
+        c.seek(0)
+        x = np.loadtxt(c, dtype=float, usecols=(1, 2))
+        assert_array_equal(x, a[:, 1:])
+
+        # Testing with arrays instead of tuples.
+        c.seek(0)
+        x = np.loadtxt(c, dtype=float, usecols=np.array([1, 2]))
+        assert_array_equal(x, a[:, 1:])
+
+        # Testing with an integer instead of a sequence
+        for int_type in [int, np.int8, np.int16,
+                         np.int32, np.int64, np.uint8, np.uint16,
+                         np.uint32, np.uint64]:
+            to_read = int_type(1)
+            c.seek(0)
+            x = np.loadtxt(c, dtype=float, usecols=to_read)
+            assert_array_equal(x, a[:, 1])
+
+        # Testing with some crazy custom integer type
+        class CrazyInt(object):
+            def __index__(self):
+                return 1
+
+        crazy_int = CrazyInt()
+        c.seek(0)
+        x = np.loadtxt(c, dtype=float, usecols=crazy_int)
+        assert_array_equal(x, a[:, 1])
+
+        c.seek(0)
+        x = np.loadtxt(c, dtype=float, usecols=(crazy_int,))
+        assert_array_equal(x, a[:, 1])
+
+        # Checking with dtypes defined converters.
+        data = '''JOE 70.1 25.3
+                BOB 60.5 27.9
+                '''
+        c = TextIO(data)
+        names = ['stid', 'temp']
+        dtypes = ['S4', 'f8']
+        arr = np.loadtxt(c, usecols=(0, 2), dtype=list(zip(names, dtypes)))
+        assert_equal(arr['stid'], [b"JOE", b"BOB"])
+        assert_equal(arr['temp'], [25.3, 27.9])
+
+        # Testing non-ints in usecols
+        c.seek(0)
+        bogus_idx = 1.5
+        assert_raises_regex(
+            TypeError,
+            '^usecols must be.*%s' % type(bogus_idx),
+            np.loadtxt, c, usecols=bogus_idx
+            )
+
+        assert_raises_regex(
+            TypeError,
+            '^usecols must be.*%s' % type(bogus_idx),
+            np.loadtxt, c, usecols=[0, bogus_idx, 0]
+            )
+
+    def test_fancy_dtype(self):
+        c = TextIO()
+        c.write('1,2,3.0\n4,5,6.0\n')
+        c.seek(0)
+        dt = np.dtype([('x', int), ('y', [('t', int), ('s', float)])])
+        x = np.loadtxt(c, dtype=dt, delimiter=',')
+        a = np.array([(1, (2, 3.0)), (4, (5, 6.0))], dt)
+        assert_array_equal(x, a)
+
+    def test_shaped_dtype(self):
+        c = TextIO("aaaa  1.0  8.0  1 2 3 4 5 6")
+        dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
+                       ('block', int, (2, 3))])
+        x = np.loadtxt(c, dtype=dt)
+        a = np.array([('aaaa', 1.0, 8.0, [[1, 2, 3], [4, 5, 6]])],
+                     dtype=dt)
+        assert_array_equal(x, a)
+
+    def test_3d_shaped_dtype(self):
+        c = TextIO("aaaa  1.0  8.0  1 2 3 4 5 6 7 8 9 10 11 12")
+        dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
+                       ('block', int, (2, 2, 3))])
+        x = np.loadtxt(c, dtype=dt)
+        a = np.array([('aaaa', 1.0, 8.0,
+                       [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])],
+                     dtype=dt)
+        assert_array_equal(x, a)
+
+    def test_str_dtype(self):
+        # see gh-8033
+        c = ["str1", "str2"]
+
+        for dt in (str, np.bytes_):
+            a = np.array(["str1", "str2"], dtype=dt)
+            x = np.loadtxt(c, dtype=dt)
+            assert_array_equal(x, a)
+
+    def test_empty_file(self):
+        with suppress_warnings() as sup:
+            sup.filter(message="loadtxt: Empty input file:")
+            c = TextIO()
+            x = np.loadtxt(c)
+            assert_equal(x.shape, (0,))
+            x = np.loadtxt(c, dtype=np.int64)
+            assert_equal(x.shape, (0,))
+            assert_(x.dtype == np.int64)
+
+    def test_unused_converter(self):
+        c = TextIO()
+        c.writelines(['1 21\n', '3 42\n'])
+        c.seek(0)
+        data = np.loadtxt(c, usecols=(1,),
+                          converters={0: lambda s: int(s, 16)})
+        assert_array_equal(data, [21, 42])
+
+        c.seek(0)
+        data = np.loadtxt(c, usecols=(1,),
+                          converters={1: lambda s: int(s, 16)})
+        assert_array_equal(data, [33, 66])
+
+    def test_dtype_with_object(self):
+        # Test using an explicit dtype with an object
+        data = """ 1; 2001-01-01
+                   2; 2002-01-31 """
+        ndtype = [('idx', int), ('code', object)]
+        func = lambda s: strptime(s.strip(), "%Y-%m-%d")
+        converters = {1: func}
+        test = np.loadtxt(TextIO(data), delimiter=";", dtype=ndtype,
+                          converters=converters)
+        control = np.array(
+            [(1, datetime(2001, 1, 1)), (2, datetime(2002, 1, 31))],
+            dtype=ndtype)
+        assert_equal(test, control)
+
+    def test_uint64_type(self):
+        tgt = (9223372043271415339, 9223372043271415853)
+        c = TextIO()
+        c.write("%s %s" % tgt)
+        c.seek(0)
+        res = np.loadtxt(c, dtype=np.uint64)
+        assert_equal(res, tgt)
+
+    def test_int64_type(self):
+        tgt = (-9223372036854775807, 9223372036854775807)
+        c = TextIO()
+        c.write("%s %s" % tgt)
+        c.seek(0)
+        res = np.loadtxt(c, dtype=np.int64)
+        assert_equal(res, tgt)
+
+    def test_from_float_hex(self):
+        # IEEE doubles and floats only, otherwise the float32
+        # conversion may fail.
+        tgt = np.logspace(-10, 10, 5).astype(np.float32)
+        tgt = np.hstack((tgt, -tgt)).astype(float)
+        inp = '\n'.join(map(float.hex, tgt))
+        c = TextIO()
+        c.write(inp)
+        for dt in [float, np.float32]:
+            c.seek(0)
+            res = np.loadtxt(c, dtype=dt)
+            assert_equal(res, tgt, err_msg="%s" % dt)
+
+    def test_from_complex(self):
+        tgt = (complex(1, 1), complex(1, -1))
+        c = TextIO()
+        c.write("%s %s" % tgt)
+        c.seek(0)
+        res = np.loadtxt(c, dtype=complex)
+        assert_equal(res, tgt)
+
+    def test_complex_misformatted(self):
+        # test for backward compatibility
+        # some complex formats used to generate x+-yj
+        a = np.zeros((2, 2), dtype=np.complex128)
+        re = np.pi
+        im = np.e
+        a[:] = re - 1.0j * im
+        c = BytesIO()
+        np.savetxt(c, a, fmt='%.16e')
+        c.seek(0)
+        txt = c.read()
+        c.seek(0)
+        # misformat the sign on the imaginary part, gh 7895
+        txt_bad = txt.replace(b'e+00-', b'e00+-')
+        assert_(txt_bad != txt)
+        c.write(txt_bad)
+        c.seek(0)
+        res = np.loadtxt(c, dtype=complex)
+        assert_equal(res, a)
+
+    def test_universal_newline(self):
+        with temppath() as name:
+            with open(name, 'w') as f:
+                f.write('1 21\r3 42\r')
+            data = np.loadtxt(name)
+        assert_array_equal(data, [[1, 21], [3, 42]])
+
+    def test_empty_field_after_tab(self):
+        c = TextIO()
+        c.write('1 \t2 \t3\tstart \n4\t5\t6\t  \n7\t8\t9.5\t')
+        c.seek(0)
+        dt = {'names': ('x', 'y', 'z', 'comment'),
+              'formats': ('<i4', '<i4', '<f4', '|S8')}
+        x = np.loadtxt(c, dtype=dt, delimiter='\t')
+        a = np.array([b'start ', b'  ', b''])
+        assert_array_equal(x['comment'], a)
+
+    def test_structure_unpack(self):
+        txt = TextIO("M 21 72\nF 35 58")
+        dt = {'names': ('a', 'b', 'c'), 'formats': ('|S1', '<i4', '<f4')}
+        a, b, c = np.loadtxt(txt, dtype=dt, unpack=True)
+        assert_(a.dtype.str == '|S1')
+        assert_(b.dtype.str == '<i4')
+        assert_(c.dtype.str == '<f4')
+        assert_array_equal(a, np.array([b'M', b'F']))
+        assert_array_equal(b, np.array([21, 35]))
+        assert_array_equal(c, np.array([72.,  58.]))
+
+    def test_ndmin_keyword(self):
+        c = TextIO()
+        c.write('1,2,3\n4,5,6')
+        c.seek(0)
+        assert_raises(ValueError, np.loadtxt, c, ndmin=3)
+        c.seek(0)
+        assert_raises(ValueError, np.loadtxt, c, ndmin=1.5)
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',', ndmin=1)
+        a = np.array([[1, 2, 3], [4, 5, 6]])
+        assert_array_equal(x, a)
+
+        d = TextIO()
+        d.write('0,1,2')
+        d.seek(0)
+        x = np.loadtxt(d, dtype=int, delimiter=',', ndmin=2)
+        assert_(x.shape == (1, 3))
+        d.seek(0)
+        x = np.loadtxt(d, dtype=int, delimiter=',', ndmin=1)
+        assert_(x.shape == (3,))
+        d.seek(0)
+        x = np.loadtxt(d, dtype=int, delimiter=',', ndmin=0)
+        assert_(x.shape == (3,))
+
+        e = TextIO()
+        e.write('0\n1\n2')
+        e.seek(0)
+        x = np.loadtxt(e, dtype=int, delimiter=',', ndmin=2)
+        assert_(x.shape == (3, 1))
+        e.seek(0)
+        x = np.loadtxt(e, dtype=int, delimiter=',', ndmin=1)
+        assert_(x.shape == (3,))
+        e.seek(0)
+        x = np.loadtxt(e, dtype=int, delimiter=',', ndmin=0)
+        assert_(x.shape == (3,))
+
+        # Test ndmin kw with empty file.
+        with suppress_warnings() as sup:
+            sup.filter(message="loadtxt: Empty input file:")
+            f = TextIO()
+            assert_(np.loadtxt(f, ndmin=2).shape == (0, 1,))
+            assert_(np.loadtxt(f, ndmin=1).shape == (0,))
+
+    def test_generator_source(self):
+        def count():
+            for i in range(10):
+                yield "%d" % i
+
+        res = np.loadtxt(count())
+        assert_array_equal(res, np.arange(10))
+
+    def test_bad_line(self):
+        c = TextIO()
+        c.write('1 2 3\n4 5 6\n2 3')
+        c.seek(0)
+
+        # Check for exception and that exception contains line number
+        assert_raises_regex(ValueError, "3", np.loadtxt, c)
+
+    def test_none_as_string(self):
+        # gh-5155, None should work as string when format demands it
+        c = TextIO()
+        c.write('100,foo,200\n300,None,400')
+        c.seek(0)
+        dt = np.dtype([('x', int), ('a', 'S10'), ('y', int)])
+        np.loadtxt(c, delimiter=',', dtype=dt, comments=None)  # Should succeed
+
+    @pytest.mark.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968',
+                        reason="Wrong preferred encoding")
+    def test_binary_load(self):
+        butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\
+                b"20,2,3,\xc3\x95scar\n\r"
+        sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines()
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write(butf8)
+            with open(path, "rb") as f:
+                x = np.loadtxt(f, encoding="UTF-8", dtype=np.unicode)
+            assert_array_equal(x, sutf8)
+            # test broken latin1 conversion people now rely on
+            with open(path, "rb") as f:
+                x = np.loadtxt(f, encoding="UTF-8", dtype="S")
+            x = [b'5,6,7,\xc3\x95scarscar', b'15,2,3,hello', b'20,2,3,\xc3\x95scar']
+            assert_array_equal(x, np.array(x, dtype="S"))
+
+    def test_max_rows(self):
+        c = TextIO()
+        c.write('1,2,3,5\n4,5,7,8\n2,1,4,5')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       max_rows=1)
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+    def test_max_rows_with_skiprows(self):
+        c = TextIO()
+        c.write('comments\n1,2,3,5\n4,5,7,8\n2,1,4,5')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       skiprows=1, max_rows=1)
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+        c = TextIO()
+        c.write('comment\n1,2,3,5\n4,5,7,8\n2,1,4,5')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       skiprows=1, max_rows=2)
+        a = np.array([[1, 2, 3, 5], [4, 5, 7, 8]], int)
+        assert_array_equal(x, a)
+
+    def test_max_rows_with_read_continuation(self):
+        c = TextIO()
+        c.write('1,2,3,5\n4,5,7,8\n2,1,4,5')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       max_rows=2)
+        a = np.array([[1, 2, 3, 5], [4, 5, 7, 8]], int)
+        assert_array_equal(x, a)
+        # test continuation
+        x = np.loadtxt(c, dtype=int, delimiter=',')
+        a = np.array([2,1,4,5], int)
+        assert_array_equal(x, a)
+
+    def test_max_rows_larger(self):
+        #test max_rows > num rows
+        c = TextIO()
+        c.write('comment\n1,2,3,5\n4,5,7,8\n2,1,4,5')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       skiprows=1, max_rows=6)
+        a = np.array([[1, 2, 3, 5], [4, 5, 7, 8], [2, 1, 4, 5]], int)
+        assert_array_equal(x, a)
+
+class Testfromregex(object):
+    def test_record(self):
+        c = TextIO()
+        c.write('1.312 foo\n1.534 bar\n4.444 qux')
+        c.seek(0)
+
+        dt = [('num', np.float64), ('val', 'S3')]
+        x = np.fromregex(c, r"([0-9.]+)\s+(...)", dt)
+        a = np.array([(1.312, 'foo'), (1.534, 'bar'), (4.444, 'qux')],
+                     dtype=dt)
+        assert_array_equal(x, a)
+
+    def test_record_2(self):
+        c = TextIO()
+        c.write('1312 foo\n1534 bar\n4444 qux')
+        c.seek(0)
+
+        dt = [('num', np.int32), ('val', 'S3')]
+        x = np.fromregex(c, r"(\d+)\s+(...)", dt)
+        a = np.array([(1312, 'foo'), (1534, 'bar'), (4444, 'qux')],
+                     dtype=dt)
+        assert_array_equal(x, a)
+
+    def test_record_3(self):
+        c = TextIO()
+        c.write('1312 foo\n1534 bar\n4444 qux')
+        c.seek(0)
+
+        dt = [('num', np.float64)]
+        x = np.fromregex(c, r"(\d+)\s+...", dt)
+        a = np.array([(1312,), (1534,), (4444,)], dtype=dt)
+        assert_array_equal(x, a)
+
+    def test_record_unicode(self):
+        utf8 = b'\xcf\x96'
+        with temppath() as path:
+            with open(path, 'wb') as f:
+                f.write(b'1.312 foo' + utf8 + b' \n1.534 bar\n4.444 qux')
+
+            dt = [('num', np.float64), ('val', 'U4')]
+            x = np.fromregex(path, r"(?u)([0-9.]+)\s+(\w+)", dt, encoding='UTF-8')
+            a = np.array([(1.312, 'foo' + utf8.decode('UTF-8')), (1.534, 'bar'),
+                           (4.444, 'qux')], dtype=dt)
+            assert_array_equal(x, a)
+
+            regexp = re.compile(r"([0-9.]+)\s+(\w+)", re.UNICODE)
+            x = np.fromregex(path, regexp, dt, encoding='UTF-8')
+            assert_array_equal(x, a)
+
+    def test_compiled_bytes(self):
+        regexp = re.compile(b'(\\d)')
+        c = BytesIO(b'123')
+        dt = [('num', np.float64)]
+        a = np.array([1, 2, 3], dtype=dt)
+        x = np.fromregex(c, regexp, dt)
+        assert_array_equal(x, a)
+
+#####--------------------------------------------------------------------------
+
+
+class TestFromTxt(LoadTxtBase):
+    loadfunc = staticmethod(np.genfromtxt)
+
+    def test_record(self):
+        # Test w/ explicit dtype
+        data = TextIO('1 2\n3 4')
+        test = np.ndfromtxt(data, dtype=[('x', np.int32), ('y', np.int32)])
+        control = np.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')])
+        assert_equal(test, control)
+        #
+        data = TextIO('M 64.0 75.0\nF 25.0 60.0')
+        descriptor = {'names': ('gender', 'age', 'weight'),
+                      'formats': ('S1', 'i4', 'f4')}
+        control = np.array([('M', 64.0, 75.0), ('F', 25.0, 60.0)],
+                           dtype=descriptor)
+        test = np.ndfromtxt(data, dtype=descriptor)
+        assert_equal(test, control)
+
+    def test_array(self):
+        # Test outputting a standard ndarray
+        data = TextIO('1 2\n3 4')
+        control = np.array([[1, 2], [3, 4]], dtype=int)
+        test = np.ndfromtxt(data, dtype=int)
+        assert_array_equal(test, control)
+        #
+        data.seek(0)
+        control = np.array([[1, 2], [3, 4]], dtype=float)
+        test = np.loadtxt(data, dtype=float)
+        assert_array_equal(test, control)
+
+    def test_1D(self):
+        # Test squeezing to 1D
+        control = np.array([1, 2, 3, 4], int)
+        #
+        data = TextIO('1\n2\n3\n4\n')
+        test = np.ndfromtxt(data, dtype=int)
+        assert_array_equal(test, control)
+        #
+        data = TextIO('1,2,3,4\n')
+        test = np.ndfromtxt(data, dtype=int, delimiter=',')
+        assert_array_equal(test, control)
+
+    def test_comments(self):
+        # Test the stripping of comments
+        control = np.array([1, 2, 3, 5], int)
+        # Comment on its own line
+        data = TextIO('# comment\n1,2,3,5\n')
+        test = np.ndfromtxt(data, dtype=int, delimiter=',', comments='#')
+        assert_equal(test, control)
+        # Comment at the end of a line
+        data = TextIO('1,2,3,5# comment\n')
+        test = np.ndfromtxt(data, dtype=int, delimiter=',', comments='#')
+        assert_equal(test, control)
+
+    def test_skiprows(self):
+        # Test row skipping
+        control = np.array([1, 2, 3, 5], int)
+        kwargs = dict(dtype=int, delimiter=',')
+        #
+        data = TextIO('comment\n1,2,3,5\n')
+        test = np.ndfromtxt(data, skip_header=1, **kwargs)
+        assert_equal(test, control)
+        #
+        data = TextIO('# comment\n1,2,3,5\n')
+        test = np.loadtxt(data, skiprows=1, **kwargs)
+        assert_equal(test, control)
+
+    def test_skip_footer(self):
+        data = ["# %i" % i for i in range(1, 6)]
+        data.append("A, B, C")
+        data.extend(["%i,%3.1f,%03s" % (i, i, i) for i in range(51)])
+        data[-1] = "99,99"
+        kwargs = dict(delimiter=",", names=True, skip_header=5, skip_footer=10)
+        test = np.genfromtxt(TextIO("\n".join(data)), **kwargs)
+        ctrl = np.array([("%f" % i, "%f" % i, "%f" % i) for i in range(41)],
+                        dtype=[(_, float) for _ in "ABC"])
+        assert_equal(test, ctrl)
+
+    def test_skip_footer_with_invalid(self):
+        with suppress_warnings() as sup:
+            sup.filter(ConversionWarning)
+            basestr = '1 1\n2 2\n3 3\n4 4\n5  \n6  \n7  \n'
+            # Footer too small to get rid of all invalid values
+            assert_raises(ValueError, np.genfromtxt,
+                          TextIO(basestr), skip_footer=1)
+    #        except ValueError:
+    #            pass
+            a = np.genfromtxt(
+                TextIO(basestr), skip_footer=1, invalid_raise=False)
+            assert_equal(a, np.array([[1., 1.], [2., 2.], [3., 3.], [4., 4.]]))
+            #
+            a = np.genfromtxt(TextIO(basestr), skip_footer=3)
+            assert_equal(a, np.array([[1., 1.], [2., 2.], [3., 3.], [4., 4.]]))
+            #
+            basestr = '1 1\n2  \n3 3\n4 4\n5  \n6 6\n7 7\n'
+            a = np.genfromtxt(
+                TextIO(basestr), skip_footer=1, invalid_raise=False)
+            assert_equal(a, np.array([[1., 1.], [3., 3.], [4., 4.], [6., 6.]]))
+            a = np.genfromtxt(
+                TextIO(basestr), skip_footer=3, invalid_raise=False)
+            assert_equal(a, np.array([[1., 1.], [3., 3.], [4., 4.]]))
+
+    def test_header(self):
+        # Test retrieving a header
+        data = TextIO('gender age weight\nM 64.0 75.0\nF 25.0 60.0')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, dtype=None, names=True)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        control = {'gender': np.array([b'M', b'F']),
+                   'age': np.array([64.0, 25.0]),
+                   'weight': np.array([75.0, 60.0])}
+        assert_equal(test['gender'], control['gender'])
+        assert_equal(test['age'], control['age'])
+        assert_equal(test['weight'], control['weight'])
+
+    def test_auto_dtype(self):
+        # Test the automatic definition of the output dtype
+        data = TextIO('A 64 75.0 3+4j True\nBCD 25 60.0 5+6j False')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        control = [np.array([b'A', b'BCD']),
+                   np.array([64, 25]),
+                   np.array([75.0, 60.0]),
+                   np.array([3 + 4j, 5 + 6j]),
+                   np.array([True, False]), ]
+        assert_equal(test.dtype.names, ['f0', 'f1', 'f2', 'f3', 'f4'])
+        for (i, ctrl) in enumerate(control):
+            assert_equal(test['f%i' % i], ctrl)
+
+    def test_auto_dtype_uniform(self):
+        # Tests whether the output dtype can be uniformized
+        data = TextIO('1 2 3 4\n5 6 7 8\n')
+        test = np.ndfromtxt(data, dtype=None)
+        control = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+        assert_equal(test, control)
+
+    def test_fancy_dtype(self):
+        # Check that a nested dtype isn't MIA
+        data = TextIO('1,2,3.0\n4,5,6.0\n')
+        fancydtype = np.dtype([('x', int), ('y', [('t', int), ('s', float)])])
+        test = np.ndfromtxt(data, dtype=fancydtype, delimiter=',')
+        control = np.array([(1, (2, 3.0)), (4, (5, 6.0))], dtype=fancydtype)
+        assert_equal(test, control)
+
+    def test_names_overwrite(self):
+        # Test overwriting the names of the dtype
+        descriptor = {'names': ('g', 'a', 'w'),
+                      'formats': ('S1', 'i4', 'f4')}
+        data = TextIO(b'M 64.0 75.0\nF 25.0 60.0')
+        names = ('gender', 'age', 'weight')
+        test = np.ndfromtxt(data, dtype=descriptor, names=names)
+        descriptor['names'] = names
+        control = np.array([('M', 64.0, 75.0),
+                            ('F', 25.0, 60.0)], dtype=descriptor)
+        assert_equal(test, control)
+
+    def test_commented_header(self):
+        # Check that names can be retrieved even if the line is commented out.
+        data = TextIO("""
+#gender age weight
+M   21  72.100000
+F   35  58.330000
+M   33  21.99
+        """)
+        # The # is part of the first name and should be deleted automatically.
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(data, names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        ctrl = np.array([('M', 21, 72.1), ('F', 35, 58.33), ('M', 33, 21.99)],
+                        dtype=[('gender', '|S1'), ('age', int), ('weight', float)])
+        assert_equal(test, ctrl)
+        # Ditto, but we should get rid of the first element
+        data = TextIO(b"""
+# gender age weight
+M   21  72.100000
+F   35  58.330000
+M   33  21.99
+        """)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(data, names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test, ctrl)
+
+    def test_names_and_comments_none(self):
+        # Tests case when names is true but comments is None (gh-10780)
+        data = TextIO('col1 col2\n 1 2\n 3 4')
+        test = np.genfromtxt(data, dtype=(int, int), comments=None, names=True)
+        control = np.array([(1, 2), (3, 4)], dtype=[('col1', int), ('col2', int)])
+        assert_equal(test, control)
+
+    def test_autonames_and_usecols(self):
+        # Tests names and usecols
+        data = TextIO('A B C D\n aaaa 121 45 9.1')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, usecols=('A', 'C', 'D'),
+                                names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        control = np.array(('aaaa', 45, 9.1),
+                           dtype=[('A', '|S4'), ('C', int), ('D', float)])
+        assert_equal(test, control)
+
+    def test_converters_with_usecols(self):
+        # Test the combination user-defined converters and usecol
+        data = TextIO('1,2,3,,5\n6,7,8,9,10\n')
+        test = np.ndfromtxt(data, dtype=int, delimiter=',',
+                            converters={3: lambda s: int(s or - 999)},
+                            usecols=(1, 3,))
+        control = np.array([[2, -999], [7, 9]], int)
+        assert_equal(test, control)
+
+    def test_converters_with_usecols_and_names(self):
+        # Tests names and usecols
+        data = TextIO('A B C D\n aaaa 121 45 9.1')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True,
+                                dtype=None,
+                                converters={'C': lambda s: 2 * int(s)})
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        control = np.array(('aaaa', 90, 9.1),
+                           dtype=[('A', '|S4'), ('C', int), ('D', float)])
+        assert_equal(test, control)
+
+    def test_converters_cornercases(self):
+        # Test the conversion to datetime.
+        converter = {
+            'date': lambda s: strptime(s, '%Y-%m-%d %H:%M:%SZ')}
+        data = TextIO('2009-02-03 12:00:00Z, 72214.0')
+        test = np.ndfromtxt(data, delimiter=',', dtype=None,
+                            names=['date', 'stid'], converters=converter)
+        control = np.array((datetime(2009, 2, 3), 72214.),
+                           dtype=[('date', np.object_), ('stid', float)])
+        assert_equal(test, control)
+
+    def test_converters_cornercases2(self):
+        # Test the conversion to datetime64.
+        converter = {
+            'date': lambda s: np.datetime64(strptime(s, '%Y-%m-%d %H:%M:%SZ'))}
+        data = TextIO('2009-02-03 12:00:00Z, 72214.0')
+        test = np.ndfromtxt(data, delimiter=',', dtype=None,
+                            names=['date', 'stid'], converters=converter)
+        control = np.array((datetime(2009, 2, 3), 72214.),
+                           dtype=[('date', 'datetime64[us]'), ('stid', float)])
+        assert_equal(test, control)
+
+    def test_unused_converter(self):
+        # Test whether unused converters are forgotten
+        data = TextIO("1 21\n  3 42\n")
+        test = np.ndfromtxt(data, usecols=(1,),
+                            converters={0: lambda s: int(s, 16)})
+        assert_equal(test, [21, 42])
+        #
+        data.seek(0)
+        test = np.ndfromtxt(data, usecols=(1,),
+                            converters={1: lambda s: int(s, 16)})
+        assert_equal(test, [33, 66])
+
+    def test_invalid_converter(self):
+        strip_rand = lambda x: float((b'r' in x.lower() and x.split()[-1]) or
+                                     (b'r' not in x.lower() and x.strip() or 0.0))
+        strip_per = lambda x: float((b'%' in x.lower() and x.split()[0]) or
+                                    (b'%' not in x.lower() and x.strip() or 0.0))
+        s = TextIO("D01N01,10/1/2003 ,1 %,R 75,400,600\r\n"
+                   "L24U05,12/5/2003, 2 %,1,300, 150.5\r\n"
+                   "D02N03,10/10/2004,R 1,,7,145.55")
+        kwargs = dict(
+            converters={2: strip_per, 3: strip_rand}, delimiter=",",
+            dtype=None)
+        assert_raises(ConverterError, np.genfromtxt, s, **kwargs)
+
+    def test_tricky_converter_bug1666(self):
+        # Test some corner cases
+        s = TextIO('q1,2\nq3,4')
+        cnv = lambda s: float(s[1:])
+        test = np.genfromtxt(s, delimiter=',', converters={0: cnv})
+        control = np.array([[1., 2.], [3., 4.]])
+        assert_equal(test, control)
+
+    def test_dtype_with_converters(self):
+        dstr = "2009; 23; 46"
+        test = np.ndfromtxt(TextIO(dstr,),
+                            delimiter=";", dtype=float, converters={0: bytes})
+        control = np.array([('2009', 23., 46)],
+                           dtype=[('f0', '|S4'), ('f1', float), ('f2', float)])
+        assert_equal(test, control)
+        test = np.ndfromtxt(TextIO(dstr,),
+                            delimiter=";", dtype=float, converters={0: float})
+        control = np.array([2009., 23., 46],)
+        assert_equal(test, control)
+
+    def test_dtype_with_converters_and_usecols(self):
+        dstr = "1,5,-1,1:1\n2,8,-1,1:n\n3,3,-2,m:n\n"
+        dmap = {'1:1':0, '1:n':1, 'm:1':2, 'm:n':3}
+        dtyp = [('e1','i4'),('e2','i4'),('e3','i2'),('n', 'i1')]
+        conv = {0: int, 1: int, 2: int, 3: lambda r: dmap[r.decode()]}
+        test = np.recfromcsv(TextIO(dstr,), dtype=dtyp, delimiter=',',
+                             names=None, converters=conv)
+        control = np.rec.array([(1,5,-1,0), (2,8,-1,1), (3,3,-2,3)], dtype=dtyp)
+        assert_equal(test, control)
+        dtyp = [('e1','i4'),('e2','i4'),('n', 'i1')]
+        test = np.recfromcsv(TextIO(dstr,), dtype=dtyp, delimiter=',',
+                             usecols=(0,1,3), names=None, converters=conv)
+        control = np.rec.array([(1,5,0), (2,8,1), (3,3,3)], dtype=dtyp)
+        assert_equal(test, control)
+
+    def test_dtype_with_object(self):
+        # Test using an explicit dtype with an object
+        data = """ 1; 2001-01-01
+                   2; 2002-01-31 """
+        ndtype = [('idx', int), ('code', object)]
+        func = lambda s: strptime(s.strip(), "%Y-%m-%d")
+        converters = {1: func}
+        test = np.genfromtxt(TextIO(data), delimiter=";", dtype=ndtype,
+                             converters=converters)
+        control = np.array(
+            [(1, datetime(2001, 1, 1)), (2, datetime(2002, 1, 31))],
+            dtype=ndtype)
+        assert_equal(test, control)
+
+        ndtype = [('nest', [('idx', int), ('code', object)])]
+        with assert_raises_regex(NotImplementedError,
+                                 'Nested fields.* not supported.*'):
+            test = np.genfromtxt(TextIO(data), delimiter=";",
+                                 dtype=ndtype, converters=converters)
+
+        # nested but empty fields also aren't supported
+        ndtype = [('idx', int), ('code', object), ('nest', [])]
+        with assert_raises_regex(NotImplementedError,
+                                 'Nested fields.* not supported.*'):
+            test = np.genfromtxt(TextIO(data), delimiter=";",
+                                 dtype=ndtype, converters=converters)
+
+    def test_userconverters_with_explicit_dtype(self):
+        # Test user_converters w/ explicit (standard) dtype
+        data = TextIO('skip,skip,2001-01-01,1.0,skip')
+        test = np.genfromtxt(data, delimiter=",", names=None, dtype=float,
+                             usecols=(2, 3), converters={2: bytes})
+        control = np.array([('2001-01-01', 1.)],
+                           dtype=[('', '|S10'), ('', float)])
+        assert_equal(test, control)
+
+    def test_utf8_userconverters_with_explicit_dtype(self):
+        utf8 = b'\xcf\x96'
+        with temppath() as path:
+            with open(path, 'wb') as f:
+                f.write(b'skip,skip,2001-01-01' + utf8 + b',1.0,skip')
+            test = np.genfromtxt(path, delimiter=",", names=None, dtype=float,
+                                 usecols=(2, 3), converters={2: np.unicode},
+                                 encoding='UTF-8')
+        control = np.array([('2001-01-01' + utf8.decode('UTF-8'), 1.)],
+                           dtype=[('', '|U11'), ('', float)])
+        assert_equal(test, control)
+
+    def test_spacedelimiter(self):
+        # Test space delimiter
+        data = TextIO("1  2  3  4   5\n6  7  8  9  10")
+        test = np.ndfromtxt(data)
+        control = np.array([[1., 2., 3., 4., 5.],
+                            [6., 7., 8., 9., 10.]])
+        assert_equal(test, control)
+
+    def test_integer_delimiter(self):
+        # Test using an integer for delimiter
+        data = "  1  2  3\n  4  5 67\n890123  4"
+        test = np.genfromtxt(TextIO(data), delimiter=3)
+        control = np.array([[1, 2, 3], [4, 5, 67], [890, 123, 4]])
+        assert_equal(test, control)
+
+    def test_missing(self):
+        data = TextIO('1,2,3,,5\n')
+        test = np.ndfromtxt(data, dtype=int, delimiter=',',
+                            converters={3: lambda s: int(s or - 999)})
+        control = np.array([1, 2, 3, -999, 5], int)
+        assert_equal(test, control)
+
+    def test_missing_with_tabs(self):
+        # Test w/ a delimiter tab
+        txt = "1\t2\t3\n\t2\t\n1\t\t3"
+        test = np.genfromtxt(TextIO(txt), delimiter="\t",
+                             usemask=True,)
+        ctrl_d = np.array([(1, 2, 3), (np.nan, 2, np.nan), (1, np.nan, 3)],)
+        ctrl_m = np.array([(0, 0, 0), (1, 0, 1), (0, 1, 0)], dtype=bool)
+        assert_equal(test.data, ctrl_d)
+        assert_equal(test.mask, ctrl_m)
+
+    def test_usecols(self):
+        # Test the selection of columns
+        # Select 1 column
+        control = np.array([[1, 2], [3, 4]], float)
+        data = TextIO()
+        np.savetxt(data, control)
+        data.seek(0)
+        test = np.ndfromtxt(data, dtype=float, usecols=(1,))
+        assert_equal(test, control[:, 1])
+        #
+        control = np.array([[1, 2, 3], [3, 4, 5]], float)
+        data = TextIO()
+        np.savetxt(data, control)
+        data.seek(0)
+        test = np.ndfromtxt(data, dtype=float, usecols=(1, 2))
+        assert_equal(test, control[:, 1:])
+        # Testing with arrays instead of tuples.
+        data.seek(0)
+        test = np.ndfromtxt(data, dtype=float, usecols=np.array([1, 2]))
+        assert_equal(test, control[:, 1:])
+
+    def test_usecols_as_css(self):
+        # Test giving usecols with a comma-separated string
+        data = "1 2 3\n4 5 6"
+        test = np.genfromtxt(TextIO(data),
+                             names="a, b, c", usecols="a, c")
+        ctrl = np.array([(1, 3), (4, 6)], dtype=[(_, float) for _ in "ac"])
+        assert_equal(test, ctrl)
+
+    def test_usecols_with_structured_dtype(self):
+        # Test usecols with an explicit structured dtype
+        data = TextIO("JOE 70.1 25.3\nBOB 60.5 27.9")
+        names = ['stid', 'temp']
+        dtypes = ['S4', 'f8']
+        test = np.ndfromtxt(
+            data, usecols=(0, 2), dtype=list(zip(names, dtypes)))
+        assert_equal(test['stid'], [b"JOE", b"BOB"])
+        assert_equal(test['temp'], [25.3, 27.9])
+
+    def test_usecols_with_integer(self):
+        # Test usecols with an integer
+        test = np.genfromtxt(TextIO(b"1 2 3\n4 5 6"), usecols=0)
+        assert_equal(test, np.array([1., 4.]))
+
+    def test_usecols_with_named_columns(self):
+        # Test usecols with named columns
+        ctrl = np.array([(1, 3), (4, 6)], dtype=[('a', float), ('c', float)])
+        data = "1 2 3\n4 5 6"
+        kwargs = dict(names="a, b, c")
+        test = np.genfromtxt(TextIO(data), usecols=(0, -1), **kwargs)
+        assert_equal(test, ctrl)
+        test = np.genfromtxt(TextIO(data),
+                             usecols=('a', 'c'), **kwargs)
+        assert_equal(test, ctrl)
+
+    def test_empty_file(self):
+        # Test that an empty file raises the proper warning.
+        with suppress_warnings() as sup:
+            sup.filter(message="genfromtxt: Empty input file:")
+            data = TextIO()
+            test = np.genfromtxt(data)
+            assert_equal(test, np.array([]))
+
+    def test_fancy_dtype_alt(self):
+        # Check that a nested dtype isn't MIA
+        data = TextIO('1,2,3.0\n4,5,6.0\n')
+        fancydtype = np.dtype([('x', int), ('y', [('t', int), ('s', float)])])
+        test = np.mafromtxt(data, dtype=fancydtype, delimiter=',')
+        control = ma.array([(1, (2, 3.0)), (4, (5, 6.0))], dtype=fancydtype)
+        assert_equal(test, control)
+
+    def test_shaped_dtype(self):
+        c = TextIO("aaaa  1.0  8.0  1 2 3 4 5 6")
+        dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
+                       ('block', int, (2, 3))])
+        x = np.ndfromtxt(c, dtype=dt)
+        a = np.array([('aaaa', 1.0, 8.0, [[1, 2, 3], [4, 5, 6]])],
+                     dtype=dt)
+        assert_array_equal(x, a)
+
+    def test_withmissing(self):
+        data = TextIO('A,B\n0,1\n2,N/A')
+        kwargs = dict(delimiter=",", missing_values="N/A", names=True)
+        test = np.mafromtxt(data, dtype=None, **kwargs)
+        control = ma.array([(0, 1), (2, -1)],
+                           mask=[(False, False), (False, True)],
+                           dtype=[('A', int), ('B', int)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        #
+        data.seek(0)
+        test = np.mafromtxt(data, **kwargs)
+        control = ma.array([(0, 1), (2, -1)],
+                           mask=[(False, False), (False, True)],
+                           dtype=[('A', float), ('B', float)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+    def test_user_missing_values(self):
+        data = "A, B, C\n0, 0., 0j\n1, N/A, 1j\n-9, 2.2, N/A\n3, -99, 3j"
+        basekwargs = dict(dtype=None, delimiter=",", names=True,)
+        mdtype = [('A', int), ('B', float), ('C', complex)]
+        #
+        test = np.mafromtxt(TextIO(data), missing_values="N/A",
+                            **basekwargs)
+        control = ma.array([(0, 0.0, 0j), (1, -999, 1j),
+                            (-9, 2.2, -999j), (3, -99, 3j)],
+                           mask=[(0, 0, 0), (0, 1, 0), (0, 0, 1), (0, 0, 0)],
+                           dtype=mdtype)
+        assert_equal(test, control)
+        #
+        basekwargs['dtype'] = mdtype
+        test = np.mafromtxt(TextIO(data),
+                            missing_values={0: -9, 1: -99, 2: -999j}, **basekwargs)
+        control = ma.array([(0, 0.0, 0j), (1, -999, 1j),
+                            (-9, 2.2, -999j), (3, -99, 3j)],
+                           mask=[(0, 0, 0), (0, 1, 0), (1, 0, 1), (0, 1, 0)],
+                           dtype=mdtype)
+        assert_equal(test, control)
+        #
+        test = np.mafromtxt(TextIO(data),
+                            missing_values={0: -9, 'B': -99, 'C': -999j},
+                            **basekwargs)
+        control = ma.array([(0, 0.0, 0j), (1, -999, 1j),
+                            (-9, 2.2, -999j), (3, -99, 3j)],
+                           mask=[(0, 0, 0), (0, 1, 0), (1, 0, 1), (0, 1, 0)],
+                           dtype=mdtype)
+        assert_equal(test, control)
+
+    def test_user_filling_values(self):
+        # Test with missing and filling values
+        ctrl = np.array([(0, 3), (4, -999)], dtype=[('a', int), ('b', int)])
+        data = "N/A, 2, 3\n4, ,???"
+        kwargs = dict(delimiter=",",
+                      dtype=int,
+                      names="a,b,c",
+                      missing_values={0: "N/A", 'b': " ", 2: "???"},
+                      filling_values={0: 0, 'b': 0, 2: -999})
+        test = np.genfromtxt(TextIO(data), **kwargs)
+        ctrl = np.array([(0, 2, 3), (4, 0, -999)],
+                        dtype=[(_, int) for _ in "abc"])
+        assert_equal(test, ctrl)
+        #
+        test = np.genfromtxt(TextIO(data), usecols=(0, -1), **kwargs)
+        ctrl = np.array([(0, 3), (4, -999)], dtype=[(_, int) for _ in "ac"])
+        assert_equal(test, ctrl)
+
+        data2 = "1,2,*,4\n5,*,7,8\n"
+        test = np.genfromtxt(TextIO(data2), delimiter=',', dtype=int,
+                             missing_values="*", filling_values=0)
+        ctrl = np.array([[1, 2, 0, 4], [5, 0, 7, 8]])
+        assert_equal(test, ctrl)
+        test = np.genfromtxt(TextIO(data2), delimiter=',', dtype=int,
+                             missing_values="*", filling_values=-1)
+        ctrl = np.array([[1, 2, -1, 4], [5, -1, 7, 8]])
+        assert_equal(test, ctrl)
+
+    def test_withmissing_float(self):
+        data = TextIO('A,B\n0,1.5\n2,-999.00')
+        test = np.mafromtxt(data, dtype=None, delimiter=',',
+                            missing_values='-999.0', names=True,)
+        control = ma.array([(0, 1.5), (2, -1.)],
+                           mask=[(False, False), (False, True)],
+                           dtype=[('A', int), ('B', float)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+    def test_with_masked_column_uniform(self):
+        # Test masked column
+        data = TextIO('1 2 3\n4 5 6\n')
+        test = np.genfromtxt(data, dtype=None,
+                             missing_values='2,5', usemask=True)
+        control = ma.array([[1, 2, 3], [4, 5, 6]], mask=[[0, 1, 0], [0, 1, 0]])
+        assert_equal(test, control)
+
+    def test_with_masked_column_various(self):
+        # Test masked column
+        data = TextIO('True 2 3\nFalse 5 6\n')
+        test = np.genfromtxt(data, dtype=None,
+                             missing_values='2,5', usemask=True)
+        control = ma.array([(1, 2, 3), (0, 5, 6)],
+                           mask=[(0, 1, 0), (0, 1, 0)],
+                           dtype=[('f0', bool), ('f1', bool), ('f2', int)])
+        assert_equal(test, control)
+
+    def test_invalid_raise(self):
+        # Test invalid raise
+        data = ["1, 1, 1, 1, 1"] * 50
+        for i in range(5):
+            data[10 * i] = "2, 2, 2, 2 2"
+        data.insert(0, "a, b, c, d, e")
+        mdata = TextIO("\n".join(data))
+        #
+        kwargs = dict(delimiter=",", dtype=None, names=True)
+        # XXX: is there a better way to get the return value of the
+        # callable in assert_warns ?
+        ret = {}
+
+        def f(_ret={}):
+            _ret['mtest'] = np.ndfromtxt(mdata, invalid_raise=False, **kwargs)
+        assert_warns(ConversionWarning, f, _ret=ret)
+        mtest = ret['mtest']
+        assert_equal(len(mtest), 45)
+        assert_equal(mtest, np.ones(45, dtype=[(_, int) for _ in 'abcde']))
+        #
+        mdata.seek(0)
+        assert_raises(ValueError, np.ndfromtxt, mdata,
+                      delimiter=",", names=True)
+
+    def test_invalid_raise_with_usecols(self):
+        # Test invalid_raise with usecols
+        data = ["1, 1, 1, 1, 1"] * 50
+        for i in range(5):
+            data[10 * i] = "2, 2, 2, 2 2"
+        data.insert(0, "a, b, c, d, e")
+        mdata = TextIO("\n".join(data))
+        kwargs = dict(delimiter=",", dtype=None, names=True,
+                      invalid_raise=False)
+        # XXX: is there a better way to get the return value of the
+        # callable in assert_warns ?
+        ret = {}
+
+        def f(_ret={}):
+            _ret['mtest'] = np.ndfromtxt(mdata, usecols=(0, 4), **kwargs)
+        assert_warns(ConversionWarning, f, _ret=ret)
+        mtest = ret['mtest']
+        assert_equal(len(mtest), 45)
+        assert_equal(mtest, np.ones(45, dtype=[(_, int) for _ in 'ae']))
+        #
+        mdata.seek(0)
+        mtest = np.ndfromtxt(mdata, usecols=(0, 1), **kwargs)
+        assert_equal(len(mtest), 50)
+        control = np.ones(50, dtype=[(_, int) for _ in 'ab'])
+        control[[10 * _ for _ in range(5)]] = (2, 2)
+        assert_equal(mtest, control)
+
+    def test_inconsistent_dtype(self):
+        # Test inconsistent dtype
+        data = ["1, 1, 1, 1, -1.1"] * 50
+        mdata = TextIO("\n".join(data))
+
+        converters = {4: lambda x: "(%s)" % x}
+        kwargs = dict(delimiter=",", converters=converters,
+                      dtype=[(_, int) for _ in 'abcde'],)
+        assert_raises(ValueError, np.genfromtxt, mdata, **kwargs)
+
+    def test_default_field_format(self):
+        # Test default format
+        data = "0, 1, 2.3\n4, 5, 6.7"
+        mtest = np.ndfromtxt(TextIO(data),
+                             delimiter=",", dtype=None, defaultfmt="f%02i")
+        ctrl = np.array([(0, 1, 2.3), (4, 5, 6.7)],
+                        dtype=[("f00", int), ("f01", int), ("f02", float)])
+        assert_equal(mtest, ctrl)
+
+    def test_single_dtype_wo_names(self):
+        # Test single dtype w/o names
+        data = "0, 1, 2.3\n4, 5, 6.7"
+        mtest = np.ndfromtxt(TextIO(data),
+                             delimiter=",", dtype=float, defaultfmt="f%02i")
+        ctrl = np.array([[0., 1., 2.3], [4., 5., 6.7]], dtype=float)
+        assert_equal(mtest, ctrl)
+
+    def test_single_dtype_w_explicit_names(self):
+        # Test single dtype w explicit names
+        data = "0, 1, 2.3\n4, 5, 6.7"
+        mtest = np.ndfromtxt(TextIO(data),
+                             delimiter=",", dtype=float, names="a, b, c")
+        ctrl = np.array([(0., 1., 2.3), (4., 5., 6.7)],
+                        dtype=[(_, float) for _ in "abc"])
+        assert_equal(mtest, ctrl)
+
+    def test_single_dtype_w_implicit_names(self):
+        # Test single dtype w implicit names
+        data = "a, b, c\n0, 1, 2.3\n4, 5, 6.7"
+        mtest = np.ndfromtxt(TextIO(data),
+                             delimiter=",", dtype=float, names=True)
+        ctrl = np.array([(0., 1., 2.3), (4., 5., 6.7)],
+                        dtype=[(_, float) for _ in "abc"])
+        assert_equal(mtest, ctrl)
+
+    def test_easy_structured_dtype(self):
+        # Test easy structured dtype
+        data = "0, 1, 2.3\n4, 5, 6.7"
+        mtest = np.ndfromtxt(TextIO(data), delimiter=",",
+                             dtype=(int, float, float), defaultfmt="f_%02i")
+        ctrl = np.array([(0, 1., 2.3), (4, 5., 6.7)],
+                        dtype=[("f_00", int), ("f_01", float), ("f_02", float)])
+        assert_equal(mtest, ctrl)
+
+    def test_autostrip(self):
+        # Test autostrip
+        data = "01/01/2003  , 1.3,   abcde"
+        kwargs = dict(delimiter=",", dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            mtest = np.ndfromtxt(TextIO(data), **kwargs)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        ctrl = np.array([('01/01/2003  ', 1.3, '   abcde')],
+                        dtype=[('f0', '|S12'), ('f1', float), ('f2', '|S8')])
+        assert_equal(mtest, ctrl)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        ctrl = np.array([('01/01/2003', 1.3, 'abcde')],
+                        dtype=[('f0', '|S10'), ('f1', float), ('f2', '|S5')])
+        assert_equal(mtest, ctrl)
+
+    def test_replace_space(self):
+        # Test the 'replace_space' option
+        txt = "A.A, B (B), C:C\n1, 2, 3.14"
+        # Test default: replace ' ' by '_' and delete non-alphanum chars
+        test = np.genfromtxt(TextIO(txt),
+                             delimiter=",", names=True, dtype=None)
+        ctrl_dtype = [("AA", int), ("B_B", int), ("CC", float)]
+        ctrl = np.array((1, 2, 3.14), dtype=ctrl_dtype)
+        assert_equal(test, ctrl)
+        # Test: no replace, no delete
+        test = np.genfromtxt(TextIO(txt),
+                             delimiter=",", names=True, dtype=None,
+                             replace_space='', deletechars='')
+        ctrl_dtype = [("A.A", int), ("B (B)", int), ("C:C", float)]
+        ctrl = np.array((1, 2, 3.14), dtype=ctrl_dtype)
+        assert_equal(test, ctrl)
+        # Test: no delete (spaces are replaced by _)
+        test = np.genfromtxt(TextIO(txt),
+                             delimiter=",", names=True, dtype=None,
+                             deletechars='')
+        ctrl_dtype = [("A.A", int), ("B_(B)", int), ("C:C", float)]
+        ctrl = np.array((1, 2, 3.14), dtype=ctrl_dtype)
+        assert_equal(test, ctrl)
+
+    def test_replace_space_known_dtype(self):
+        # Test the 'replace_space' (and related) options when dtype != None
+        txt = "A.A, B (B), C:C\n1, 2, 3"
+        # Test default: replace ' ' by '_' and delete non-alphanum chars
+        test = np.genfromtxt(TextIO(txt),
+                             delimiter=",", names=True, dtype=int)
+        ctrl_dtype = [("AA", int), ("B_B", int), ("CC", int)]
+        ctrl = np.array((1, 2, 3), dtype=ctrl_dtype)
+        assert_equal(test, ctrl)
+        # Test: no replace, no delete
+        test = np.genfromtxt(TextIO(txt),
+                             delimiter=",", names=True, dtype=int,
+                             replace_space='', deletechars='')
+        ctrl_dtype = [("A.A", int), ("B (B)", int), ("C:C", int)]
+        ctrl = np.array((1, 2, 3), dtype=ctrl_dtype)
+        assert_equal(test, ctrl)
+        # Test: no delete (spaces are replaced by _)
+        test = np.genfromtxt(TextIO(txt),
+                             delimiter=",", names=True, dtype=int,
+                             deletechars='')
+        ctrl_dtype = [("A.A", int), ("B_(B)", int), ("C:C", int)]
+        ctrl = np.array((1, 2, 3), dtype=ctrl_dtype)
+        assert_equal(test, ctrl)
+
+    def test_incomplete_names(self):
+        # Test w/ incomplete names
+        data = "A,,C\n0,1,2\n3,4,5"
+        kwargs = dict(delimiter=",", names=True)
+        # w/ dtype=None
+        ctrl = np.array([(0, 1, 2), (3, 4, 5)],
+                        dtype=[(_, int) for _ in ('A', 'f0', 'C')])
+        test = np.ndfromtxt(TextIO(data), dtype=None, **kwargs)
+        assert_equal(test, ctrl)
+        # w/ default dtype
+        ctrl = np.array([(0, 1, 2), (3, 4, 5)],
+                        dtype=[(_, float) for _ in ('A', 'f0', 'C')])
+        test = np.ndfromtxt(TextIO(data), **kwargs)
+
+    def test_names_auto_completion(self):
+        # Make sure that names are properly completed
+        data = "1 2 3\n 4 5 6"
+        test = np.genfromtxt(TextIO(data),
+                             dtype=(int, float, int), names="a")
+        ctrl = np.array([(1, 2, 3), (4, 5, 6)],
+                        dtype=[('a', int), ('f0', float), ('f1', int)])
+        assert_equal(test, ctrl)
+
+    def test_names_with_usecols_bug1636(self):
+        # Make sure we pick up the right names w/ usecols
+        data = "A,B,C,D,E\n0,1,2,3,4\n0,1,2,3,4\n0,1,2,3,4"
+        ctrl_names = ("A", "C", "E")
+        test = np.genfromtxt(TextIO(data),
+                             dtype=(int, int, int), delimiter=",",
+                             usecols=(0, 2, 4), names=True)
+        assert_equal(test.dtype.names, ctrl_names)
+        #
+        test = np.genfromtxt(TextIO(data),
+                             dtype=(int, int, int), delimiter=",",
+                             usecols=("A", "C", "E"), names=True)
+        assert_equal(test.dtype.names, ctrl_names)
+        #
+        test = np.genfromtxt(TextIO(data),
+                             dtype=int, delimiter=",",
+                             usecols=("A", "C", "E"), names=True)
+        assert_equal(test.dtype.names, ctrl_names)
+
+    def test_fixed_width_names(self):
+        # Test fix-width w/ names
+        data = "    A    B   C\n    0    1 2.3\n   45   67   9."
+        kwargs = dict(delimiter=(5, 5, 4), names=True, dtype=None)
+        ctrl = np.array([(0, 1, 2.3), (45, 67, 9.)],
+                        dtype=[('A', int), ('B', int), ('C', float)])
+        test = np.ndfromtxt(TextIO(data), **kwargs)
+        assert_equal(test, ctrl)
+        #
+        kwargs = dict(delimiter=5, names=True, dtype=None)
+        ctrl = np.array([(0, 1, 2.3), (45, 67, 9.)],
+                        dtype=[('A', int), ('B', int), ('C', float)])
+        test = np.ndfromtxt(TextIO(data), **kwargs)
+        assert_equal(test, ctrl)
+
+    def test_filling_values(self):
+        # Test missing values
+        data = b"1, 2, 3\n1, , 5\n0, 6, \n"
+        kwargs = dict(delimiter=",", dtype=None, filling_values=-999)
+        ctrl = np.array([[1, 2, 3], [1, -999, 5], [0, 6, -999]], dtype=int)
+        test = np.ndfromtxt(TextIO(data), **kwargs)
+        assert_equal(test, ctrl)
+
+    def test_comments_is_none(self):
+        # Github issue 329 (None was previously being converted to 'None').
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test[1], b'testNonetherestofthedata')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test[1], b' testNonetherestofthedata')
+
+    def test_latin1(self):
+        latin1 = b'\xf6\xfc\xf6'
+        norm = b"norm1,norm2,norm3\n"
+        enc = b"test1,testNonethe" + latin1 + b",test3\n"
+        s = norm + enc + norm
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(s),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test[1, 0], b"test1")
+        assert_equal(test[1, 1], b"testNonethe" + latin1)
+        assert_equal(test[1, 2], b"test3")
+        test = np.genfromtxt(TextIO(s),
+                             dtype=None, comments=None, delimiter=',',
+                             encoding='latin1')
+        assert_equal(test[1, 0], u"test1")
+        assert_equal(test[1, 1], u"testNonethe" + latin1.decode('latin1'))
+        assert_equal(test[1, 2], u"test3")
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(b"0,testNonethe" + latin1),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test['f0'], 0)
+        assert_equal(test['f1'], b"testNonethe" + latin1)
+
+    def test_binary_decode_autodtype(self):
+        utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
+        v = self.loadfunc(BytesIO(utf16), dtype=None, encoding='UTF-16')
+        assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
+
+    def test_utf8_byte_encoding(self):
+        utf8 = b"\xcf\x96"
+        norm = b"norm1,norm2,norm3\n"
+        enc = b"test1,testNonethe" + utf8 + b",test3\n"
+        s = norm + enc + norm
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(s),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        ctl = np.array([
+                 [b'norm1', b'norm2', b'norm3'],
+                 [b'test1', b'testNonethe' + utf8, b'test3'],
+                 [b'norm1', b'norm2', b'norm3']])
+        assert_array_equal(test, ctl)
+
+    def test_utf8_file(self):
+        utf8 = b"\xcf\x96"
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write((b"test1,testNonethe" + utf8 + b",test3\n") * 2)
+            test = np.genfromtxt(path, dtype=None, comments=None,
+                                 delimiter=',', encoding="UTF-8")
+            ctl = np.array([
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"],
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
+                     dtype=np.unicode)
+            assert_array_equal(test, ctl)
+
+            # test a mixed dtype
+            with open(path, "wb") as f:
+                f.write(b"0,testNonethe" + utf8)
+            test = np.genfromtxt(path, dtype=None, comments=None,
+                                 delimiter=',', encoding="UTF-8")
+            assert_equal(test['f0'], 0)
+            assert_equal(test['f1'], "testNonethe" + utf8.decode("UTF-8"))
+
+    def test_utf8_file_nodtype_unicode(self):
+        # bytes encoding with non-latin1 -> unicode upcast
+        utf8 = u'\u03d6'
+        latin1 = u'\xf6\xfc\xf6'
+
+        # skip test if cannot encode utf8 test string with preferred
+        # encoding. The preferred encoding is assumed to be the default
+        # encoding of io.open. Will need to change this for PyTest, maybe
+        # using pytest.mark.xfail(raises=***).
+        try:
+            encoding = locale.getpreferredencoding()
+            utf8.encode(encoding)
+        except (UnicodeError, ImportError):
+            pytest.skip('Skipping test_utf8_file_nodtype_unicode, '
+                        'unable to encode utf8 in preferred encoding')
+
+        with temppath() as path:
+            with io.open(path, "wt") as f:
+                f.write(u"norm1,norm2,norm3\n")
+                f.write(u"norm1," + latin1 + u",norm3\n")
+                f.write(u"test1,testNonethe" + utf8 + u",test3\n")
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings('always', '',
+                                        np.VisibleDeprecationWarning)
+                test = np.genfromtxt(path, dtype=None, comments=None,
+                                     delimiter=',')
+                # Check for warning when encoding not specified.
+                assert_(w[0].category is np.VisibleDeprecationWarning)
+            ctl = np.array([
+                     ["norm1", "norm2", "norm3"],
+                     ["norm1", latin1, "norm3"],
+                     ["test1", "testNonethe" + utf8, "test3"]],
+                     dtype=np.unicode)
+            assert_array_equal(test, ctl)
+
+    def test_recfromtxt(self):
+        #
+        data = TextIO('A,B\n0,1\n2,3')
+        kwargs = dict(delimiter=",", missing_values="N/A", names=True)
+        test = np.recfromtxt(data, **kwargs)
+        control = np.array([(0, 1), (2, 3)],
+                           dtype=[('A', int), ('B', int)])
+        assert_(isinstance(test, np.recarray))
+        assert_equal(test, control)
+        #
+        data = TextIO('A,B\n0,1\n2,N/A')
+        test = np.recfromtxt(data, dtype=None, usemask=True, **kwargs)
+        control = ma.array([(0, 1), (2, -1)],
+                           mask=[(False, False), (False, True)],
+                           dtype=[('A', int), ('B', int)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        assert_equal(test.A, [0, 2])
+
+    def test_recfromcsv(self):
+        #
+        data = TextIO('A,B\n0,1\n2,3')
+        kwargs = dict(missing_values="N/A", names=True, case_sensitive=True)
+        test = np.recfromcsv(data, dtype=None, **kwargs)
+        control = np.array([(0, 1), (2, 3)],
+                           dtype=[('A', int), ('B', int)])
+        assert_(isinstance(test, np.recarray))
+        assert_equal(test, control)
+        #
+        data = TextIO('A,B\n0,1\n2,N/A')
+        test = np.recfromcsv(data, dtype=None, usemask=True, **kwargs)
+        control = ma.array([(0, 1), (2, -1)],
+                           mask=[(False, False), (False, True)],
+                           dtype=[('A', int), ('B', int)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        assert_equal(test.A, [0, 2])
+        #
+        data = TextIO('A,B\n0,1\n2,3')
+        test = np.recfromcsv(data, missing_values='N/A',)
+        control = np.array([(0, 1), (2, 3)],
+                           dtype=[('a', int), ('b', int)])
+        assert_(isinstance(test, np.recarray))
+        assert_equal(test, control)
+        #
+        data = TextIO('A,B\n0,1\n2,3')
+        dtype = [('a', int), ('b', float)]
+        test = np.recfromcsv(data, missing_values='N/A', dtype=dtype)
+        control = np.array([(0, 1), (2, 3)],
+                           dtype=dtype)
+        assert_(isinstance(test, np.recarray))
+        assert_equal(test, control)
+
+        #gh-10394
+        data = TextIO('color\n"red"\n"blue"')
+        test = np.recfromcsv(data, converters={0: lambda x: x.strip(b'\"')})
+        control = np.array([('red',), ('blue',)], dtype=[('color', (bytes, 4))])
+        assert_equal(test.dtype, control.dtype)
+        assert_equal(test, control)
+
+    def test_max_rows(self):
+        # Test the `max_rows` keyword argument.
+        data = '1 2\n3 4\n5 6\n7 8\n9 10\n'
+        txt = TextIO(data)
+        a1 = np.genfromtxt(txt, max_rows=3)
+        a2 = np.genfromtxt(txt)
+        assert_equal(a1, [[1, 2], [3, 4], [5, 6]])
+        assert_equal(a2, [[7, 8], [9, 10]])
+
+        # max_rows must be at least 1.
+        assert_raises(ValueError, np.genfromtxt, TextIO(data), max_rows=0)
+
+        # An input with several invalid rows.
+        data = '1 1\n2 2\n0 \n3 3\n4 4\n5  \n6  \n7  \n'
+
+        test = np.genfromtxt(TextIO(data), max_rows=2)
+        control = np.array([[1., 1.], [2., 2.]])
+        assert_equal(test, control)
+
+        # Test keywords conflict
+        assert_raises(ValueError, np.genfromtxt, TextIO(data), skip_footer=1,
+                      max_rows=4)
+
+        # Test with invalid value
+        assert_raises(ValueError, np.genfromtxt, TextIO(data), max_rows=4)
+
+        # Test with invalid not raise
+        with suppress_warnings() as sup:
+            sup.filter(ConversionWarning)
+
+            test = np.genfromtxt(TextIO(data), max_rows=4, invalid_raise=False)
+            control = np.array([[1., 1.], [2., 2.], [3., 3.], [4., 4.]])
+            assert_equal(test, control)
+
+            test = np.genfromtxt(TextIO(data), max_rows=5, invalid_raise=False)
+            control = np.array([[1., 1.], [2., 2.], [3., 3.], [4., 4.]])
+            assert_equal(test, control)
+
+        # Structured array with field names.
+        data = 'a b\n#c d\n1 1\n2 2\n#0 \n3 3\n4 4\n5  5\n'
+
+        # Test with header, names and comments
+        txt = TextIO(data)
+        test = np.genfromtxt(txt, skip_header=1, max_rows=3, names=True)
+        control = np.array([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)],
+                      dtype=[('c', '<f8'), ('d', '<f8')])
+        assert_equal(test, control)
+        # To continue reading the same "file", don't use skip_header or
+        # names, and use the previously determined dtype.
+        test = np.genfromtxt(txt, max_rows=None, dtype=test.dtype)
+        control = np.array([(4.0, 4.0), (5.0, 5.0)],
+                      dtype=[('c', '<f8'), ('d', '<f8')])
+        assert_equal(test, control)
+
+    def test_gft_using_filename(self):
+        # Test that we can load data from a filename as well as a file
+        # object
+        tgt = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            with temppath() as name:
+                with open(name, 'w') as f:
+                    f.write(data)
+                res = np.genfromtxt(name)
+            assert_array_equal(res, tgt)
+
+    def test_gft_from_gzip(self):
+        # Test that we can load data from a gzipped file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            s = BytesIO()
+            with gzip.GzipFile(fileobj=s, mode='w') as g:
+                g.write(asbytes(data))
+
+            with temppath(suffix='.gz2') as name:
+                with open(name, 'w') as f:
+                    f.write(data)
+                assert_array_equal(np.genfromtxt(name), wanted)
+
+    def test_gft_using_generator(self):
+        # gft doesn't work with unicode.
+        def count():
+            for i in range(10):
+                yield asbytes("%d" % i)
+
+        res = np.genfromtxt(count())
+        assert_array_equal(res, np.arange(10))
+
+    def test_auto_dtype_largeint(self):
+        # Regression test for numpy/numpy#5635 whereby large integers could
+        # cause OverflowErrors.
+
+        # Test the automatic definition of the output dtype
+        #
+        # 2**66 = 73786976294838206464 => should convert to float
+        # 2**34 = 17179869184 => should convert to int64
+        # 2**10 = 1024 => should convert to int (int32 on 32-bit systems,
+        #                 int64 on 64-bit systems)
+
+        data = TextIO('73786976294838206464 17179869184 1024')
+
+        test = np.ndfromtxt(data, dtype=None)
+
+        assert_equal(test.dtype.names, ['f0', 'f1', 'f2'])
+
+        assert_(test.dtype['f0'] == float)
+        assert_(test.dtype['f1'] == np.int64)
+        assert_(test.dtype['f2'] == np.integer)
+
+        assert_allclose(test['f0'], 73786976294838206464.)
+        assert_equal(test['f1'], 17179869184)
+        assert_equal(test['f2'], 1024)
+
+
+[email protected](Path is None, reason="No pathlib.Path")
+class TestPathUsage(object):
+    # Test that pathlib.Path can be used
+    def test_loadtxt(self):
+        with temppath(suffix='.txt') as path:
+            path = Path(path)
+            a = np.array([[1.1, 2], [3, 4]])
+            np.savetxt(path, a)
+            x = np.loadtxt(path)
+            assert_array_equal(x, a)
+
+    def test_save_load(self):
+        # Test that pathlib.Path instances can be used with save.
+        with temppath(suffix='.npy') as path:
+            path = Path(path)
+            a = np.array([[1, 2], [3, 4]], int)
+            np.save(path, a)
+            data = np.load(path)
+            assert_array_equal(data, a)
+
+    def test_save_load_memmap(self):
+        # Test that pathlib.Path instances can be loaded mem-mapped.
+        with temppath(suffix='.npy') as path:
+            path = Path(path)
+            a = np.array([[1, 2], [3, 4]], int)
+            np.save(path, a)
+            data = np.load(path, mmap_mode='r')
+            assert_array_equal(data, a)
+            # close the mem-mapped file
+            del data
+
+    def test_save_load_memmap_readwrite(self):
+        # Test that pathlib.Path instances can be written mem-mapped.
+        with temppath(suffix='.npy') as path:
+            path = Path(path)
+            a = np.array([[1, 2], [3, 4]], int)
+            np.save(path, a)
+            b = np.load(path, mmap_mode='r+')
+            a[0][0] = 5
+            b[0][0] = 5
+            del b  # closes the file
+            data = np.load(path)
+            assert_array_equal(data, a)
+
+    def test_savez_load(self):
+        # Test that pathlib.Path instances can be used with savez.
+        with temppath(suffix='.npz') as path:
+            path = Path(path)
+            np.savez(path, lab='place holder')
+            with np.load(path) as data:
+                assert_array_equal(data['lab'], 'place holder')
+    
+    def test_savez_compressed_load(self):
+        # Test that pathlib.Path instances can be used with savez.
+        with temppath(suffix='.npz') as path:
+            path = Path(path)
+            np.savez_compressed(path, lab='place holder')
+            data = np.load(path)
+            assert_array_equal(data['lab'], 'place holder')
+            data.close()
+
+    def test_genfromtxt(self):
+        with temppath(suffix='.txt') as path:
+            path = Path(path)
+            a = np.array([(1, 2), (3, 4)])
+            np.savetxt(path, a)
+            data = np.genfromtxt(path)
+            assert_array_equal(a, data)
+
+    def test_ndfromtxt(self):
+        # Test outputting a standard ndarray
+        with temppath(suffix='.txt') as path:
+            path = Path(path)
+            with path.open('w') as f:
+                f.write(u'1 2\n3 4')
+
+            control = np.array([[1, 2], [3, 4]], dtype=int)
+            test = np.ndfromtxt(path, dtype=int)
+            assert_array_equal(test, control)
+
+    def test_mafromtxt(self):
+        # From `test_fancy_dtype_alt` above
+        with temppath(suffix='.txt') as path:
+            path = Path(path)
+            with path.open('w') as f:
+                f.write(u'1,2,3.0\n4,5,6.0\n')
+
+            test = np.mafromtxt(path, delimiter=',')
+            control = ma.array([(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)])
+            assert_equal(test, control)
+
+    def test_recfromtxt(self):
+        with temppath(suffix='.txt') as path:
+            path = Path(path)
+            with path.open('w') as f:
+                f.write(u'A,B\n0,1\n2,3')
+
+            kwargs = dict(delimiter=",", missing_values="N/A", names=True)
+            test = np.recfromtxt(path, **kwargs)
+            control = np.array([(0, 1), (2, 3)],
+                               dtype=[('A', int), ('B', int)])
+            assert_(isinstance(test, np.recarray))
+            assert_equal(test, control)
+
+    def test_recfromcsv(self):
+        with temppath(suffix='.txt') as path:
+            path = Path(path)
+            with path.open('w') as f:
+                f.write(u'A,B\n0,1\n2,3')
+
+            kwargs = dict(missing_values="N/A", names=True, case_sensitive=True)
+            test = np.recfromcsv(path, dtype=None, **kwargs)
+            control = np.array([(0, 1), (2, 3)],
+                               dtype=[('A', int), ('B', int)])
+            assert_(isinstance(test, np.recarray))
+            assert_equal(test, control)
+
+
+def test_gzip_load():
+    a = np.random.random((5, 5))
+
+    s = BytesIO()
+    f = gzip.GzipFile(fileobj=s, mode="w")
+
+    np.save(f, a)
+    f.close()
+    s.seek(0)
+
+    f = gzip.GzipFile(fileobj=s, mode="r")
+    assert_array_equal(np.load(f), a)
+
+
+def test_gzip_loadtxt():
+    # Thanks to another windows brokenness, we can't use
+    # NamedTemporaryFile: a file created from this function cannot be
+    # reopened by another open call. So we first put the gzipped string
+    # of the test reference array, write it to a securely opened file,
+    # which is then read from by the loadtxt function
+    s = BytesIO()
+    g = gzip.GzipFile(fileobj=s, mode='w')
+    g.write(b'1 2 3\n')
+    g.close()
+
+    s.seek(0)
+    with temppath(suffix='.gz') as name:
+        with open(name, 'wb') as f:
+            f.write(s.read())
+        res = np.loadtxt(name)
+    s.close()
+
+    assert_array_equal(res, [1, 2, 3])
+
+
+def test_gzip_loadtxt_from_string():
+    s = BytesIO()
+    f = gzip.GzipFile(fileobj=s, mode="w")
+    f.write(b'1 2 3\n')
+    f.close()
+    s.seek(0)
+
+    f = gzip.GzipFile(fileobj=s, mode="r")
+    assert_array_equal(np.loadtxt(f), [1, 2, 3])
+
+
+def test_npzfile_dict():
+    s = BytesIO()
+    x = np.zeros((3, 3))
+    y = np.zeros((3, 3))
+
+    np.savez(s, x=x, y=y)
+    s.seek(0)
+
+    z = np.load(s)
+
+    assert_('x' in z)
+    assert_('y' in z)
+    assert_('x' in z.keys())
+    assert_('y' in z.keys())
+
+    for f, a in z.items():
+        assert_(f in ['x', 'y'])
+        assert_equal(a.shape, (3, 3))
+
+    assert_(len(z.items()) == 2)
+
+    for f in z:
+        assert_(f in ['x', 'y'])
+
+    assert_('x' in z.keys())
+
+
+[email protected](not HAS_REFCOUNT, reason="Python lacks refcounts")
+def test_load_refcount():
+    # Check that objects returned by np.load are directly freed based on
+    # their refcount, rather than needing the gc to collect them.
+
+    f = BytesIO()
+    np.savez(f, [1, 2, 3])
+    f.seek(0)
+
+    with assert_no_gc_cycles():
+        np.load(f)
+
+    f.seek(0)
+    dt = [("a", 'u1', 2), ("b", 'u1', 2)]
+    with assert_no_gc_cycles():
+        x = np.loadtxt(TextIO("0 1 2 3"), dtype=dt)
+        assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt))
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_mixins.py b/contrib/python/numpy/py2/numpy/lib/tests/test_mixins.py
new file mode 100644
index 00000000000..3dd5346b695
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_mixins.py
@@ -0,0 +1,224 @@
+from __future__ import division, absolute_import, print_function
+
+import numbers
+import operator
+import sys
+
+import numpy as np
+from numpy.testing import assert_, assert_equal, assert_raises
+
+
+PY2 = sys.version_info.major < 3
+
+
+# NOTE: This class should be kept as an exact copy of the example from the
+# docstring for NDArrayOperatorsMixin.
+
+class ArrayLike(np.lib.mixins.NDArrayOperatorsMixin):
+    def __init__(self, value):
+        self.value = np.asarray(value)
+
+    # One might also consider adding the built-in list type to this
+    # list, to support operations like np.add(array_like, list)
+    _HANDLED_TYPES = (np.ndarray, numbers.Number)
+
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        out = kwargs.get('out', ())
+        for x in inputs + out:
+            # Only support operations with instances of _HANDLED_TYPES.
+            # Use ArrayLike instead of type(self) for isinstance to
+            # allow subclasses that don't override __array_ufunc__ to
+            # handle ArrayLike objects.
+            if not isinstance(x, self._HANDLED_TYPES + (ArrayLike,)):
+                return NotImplemented
+
+        # Defer to the implementation of the ufunc on unwrapped values.
+        inputs = tuple(x.value if isinstance(x, ArrayLike) else x
+                       for x in inputs)
+        if out:
+            kwargs['out'] = tuple(
+                x.value if isinstance(x, ArrayLike) else x
+                for x in out)
+        result = getattr(ufunc, method)(*inputs, **kwargs)
+
+        if type(result) is tuple:
+            # multiple return values
+            return tuple(type(self)(x) for x in result)
+        elif method == 'at':
+            # no return value
+            return None
+        else:
+            # one return value
+            return type(self)(result)
+
+    def __repr__(self):
+        return '%s(%r)' % (type(self).__name__, self.value)
+
+
+def wrap_array_like(result):
+    if type(result) is tuple:
+        return tuple(ArrayLike(r) for r in result)
+    else:
+        return ArrayLike(result)
+
+
+def _assert_equal_type_and_value(result, expected, err_msg=None):
+    assert_equal(type(result), type(expected), err_msg=err_msg)
+    if isinstance(result, tuple):
+        assert_equal(len(result), len(expected), err_msg=err_msg)
+        for result_item, expected_item in zip(result, expected):
+            _assert_equal_type_and_value(result_item, expected_item, err_msg)
+    else:
+        assert_equal(result.value, expected.value, err_msg=err_msg)
+        assert_equal(getattr(result.value, 'dtype', None),
+                     getattr(expected.value, 'dtype', None), err_msg=err_msg)
+
+
+_ALL_BINARY_OPERATORS = [
+    operator.lt,
+    operator.le,
+    operator.eq,
+    operator.ne,
+    operator.gt,
+    operator.ge,
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    # TODO: test div on Python 2, only
+    operator.mod,
+    divmod,
+    pow,
+    operator.lshift,
+    operator.rshift,
+    operator.and_,
+    operator.xor,
+    operator.or_,
+]
+
+
+class TestNDArrayOperatorsMixin(object):
+
+    def test_array_like_add(self):
+
+        def check(result):
+            _assert_equal_type_and_value(result, ArrayLike(0))
+
+        check(ArrayLike(0) + 0)
+        check(0 + ArrayLike(0))
+
+        check(ArrayLike(0) + np.array(0))
+        check(np.array(0) + ArrayLike(0))
+
+        check(ArrayLike(np.array(0)) + 0)
+        check(0 + ArrayLike(np.array(0)))
+
+        check(ArrayLike(np.array(0)) + np.array(0))
+        check(np.array(0) + ArrayLike(np.array(0)))
+
+    def test_inplace(self):
+        array_like = ArrayLike(np.array([0]))
+        array_like += 1
+        _assert_equal_type_and_value(array_like, ArrayLike(np.array([1])))
+
+        array = np.array([0])
+        array += ArrayLike(1)
+        _assert_equal_type_and_value(array, ArrayLike(np.array([1])))
+
+    def test_opt_out(self):
+
+        class OptOut(object):
+            """Object that opts out of __array_ufunc__."""
+            __array_ufunc__ = None
+
+            def __add__(self, other):
+                return self
+
+            def __radd__(self, other):
+                return self
+
+        array_like = ArrayLike(1)
+        opt_out = OptOut()
+
+        # supported operations
+        assert_(array_like + opt_out is opt_out)
+        assert_(opt_out + array_like is opt_out)
+
+        # not supported
+        with assert_raises(TypeError):
+            # don't use the Python default, array_like = array_like + opt_out
+            array_like += opt_out
+        with assert_raises(TypeError):
+            array_like - opt_out
+        with assert_raises(TypeError):
+            opt_out - array_like
+
+    def test_subclass(self):
+
+        class SubArrayLike(ArrayLike):
+            """Should take precedence over ArrayLike."""
+
+        x = ArrayLike(0)
+        y = SubArrayLike(1)
+        _assert_equal_type_and_value(x + y, y)
+        _assert_equal_type_and_value(y + x, y)
+
+    def test_object(self):
+        x = ArrayLike(0)
+        obj = object()
+        with assert_raises(TypeError):
+            x + obj
+        with assert_raises(TypeError):
+            obj + x
+        with assert_raises(TypeError):
+            x += obj
+
+    def test_unary_methods(self):
+        array = np.array([-1, 0, 1, 2])
+        array_like = ArrayLike(array)
+        for op in [operator.neg,
+                   operator.pos,
+                   abs,
+                   operator.invert]:
+            _assert_equal_type_and_value(op(array_like), ArrayLike(op(array)))
+
+    def test_forward_binary_methods(self):
+        array = np.array([-1, 0, 1, 2])
+        array_like = ArrayLike(array)
+        for op in _ALL_BINARY_OPERATORS:
+            expected = wrap_array_like(op(array, 1))
+            actual = op(array_like, 1)
+            err_msg = 'failed for operator {}'.format(op)
+            _assert_equal_type_and_value(expected, actual, err_msg=err_msg)
+
+    def test_reflected_binary_methods(self):
+        for op in _ALL_BINARY_OPERATORS:
+            expected = wrap_array_like(op(2, 1))
+            actual = op(2, ArrayLike(1))
+            err_msg = 'failed for operator {}'.format(op)
+            _assert_equal_type_and_value(expected, actual, err_msg=err_msg)
+
+    def test_matmul(self):
+        array = np.array([1, 2], dtype=np.float64)
+        array_like = ArrayLike(array)
+        expected = ArrayLike(np.float64(5))
+        _assert_equal_type_and_value(expected, np.matmul(array_like, array))
+        if not PY2:
+            _assert_equal_type_and_value(
+                expected, operator.matmul(array_like, array))
+            _assert_equal_type_and_value(
+                expected, operator.matmul(array, array_like))
+
+    def test_ufunc_at(self):
+        array = ArrayLike(np.array([1, 2, 3, 4]))
+        assert_(np.negative.at(array, np.array([0, 1])) is None)
+        _assert_equal_type_and_value(array, ArrayLike([-1, -2, 3, 4]))
+
+    def test_ufunc_two_outputs(self):
+        mantissa, exponent = np.frexp(2 ** -3)
+        expected = (ArrayLike(mantissa), ArrayLike(exponent))
+        _assert_equal_type_and_value(
+            np.frexp(ArrayLike(2 ** -3)), expected)
+        _assert_equal_type_and_value(
+            np.frexp(ArrayLike(np.array(2 ** -3))), expected)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_nanfunctions.py b/contrib/python/numpy/py2/numpy/lib/tests/test_nanfunctions.py
new file mode 100644
index 00000000000..504372faf5f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_nanfunctions.py
@@ -0,0 +1,927 @@
+from __future__ import division, absolute_import, print_function
+
+import warnings
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_almost_equal, assert_no_warnings,
+    assert_raises, assert_array_equal, suppress_warnings
+    )
+
+
+# Test data
+_ndat = np.array([[0.6244, np.nan, 0.2692, 0.0116, np.nan, 0.1170],
+                  [0.5351, -0.9403, np.nan, 0.2100, 0.4759, 0.2833],
+                  [np.nan, np.nan, np.nan, 0.1042, np.nan, -0.5954],
+                  [0.1610, np.nan, np.nan, 0.1859, 0.3146, np.nan]])
+
+
+# Rows of _ndat with nans removed
+_rdat = [np.array([0.6244, 0.2692, 0.0116, 0.1170]),
+         np.array([0.5351, -0.9403, 0.2100, 0.4759, 0.2833]),
+         np.array([0.1042, -0.5954]),
+         np.array([0.1610, 0.1859, 0.3146])]
+
+# Rows of _ndat with nans converted to ones
+_ndat_ones = np.array([[0.6244, 1.0, 0.2692, 0.0116, 1.0, 0.1170],
+                       [0.5351, -0.9403, 1.0, 0.2100, 0.4759, 0.2833],
+                       [1.0, 1.0, 1.0, 0.1042, 1.0, -0.5954],
+                       [0.1610, 1.0, 1.0, 0.1859, 0.3146, 1.0]])
+
+# Rows of _ndat with nans converted to zeros
+_ndat_zeros = np.array([[0.6244, 0.0, 0.2692, 0.0116, 0.0, 0.1170],
+                        [0.5351, -0.9403, 0.0, 0.2100, 0.4759, 0.2833],
+                        [0.0, 0.0, 0.0, 0.1042, 0.0, -0.5954],
+                        [0.1610, 0.0, 0.0, 0.1859, 0.3146, 0.0]])
+
+
+class TestNanFunctions_MinMax(object):
+
+    nanfuncs = [np.nanmin, np.nanmax]
+    stdfuncs = [np.min, np.max]
+
+    def test_mutation(self):
+        # Check that passed array is not modified.
+        ndat = _ndat.copy()
+        for f in self.nanfuncs:
+            f(ndat)
+            assert_equal(ndat, _ndat)
+
+    def test_keepdims(self):
+        mat = np.eye(3)
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            for axis in [None, 0, 1]:
+                tgt = rf(mat, axis=axis, keepdims=True)
+                res = nf(mat, axis=axis, keepdims=True)
+                assert_(res.ndim == tgt.ndim)
+
+    def test_out(self):
+        mat = np.eye(3)
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            resout = np.zeros(3)
+            tgt = rf(mat, axis=1)
+            res = nf(mat, axis=1, out=resout)
+            assert_almost_equal(res, resout)
+            assert_almost_equal(res, tgt)
+
+    def test_dtype_from_input(self):
+        codes = 'efdgFDG'
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            for c in codes:
+                mat = np.eye(3, dtype=c)
+                tgt = rf(mat, axis=1).dtype.type
+                res = nf(mat, axis=1).dtype.type
+                assert_(res is tgt)
+                # scalar case
+                tgt = rf(mat, axis=None).dtype.type
+                res = nf(mat, axis=None).dtype.type
+                assert_(res is tgt)
+
+    def test_result_values(self):
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            tgt = [rf(d) for d in _rdat]
+            res = nf(_ndat, axis=1)
+            assert_almost_equal(res, tgt)
+
+    def test_allnans(self):
+        mat = np.array([np.nan]*9).reshape(3, 3)
+        for f in self.nanfuncs:
+            for axis in [None, 0, 1]:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    assert_(np.isnan(f(mat, axis=axis)).all())
+                    assert_(len(w) == 1, 'no warning raised')
+                    assert_(issubclass(w[0].category, RuntimeWarning))
+            # Check scalars
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_(np.isnan(f(np.nan)))
+                assert_(len(w) == 1, 'no warning raised')
+                assert_(issubclass(w[0].category, RuntimeWarning))
+
+    def test_masked(self):
+        mat = np.ma.fix_invalid(_ndat)
+        msk = mat._mask.copy()
+        for f in [np.nanmin]:
+            res = f(mat, axis=1)
+            tgt = f(_ndat, axis=1)
+            assert_equal(res, tgt)
+            assert_equal(mat._mask, msk)
+            assert_(not np.isinf(mat).any())
+
+    def test_scalar(self):
+        for f in self.nanfuncs:
+            assert_(f(0.) == 0.)
+
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
+        # Check that it works and that type and
+        # shape are preserved
+        mine = np.eye(3).view(MyNDArray)
+        for f in self.nanfuncs:
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
+
+        # check that rows of nan are dealt with for subclasses (#4628)
+        mine[1] = np.nan
+        for f in self.nanfuncs:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                res = f(mine, axis=0)
+                assert_(isinstance(res, MyNDArray))
+                assert_(not np.any(np.isnan(res)))
+                assert_(len(w) == 0)
+
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                res = f(mine, axis=1)
+                assert_(isinstance(res, MyNDArray))
+                assert_(np.isnan(res[1]) and not np.isnan(res[0])
+                        and not np.isnan(res[2]))
+                assert_(len(w) == 1, 'no warning raised')
+                assert_(issubclass(w[0].category, RuntimeWarning))
+
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                res = f(mine)
+                assert_(res.shape == ())
+                assert_(res != np.nan)
+                assert_(len(w) == 0)
+
+    def test_object_array(self):
+        arr = np.array([[1.0, 2.0], [np.nan, 4.0], [np.nan, np.nan]], dtype=object)
+        assert_equal(np.nanmin(arr), 1.0)
+        assert_equal(np.nanmin(arr, axis=0), [1.0, 2.0])
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            # assert_equal does not work on object arrays of nan
+            assert_equal(list(np.nanmin(arr, axis=1)), [1.0, 4.0, np.nan])
+            assert_(len(w) == 1, 'no warning raised')
+            assert_(issubclass(w[0].category, RuntimeWarning))
+
+
+class TestNanFunctions_ArgminArgmax(object):
+
+    nanfuncs = [np.nanargmin, np.nanargmax]
+
+    def test_mutation(self):
+        # Check that passed array is not modified.
+        ndat = _ndat.copy()
+        for f in self.nanfuncs:
+            f(ndat)
+            assert_equal(ndat, _ndat)
+
+    def test_result_values(self):
+        for f, fcmp in zip(self.nanfuncs, [np.greater, np.less]):
+            for row in _ndat:
+                with suppress_warnings() as sup:
+                    sup.filter(RuntimeWarning, "invalid value encountered in")
+                    ind = f(row)
+                    val = row[ind]
+                    # comparing with NaN is tricky as the result
+                    # is always false except for NaN != NaN
+                    assert_(not np.isnan(val))
+                    assert_(not fcmp(val, row).any())
+                    assert_(not np.equal(val, row[:ind]).any())
+
+    def test_allnans(self):
+        mat = np.array([np.nan]*9).reshape(3, 3)
+        for f in self.nanfuncs:
+            for axis in [None, 0, 1]:
+                assert_raises(ValueError, f, mat, axis=axis)
+            assert_raises(ValueError, f, np.nan)
+
+    def test_empty(self):
+        mat = np.zeros((0, 3))
+        for f in self.nanfuncs:
+            for axis in [0, None]:
+                assert_raises(ValueError, f, mat, axis=axis)
+            for axis in [1]:
+                res = f(mat, axis=axis)
+                assert_equal(res, np.zeros(0))
+
+    def test_scalar(self):
+        for f in self.nanfuncs:
+            assert_(f(0.) == 0.)
+
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
+        # Check that it works and that type and
+        # shape are preserved
+        mine = np.eye(3).view(MyNDArray)
+        for f in self.nanfuncs:
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
+
+
+class TestNanFunctions_IntTypes(object):
+
+    int_types = (np.int8, np.int16, np.int32, np.int64, np.uint8,
+                 np.uint16, np.uint32, np.uint64)
+
+    mat = np.array([127, 39, 93, 87, 46])
+
+    def integer_arrays(self):
+        for dtype in self.int_types:
+            yield self.mat.astype(dtype)
+
+    def test_nanmin(self):
+        tgt = np.min(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanmin(mat), tgt)
+
+    def test_nanmax(self):
+        tgt = np.max(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanmax(mat), tgt)
+
+    def test_nanargmin(self):
+        tgt = np.argmin(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanargmin(mat), tgt)
+
+    def test_nanargmax(self):
+        tgt = np.argmax(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanargmax(mat), tgt)
+
+    def test_nansum(self):
+        tgt = np.sum(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nansum(mat), tgt)
+
+    def test_nanprod(self):
+        tgt = np.prod(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanprod(mat), tgt)
+
+    def test_nancumsum(self):
+        tgt = np.cumsum(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nancumsum(mat), tgt)
+
+    def test_nancumprod(self):
+        tgt = np.cumprod(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nancumprod(mat), tgt)
+
+    def test_nanmean(self):
+        tgt = np.mean(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanmean(mat), tgt)
+
+    def test_nanvar(self):
+        tgt = np.var(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanvar(mat), tgt)
+
+        tgt = np.var(mat, ddof=1)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanvar(mat, ddof=1), tgt)
+
+    def test_nanstd(self):
+        tgt = np.std(self.mat)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanstd(mat), tgt)
+
+        tgt = np.std(self.mat, ddof=1)
+        for mat in self.integer_arrays():
+            assert_equal(np.nanstd(mat, ddof=1), tgt)
+
+
+class SharedNanFunctionsTestsMixin(object):
+    def test_mutation(self):
+        # Check that passed array is not modified.
+        ndat = _ndat.copy()
+        for f in self.nanfuncs:
+            f(ndat)
+            assert_equal(ndat, _ndat)
+
+    def test_keepdims(self):
+        mat = np.eye(3)
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            for axis in [None, 0, 1]:
+                tgt = rf(mat, axis=axis, keepdims=True)
+                res = nf(mat, axis=axis, keepdims=True)
+                assert_(res.ndim == tgt.ndim)
+
+    def test_out(self):
+        mat = np.eye(3)
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            resout = np.zeros(3)
+            tgt = rf(mat, axis=1)
+            res = nf(mat, axis=1, out=resout)
+            assert_almost_equal(res, resout)
+            assert_almost_equal(res, tgt)
+
+    def test_dtype_from_dtype(self):
+        mat = np.eye(3)
+        codes = 'efdgFDG'
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            for c in codes:
+                with suppress_warnings() as sup:
+                    if nf in {np.nanstd, np.nanvar} and c in 'FDG':
+                        # Giving the warning is a small bug, see gh-8000
+                        sup.filter(np.ComplexWarning)
+                    tgt = rf(mat, dtype=np.dtype(c), axis=1).dtype.type
+                    res = nf(mat, dtype=np.dtype(c), axis=1).dtype.type
+                    assert_(res is tgt)
+                    # scalar case
+                    tgt = rf(mat, dtype=np.dtype(c), axis=None).dtype.type
+                    res = nf(mat, dtype=np.dtype(c), axis=None).dtype.type
+                    assert_(res is tgt)
+
+    def test_dtype_from_char(self):
+        mat = np.eye(3)
+        codes = 'efdgFDG'
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            for c in codes:
+                with suppress_warnings() as sup:
+                    if nf in {np.nanstd, np.nanvar} and c in 'FDG':
+                        # Giving the warning is a small bug, see gh-8000
+                        sup.filter(np.ComplexWarning)
+                    tgt = rf(mat, dtype=c, axis=1).dtype.type
+                    res = nf(mat, dtype=c, axis=1).dtype.type
+                    assert_(res is tgt)
+                    # scalar case
+                    tgt = rf(mat, dtype=c, axis=None).dtype.type
+                    res = nf(mat, dtype=c, axis=None).dtype.type
+                    assert_(res is tgt)
+
+    def test_dtype_from_input(self):
+        codes = 'efdgFDG'
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            for c in codes:
+                mat = np.eye(3, dtype=c)
+                tgt = rf(mat, axis=1).dtype.type
+                res = nf(mat, axis=1).dtype.type
+                assert_(res is tgt, "res %s, tgt %s" % (res, tgt))
+                # scalar case
+                tgt = rf(mat, axis=None).dtype.type
+                res = nf(mat, axis=None).dtype.type
+                assert_(res is tgt)
+
+    def test_result_values(self):
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            tgt = [rf(d) for d in _rdat]
+            res = nf(_ndat, axis=1)
+            assert_almost_equal(res, tgt)
+
+    def test_scalar(self):
+        for f in self.nanfuncs:
+            assert_(f(0.) == 0.)
+
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
+        # Check that it works and that type and
+        # shape are preserved
+        array = np.eye(3)
+        mine = array.view(MyNDArray)
+        for f in self.nanfuncs:
+            expected_shape = f(array, axis=0).shape
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array, axis=1).shape
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array).shape
+            res = f(mine)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+
+
+class TestNanFunctions_SumProd(SharedNanFunctionsTestsMixin):
+
+    nanfuncs = [np.nansum, np.nanprod]
+    stdfuncs = [np.sum, np.prod]
+
+    def test_allnans(self):
+        # Check for FutureWarning
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = np.nansum([np.nan]*3, axis=None)
+            assert_(res == 0, 'result is not 0')
+            assert_(len(w) == 0, 'warning raised')
+            # Check scalar
+            res = np.nansum(np.nan)
+            assert_(res == 0, 'result is not 0')
+            assert_(len(w) == 0, 'warning raised')
+            # Check there is no warning for not all-nan
+            np.nansum([0]*3, axis=None)
+            assert_(len(w) == 0, 'unwanted warning raised')
+
+    def test_empty(self):
+        for f, tgt_value in zip([np.nansum, np.nanprod], [0, 1]):
+            mat = np.zeros((0, 3))
+            tgt = [tgt_value]*3
+            res = f(mat, axis=0)
+            assert_equal(res, tgt)
+            tgt = []
+            res = f(mat, axis=1)
+            assert_equal(res, tgt)
+            tgt = tgt_value
+            res = f(mat, axis=None)
+            assert_equal(res, tgt)
+
+
+class TestNanFunctions_CumSumProd(SharedNanFunctionsTestsMixin):
+
+    nanfuncs = [np.nancumsum, np.nancumprod]
+    stdfuncs = [np.cumsum, np.cumprod]
+
+    def test_allnans(self):
+        for f, tgt_value in zip(self.nanfuncs, [0, 1]):
+            # Unlike other nan-functions, sum/prod/cumsum/cumprod don't warn on all nan input
+            with assert_no_warnings():
+                res = f([np.nan]*3, axis=None)
+                tgt = tgt_value*np.ones((3))
+                assert_(np.array_equal(res, tgt), 'result is not %s * np.ones((3))' % (tgt_value))
+                # Check scalar
+                res = f(np.nan)
+                tgt = tgt_value*np.ones((1))
+                assert_(np.array_equal(res, tgt), 'result is not %s * np.ones((1))' % (tgt_value))
+                # Check there is no warning for not all-nan
+                f([0]*3, axis=None)
+
+    def test_empty(self):
+        for f, tgt_value in zip(self.nanfuncs, [0, 1]):
+            mat = np.zeros((0, 3))
+            tgt = tgt_value*np.ones((0, 3))
+            res = f(mat, axis=0)
+            assert_equal(res, tgt)
+            tgt = mat
+            res = f(mat, axis=1)
+            assert_equal(res, tgt)
+            tgt = np.zeros((0))
+            res = f(mat, axis=None)
+            assert_equal(res, tgt)
+
+    def test_keepdims(self):
+        for f, g in zip(self.nanfuncs, self.stdfuncs):
+            mat = np.eye(3)
+            for axis in [None, 0, 1]:
+                tgt = f(mat, axis=axis, out=None)
+                res = g(mat, axis=axis, out=None)
+                assert_(res.ndim == tgt.ndim)
+
+        for f in self.nanfuncs:
+            d = np.ones((3, 5, 7, 11))
+            # Randomly set some elements to NaN:
+            rs = np.random.RandomState(0)
+            d[rs.rand(*d.shape) < 0.5] = np.nan
+            res = f(d, axis=None)
+            assert_equal(res.shape, (1155,))
+            for axis in np.arange(4):
+                res = f(d, axis=axis)
+                assert_equal(res.shape, (3, 5, 7, 11))
+
+    def test_result_values(self):
+        for axis in (-2, -1, 0, 1, None):
+            tgt = np.cumprod(_ndat_ones, axis=axis)
+            res = np.nancumprod(_ndat, axis=axis)
+            assert_almost_equal(res, tgt)
+            tgt = np.cumsum(_ndat_zeros,axis=axis)
+            res = np.nancumsum(_ndat, axis=axis)
+            assert_almost_equal(res, tgt)
+
+    def test_out(self):
+        mat = np.eye(3)
+        for nf, rf in zip(self.nanfuncs, self.stdfuncs):
+            resout = np.eye(3)
+            for axis in (-2, -1, 0, 1):
+                tgt = rf(mat, axis=axis)
+                res = nf(mat, axis=axis, out=resout)
+                assert_almost_equal(res, resout)
+                assert_almost_equal(res, tgt)
+
+
+class TestNanFunctions_MeanVarStd(SharedNanFunctionsTestsMixin):
+
+    nanfuncs = [np.nanmean, np.nanvar, np.nanstd]
+    stdfuncs = [np.mean, np.var, np.std]
+
+    def test_dtype_error(self):
+        for f in self.nanfuncs:
+            for dtype in [np.bool_, np.int_, np.object_]:
+                assert_raises(TypeError, f, _ndat, axis=1, dtype=dtype)
+
+    def test_out_dtype_error(self):
+        for f in self.nanfuncs:
+            for dtype in [np.bool_, np.int_, np.object_]:
+                out = np.empty(_ndat.shape[0], dtype=dtype)
+                assert_raises(TypeError, f, _ndat, axis=1, out=out)
+
+    def test_ddof(self):
+        nanfuncs = [np.nanvar, np.nanstd]
+        stdfuncs = [np.var, np.std]
+        for nf, rf in zip(nanfuncs, stdfuncs):
+            for ddof in [0, 1]:
+                tgt = [rf(d, ddof=ddof) for d in _rdat]
+                res = nf(_ndat, axis=1, ddof=ddof)
+                assert_almost_equal(res, tgt)
+
+    def test_ddof_too_big(self):
+        nanfuncs = [np.nanvar, np.nanstd]
+        stdfuncs = [np.var, np.std]
+        dsize = [len(d) for d in _rdat]
+        for nf, rf in zip(nanfuncs, stdfuncs):
+            for ddof in range(5):
+                with suppress_warnings() as sup:
+                    sup.record(RuntimeWarning)
+                    sup.filter(np.ComplexWarning)
+                    tgt = [ddof >= d for d in dsize]
+                    res = nf(_ndat, axis=1, ddof=ddof)
+                    assert_equal(np.isnan(res), tgt)
+                    if any(tgt):
+                        assert_(len(sup.log) == 1)
+                    else:
+                        assert_(len(sup.log) == 0)
+
+    def test_allnans(self):
+        mat = np.array([np.nan]*9).reshape(3, 3)
+        for f in self.nanfuncs:
+            for axis in [None, 0, 1]:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    assert_(np.isnan(f(mat, axis=axis)).all())
+                    assert_(len(w) == 1)
+                    assert_(issubclass(w[0].category, RuntimeWarning))
+                    # Check scalar
+                    assert_(np.isnan(f(np.nan)))
+                    assert_(len(w) == 2)
+                    assert_(issubclass(w[0].category, RuntimeWarning))
+
+    def test_empty(self):
+        mat = np.zeros((0, 3))
+        for f in self.nanfuncs:
+            for axis in [0, None]:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    assert_(np.isnan(f(mat, axis=axis)).all())
+                    assert_(len(w) == 1)
+                    assert_(issubclass(w[0].category, RuntimeWarning))
+            for axis in [1]:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter('always')
+                    assert_equal(f(mat, axis=axis), np.zeros([]))
+                    assert_(len(w) == 0)
+
+
+class TestNanFunctions_Median(object):
+
+    def test_mutation(self):
+        # Check that passed array is not modified.
+        ndat = _ndat.copy()
+        np.nanmedian(ndat)
+        assert_equal(ndat, _ndat)
+
+    def test_keepdims(self):
+        mat = np.eye(3)
+        for axis in [None, 0, 1]:
+            tgt = np.median(mat, axis=axis, out=None, overwrite_input=False)
+            res = np.nanmedian(mat, axis=axis, out=None, overwrite_input=False)
+            assert_(res.ndim == tgt.ndim)
+
+        d = np.ones((3, 5, 7, 11))
+        # Randomly set some elements to NaN:
+        w = np.random.random((4, 200)) * np.array(d.shape)[:, None]
+        w = w.astype(np.intp)
+        d[tuple(w)] = np.nan
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning)
+            res = np.nanmedian(d, axis=None, keepdims=True)
+            assert_equal(res.shape, (1, 1, 1, 1))
+            res = np.nanmedian(d, axis=(0, 1), keepdims=True)
+            assert_equal(res.shape, (1, 1, 7, 11))
+            res = np.nanmedian(d, axis=(0, 3), keepdims=True)
+            assert_equal(res.shape, (1, 5, 7, 1))
+            res = np.nanmedian(d, axis=(1,), keepdims=True)
+            assert_equal(res.shape, (3, 1, 7, 11))
+            res = np.nanmedian(d, axis=(0, 1, 2, 3), keepdims=True)
+            assert_equal(res.shape, (1, 1, 1, 1))
+            res = np.nanmedian(d, axis=(0, 1, 3), keepdims=True)
+            assert_equal(res.shape, (1, 1, 7, 1))
+
+    def test_out(self):
+        mat = np.random.rand(3, 3)
+        nan_mat = np.insert(mat, [0, 2], np.nan, axis=1)
+        resout = np.zeros(3)
+        tgt = np.median(mat, axis=1)
+        res = np.nanmedian(nan_mat, axis=1, out=resout)
+        assert_almost_equal(res, resout)
+        assert_almost_equal(res, tgt)
+        # 0-d output:
+        resout = np.zeros(())
+        tgt = np.median(mat, axis=None)
+        res = np.nanmedian(nan_mat, axis=None, out=resout)
+        assert_almost_equal(res, resout)
+        assert_almost_equal(res, tgt)
+        res = np.nanmedian(nan_mat, axis=(0, 1), out=resout)
+        assert_almost_equal(res, resout)
+        assert_almost_equal(res, tgt)
+
+    def test_small_large(self):
+        # test the small and large code paths, current cutoff 400 elements
+        for s in [5, 20, 51, 200, 1000]:
+            d = np.random.randn(4, s)
+            # Randomly set some elements to NaN:
+            w = np.random.randint(0, d.size, size=d.size // 5)
+            d.ravel()[w] = np.nan
+            d[:,0] = 1.  # ensure at least one good value
+            # use normal median without nans to compare
+            tgt = []
+            for x in d:
+                nonan = np.compress(~np.isnan(x), x)
+                tgt.append(np.median(nonan, overwrite_input=True))
+
+            assert_array_equal(np.nanmedian(d, axis=-1), tgt)
+
+    def test_result_values(self):
+            tgt = [np.median(d) for d in _rdat]
+            res = np.nanmedian(_ndat, axis=1)
+            assert_almost_equal(res, tgt)
+
+    def test_allnans(self):
+        mat = np.array([np.nan]*9).reshape(3, 3)
+        for axis in [None, 0, 1]:
+            with suppress_warnings() as sup:
+                sup.record(RuntimeWarning)
+
+                assert_(np.isnan(np.nanmedian(mat, axis=axis)).all())
+                if axis is None:
+                    assert_(len(sup.log) == 1)
+                else:
+                    assert_(len(sup.log) == 3)
+                # Check scalar
+                assert_(np.isnan(np.nanmedian(np.nan)))
+                if axis is None:
+                    assert_(len(sup.log) == 2)
+                else:
+                    assert_(len(sup.log) == 4)
+
+    def test_empty(self):
+        mat = np.zeros((0, 3))
+        for axis in [0, None]:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_(np.isnan(np.nanmedian(mat, axis=axis)).all())
+                assert_(len(w) == 1)
+                assert_(issubclass(w[0].category, RuntimeWarning))
+        for axis in [1]:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_equal(np.nanmedian(mat, axis=axis), np.zeros([]))
+                assert_(len(w) == 0)
+
+    def test_scalar(self):
+        assert_(np.nanmedian(0.) == 0.)
+
+    def test_extended_axis_invalid(self):
+        d = np.ones((3, 5, 7, 11))
+        assert_raises(np.AxisError, np.nanmedian, d, axis=-5)
+        assert_raises(np.AxisError, np.nanmedian, d, axis=(0, -5))
+        assert_raises(np.AxisError, np.nanmedian, d, axis=4)
+        assert_raises(np.AxisError, np.nanmedian, d, axis=(0, 4))
+        assert_raises(ValueError, np.nanmedian, d, axis=(1, 1))
+
+    def test_float_special(self):
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning)
+            for inf in [np.inf, -np.inf]:
+                a = np.array([[inf,  np.nan], [np.nan, np.nan]])
+                assert_equal(np.nanmedian(a, axis=0), [inf,  np.nan])
+                assert_equal(np.nanmedian(a, axis=1), [inf,  np.nan])
+                assert_equal(np.nanmedian(a), inf)
+
+                # minimum fill value check
+                a = np.array([[np.nan, np.nan, inf],
+                             [np.nan, np.nan, inf]])
+                assert_equal(np.nanmedian(a), inf)
+                assert_equal(np.nanmedian(a, axis=0), [np.nan, np.nan, inf])
+                assert_equal(np.nanmedian(a, axis=1), inf)
+
+                # no mask path
+                a = np.array([[inf, inf], [inf, inf]])
+                assert_equal(np.nanmedian(a, axis=1), inf)
+
+                a = np.array([[inf, 7, -inf, -9],
+                              [-10, np.nan, np.nan, 5],
+                              [4, np.nan, np.nan, inf]],
+                              dtype=np.float32)
+                if inf > 0:
+                    assert_equal(np.nanmedian(a, axis=0), [4., 7., -inf, 5.])
+                    assert_equal(np.nanmedian(a), 4.5)
+                else:
+                    assert_equal(np.nanmedian(a, axis=0), [-10., 7., -inf, -9.])
+                    assert_equal(np.nanmedian(a), -2.5)
+                assert_equal(np.nanmedian(a, axis=-1), [-1., -2.5, inf])
+
+                for i in range(0, 10):
+                    for j in range(1, 10):
+                        a = np.array([([np.nan] * i) + ([inf] * j)] * 2)
+                        assert_equal(np.nanmedian(a), inf)
+                        assert_equal(np.nanmedian(a, axis=1), inf)
+                        assert_equal(np.nanmedian(a, axis=0),
+                                     ([np.nan] * i) + [inf] * j)
+
+                        a = np.array([([np.nan] * i) + ([-inf] * j)] * 2)
+                        assert_equal(np.nanmedian(a), -inf)
+                        assert_equal(np.nanmedian(a, axis=1), -inf)
+                        assert_equal(np.nanmedian(a, axis=0),
+                                     ([np.nan] * i) + [-inf] * j)
+
+
+class TestNanFunctions_Percentile(object):
+
+    def test_mutation(self):
+        # Check that passed array is not modified.
+        ndat = _ndat.copy()
+        np.nanpercentile(ndat, 30)
+        assert_equal(ndat, _ndat)
+
+    def test_keepdims(self):
+        mat = np.eye(3)
+        for axis in [None, 0, 1]:
+            tgt = np.percentile(mat, 70, axis=axis, out=None,
+                                overwrite_input=False)
+            res = np.nanpercentile(mat, 70, axis=axis, out=None,
+                                   overwrite_input=False)
+            assert_(res.ndim == tgt.ndim)
+
+        d = np.ones((3, 5, 7, 11))
+        # Randomly set some elements to NaN:
+        w = np.random.random((4, 200)) * np.array(d.shape)[:, None]
+        w = w.astype(np.intp)
+        d[tuple(w)] = np.nan
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning)
+            res = np.nanpercentile(d, 90, axis=None, keepdims=True)
+            assert_equal(res.shape, (1, 1, 1, 1))
+            res = np.nanpercentile(d, 90, axis=(0, 1), keepdims=True)
+            assert_equal(res.shape, (1, 1, 7, 11))
+            res = np.nanpercentile(d, 90, axis=(0, 3), keepdims=True)
+            assert_equal(res.shape, (1, 5, 7, 1))
+            res = np.nanpercentile(d, 90, axis=(1,), keepdims=True)
+            assert_equal(res.shape, (3, 1, 7, 11))
+            res = np.nanpercentile(d, 90, axis=(0, 1, 2, 3), keepdims=True)
+            assert_equal(res.shape, (1, 1, 1, 1))
+            res = np.nanpercentile(d, 90, axis=(0, 1, 3), keepdims=True)
+            assert_equal(res.shape, (1, 1, 7, 1))
+
+    def test_out(self):
+        mat = np.random.rand(3, 3)
+        nan_mat = np.insert(mat, [0, 2], np.nan, axis=1)
+        resout = np.zeros(3)
+        tgt = np.percentile(mat, 42, axis=1)
+        res = np.nanpercentile(nan_mat, 42, axis=1, out=resout)
+        assert_almost_equal(res, resout)
+        assert_almost_equal(res, tgt)
+        # 0-d output:
+        resout = np.zeros(())
+        tgt = np.percentile(mat, 42, axis=None)
+        res = np.nanpercentile(nan_mat, 42, axis=None, out=resout)
+        assert_almost_equal(res, resout)
+        assert_almost_equal(res, tgt)
+        res = np.nanpercentile(nan_mat, 42, axis=(0, 1), out=resout)
+        assert_almost_equal(res, resout)
+        assert_almost_equal(res, tgt)
+
+    def test_result_values(self):
+        tgt = [np.percentile(d, 28) for d in _rdat]
+        res = np.nanpercentile(_ndat, 28, axis=1)
+        assert_almost_equal(res, tgt)
+        # Transpose the array to fit the output convention of numpy.percentile
+        tgt = np.transpose([np.percentile(d, (28, 98)) for d in _rdat])
+        res = np.nanpercentile(_ndat, (28, 98), axis=1)
+        assert_almost_equal(res, tgt)
+
+    def test_allnans(self):
+        mat = np.array([np.nan]*9).reshape(3, 3)
+        for axis in [None, 0, 1]:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_(np.isnan(np.nanpercentile(mat, 60, axis=axis)).all())
+                if axis is None:
+                    assert_(len(w) == 1)
+                else:
+                    assert_(len(w) == 3)
+                assert_(issubclass(w[0].category, RuntimeWarning))
+                # Check scalar
+                assert_(np.isnan(np.nanpercentile(np.nan, 60)))
+                if axis is None:
+                    assert_(len(w) == 2)
+                else:
+                    assert_(len(w) == 4)
+                assert_(issubclass(w[0].category, RuntimeWarning))
+
+    def test_empty(self):
+        mat = np.zeros((0, 3))
+        for axis in [0, None]:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_(np.isnan(np.nanpercentile(mat, 40, axis=axis)).all())
+                assert_(len(w) == 1)
+                assert_(issubclass(w[0].category, RuntimeWarning))
+        for axis in [1]:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_equal(np.nanpercentile(mat, 40, axis=axis), np.zeros([]))
+                assert_(len(w) == 0)
+
+    def test_scalar(self):
+        assert_equal(np.nanpercentile(0., 100), 0.)
+        a = np.arange(6)
+        r = np.nanpercentile(a, 50, axis=0)
+        assert_equal(r, 2.5)
+        assert_(np.isscalar(r))
+
+    def test_extended_axis_invalid(self):
+        d = np.ones((3, 5, 7, 11))
+        assert_raises(np.AxisError, np.nanpercentile, d, q=5, axis=-5)
+        assert_raises(np.AxisError, np.nanpercentile, d, q=5, axis=(0, -5))
+        assert_raises(np.AxisError, np.nanpercentile, d, q=5, axis=4)
+        assert_raises(np.AxisError, np.nanpercentile, d, q=5, axis=(0, 4))
+        assert_raises(ValueError, np.nanpercentile, d, q=5, axis=(1, 1))
+
+    def test_multiple_percentiles(self):
+        perc = [50, 100]
+        mat = np.ones((4, 3))
+        nan_mat = np.nan * mat
+        # For checking consistency in higher dimensional case
+        large_mat = np.ones((3, 4, 5))
+        large_mat[:, 0:2:4, :] = 0
+        large_mat[:, :, 3:] *= 2
+        for axis in [None, 0, 1]:
+            for keepdim in [False, True]:
+                with suppress_warnings() as sup:
+                    sup.filter(RuntimeWarning, "All-NaN slice encountered")
+                    val = np.percentile(mat, perc, axis=axis, keepdims=keepdim)
+                    nan_val = np.nanpercentile(nan_mat, perc, axis=axis,
+                                               keepdims=keepdim)
+                    assert_equal(nan_val.shape, val.shape)
+
+                    val = np.percentile(large_mat, perc, axis=axis,
+                                        keepdims=keepdim)
+                    nan_val = np.nanpercentile(large_mat, perc, axis=axis,
+                                               keepdims=keepdim)
+                    assert_equal(nan_val, val)
+
+        megamat = np.ones((3, 4, 5, 6))
+        assert_equal(np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6))
+
+
+class TestNanFunctions_Quantile(object):
+    # most of this is already tested by TestPercentile
+
+    def test_regression(self):
+        ar = np.arange(24).reshape(2, 3, 4).astype(float)
+        ar[0][1] = np.nan
+
+        assert_equal(np.nanquantile(ar, q=0.5), np.nanpercentile(ar, q=50))
+        assert_equal(np.nanquantile(ar, q=0.5, axis=0),
+                     np.nanpercentile(ar, q=50, axis=0))
+        assert_equal(np.nanquantile(ar, q=0.5, axis=1),
+                     np.nanpercentile(ar, q=50, axis=1))
+        assert_equal(np.nanquantile(ar, q=[0.5], axis=1),
+                     np.nanpercentile(ar, q=[50], axis=1))
+        assert_equal(np.nanquantile(ar, q=[0.25, 0.5, 0.75], axis=1),
+                     np.nanpercentile(ar, q=[25, 50, 75], axis=1))
+
+    def test_basic(self):
+        x = np.arange(8) * 0.5
+        assert_equal(np.nanquantile(x, 0), 0.)
+        assert_equal(np.nanquantile(x, 1), 3.5)
+        assert_equal(np.nanquantile(x, 0.5), 1.75)
+
+    def test_no_p_overwrite(self):
+        # this is worth retesting, because quantile does not make a copy
+        p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
+        p = p0.copy()
+        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
+
+        p0 = p0.tolist()
+        p = p.tolist()
+        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_packbits.py b/contrib/python/numpy/py2/numpy/lib/tests/test_packbits.py
new file mode 100644
index 00000000000..fde5c37f2e6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_packbits.py
@@ -0,0 +1,268 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_array_equal, assert_equal, assert_raises
+
+
+def test_packbits():
+    # Copied from the docstring.
+    a = [[[1, 0, 1], [0, 1, 0]],
+         [[1, 1, 0], [0, 0, 1]]]
+    for dt in '?bBhHiIlLqQ':
+        arr = np.array(a, dtype=dt)
+        b = np.packbits(arr, axis=-1)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, np.array([[[160], [64]], [[192], [32]]]))
+
+    assert_raises(TypeError, np.packbits, np.array(a, dtype=float))
+
+
+def test_packbits_empty():
+    shapes = [
+        (0,), (10, 20, 0), (10, 0, 20), (0, 10, 20), (20, 0, 0), (0, 20, 0),
+        (0, 0, 20), (0, 0, 0),
+    ]
+    for dt in '?bBhHiIlLqQ':
+        for shape in shapes:
+            a = np.empty(shape, dtype=dt)
+            b = np.packbits(a)
+            assert_equal(b.dtype, np.uint8)
+            assert_equal(b.shape, (0,))
+
+
+def test_packbits_empty_with_axis():
+    # Original shapes and lists of packed shapes for different axes.
+    shapes = [
+        ((0,), [(0,)]),
+        ((10, 20, 0), [(2, 20, 0), (10, 3, 0), (10, 20, 0)]),
+        ((10, 0, 20), [(2, 0, 20), (10, 0, 20), (10, 0, 3)]),
+        ((0, 10, 20), [(0, 10, 20), (0, 2, 20), (0, 10, 3)]),
+        ((20, 0, 0), [(3, 0, 0), (20, 0, 0), (20, 0, 0)]),
+        ((0, 20, 0), [(0, 20, 0), (0, 3, 0), (0, 20, 0)]),
+        ((0, 0, 20), [(0, 0, 20), (0, 0, 20), (0, 0, 3)]),
+        ((0, 0, 0), [(0, 0, 0), (0, 0, 0), (0, 0, 0)]),
+    ]
+    for dt in '?bBhHiIlLqQ':
+        for in_shape, out_shapes in shapes:
+            for ax, out_shape in enumerate(out_shapes):
+                a = np.empty(in_shape, dtype=dt)
+                b = np.packbits(a, axis=ax)
+                assert_equal(b.dtype, np.uint8)
+                assert_equal(b.shape, out_shape)
+
+
+def test_packbits_large():
+    # test data large enough for 16 byte vectorization
+    a = np.array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+                  0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
+                  1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
+                  1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
+                  1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
+                  1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
+                  1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
+                  0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
+                  1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
+                  1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
+                  1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
+                  0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
+                  1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
+                  1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
+                  1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0])
+    a = a.repeat(3)
+    for dtype in '?bBhHiIlLqQ':
+        arr = np.array(a, dtype=dtype)
+        b = np.packbits(arr, axis=None)
+        assert_equal(b.dtype, np.uint8)
+        r = [252, 127, 192, 3, 254, 7, 252, 0, 7, 31, 240, 0, 28, 1, 255, 252,
+             113, 248, 3, 255, 192, 28, 15, 192, 28, 126, 0, 224, 127, 255,
+             227, 142, 7, 31, 142, 63, 28, 126, 56, 227, 240, 0, 227, 128, 63,
+             224, 14, 56, 252, 112, 56, 255, 241, 248, 3, 240, 56, 224, 112,
+             63, 255, 255, 199, 224, 14, 0, 31, 143, 192, 3, 255, 199, 0, 1,
+             255, 224, 1, 255, 252, 126, 63, 0, 1, 192, 252, 14, 63, 0, 15,
+             199, 252, 113, 255, 3, 128, 56, 252, 14, 7, 0, 113, 255, 255, 142, 56, 227,
+             129, 248, 227, 129, 199, 31, 128]
+        assert_array_equal(b, r)
+        # equal for size being multiple of 8
+        assert_array_equal(np.unpackbits(b)[:-4], a)
+
+        # check last byte of different remainders (16 byte vectorization)
+        b = [np.packbits(arr[:-i], axis=None)[-1] for i in range(1, 16)]
+        assert_array_equal(b, [128, 128, 128, 31, 30, 28, 24, 16, 0, 0, 0, 199,
+                               198, 196, 192])
+
+
+        arr = arr.reshape(36, 25)
+        b = np.packbits(arr, axis=0)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[190, 186, 178, 178, 150, 215, 87, 83, 83, 195,
+                                199, 206, 204, 204, 140, 140, 136, 136, 8, 40, 105,
+                                107, 75, 74, 88],
+                               [72, 216, 248, 241, 227, 195, 202, 90, 90, 83,
+                                83, 119, 127, 109, 73, 64, 208, 244, 189, 45,
+                                41, 104, 122, 90, 18],
+                               [113, 120, 248, 216, 152, 24, 60, 52, 182, 150,
+                                150, 150, 146, 210, 210, 246, 255, 255, 223,
+                                151, 21, 17, 17, 131, 163],
+                               [214, 210, 210, 64, 68, 5, 5, 1, 72, 88, 92,
+                                92, 78, 110, 39, 181, 149, 220, 222, 218, 218,
+                                202, 234, 170, 168],
+                               [0, 128, 128, 192, 80, 112, 48, 160, 160, 224,
+                                240, 208, 144, 128, 160, 224, 240, 208, 144,
+                                144, 176, 240, 224, 192, 128]])
+
+        b = np.packbits(arr, axis=1)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[252, 127, 192,   0],
+                               [  7, 252,  15, 128],
+                               [240,   0,  28,   0],
+                               [255, 128,   0, 128],
+                               [192,  31, 255, 128],
+                               [142,  63,   0,   0],
+                               [255, 240,   7,   0],
+                               [  7, 224,  14,   0],
+                               [126,   0, 224,   0],
+                               [255, 255, 199,   0],
+                               [ 56,  28, 126,   0],
+                               [113, 248, 227, 128],
+                               [227, 142,  63,   0],
+                               [  0,  28, 112,   0],
+                               [ 15, 248,   3, 128],
+                               [ 28, 126,  56,   0],
+                               [ 56, 255, 241, 128],
+                               [240,   7, 224,   0],
+                               [227, 129, 192, 128],
+                               [255, 255, 254,   0],
+                               [126,   0, 224,   0],
+                               [  3, 241, 248,   0],
+                               [  0, 255, 241, 128],
+                               [128,   0, 255, 128],
+                               [224,   1, 255, 128],
+                               [248, 252, 126,   0],
+                               [  0,   7,   3, 128],
+                               [224, 113, 248,   0],
+                               [  0, 252, 127, 128],
+                               [142,  63, 224,   0],
+                               [224,  14,  63,   0],
+                               [  7,   3, 128,   0],
+                               [113, 255, 255, 128],
+                               [ 28, 113, 199,   0],
+                               [  7, 227, 142,   0],
+                               [ 14,  56, 252,   0]])
+
+        arr = arr.T.copy()
+        b = np.packbits(arr, axis=0)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[252, 7, 240, 255, 192, 142, 255, 7, 126, 255,
+                                56, 113, 227, 0, 15, 28, 56, 240, 227, 255,
+                                126, 3, 0, 128, 224, 248, 0, 224, 0, 142, 224,
+                                7, 113, 28, 7, 14],
+                                [127, 252, 0, 128, 31, 63, 240, 224, 0, 255,
+                                 28, 248, 142, 28, 248, 126, 255, 7, 129, 255,
+                                 0, 241, 255, 0, 1, 252, 7, 113, 252, 63, 14,
+                                 3, 255, 113, 227, 56],
+                                [192, 15, 28, 0, 255, 0, 7, 14, 224, 199, 126,
+                                 227, 63, 112, 3, 56, 241, 224, 192, 254, 224,
+                                 248, 241, 255, 255, 126, 3, 248, 127, 224, 63,
+                                 128, 255, 199, 142, 252],
+                                [0, 128, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 0,
+                                 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 128,
+                                 128, 0, 128, 0, 128, 0, 0, 0, 128, 0, 0, 0]])
+
+        b = np.packbits(arr, axis=1)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[190,  72, 113, 214,   0],
+                               [186, 216, 120, 210, 128],
+                               [178, 248, 248, 210, 128],
+                               [178, 241, 216,  64, 192],
+                               [150, 227, 152,  68,  80],
+                               [215, 195,  24,   5, 112],
+                               [ 87, 202,  60,   5,  48],
+                               [ 83,  90,  52,   1, 160],
+                               [ 83,  90, 182,  72, 160],
+                               [195,  83, 150,  88, 224],
+                               [199,  83, 150,  92, 240],
+                               [206, 119, 150,  92, 208],
+                               [204, 127, 146,  78, 144],
+                               [204, 109, 210, 110, 128],
+                               [140,  73, 210,  39, 160],
+                               [140,  64, 246, 181, 224],
+                               [136, 208, 255, 149, 240],
+                               [136, 244, 255, 220, 208],
+                               [  8, 189, 223, 222, 144],
+                               [ 40,  45, 151, 218, 144],
+                               [105,  41,  21, 218, 176],
+                               [107, 104,  17, 202, 240],
+                               [ 75, 122,  17, 234, 224],
+                               [ 74,  90, 131, 170, 192],
+                               [ 88,  18, 163, 168, 128]])
+
+
+    # result is the same if input is multiplied with a nonzero value
+    for dtype in 'bBhHiIlLqQ':
+        arr = np.array(a, dtype=dtype)
+        rnd = np.random.randint(low=np.iinfo(dtype).min,
+                                high=np.iinfo(dtype).max, size=arr.size,
+                                dtype=dtype)
+        rnd[rnd == 0] = 1
+        arr *= rnd.astype(dtype)
+        b = np.packbits(arr, axis=-1)
+        assert_array_equal(np.unpackbits(b)[:-4], a)
+
+    assert_raises(TypeError, np.packbits, np.array(a, dtype=float))
+
+
+def test_packbits_very_large():
+    # test some with a larger arrays gh-8637
+    # code is covered earlier but larger array makes crash on bug more likely
+    for s in range(950, 1050):
+        for dt in '?bBhHiIlLqQ':
+            x = np.ones((200, s), dtype=bool)
+            np.packbits(x, axis=1)
+
+
+def test_unpackbits():
+    # Copied from the docstring.
+    a = np.array([[2], [7], [23]], dtype=np.uint8)
+    b = np.unpackbits(a, axis=1)
+    assert_equal(b.dtype, np.uint8)
+    assert_array_equal(b, np.array([[0, 0, 0, 0, 0, 0, 1, 0],
+                                    [0, 0, 0, 0, 0, 1, 1, 1],
+                                    [0, 0, 0, 1, 0, 1, 1, 1]]))
+
+
+def test_unpackbits_empty():
+    a = np.empty((0,), dtype=np.uint8)
+    b = np.unpackbits(a)
+    assert_equal(b.dtype, np.uint8)
+    assert_array_equal(b, np.empty((0,)))
+
+
+def test_unpackbits_empty_with_axis():
+    # Lists of packed shapes for different axes and unpacked shapes.
+    shapes = [
+        ([(0,)], (0,)),
+        ([(2, 24, 0), (16, 3, 0), (16, 24, 0)], (16, 24, 0)),
+        ([(2, 0, 24), (16, 0, 24), (16, 0, 3)], (16, 0, 24)),
+        ([(0, 16, 24), (0, 2, 24), (0, 16, 3)], (0, 16, 24)),
+        ([(3, 0, 0), (24, 0, 0), (24, 0, 0)], (24, 0, 0)),
+        ([(0, 24, 0), (0, 3, 0), (0, 24, 0)], (0, 24, 0)),
+        ([(0, 0, 24), (0, 0, 24), (0, 0, 3)], (0, 0, 24)),
+        ([(0, 0, 0), (0, 0, 0), (0, 0, 0)], (0, 0, 0)),
+    ]
+    for in_shapes, out_shape in shapes:
+        for ax, in_shape in enumerate(in_shapes):
+            a = np.empty(in_shape, dtype=np.uint8)
+            b = np.unpackbits(a, axis=ax)
+            assert_equal(b.dtype, np.uint8)
+            assert_equal(b.shape, out_shape)
+
+
+def test_unpackbits_large():
+    # test all possible numbers via comparison to already tested packbits
+    d = np.arange(277, dtype=np.uint8)
+    assert_array_equal(np.packbits(np.unpackbits(d)), d)
+    assert_array_equal(np.packbits(np.unpackbits(d[::2])), d[::2])
+    d = np.tile(d, (3, 1))
+    assert_array_equal(np.packbits(np.unpackbits(d, axis=1), axis=1), d)
+    d = d.T.copy()
+    assert_array_equal(np.packbits(np.unpackbits(d, axis=0), axis=0), d)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_polynomial.py b/contrib/python/numpy/py2/numpy/lib/tests/test_polynomial.py
new file mode 100644
index 00000000000..89759bd8394
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_polynomial.py
@@ -0,0 +1,261 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_almost_equal,
+    assert_array_almost_equal, assert_raises, assert_allclose
+    )
+
+
+class TestPolynomial(object):
+    def test_poly1d_str_and_repr(self):
+        p = np.poly1d([1., 2, 3])
+        assert_equal(repr(p), 'poly1d([1., 2., 3.])')
+        assert_equal(str(p),
+                     '   2\n'
+                     '1 x + 2 x + 3')
+
+        q = np.poly1d([3., 2, 1])
+        assert_equal(repr(q), 'poly1d([3., 2., 1.])')
+        assert_equal(str(q),
+                     '   2\n'
+                     '3 x + 2 x + 1')
+
+        r = np.poly1d([1.89999 + 2j, -3j, -5.12345678, 2 + 1j])
+        assert_equal(str(r),
+                     '            3      2\n'
+                     '(1.9 + 2j) x - 3j x - 5.123 x + (2 + 1j)')
+
+        assert_equal(str(np.poly1d([-3, -2, -1])),
+                     '    2\n'
+                     '-3 x - 2 x - 1')
+
+    def test_poly1d_resolution(self):
+        p = np.poly1d([1., 2, 3])
+        q = np.poly1d([3., 2, 1])
+        assert_equal(p(0), 3.0)
+        assert_equal(p(5), 38.0)
+        assert_equal(q(0), 1.0)
+        assert_equal(q(5), 86.0)
+
+    def test_poly1d_math(self):
+        # here we use some simple coeffs to make calculations easier
+        p = np.poly1d([1., 2, 4])
+        q = np.poly1d([4., 2, 1])
+        assert_equal(p/q, (np.poly1d([0.25]), np.poly1d([1.5, 3.75])))
+        assert_equal(p.integ(), np.poly1d([1/3, 1., 4., 0.]))
+        assert_equal(p.integ(1), np.poly1d([1/3, 1., 4., 0.]))
+
+        p = np.poly1d([1., 2, 3])
+        q = np.poly1d([3., 2, 1])
+        assert_equal(p * q, np.poly1d([3., 8., 14., 8., 3.]))
+        assert_equal(p + q, np.poly1d([4., 4., 4.]))
+        assert_equal(p - q, np.poly1d([-2., 0., 2.]))
+        assert_equal(p ** 4, np.poly1d([1., 8., 36., 104., 214., 312., 324., 216., 81.]))
+        assert_equal(p(q), np.poly1d([9., 12., 16., 8., 6.]))
+        assert_equal(q(p), np.poly1d([3., 12., 32., 40., 34.]))
+        assert_equal(p.deriv(), np.poly1d([2., 2.]))
+        assert_equal(p.deriv(2), np.poly1d([2.]))
+        assert_equal(np.polydiv(np.poly1d([1, 0, -1]), np.poly1d([1, 1])),
+                     (np.poly1d([1., -1.]), np.poly1d([0.])))
+
+    def test_poly1d_misc(self):
+        p = np.poly1d([1., 2, 3])
+        assert_equal(np.asarray(p), np.array([1., 2., 3.]))
+        assert_equal(len(p), 2)
+        assert_equal((p[0], p[1], p[2], p[3]), (3.0, 2.0, 1.0, 0))
+
+    def test_poly1d_variable_arg(self):
+        q = np.poly1d([1., 2, 3], variable='y')
+        assert_equal(str(q),
+                     '   2\n'
+                     '1 y + 2 y + 3')
+        q = np.poly1d([1., 2, 3], variable='lambda')
+        assert_equal(str(q),
+                     '        2\n'
+                     '1 lambda + 2 lambda + 3')
+
+    def test_poly(self):
+        assert_array_almost_equal(np.poly([3, -np.sqrt(2), np.sqrt(2)]),
+                                  [1, -3, -2, 6])
+
+        # From matlab docs
+        A = [[1, 2, 3], [4, 5, 6], [7, 8, 0]]
+        assert_array_almost_equal(np.poly(A), [1, -6, -72, -27])
+
+        # Should produce real output for perfect conjugates
+        assert_(np.isrealobj(np.poly([+1.082j, +2.613j, -2.613j, -1.082j])))
+        assert_(np.isrealobj(np.poly([0+1j, -0+-1j, 1+2j,
+                                      1-2j, 1.+3.5j, 1-3.5j])))
+        assert_(np.isrealobj(np.poly([1j, -1j, 1+2j, 1-2j, 1+3j, 1-3.j])))
+        assert_(np.isrealobj(np.poly([1j, -1j, 1+2j, 1-2j])))
+        assert_(np.isrealobj(np.poly([1j, -1j, 2j, -2j])))
+        assert_(np.isrealobj(np.poly([1j, -1j])))
+        assert_(np.isrealobj(np.poly([1, -1])))
+
+        assert_(np.iscomplexobj(np.poly([1j, -1.0000001j])))
+
+        np.random.seed(42)
+        a = np.random.randn(100) + 1j*np.random.randn(100)
+        assert_(np.isrealobj(np.poly(np.concatenate((a, np.conjugate(a))))))
+
+    def test_roots(self):
+        assert_array_equal(np.roots([1, 0, 0]), [0, 0])
+
+    def test_str_leading_zeros(self):
+        p = np.poly1d([4, 3, 2, 1])
+        p[3] = 0
+        assert_equal(str(p),
+                     "   2\n"
+                     "3 x + 2 x + 1")
+
+        p = np.poly1d([1, 2])
+        p[0] = 0
+        p[1] = 0
+        assert_equal(str(p), " \n0")
+
+    def test_polyfit(self):
+        c = np.array([3., 2., 1.])
+        x = np.linspace(0, 2, 7)
+        y = np.polyval(c, x)
+        err = [1, -1, 1, -1, 1, -1, 1]
+        weights = np.arange(8, 1, -1)**2/7.0
+
+        # Check exception when too few points for variance estimate. Note that
+        # the estimate requires the number of data points to exceed
+        # degree + 1
+        assert_raises(ValueError, np.polyfit,
+                      [1], [1], deg=0, cov=True)
+
+        # check 1D case
+        m, cov = np.polyfit(x, y+err, 2, cov=True)
+        est = [3.8571, 0.2857, 1.619]
+        assert_almost_equal(est, m, decimal=4)
+        val0 = [[ 1.4694, -2.9388,  0.8163],
+                [-2.9388,  6.3673, -2.1224],
+                [ 0.8163, -2.1224,  1.161 ]]
+        assert_almost_equal(val0, cov, decimal=4)
+
+        m2, cov2 = np.polyfit(x, y+err, 2, w=weights, cov=True)
+        assert_almost_equal([4.8927, -1.0177, 1.7768], m2, decimal=4)
+        val = [[ 4.3964, -5.0052,  0.4878],
+               [-5.0052,  6.8067, -0.9089],
+               [ 0.4878, -0.9089,  0.3337]]
+        assert_almost_equal(val, cov2, decimal=4)
+
+        m3, cov3 = np.polyfit(x, y+err, 2, w=weights, cov="unscaled")
+        assert_almost_equal([4.8927, -1.0177, 1.7768], m3, decimal=4)
+        val = [[ 0.1473, -0.1677,  0.0163],
+               [-0.1677,  0.228 , -0.0304],
+               [ 0.0163, -0.0304,  0.0112]]
+        assert_almost_equal(val, cov3, decimal=4)
+
+        # check 2D (n,1) case
+        y = y[:, np.newaxis]
+        c = c[:, np.newaxis]
+        assert_almost_equal(c, np.polyfit(x, y, 2))
+        # check 2D (n,2) case
+        yy = np.concatenate((y, y), axis=1)
+        cc = np.concatenate((c, c), axis=1)
+        assert_almost_equal(cc, np.polyfit(x, yy, 2))
+
+        m, cov = np.polyfit(x, yy + np.array(err)[:, np.newaxis], 2, cov=True)
+        assert_almost_equal(est, m[:, 0], decimal=4)
+        assert_almost_equal(est, m[:, 1], decimal=4)
+        assert_almost_equal(val0, cov[:, :, 0], decimal=4)
+        assert_almost_equal(val0, cov[:, :, 1], decimal=4)
+
+        # check order 1 (deg=0) case, were the analytic results are simple
+        np.random.seed(123)
+        y = np.random.normal(size=(4, 10000))
+        mean, cov = np.polyfit(np.zeros(y.shape[0]), y, deg=0, cov=True)
+        # Should get sigma_mean = sigma/sqrt(N) = 1./sqrt(4) = 0.5.
+        assert_allclose(mean.std(), 0.5, atol=0.01)
+        assert_allclose(np.sqrt(cov.mean()), 0.5, atol=0.01)
+        # Without scaling, since reduced chi2 is 1, the result should be the same.
+        mean, cov = np.polyfit(np.zeros(y.shape[0]), y, w=np.ones(y.shape[0]),
+                               deg=0, cov="unscaled")
+        assert_allclose(mean.std(), 0.5, atol=0.01)
+        assert_almost_equal(np.sqrt(cov.mean()), 0.5)
+        # If we estimate our errors wrong, no change with scaling:
+        w = np.full(y.shape[0], 1./0.5)
+        mean, cov = np.polyfit(np.zeros(y.shape[0]), y, w=w, deg=0, cov=True)
+        assert_allclose(mean.std(), 0.5, atol=0.01)
+        assert_allclose(np.sqrt(cov.mean()), 0.5, atol=0.01)
+        # But if we do not scale, our estimate for the error in the mean will
+        # differ.
+        mean, cov = np.polyfit(np.zeros(y.shape[0]), y, w=w, deg=0, cov="unscaled")
+        assert_allclose(mean.std(), 0.5, atol=0.01)
+        assert_almost_equal(np.sqrt(cov.mean()), 0.25)
+
+    def test_objects(self):
+        from decimal import Decimal
+        p = np.poly1d([Decimal('4.0'), Decimal('3.0'), Decimal('2.0')])
+        p2 = p * Decimal('1.333333333333333')
+        assert_(p2[1] == Decimal("3.9999999999999990"))
+        p2 = p.deriv()
+        assert_(p2[1] == Decimal('8.0'))
+        p2 = p.integ()
+        assert_(p2[3] == Decimal("1.333333333333333333333333333"))
+        assert_(p2[2] == Decimal('1.5'))
+        assert_(np.issubdtype(p2.coeffs.dtype, np.object_))
+        p = np.poly([Decimal(1), Decimal(2)])
+        assert_equal(np.poly([Decimal(1), Decimal(2)]),
+                     [1, Decimal(-3), Decimal(2)])
+
+    def test_complex(self):
+        p = np.poly1d([3j, 2j, 1j])
+        p2 = p.integ()
+        assert_((p2.coeffs == [1j, 1j, 1j, 0]).all())
+        p2 = p.deriv()
+        assert_((p2.coeffs == [6j, 2j]).all())
+
+    def test_integ_coeffs(self):
+        p = np.poly1d([3, 2, 1])
+        p2 = p.integ(3, k=[9, 7, 6])
+        assert_(
+            (p2.coeffs == [1/4./5., 1/3./4., 1/2./3., 9/1./2., 7, 6]).all())
+
+    def test_zero_dims(self):
+        try:
+            np.poly(np.zeros((0, 0)))
+        except ValueError:
+            pass
+
+    def test_poly_int_overflow(self):
+        """
+        Regression test for gh-5096.
+        """
+        v = np.arange(1, 21)
+        assert_almost_equal(np.poly(v), np.poly(np.diag(v)))
+
+    def test_poly_eq(self):
+        p = np.poly1d([1, 2, 3])
+        p2 = np.poly1d([1, 2, 4])
+        assert_equal(p == None, False)
+        assert_equal(p != None, True)
+        assert_equal(p == p, True)
+        assert_equal(p == p2, False)
+        assert_equal(p != p2, True)
+
+    def test_polydiv(self):
+        b = np.poly1d([2, 6, 6, 1])
+        a = np.poly1d([-1j, (1+2j), -(2+1j), 1])
+        q, r = np.polydiv(b, a)
+        assert_equal(q.coeffs.dtype, np.complex128)
+        assert_equal(r.coeffs.dtype, np.complex128)
+        assert_equal(q*a + r, b)
+
+    def test_poly_coeffs_mutable(self):
+        """ Coefficients should be modifiable """
+        p = np.poly1d([1, 2, 3])
+
+        p.coeffs += 1
+        assert_equal(p.coeffs, [2, 3, 4])
+
+        p.coeffs[2] += 10
+        assert_equal(p.coeffs, [2, 3, 14])
+
+        # this never used to be allowed - let's not add features to deprecated
+        # APIs
+        assert_raises(AttributeError, setattr, p, 'coeffs', np.array(1))
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_recfunctions.py b/contrib/python/numpy/py2/numpy/lib/tests/test_recfunctions.py
new file mode 100644
index 00000000000..0c839d486fe
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_recfunctions.py
@@ -0,0 +1,980 @@
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+import numpy as np
+import numpy.ma as ma
+from numpy.ma.mrecords import MaskedRecords
+from numpy.ma.testutils import assert_equal
+from numpy.testing import assert_, assert_raises
+from numpy.lib.recfunctions import (
+    drop_fields, rename_fields, get_fieldstructure, recursive_fill_fields,
+    find_duplicates, merge_arrays, append_fields, stack_arrays, join_by,
+    repack_fields, unstructured_to_structured, structured_to_unstructured,
+    apply_along_fields, require_fields, assign_fields_by_name)
+get_fieldspec = np.lib.recfunctions._get_fieldspec
+get_names = np.lib.recfunctions.get_names
+get_names_flat = np.lib.recfunctions.get_names_flat
+zip_descr = np.lib.recfunctions._zip_descr
+zip_dtype = np.lib.recfunctions._zip_dtype
+
+
+class TestRecFunctions(object):
+    # Misc tests
+
+    def setup(self):
+        x = np.array([1, 2, ])
+        y = np.array([10, 20, 30])
+        z = np.array([('A', 1.), ('B', 2.)],
+                     dtype=[('A', '|S3'), ('B', float)])
+        w = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
+                     dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
+        self.data = (w, x, y, z)
+
+    def test_zip_descr(self):
+        # Test zip_descr
+        (w, x, y, z) = self.data
+
+        # Std array
+        test = zip_descr((x, x), flatten=True)
+        assert_equal(test,
+                     np.dtype([('', int), ('', int)]))
+        test = zip_descr((x, x), flatten=False)
+        assert_equal(test,
+                     np.dtype([('', int), ('', int)]))
+
+        # Std & flexible-dtype
+        test = zip_descr((x, z), flatten=True)
+        assert_equal(test,
+                     np.dtype([('', int), ('A', '|S3'), ('B', float)]))
+        test = zip_descr((x, z), flatten=False)
+        assert_equal(test,
+                     np.dtype([('', int),
+                               ('', [('A', '|S3'), ('B', float)])]))
+
+        # Standard & nested dtype
+        test = zip_descr((x, w), flatten=True)
+        assert_equal(test,
+                     np.dtype([('', int),
+                               ('a', int),
+                               ('ba', float), ('bb', int)]))
+        test = zip_descr((x, w), flatten=False)
+        assert_equal(test,
+                     np.dtype([('', int),
+                               ('', [('a', int),
+                                     ('b', [('ba', float), ('bb', int)])])]))
+
+    def test_drop_fields(self):
+        # Test drop_fields
+        a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
+                     dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
+
+        # A basic field
+        test = drop_fields(a, 'a')
+        control = np.array([((2, 3.0),), ((5, 6.0),)],
+                           dtype=[('b', [('ba', float), ('bb', int)])])
+        assert_equal(test, control)
+
+        # Another basic field (but nesting two fields)
+        test = drop_fields(a, 'b')
+        control = np.array([(1,), (4,)], dtype=[('a', int)])
+        assert_equal(test, control)
+
+        # A nested sub-field
+        test = drop_fields(a, ['ba', ])
+        control = np.array([(1, (3.0,)), (4, (6.0,))],
+                           dtype=[('a', int), ('b', [('bb', int)])])
+        assert_equal(test, control)
+
+        # All the nested sub-field from a field: zap that field
+        test = drop_fields(a, ['ba', 'bb'])
+        control = np.array([(1,), (4,)], dtype=[('a', int)])
+        assert_equal(test, control)
+
+        test = drop_fields(a, ['a', 'b'])
+        assert_(test is None)
+
+    def test_rename_fields(self):
+        # Test rename fields
+        a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
+                     dtype=[('a', int),
+                            ('b', [('ba', float), ('bb', (float, 2))])])
+        test = rename_fields(a, {'a': 'A', 'bb': 'BB'})
+        newdtype = [('A', int), ('b', [('ba', float), ('BB', (float, 2))])]
+        control = a.view(newdtype)
+        assert_equal(test.dtype, newdtype)
+        assert_equal(test, control)
+
+    def test_get_names(self):
+        # Test get_names
+        ndtype = np.dtype([('A', '|S3'), ('B', float)])
+        test = get_names(ndtype)
+        assert_equal(test, ('A', 'B'))
+
+        ndtype = np.dtype([('a', int), ('b', [('ba', float), ('bb', int)])])
+        test = get_names(ndtype)
+        assert_equal(test, ('a', ('b', ('ba', 'bb'))))
+
+        ndtype = np.dtype([('a', int), ('b', [])])
+        test = get_names(ndtype)
+        assert_equal(test, ('a', ('b', ())))
+
+        ndtype = np.dtype([])
+        test = get_names(ndtype)
+        assert_equal(test, ())
+
+    def test_get_names_flat(self):
+        # Test get_names_flat
+        ndtype = np.dtype([('A', '|S3'), ('B', float)])
+        test = get_names_flat(ndtype)
+        assert_equal(test, ('A', 'B'))
+
+        ndtype = np.dtype([('a', int), ('b', [('ba', float), ('bb', int)])])
+        test = get_names_flat(ndtype)
+        assert_equal(test, ('a', 'b', 'ba', 'bb'))
+
+        ndtype = np.dtype([('a', int), ('b', [])])
+        test = get_names_flat(ndtype)
+        assert_equal(test, ('a', 'b'))
+
+        ndtype = np.dtype([])
+        test = get_names_flat(ndtype)
+        assert_equal(test, ())
+
+    def test_get_fieldstructure(self):
+        # Test get_fieldstructure
+
+        # No nested fields
+        ndtype = np.dtype([('A', '|S3'), ('B', float)])
+        test = get_fieldstructure(ndtype)
+        assert_equal(test, {'A': [], 'B': []})
+
+        # One 1-nested field
+        ndtype = np.dtype([('A', int), ('B', [('BA', float), ('BB', '|S1')])])
+        test = get_fieldstructure(ndtype)
+        assert_equal(test, {'A': [], 'B': [], 'BA': ['B', ], 'BB': ['B']})
+
+        # One 2-nested fields
+        ndtype = np.dtype([('A', int),
+                           ('B', [('BA', int),
+                                  ('BB', [('BBA', int), ('BBB', int)])])])
+        test = get_fieldstructure(ndtype)
+        control = {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'],
+                   'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
+        assert_equal(test, control)
+
+        # 0 fields
+        ndtype = np.dtype([])
+        test = get_fieldstructure(ndtype)
+        assert_equal(test, {})
+
+    def test_find_duplicates(self):
+        # Test find_duplicates
+        a = ma.array([(2, (2., 'B')), (1, (2., 'B')), (2, (2., 'B')),
+                      (1, (1., 'B')), (2, (2., 'B')), (2, (2., 'C'))],
+                     mask=[(0, (0, 0)), (0, (0, 0)), (0, (0, 0)),
+                           (0, (0, 0)), (1, (0, 0)), (0, (1, 0))],
+                     dtype=[('A', int), ('B', [('BA', float), ('BB', '|S1')])])
+
+        test = find_duplicates(a, ignoremask=False, return_index=True)
+        control = [0, 2]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+        test = find_duplicates(a, key='A', return_index=True)
+        control = [0, 1, 2, 3, 5]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+        test = find_duplicates(a, key='B', return_index=True)
+        control = [0, 1, 2, 4]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+        test = find_duplicates(a, key='BA', return_index=True)
+        control = [0, 1, 2, 4]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+        test = find_duplicates(a, key='BB', return_index=True)
+        control = [0, 1, 2, 3, 4]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+    def test_find_duplicates_ignoremask(self):
+        # Test the ignoremask option of find_duplicates
+        ndtype = [('a', int)]
+        a = ma.array([1, 1, 1, 2, 2, 3, 3],
+                     mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
+        test = find_duplicates(a, ignoremask=True, return_index=True)
+        control = [0, 1, 3, 4]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+        test = find_duplicates(a, ignoremask=False, return_index=True)
+        control = [0, 1, 2, 3, 4, 6]
+        assert_equal(sorted(test[-1]), control)
+        assert_equal(test[0], a[test[-1]])
+
+    def test_repack_fields(self):
+        dt = np.dtype('u1,f4,i8', align=True)
+        a = np.zeros(2, dtype=dt)
+
+        assert_equal(repack_fields(dt), np.dtype('u1,f4,i8'))
+        assert_equal(repack_fields(a).itemsize, 13)
+        assert_equal(repack_fields(repack_fields(dt), align=True), dt)
+
+        # make sure type is preserved
+        dt = np.dtype((np.record, dt))
+        assert_(repack_fields(dt).type is np.record)
+
+    def test_structured_to_unstructured(self):
+        a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
+        out = structured_to_unstructured(a)
+        assert_equal(out, np.zeros((4,5), dtype='f8'))
+
+        b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
+                     dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
+        out = np.mean(structured_to_unstructured(b[['x', 'z']]), axis=-1)
+        assert_equal(out, np.array([ 3. ,  5.5,  9. , 11. ]))
+        out = np.mean(structured_to_unstructured(b[['x']]), axis=-1)
+        assert_equal(out, np.array([ 1. ,  4. ,  7. , 10. ]))
+
+        c = np.arange(20).reshape((4,5))
+        out = unstructured_to_structured(c, a.dtype)
+        want = np.array([( 0, ( 1.,  2), [ 3.,  4.]),
+                         ( 5, ( 6.,  7), [ 8.,  9.]),
+                         (10, (11., 12), [13., 14.]),
+                         (15, (16., 17), [18., 19.])],
+                     dtype=[('a', 'i4'),
+                            ('b', [('f0', 'f4'), ('f1', 'u2')]),
+                            ('c', 'f4', (2,))])
+        assert_equal(out, want)
+
+        d = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
+                     dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
+        assert_equal(apply_along_fields(np.mean, d),
+                     np.array([ 8.0/3,  16.0/3,  26.0/3, 11. ]))
+        assert_equal(apply_along_fields(np.mean, d[['x', 'z']]),
+                     np.array([ 3. ,  5.5,  9. , 11. ]))
+
+        # check that for uniform field dtypes we get a view, not a copy:
+        d = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
+                     dtype=[('x', 'i4'), ('y', 'i4'), ('z', 'i4')])
+        dd = structured_to_unstructured(d)
+        ddd = unstructured_to_structured(dd, d.dtype)
+        assert_(dd.base is d)
+        assert_(ddd.base is d)
+
+        # including uniform fields with subarrays unpacked
+        d = np.array([(1, [2,  3], [[ 4,  5], [ 6,  7]]),
+                      (8, [9, 10], [[11, 12], [13, 14]])],
+                     dtype=[('x0', 'i4'), ('x1', ('i4', 2)),
+                            ('x2', ('i4', (2, 2)))])
+        dd = structured_to_unstructured(d)
+        ddd = unstructured_to_structured(dd, d.dtype)
+        assert_(dd.base is d)
+        assert_(ddd.base is d)
+
+        # test that nested fields with identical names don't break anything
+        point = np.dtype([('x', int), ('y', int)])
+        triangle = np.dtype([('a', point), ('b', point), ('c', point)])
+        arr = np.zeros(10, triangle)
+        res = structured_to_unstructured(arr, dtype=int)
+        assert_equal(res, np.zeros((10, 6), dtype=int))
+
+
+        # test nested combinations of subarrays and structured arrays, gh-13333
+        def subarray(dt, shape):
+            return np.dtype((dt, shape))
+
+        def structured(*dts):
+            return np.dtype([('x{}'.format(i), dt) for i, dt in enumerate(dts)])
+
+        def inspect(dt, dtype=None):
+            arr = np.zeros((), dt)
+            ret = structured_to_unstructured(arr, dtype=dtype)
+            backarr = unstructured_to_structured(ret, dt)
+            return ret.shape, ret.dtype, backarr.dtype
+
+        dt = structured(subarray(structured(np.int32, np.int32), 3))
+        assert_equal(inspect(dt), ((6,), np.int32, dt))
+
+        dt = structured(subarray(subarray(np.int32, 2), 2))
+        assert_equal(inspect(dt), ((4,), np.int32, dt))
+
+        dt = structured(np.int32)
+        assert_equal(inspect(dt), ((1,), np.int32, dt))
+
+        dt = structured(np.int32, subarray(subarray(np.int32, 2), 2))
+        assert_equal(inspect(dt), ((5,), np.int32, dt))
+
+        dt = structured()
+        assert_raises(ValueError, structured_to_unstructured, np.zeros(3, dt))
+
+        # these currently don't work, but we may make it work in the future
+        assert_raises(NotImplementedError, structured_to_unstructured,
+                                           np.zeros(3, dt), dtype=np.int32)
+        assert_raises(NotImplementedError, unstructured_to_structured,
+                                           np.zeros((3,0), dtype=np.int32))
+
+    def test_field_assignment_by_name(self):
+        a = np.ones(2, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
+        newdt = [('b', 'f4'), ('c', 'u1')]
+
+        assert_equal(require_fields(a, newdt), np.ones(2, newdt))
+
+        b = np.array([(1,2), (3,4)], dtype=newdt)
+        assign_fields_by_name(a, b, zero_unassigned=False)
+        assert_equal(a, np.array([(1,1,2),(1,3,4)], dtype=a.dtype))
+        assign_fields_by_name(a, b)
+        assert_equal(a, np.array([(0,1,2),(0,3,4)], dtype=a.dtype))
+
+        # test nested fields
+        a = np.ones(2, dtype=[('a', [('b', 'f8'), ('c', 'u1')])])
+        newdt = [('a', [('c', 'u1')])]
+        assert_equal(require_fields(a, newdt), np.ones(2, newdt))
+        b = np.array([((2,),), ((3,),)], dtype=newdt)
+        assign_fields_by_name(a, b, zero_unassigned=False)
+        assert_equal(a, np.array([((1,2),), ((1,3),)], dtype=a.dtype))
+        assign_fields_by_name(a, b)
+        assert_equal(a, np.array([((0,2),), ((0,3),)], dtype=a.dtype))
+
+        # test unstructured code path for 0d arrays
+        a, b = np.array(3), np.array(0)
+        assign_fields_by_name(b, a)
+        assert_equal(b[()], 3)
+
+
+class TestRecursiveFillFields(object):
+    # Test recursive_fill_fields.
+    def test_simple_flexible(self):
+        # Test recursive_fill_fields on flexible-array
+        a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)])
+        b = np.zeros((3,), dtype=a.dtype)
+        test = recursive_fill_fields(a, b)
+        control = np.array([(1, 10.), (2, 20.), (0, 0.)],
+                           dtype=[('A', int), ('B', float)])
+        assert_equal(test, control)
+
+    def test_masked_flexible(self):
+        # Test recursive_fill_fields on masked flexible-array
+        a = ma.array([(1, 10.), (2, 20.)], mask=[(0, 1), (1, 0)],
+                     dtype=[('A', int), ('B', float)])
+        b = ma.zeros((3,), dtype=a.dtype)
+        test = recursive_fill_fields(a, b)
+        control = ma.array([(1, 10.), (2, 20.), (0, 0.)],
+                           mask=[(0, 1), (1, 0), (0, 0)],
+                           dtype=[('A', int), ('B', float)])
+        assert_equal(test, control)
+
+
+class TestMergeArrays(object):
+    # Test merge_arrays
+
+    def setup(self):
+        x = np.array([1, 2, ])
+        y = np.array([10, 20, 30])
+        z = np.array(
+            [('A', 1.), ('B', 2.)], dtype=[('A', '|S3'), ('B', float)])
+        w = np.array(
+            [(1, (2, 3.0)), (4, (5, 6.0))],
+            dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
+        self.data = (w, x, y, z)
+
+    def test_solo(self):
+        # Test merge_arrays on a single array.
+        (_, x, _, z) = self.data
+
+        test = merge_arrays(x)
+        control = np.array([(1,), (2,)], dtype=[('f0', int)])
+        assert_equal(test, control)
+        test = merge_arrays((x,))
+        assert_equal(test, control)
+
+        test = merge_arrays(z, flatten=False)
+        assert_equal(test, z)
+        test = merge_arrays(z, flatten=True)
+        assert_equal(test, z)
+
+    def test_solo_w_flatten(self):
+        # Test merge_arrays on a single array w & w/o flattening
+        w = self.data[0]
+        test = merge_arrays(w, flatten=False)
+        assert_equal(test, w)
+
+        test = merge_arrays(w, flatten=True)
+        control = np.array([(1, 2, 3.0), (4, 5, 6.0)],
+                           dtype=[('a', int), ('ba', float), ('bb', int)])
+        assert_equal(test, control)
+
+    def test_standard(self):
+        # Test standard & standard
+        # Test merge arrays
+        (_, x, y, _) = self.data
+        test = merge_arrays((x, y), usemask=False)
+        control = np.array([(1, 10), (2, 20), (-1, 30)],
+                           dtype=[('f0', int), ('f1', int)])
+        assert_equal(test, control)
+
+        test = merge_arrays((x, y), usemask=True)
+        control = ma.array([(1, 10), (2, 20), (-1, 30)],
+                           mask=[(0, 0), (0, 0), (1, 0)],
+                           dtype=[('f0', int), ('f1', int)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+    def test_flatten(self):
+        # Test standard & flexible
+        (_, x, _, z) = self.data
+        test = merge_arrays((x, z), flatten=True)
+        control = np.array([(1, 'A', 1.), (2, 'B', 2.)],
+                           dtype=[('f0', int), ('A', '|S3'), ('B', float)])
+        assert_equal(test, control)
+
+        test = merge_arrays((x, z), flatten=False)
+        control = np.array([(1, ('A', 1.)), (2, ('B', 2.))],
+                           dtype=[('f0', int),
+                                  ('f1', [('A', '|S3'), ('B', float)])])
+        assert_equal(test, control)
+
+    def test_flatten_wflexible(self):
+        # Test flatten standard & nested
+        (w, x, _, _) = self.data
+        test = merge_arrays((x, w), flatten=True)
+        control = np.array([(1, 1, 2, 3.0), (2, 4, 5, 6.0)],
+                           dtype=[('f0', int),
+                                  ('a', int), ('ba', float), ('bb', int)])
+        assert_equal(test, control)
+
+        test = merge_arrays((x, w), flatten=False)
+        controldtype = [('f0', int),
+                                ('f1', [('a', int),
+                                        ('b', [('ba', float), ('bb', int)])])]
+        control = np.array([(1., (1, (2, 3.0))), (2, (4, (5, 6.0)))],
+                           dtype=controldtype)
+        assert_equal(test, control)
+
+    def test_wmasked_arrays(self):
+        # Test merge_arrays masked arrays
+        (_, x, _, _) = self.data
+        mx = ma.array([1, 2, 3], mask=[1, 0, 0])
+        test = merge_arrays((x, mx), usemask=True)
+        control = ma.array([(1, 1), (2, 2), (-1, 3)],
+                           mask=[(0, 1), (0, 0), (1, 0)],
+                           dtype=[('f0', int), ('f1', int)])
+        assert_equal(test, control)
+        test = merge_arrays((x, mx), usemask=True, asrecarray=True)
+        assert_equal(test, control)
+        assert_(isinstance(test, MaskedRecords))
+
+    def test_w_singlefield(self):
+        # Test single field
+        test = merge_arrays((np.array([1, 2]).view([('a', int)]),
+                             np.array([10., 20., 30.])),)
+        control = ma.array([(1, 10.), (2, 20.), (-1, 30.)],
+                           mask=[(0, 0), (0, 0), (1, 0)],
+                           dtype=[('a', int), ('f1', float)])
+        assert_equal(test, control)
+
+    def test_w_shorter_flex(self):
+        # Test merge_arrays w/ a shorter flexndarray.
+        z = self.data[-1]
+
+        # Fixme, this test looks incomplete and broken
+        #test = merge_arrays((z, np.array([10, 20, 30]).view([('C', int)])))
+        #control = np.array([('A', 1., 10), ('B', 2., 20), ('-1', -1, 20)],
+        #                   dtype=[('A', '|S3'), ('B', float), ('C', int)])
+        #assert_equal(test, control)
+
+        # Hack to avoid pyflakes warnings about unused variables
+        merge_arrays((z, np.array([10, 20, 30]).view([('C', int)])))
+        np.array([('A', 1., 10), ('B', 2., 20), ('-1', -1, 20)],
+                 dtype=[('A', '|S3'), ('B', float), ('C', int)])
+
+    def test_singlerecord(self):
+        (_, x, y, z) = self.data
+        test = merge_arrays((x[0], y[0], z[0]), usemask=False)
+        control = np.array([(1, 10, ('A', 1))],
+                           dtype=[('f0', int),
+                                  ('f1', int),
+                                  ('f2', [('A', '|S3'), ('B', float)])])
+        assert_equal(test, control)
+
+
+class TestAppendFields(object):
+    # Test append_fields
+
+    def setup(self):
+        x = np.array([1, 2, ])
+        y = np.array([10, 20, 30])
+        z = np.array(
+            [('A', 1.), ('B', 2.)], dtype=[('A', '|S3'), ('B', float)])
+        w = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
+                     dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
+        self.data = (w, x, y, z)
+
+    def test_append_single(self):
+        # Test simple case
+        (_, x, _, _) = self.data
+        test = append_fields(x, 'A', data=[10, 20, 30])
+        control = ma.array([(1, 10), (2, 20), (-1, 30)],
+                           mask=[(0, 0), (0, 0), (1, 0)],
+                           dtype=[('f0', int), ('A', int)],)
+        assert_equal(test, control)
+
+    def test_append_double(self):
+        # Test simple case
+        (_, x, _, _) = self.data
+        test = append_fields(x, ('A', 'B'), data=[[10, 20, 30], [100, 200]])
+        control = ma.array([(1, 10, 100), (2, 20, 200), (-1, 30, -1)],
+                           mask=[(0, 0, 0), (0, 0, 0), (1, 0, 1)],
+                           dtype=[('f0', int), ('A', int), ('B', int)],)
+        assert_equal(test, control)
+
+    def test_append_on_flex(self):
+        # Test append_fields on flexible type arrays
+        z = self.data[-1]
+        test = append_fields(z, 'C', data=[10, 20, 30])
+        control = ma.array([('A', 1., 10), ('B', 2., 20), (-1, -1., 30)],
+                           mask=[(0, 0, 0), (0, 0, 0), (1, 1, 0)],
+                           dtype=[('A', '|S3'), ('B', float), ('C', int)],)
+        assert_equal(test, control)
+
+    def test_append_on_nested(self):
+        # Test append_fields on nested fields
+        w = self.data[0]
+        test = append_fields(w, 'C', data=[10, 20, 30])
+        control = ma.array([(1, (2, 3.0), 10),
+                            (4, (5, 6.0), 20),
+                            (-1, (-1, -1.), 30)],
+                           mask=[(
+                               0, (0, 0), 0), (0, (0, 0), 0), (1, (1, 1), 0)],
+                           dtype=[('a', int),
+                                  ('b', [('ba', float), ('bb', int)]),
+                                  ('C', int)],)
+        assert_equal(test, control)
+
+
+class TestStackArrays(object):
+    # Test stack_arrays
+    def setup(self):
+        x = np.array([1, 2, ])
+        y = np.array([10, 20, 30])
+        z = np.array(
+            [('A', 1.), ('B', 2.)], dtype=[('A', '|S3'), ('B', float)])
+        w = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
+                     dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
+        self.data = (w, x, y, z)
+
+    def test_solo(self):
+        # Test stack_arrays on single arrays
+        (_, x, _, _) = self.data
+        test = stack_arrays((x,))
+        assert_equal(test, x)
+        assert_(test is x)
+
+        test = stack_arrays(x)
+        assert_equal(test, x)
+        assert_(test is x)
+
+    def test_unnamed_fields(self):
+        # Tests combinations of arrays w/o named fields
+        (_, x, y, _) = self.data
+
+        test = stack_arrays((x, x), usemask=False)
+        control = np.array([1, 2, 1, 2])
+        assert_equal(test, control)
+
+        test = stack_arrays((x, y), usemask=False)
+        control = np.array([1, 2, 10, 20, 30])
+        assert_equal(test, control)
+
+        test = stack_arrays((y, x), usemask=False)
+        control = np.array([10, 20, 30, 1, 2])
+        assert_equal(test, control)
+
+    def test_unnamed_and_named_fields(self):
+        # Test combination of arrays w/ & w/o named fields
+        (_, x, _, z) = self.data
+
+        test = stack_arrays((x, z))
+        control = ma.array([(1, -1, -1), (2, -1, -1),
+                            (-1, 'A', 1), (-1, 'B', 2)],
+                           mask=[(0, 1, 1), (0, 1, 1),
+                                 (1, 0, 0), (1, 0, 0)],
+                           dtype=[('f0', int), ('A', '|S3'), ('B', float)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+        test = stack_arrays((z, x))
+        control = ma.array([('A', 1, -1), ('B', 2, -1),
+                            (-1, -1, 1), (-1, -1, 2), ],
+                           mask=[(0, 0, 1), (0, 0, 1),
+                                 (1, 1, 0), (1, 1, 0)],
+                           dtype=[('A', '|S3'), ('B', float), ('f2', int)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+        test = stack_arrays((z, z, x))
+        control = ma.array([('A', 1, -1), ('B', 2, -1),
+                            ('A', 1, -1), ('B', 2, -1),
+                            (-1, -1, 1), (-1, -1, 2), ],
+                           mask=[(0, 0, 1), (0, 0, 1),
+                                 (0, 0, 1), (0, 0, 1),
+                                 (1, 1, 0), (1, 1, 0)],
+                           dtype=[('A', '|S3'), ('B', float), ('f2', int)])
+        assert_equal(test, control)
+
+    def test_matching_named_fields(self):
+        # Test combination of arrays w/ matching field names
+        (_, x, _, z) = self.data
+        zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
+                      dtype=[('A', '|S3'), ('B', float), ('C', float)])
+        test = stack_arrays((z, zz))
+        control = ma.array([('A', 1, -1), ('B', 2, -1),
+                            (
+                                'a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
+                           dtype=[('A', '|S3'), ('B', float), ('C', float)],
+                           mask=[(0, 0, 1), (0, 0, 1),
+                                 (0, 0, 0), (0, 0, 0), (0, 0, 0)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+        test = stack_arrays((z, zz, x))
+        ndtype = [('A', '|S3'), ('B', float), ('C', float), ('f3', int)]
+        control = ma.array([('A', 1, -1, -1), ('B', 2, -1, -1),
+                            ('a', 10., 100., -1), ('b', 20., 200., -1),
+                            ('c', 30., 300., -1),
+                            (-1, -1, -1, 1), (-1, -1, -1, 2)],
+                           dtype=ndtype,
+                           mask=[(0, 0, 1, 1), (0, 0, 1, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1),
+                                 (1, 1, 1, 0), (1, 1, 1, 0)])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+    def test_defaults(self):
+        # Test defaults: no exception raised if keys of defaults are not fields.
+        (_, _, _, z) = self.data
+        zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
+                      dtype=[('A', '|S3'), ('B', float), ('C', float)])
+        defaults = {'A': '???', 'B': -999., 'C': -9999., 'D': -99999.}
+        test = stack_arrays((z, zz), defaults=defaults)
+        control = ma.array([('A', 1, -9999.), ('B', 2, -9999.),
+                            (
+                                'a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
+                           dtype=[('A', '|S3'), ('B', float), ('C', float)],
+                           mask=[(0, 0, 1), (0, 0, 1),
+                                 (0, 0, 0), (0, 0, 0), (0, 0, 0)])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+    def test_autoconversion(self):
+        # Tests autoconversion
+        adtype = [('A', int), ('B', bool), ('C', float)]
+        a = ma.array([(1, 2, 3)], mask=[(0, 1, 0)], dtype=adtype)
+        bdtype = [('A', int), ('B', float), ('C', float)]
+        b = ma.array([(4, 5, 6)], dtype=bdtype)
+        control = ma.array([(1, 2, 3), (4, 5, 6)], mask=[(0, 1, 0), (0, 0, 0)],
+                           dtype=bdtype)
+        test = stack_arrays((a, b), autoconvert=True)
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        with assert_raises(TypeError):
+            stack_arrays((a, b), autoconvert=False)
+
+    def test_checktitles(self):
+        # Test using titles in the field names
+        adtype = [(('a', 'A'), int), (('b', 'B'), bool), (('c', 'C'), float)]
+        a = ma.array([(1, 2, 3)], mask=[(0, 1, 0)], dtype=adtype)
+        bdtype = [(('a', 'A'), int), (('b', 'B'), bool), (('c', 'C'), float)]
+        b = ma.array([(4, 5, 6)], dtype=bdtype)
+        test = stack_arrays((a, b))
+        control = ma.array([(1, 2, 3), (4, 5, 6)], mask=[(0, 1, 0), (0, 0, 0)],
+                           dtype=bdtype)
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+
+    def test_subdtype(self):
+        z = np.array([
+            ('A', 1), ('B', 2)
+        ], dtype=[('A', '|S3'), ('B', float, (1,))])
+        zz = np.array([
+            ('a', [10.], 100.), ('b', [20.], 200.), ('c', [30.], 300.)
+        ], dtype=[('A', '|S3'), ('B', float, (1,)), ('C', float)])
+
+        res = stack_arrays((z, zz))
+        expected = ma.array(
+            data=[
+                (b'A', [1.0], 0),
+                (b'B', [2.0], 0),
+                (b'a', [10.0], 100.0),
+                (b'b', [20.0], 200.0),
+                (b'c', [30.0], 300.0)],
+            mask=[
+                (False, [False],  True),
+                (False, [False],  True),
+                (False, [False], False),
+                (False, [False], False),
+                (False, [False], False)
+            ],
+            dtype=zz.dtype
+        )
+        assert_equal(res.dtype, expected.dtype)
+        assert_equal(res, expected)
+        assert_equal(res.mask, expected.mask)
+
+
+class TestJoinBy(object):
+    def setup(self):
+        self.a = np.array(list(zip(np.arange(10), np.arange(50, 60),
+                                   np.arange(100, 110))),
+                          dtype=[('a', int), ('b', int), ('c', int)])
+        self.b = np.array(list(zip(np.arange(5, 15), np.arange(65, 75),
+                                   np.arange(100, 110))),
+                          dtype=[('a', int), ('b', int), ('d', int)])
+
+    def test_inner_join(self):
+        # Basic test of join_by
+        a, b = self.a, self.b
+
+        test = join_by('a', a, b, jointype='inner')
+        control = np.array([(5, 55, 65, 105, 100), (6, 56, 66, 106, 101),
+                            (7, 57, 67, 107, 102), (8, 58, 68, 108, 103),
+                            (9, 59, 69, 109, 104)],
+                           dtype=[('a', int), ('b1', int), ('b2', int),
+                                  ('c', int), ('d', int)])
+        assert_equal(test, control)
+
+    def test_join(self):
+        a, b = self.a, self.b
+
+        # Fixme, this test is broken
+        #test = join_by(('a', 'b'), a, b)
+        #control = np.array([(5, 55, 105, 100), (6, 56, 106, 101),
+        #                    (7, 57, 107, 102), (8, 58, 108, 103),
+        #                    (9, 59, 109, 104)],
+        #                   dtype=[('a', int), ('b', int),
+        #                          ('c', int), ('d', int)])
+        #assert_equal(test, control)
+
+        # Hack to avoid pyflakes unused variable warnings
+        join_by(('a', 'b'), a, b)
+        np.array([(5, 55, 105, 100), (6, 56, 106, 101),
+                  (7, 57, 107, 102), (8, 58, 108, 103),
+                  (9, 59, 109, 104)],
+                  dtype=[('a', int), ('b', int),
+                         ('c', int), ('d', int)])
+
+    def test_join_subdtype(self):
+        # tests the bug in https://stackoverflow.com/q/44769632/102441
+        from numpy.lib import recfunctions as rfn
+        foo = np.array([(1,)],
+                       dtype=[('key', int)])
+        bar = np.array([(1, np.array([1,2,3]))],
+                       dtype=[('key', int), ('value', 'uint16', 3)])
+        res = join_by('key', foo, bar)
+        assert_equal(res, bar.view(ma.MaskedArray))
+
+    def test_outer_join(self):
+        a, b = self.a, self.b
+
+        test = join_by(('a', 'b'), a, b, 'outer')
+        control = ma.array([(0, 50, 100, -1), (1, 51, 101, -1),
+                            (2, 52, 102, -1), (3, 53, 103, -1),
+                            (4, 54, 104, -1), (5, 55, 105, -1),
+                            (5, 65, -1, 100), (6, 56, 106, -1),
+                            (6, 66, -1, 101), (7, 57, 107, -1),
+                            (7, 67, -1, 102), (8, 58, 108, -1),
+                            (8, 68, -1, 103), (9, 59, 109, -1),
+                            (9, 69, -1, 104), (10, 70, -1, 105),
+                            (11, 71, -1, 106), (12, 72, -1, 107),
+                            (13, 73, -1, 108), (14, 74, -1, 109)],
+                           mask=[(0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 1, 0), (0, 0, 0, 1),
+                                 (0, 0, 1, 0), (0, 0, 0, 1),
+                                 (0, 0, 1, 0), (0, 0, 0, 1),
+                                 (0, 0, 1, 0), (0, 0, 0, 1),
+                                 (0, 0, 1, 0), (0, 0, 1, 0),
+                                 (0, 0, 1, 0), (0, 0, 1, 0),
+                                 (0, 0, 1, 0), (0, 0, 1, 0)],
+                           dtype=[('a', int), ('b', int),
+                                  ('c', int), ('d', int)])
+        assert_equal(test, control)
+
+    def test_leftouter_join(self):
+        a, b = self.a, self.b
+
+        test = join_by(('a', 'b'), a, b, 'leftouter')
+        control = ma.array([(0, 50, 100, -1), (1, 51, 101, -1),
+                            (2, 52, 102, -1), (3, 53, 103, -1),
+                            (4, 54, 104, -1), (5, 55, 105, -1),
+                            (6, 56, 106, -1), (7, 57, 107, -1),
+                            (8, 58, 108, -1), (9, 59, 109, -1)],
+                           mask=[(0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1),
+                                 (0, 0, 0, 1), (0, 0, 0, 1)],
+                           dtype=[('a', int), ('b', int), ('c', int), ('d', int)])
+        assert_equal(test, control)
+
+    def test_different_field_order(self):
+        # gh-8940
+        a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'f4'), ('c', 'u1')])
+        b = np.ones(3, dtype=[('c', 'u1'), ('b', 'f4'), ('a', 'i4')])
+        # this should not give a FutureWarning:
+        j = join_by(['c', 'b'], a, b, jointype='inner', usemask=False)
+        assert_equal(j.dtype.names, ['b', 'c', 'a1', 'a2'])
+
+    def test_duplicate_keys(self):
+        a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'f4'), ('c', 'u1')])
+        b = np.ones(3, dtype=[('c', 'u1'), ('b', 'f4'), ('a', 'i4')])
+        assert_raises(ValueError, join_by, ['a', 'b', 'b'], a, b)
+
+    @pytest.mark.xfail(reason="See comment at gh-9343")
+    def test_same_name_different_dtypes_key(self):
+        a_dtype = np.dtype([('key', 'S5'), ('value', '<f4')])
+        b_dtype = np.dtype([('key', 'S10'), ('value', '<f4')])
+        expected_dtype = np.dtype([
+            ('key', 'S10'), ('value1', '<f4'), ('value2', '<f4')])
+
+        a = np.array([('Sarah',  8.0), ('John', 6.0)], dtype=a_dtype)
+        b = np.array([('Sarah', 10.0), ('John', 7.0)], dtype=b_dtype)
+        res = join_by('key', a, b)
+
+        assert_equal(res.dtype, expected_dtype)
+
+    def test_same_name_different_dtypes(self):
+        # gh-9338
+        a_dtype = np.dtype([('key', 'S10'), ('value', '<f4')])
+        b_dtype = np.dtype([('key', 'S10'), ('value', '<f8')])
+        expected_dtype = np.dtype([
+            ('key', '|S10'), ('value1', '<f4'), ('value2', '<f8')])
+
+        a = np.array([('Sarah',  8.0), ('John', 6.0)], dtype=a_dtype)
+        b = np.array([('Sarah', 10.0), ('John', 7.0)], dtype=b_dtype)
+        res = join_by('key', a, b)
+
+        assert_equal(res.dtype, expected_dtype)
+
+    def test_subarray_key(self):
+        a_dtype = np.dtype([('pos', int, 3), ('f', '<f4')])
+        a = np.array([([1, 1, 1], np.pi), ([1, 2, 3], 0.0)], dtype=a_dtype)
+
+        b_dtype = np.dtype([('pos', int, 3), ('g', '<f4')])
+        b = np.array([([1, 1, 1], 3), ([3, 2, 1], 0.0)], dtype=b_dtype)
+
+        expected_dtype = np.dtype([('pos', int, 3), ('f', '<f4'), ('g', '<f4')])
+        expected = np.array([([1, 1, 1], np.pi, 3)], dtype=expected_dtype)
+
+        res = join_by('pos', a, b)
+        assert_equal(res.dtype, expected_dtype)
+        assert_equal(res, expected)
+
+    def test_padded_dtype(self):
+        dt = np.dtype('i1,f4', align=True)
+        dt.names = ('k', 'v')
+        assert_(len(dt.descr), 3)  # padding field is inserted
+
+        a = np.array([(1, 3), (3, 2)], dt)
+        b = np.array([(1, 1), (2, 2)], dt)
+        res = join_by('k', a, b)
+
+        # no padding fields remain
+        expected_dtype = np.dtype([
+            ('k', 'i1'), ('v1', 'f4'), ('v2', 'f4')
+        ])
+
+        assert_equal(res.dtype, expected_dtype)
+
+
+class TestJoinBy2(object):
+    @classmethod
+    def setup(cls):
+        cls.a = np.array(list(zip(np.arange(10), np.arange(50, 60),
+                                  np.arange(100, 110))),
+                         dtype=[('a', int), ('b', int), ('c', int)])
+        cls.b = np.array(list(zip(np.arange(10), np.arange(65, 75),
+                                  np.arange(100, 110))),
+                         dtype=[('a', int), ('b', int), ('d', int)])
+
+    def test_no_r1postfix(self):
+        # Basic test of join_by no_r1postfix
+        a, b = self.a, self.b
+
+        test = join_by(
+            'a', a, b, r1postfix='', r2postfix='2', jointype='inner')
+        control = np.array([(0, 50, 65, 100, 100), (1, 51, 66, 101, 101),
+                            (2, 52, 67, 102, 102), (3, 53, 68, 103, 103),
+                            (4, 54, 69, 104, 104), (5, 55, 70, 105, 105),
+                            (6, 56, 71, 106, 106), (7, 57, 72, 107, 107),
+                            (8, 58, 73, 108, 108), (9, 59, 74, 109, 109)],
+                           dtype=[('a', int), ('b', int), ('b2', int),
+                                  ('c', int), ('d', int)])
+        assert_equal(test, control)
+
+    def test_no_postfix(self):
+        assert_raises(ValueError, join_by, 'a', self.a, self.b,
+                      r1postfix='', r2postfix='')
+
+    def test_no_r2postfix(self):
+        # Basic test of join_by no_r2postfix
+        a, b = self.a, self.b
+
+        test = join_by(
+            'a', a, b, r1postfix='1', r2postfix='', jointype='inner')
+        control = np.array([(0, 50, 65, 100, 100), (1, 51, 66, 101, 101),
+                            (2, 52, 67, 102, 102), (3, 53, 68, 103, 103),
+                            (4, 54, 69, 104, 104), (5, 55, 70, 105, 105),
+                            (6, 56, 71, 106, 106), (7, 57, 72, 107, 107),
+                            (8, 58, 73, 108, 108), (9, 59, 74, 109, 109)],
+                           dtype=[('a', int), ('b1', int), ('b', int),
+                                  ('c', int), ('d', int)])
+        assert_equal(test, control)
+
+    def test_two_keys_two_vars(self):
+        a = np.array(list(zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2),
+                              np.arange(50, 60), np.arange(10, 20))),
+                     dtype=[('k', int), ('a', int), ('b', int), ('c', int)])
+
+        b = np.array(list(zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2),
+                              np.arange(65, 75), np.arange(0, 10))),
+                     dtype=[('k', int), ('a', int), ('b', int), ('c', int)])
+
+        control = np.array([(10, 0, 50, 65, 10, 0), (11, 0, 51, 66, 11, 1),
+                            (10, 1, 52, 67, 12, 2), (11, 1, 53, 68, 13, 3),
+                            (10, 2, 54, 69, 14, 4), (11, 2, 55, 70, 15, 5),
+                            (10, 3, 56, 71, 16, 6), (11, 3, 57, 72, 17, 7),
+                            (10, 4, 58, 73, 18, 8), (11, 4, 59, 74, 19, 9)],
+                           dtype=[('k', int), ('a', int), ('b1', int),
+                                  ('b2', int), ('c1', int), ('c2', int)])
+        test = join_by(
+            ['a', 'k'], a, b, r1postfix='1', r2postfix='2', jointype='inner')
+        assert_equal(test.dtype, control.dtype)
+        assert_equal(test, control)
+
+class TestAppendFieldsObj(object):
+    """
+    Test append_fields with arrays containing objects
+    """
+    # https://github.com/numpy/numpy/issues/2346
+
+    def setup(self):
+        from datetime import date
+        self.data = dict(obj=date(2000, 1, 1))
+
+    def test_append_to_objects(self):
+        "Test append_fields when the base array contains objects"
+        obj = self.data['obj']
+        x = np.array([(obj, 1.), (obj, 2.)],
+                      dtype=[('A', object), ('B', float)])
+        y = np.array([10, 20], dtype=int)
+        test = append_fields(x, 'C', data=y, usemask=False)
+        control = np.array([(obj, 1.0, 10), (obj, 2.0, 20)],
+                           dtype=[('A', object), ('B', float), ('C', int)])
+        assert_equal(test, control)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_regression.py b/contrib/python/numpy/py2/numpy/lib/tests/test_regression.py
new file mode 100644
index 00000000000..4c46bc46b5a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_regression.py
@@ -0,0 +1,254 @@
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_array_almost_equal,
+    assert_raises, _assert_valid_refcount,
+    )
+from numpy.compat import unicode
+
+
+class TestRegression(object):
+    def test_poly1d(self):
+        # Ticket #28
+        assert_equal(np.poly1d([1]) - np.poly1d([1, 0]),
+                     np.poly1d([-1, 1]))
+
+    def test_cov_parameters(self):
+        # Ticket #91
+        x = np.random.random((3, 3))
+        y = x.copy()
+        np.cov(x, rowvar=1)
+        np.cov(y, rowvar=0)
+        assert_array_equal(x, y)
+
+    def test_mem_digitize(self):
+        # Ticket #95
+        for i in range(100):
+            np.digitize([1, 2, 3, 4], [1, 3])
+            np.digitize([0, 1, 2, 3, 4], [1, 3])
+
+    def test_unique_zero_sized(self):
+        # Ticket #205
+        assert_array_equal([], np.unique(np.array([])))
+
+    def test_mem_vectorise(self):
+        # Ticket #325
+        vt = np.vectorize(lambda *args: args)
+        vt(np.zeros((1, 2, 1)), np.zeros((2, 1, 1)), np.zeros((1, 1, 2)))
+        vt(np.zeros((1, 2, 1)), np.zeros((2, 1, 1)), np.zeros((1,
+           1, 2)), np.zeros((2, 2)))
+
+    def test_mgrid_single_element(self):
+        # Ticket #339
+        assert_array_equal(np.mgrid[0:0:1j], [0])
+        assert_array_equal(np.mgrid[0:0], [])
+
+    def test_refcount_vectorize(self):
+        # Ticket #378
+        def p(x, y):
+            return 123
+        v = np.vectorize(p)
+        _assert_valid_refcount(v)
+
+    def test_poly1d_nan_roots(self):
+        # Ticket #396
+        p = np.poly1d([np.nan, np.nan, 1], r=0)
+        assert_raises(np.linalg.LinAlgError, getattr, p, "r")
+
+    def test_mem_polymul(self):
+        # Ticket #448
+        np.polymul([], [1.])
+
+    def test_mem_string_concat(self):
+        # Ticket #469
+        x = np.array([])
+        np.append(x, 'asdasd\tasdasd')
+
+    def test_poly_div(self):
+        # Ticket #553
+        u = np.poly1d([1, 2, 3])
+        v = np.poly1d([1, 2, 3, 4, 5])
+        q, r = np.polydiv(u, v)
+        assert_equal(q*v + r, u)
+
+    def test_poly_eq(self):
+        # Ticket #554
+        x = np.poly1d([1, 2, 3])
+        y = np.poly1d([3, 4])
+        assert_(x != y)
+        assert_(x == x)
+
+    def test_polyfit_build(self):
+        # Ticket #628
+        ref = [-1.06123820e-06, 5.70886914e-04, -1.13822012e-01,
+               9.95368241e+00, -3.14526520e+02]
+        x = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+             104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+             116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 129,
+             130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
+             146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+             158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+             170, 171, 172, 173, 174, 175, 176]
+        y = [9.0, 3.0, 7.0, 4.0, 4.0, 8.0, 6.0, 11.0, 9.0, 8.0, 11.0, 5.0,
+             6.0, 5.0, 9.0, 8.0, 6.0, 10.0, 6.0, 10.0, 7.0, 6.0, 6.0, 6.0,
+             13.0, 4.0, 9.0, 11.0, 4.0, 5.0, 8.0, 5.0, 7.0, 7.0, 6.0, 12.0,
+             7.0, 7.0, 9.0, 4.0, 12.0, 6.0, 6.0, 4.0, 3.0, 9.0, 8.0, 8.0,
+             6.0, 7.0, 9.0, 10.0, 6.0, 8.0, 4.0, 7.0, 7.0, 10.0, 8.0, 8.0,
+             6.0, 3.0, 8.0, 4.0, 5.0, 7.0, 8.0, 6.0, 6.0, 4.0, 12.0, 9.0,
+             8.0, 8.0, 8.0, 6.0, 7.0, 4.0, 4.0, 5.0, 7.0]
+        tested = np.polyfit(x, y, 4)
+        assert_array_almost_equal(ref, tested)
+
+    def test_polydiv_type(self):
+        # Make polydiv work for complex types
+        msg = "Wrong type, should be complex"
+        x = np.ones(3, dtype=complex)
+        q, r = np.polydiv(x, x)
+        assert_(q.dtype == complex, msg)
+        msg = "Wrong type, should be float"
+        x = np.ones(3, dtype=int)
+        q, r = np.polydiv(x, x)
+        assert_(q.dtype == float, msg)
+
+    def test_histogramdd_too_many_bins(self):
+        # Ticket 928.
+        assert_raises(ValueError, np.histogramdd, np.ones((1, 10)), bins=2**10)
+
+    def test_polyint_type(self):
+        # Ticket #944
+        msg = "Wrong type, should be complex"
+        x = np.ones(3, dtype=complex)
+        assert_(np.polyint(x).dtype == complex, msg)
+        msg = "Wrong type, should be float"
+        x = np.ones(3, dtype=int)
+        assert_(np.polyint(x).dtype == float, msg)
+
+    def test_ndenumerate_crash(self):
+        # Ticket 1140
+        # Shouldn't crash:
+        list(np.ndenumerate(np.array([[]])))
+
+    def test_asfarray_none(self):
+        # Test for changeset r5065
+        assert_array_equal(np.array([np.nan]), np.asfarray([None]))
+
+    def test_large_fancy_indexing(self):
+        # Large enough to fail on 64-bit.
+        nbits = np.dtype(np.intp).itemsize * 8
+        thesize = int((2**nbits)**(1.0/5.0)+1)
+
+        def dp():
+            n = 3
+            a = np.ones((n,)*5)
+            i = np.random.randint(0, n, size=thesize)
+            a[np.ix_(i, i, i, i, i)] = 0
+
+        def dp2():
+            n = 3
+            a = np.ones((n,)*5)
+            i = np.random.randint(0, n, size=thesize)
+            a[np.ix_(i, i, i, i, i)]
+
+        assert_raises(ValueError, dp)
+        assert_raises(ValueError, dp2)
+
+    def test_void_coercion(self):
+        dt = np.dtype([('a', 'f4'), ('b', 'i4')])
+        x = np.zeros((1,), dt)
+        assert_(np.r_[x, x].dtype == dt)
+
+    def test_who_with_0dim_array(self):
+        # ticket #1243
+        import os
+        import sys
+
+        oldstdout = sys.stdout
+        sys.stdout = open(os.devnull, 'w')
+        try:
+            try:
+                np.who({'foo': np.array(1)})
+            except Exception:
+                raise AssertionError("ticket #1243")
+        finally:
+            sys.stdout.close()
+            sys.stdout = oldstdout
+
+    def test_include_dirs(self):
+        # As a sanity check, just test that get_include
+        # includes something reasonable.  Somewhat
+        # related to ticket #1405.
+        include_dirs = [np.get_include()]
+        for path in include_dirs:
+            assert_(isinstance(path, (str, unicode)))
+            assert_(path != '')
+
+    def test_polyder_return_type(self):
+        # Ticket #1249
+        assert_(isinstance(np.polyder(np.poly1d([1]), 0), np.poly1d))
+        assert_(isinstance(np.polyder([1], 0), np.ndarray))
+        assert_(isinstance(np.polyder(np.poly1d([1]), 1), np.poly1d))
+        assert_(isinstance(np.polyder([1], 1), np.ndarray))
+
+    def test_append_fields_dtype_list(self):
+        # Ticket #1676
+        from numpy.lib.recfunctions import append_fields
+
+        base = np.array([1, 2, 3], dtype=np.int32)
+        names = ['a', 'b', 'c']
+        data = np.eye(3).astype(np.int32)
+        dlist = [np.float64, np.int32, np.int32]
+        try:
+            append_fields(base, names, data, dlist)
+        except Exception:
+            raise AssertionError()
+
+    def test_loadtxt_fields_subarrays(self):
+        # For ticket #1936
+        if sys.version_info[0] >= 3:
+            from io import StringIO
+        else:
+            from StringIO import StringIO
+
+        dt = [("a", 'u1', 2), ("b", 'u1', 2)]
+        x = np.loadtxt(StringIO("0 1 2 3"), dtype=dt)
+        assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt))
+
+        dt = [("a", [("a", 'u1', (1, 3)), ("b", 'u1')])]
+        x = np.loadtxt(StringIO("0 1 2 3"), dtype=dt)
+        assert_equal(x, np.array([(((0, 1, 2), 3),)], dtype=dt))
+
+        dt = [("a", 'u1', (2, 2))]
+        x = np.loadtxt(StringIO("0 1 2 3"), dtype=dt)
+        assert_equal(x, np.array([(((0, 1), (2, 3)),)], dtype=dt))
+
+        dt = [("a", 'u1', (2, 3, 2))]
+        x = np.loadtxt(StringIO("0 1 2 3 4 5 6 7 8 9 10 11"), dtype=dt)
+        data = [((((0, 1), (2, 3), (4, 5)), ((6, 7), (8, 9), (10, 11))),)]
+        assert_equal(x, np.array(data, dtype=dt))
+
+    def test_nansum_with_boolean(self):
+        # gh-2978
+        a = np.zeros(2, dtype=bool)
+        try:
+            np.nansum(a)
+        except Exception:
+            raise AssertionError()
+
+    def test_py3_compat(self):
+        # gh-2561
+        # Test if the oldstyle class test is bypassed in python3
+        class C():
+            """Old-style class in python2, normal class in python3"""
+            pass
+
+        out = open(os.devnull, 'w')
+        try:
+            np.info(C(), output=out)
+        except AttributeError:
+            raise AssertionError()
+        finally:
+            out.close()
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_shape_base.py b/contrib/python/numpy/py2/numpy/lib/tests/test_shape_base.py
new file mode 100644
index 00000000000..01ea028bbf6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_shape_base.py
@@ -0,0 +1,708 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+import warnings
+import functools
+import sys
+import pytest
+
+from numpy.lib.shape_base import (
+    apply_along_axis, apply_over_axes, array_split, split, hsplit, dsplit,
+    vsplit, dstack, column_stack, kron, tile, expand_dims, take_along_axis,
+    put_along_axis
+    )
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises, assert_warns
+    )
+
+
+IS_64BIT = sys.maxsize > 2**32
+
+
+def _add_keepdims(func):
+    """ hack in keepdims behavior into a function taking an axis """
+    @functools.wraps(func)
+    def wrapped(a, axis, **kwargs):
+        res = func(a, axis=axis, **kwargs)
+        if axis is None:
+            axis = 0  # res is now a scalar, so we can insert this anywhere
+        return np.expand_dims(res, axis=axis)
+    return wrapped
+
+
+class TestTakeAlongAxis(object):
+    def test_argequivalent(self):
+        """ Test it translates from arg<func> to <func> """
+        from numpy.random import rand
+        a = rand(3, 4, 5)
+
+        funcs = [
+            (np.sort, np.argsort, dict()),
+            (_add_keepdims(np.min), _add_keepdims(np.argmin), dict()),
+            (_add_keepdims(np.max), _add_keepdims(np.argmax), dict()),
+            (np.partition, np.argpartition, dict(kth=2)),
+        ]
+
+        for func, argfunc, kwargs in funcs:
+            for axis in list(range(a.ndim)) + [None]:
+                a_func = func(a, axis=axis, **kwargs)
+                ai_func = argfunc(a, axis=axis, **kwargs)
+                assert_equal(a_func, take_along_axis(a, ai_func, axis=axis))
+
+    def test_invalid(self):
+        """ Test it errors when indices has too few dimensions """
+        a = np.ones((10, 10))
+        ai = np.ones((10, 2), dtype=np.intp)
+
+        # sanity check
+        take_along_axis(a, ai, axis=1)
+
+        # not enough indices
+        assert_raises(ValueError, take_along_axis, a, np.array(1), axis=1)
+        # bool arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(bool), axis=1)
+        # float arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(float), axis=1)
+        # invalid axis
+        assert_raises(np.AxisError, take_along_axis, a, ai, axis=10)
+
+    def test_empty(self):
+        """ Test everything is ok with empty results, even with inserted dims """
+        a  = np.ones((3, 4, 5))
+        ai = np.ones((3, 0, 5), dtype=np.intp)
+
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, ai.shape)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.ones((1, 2, 5), dtype=np.intp)
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, (3, 2, 5))
+
+
+class TestPutAlongAxis(object):
+    def test_replace_max(self):
+        a_base = np.array([[10, 30, 20], [60, 40, 50]])
+
+        for axis in list(range(a_base.ndim)) + [None]:
+            # we mutate this in the loop
+            a = a_base.copy()
+
+            # replace the max with a small value
+            i_max = _add_keepdims(np.argmax)(a, axis=axis)
+            put_along_axis(a, i_max, -99, axis=axis)
+
+            # find the new minimum, which should max
+            i_min = _add_keepdims(np.argmin)(a, axis=axis)
+
+            assert_equal(i_min, i_max)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.arange(10, dtype=np.intp).reshape((1, 2, 5)) % 4
+        put_along_axis(a, ai, 20, axis=1)
+        assert_equal(take_along_axis(a, ai, axis=1), 20)
+
+
+class TestApplyAlongAxis(object):
+    def test_simple(self):
+        a = np.ones((20, 10), 'd')
+        assert_array_equal(
+            apply_along_axis(len, 0, a), len(a)*np.ones(a.shape[1]))
+
+    def test_simple101(self):
+        a = np.ones((10, 101), 'd')
+        assert_array_equal(
+            apply_along_axis(len, 0, a), len(a)*np.ones(a.shape[1]))
+
+    def test_3d(self):
+        a = np.arange(27).reshape((3, 3, 3))
+        assert_array_equal(apply_along_axis(np.sum, 0, a),
+                           [[27, 30, 33], [36, 39, 42], [45, 48, 51]])
+
+    def test_preserve_subclass(self):
+        def double(row):
+            return row * 2
+
+        class MyNDArray(np.ndarray):
+            pass
+
+        m = np.array([[0, 1], [2, 3]]).view(MyNDArray)
+        expected = np.array([[0, 2], [4, 6]]).view(MyNDArray)
+
+        result = apply_along_axis(double, 0, m)
+        assert_(isinstance(result, MyNDArray))
+        assert_array_equal(result, expected)
+
+        result = apply_along_axis(double, 1, m)
+        assert_(isinstance(result, MyNDArray))
+        assert_array_equal(result, expected)
+
+    def test_subclass(self):
+        class MinimalSubclass(np.ndarray):
+            data = 1
+
+        def minimal_function(array):
+            return array.data
+
+        a = np.zeros((6, 3)).view(MinimalSubclass)
+
+        assert_array_equal(
+            apply_along_axis(minimal_function, 0, a), np.array([1, 1, 1])
+        )
+
+    def test_scalar_array(self, cls=np.ndarray):
+        a = np.ones((6, 3)).view(cls)
+        res = apply_along_axis(np.sum, 0, a)
+        assert_(isinstance(res, cls))
+        assert_array_equal(res, np.array([6, 6, 6]).view(cls))
+
+    def test_0d_array(self, cls=np.ndarray):
+        def sum_to_0d(x):
+            """ Sum x, returning a 0d array of the same class """
+            assert_equal(x.ndim, 1)
+            return np.squeeze(np.sum(x, keepdims=True))
+        a = np.ones((6, 3)).view(cls)
+        res = apply_along_axis(sum_to_0d, 0, a)
+        assert_(isinstance(res, cls))
+        assert_array_equal(res, np.array([6, 6, 6]).view(cls))
+
+        res = apply_along_axis(sum_to_0d, 1, a)
+        assert_(isinstance(res, cls))
+        assert_array_equal(res, np.array([3, 3, 3, 3, 3, 3]).view(cls))
+
+    def test_axis_insertion(self, cls=np.ndarray):
+        def f1to2(x):
+            """produces an asymmetric non-square matrix from x"""
+            assert_equal(x.ndim, 1)
+            return (x[::-1] * x[1:,None]).view(cls)
+
+        a2d = np.arange(6*3).reshape((6, 3))
+
+        # 2d insertion along first axis
+        actual = apply_along_axis(f1to2, 0, a2d)
+        expected = np.stack([
+            f1to2(a2d[:,i]) for i in range(a2d.shape[1])
+        ], axis=-1).view(cls)
+        assert_equal(type(actual), type(expected))
+        assert_equal(actual, expected)
+
+        # 2d insertion along last axis
+        actual = apply_along_axis(f1to2, 1, a2d)
+        expected = np.stack([
+            f1to2(a2d[i,:]) for i in range(a2d.shape[0])
+        ], axis=0).view(cls)
+        assert_equal(type(actual), type(expected))
+        assert_equal(actual, expected)
+
+        # 3d insertion along middle axis
+        a3d = np.arange(6*5*3).reshape((6, 5, 3))
+
+        actual = apply_along_axis(f1to2, 1, a3d)
+        expected = np.stack([
+            np.stack([
+                f1to2(a3d[i,:,j]) for i in range(a3d.shape[0])
+            ], axis=0)
+            for j in range(a3d.shape[2])
+        ], axis=-1).view(cls)
+        assert_equal(type(actual), type(expected))
+        assert_equal(actual, expected)
+
+    def test_subclass_preservation(self):
+        class MinimalSubclass(np.ndarray):
+            pass
+        self.test_scalar_array(MinimalSubclass)
+        self.test_0d_array(MinimalSubclass)
+        self.test_axis_insertion(MinimalSubclass)
+
+    def test_axis_insertion_ma(self):
+        def f1to2(x):
+            """produces an asymmetric non-square matrix from x"""
+            assert_equal(x.ndim, 1)
+            res = x[::-1] * x[1:,None]
+            return np.ma.masked_where(res%5==0, res)
+        a = np.arange(6*3).reshape((6, 3))
+        res = apply_along_axis(f1to2, 0, a)
+        assert_(isinstance(res, np.ma.masked_array))
+        assert_equal(res.ndim, 3)
+        assert_array_equal(res[:,:,0].mask, f1to2(a[:,0]).mask)
+        assert_array_equal(res[:,:,1].mask, f1to2(a[:,1]).mask)
+        assert_array_equal(res[:,:,2].mask, f1to2(a[:,2]).mask)
+
+    def test_tuple_func1d(self):
+        def sample_1d(x):
+            return x[1], x[0]
+        res = np.apply_along_axis(sample_1d, 1, np.array([[1, 2], [3, 4]]))
+        assert_array_equal(res, np.array([[2, 1], [4, 3]]))
+
+    def test_empty(self):
+        # can't apply_along_axis when there's no chance to call the function
+        def never_call(x):
+            assert_(False) # should never be reached
+
+        a = np.empty((0, 0))
+        assert_raises(ValueError, np.apply_along_axis, never_call, 0, a)
+        assert_raises(ValueError, np.apply_along_axis, never_call, 1, a)
+
+        # but it's sometimes ok with some non-zero dimensions
+        def empty_to_1(x):
+            assert_(len(x) == 0)
+            return 1
+
+        a = np.empty((10, 0))
+        actual = np.apply_along_axis(empty_to_1, 1, a)
+        assert_equal(actual, np.ones(10))
+        assert_raises(ValueError, np.apply_along_axis, empty_to_1, 0, a)
+
+    def test_with_iterable_object(self):
+        # from issue 5248
+        d = np.array([
+            [{1, 11}, {2, 22}, {3, 33}],
+            [{4, 44}, {5, 55}, {6, 66}]
+        ])
+        actual = np.apply_along_axis(lambda a: set.union(*a), 0, d)
+        expected = np.array([{1, 11, 4, 44}, {2, 22, 5, 55}, {3, 33, 6, 66}])
+
+        assert_equal(actual, expected)
+
+        # issue 8642 - assert_equal doesn't detect this!
+        for i in np.ndindex(actual.shape):
+            assert_equal(type(actual[i]), type(expected[i]))
+
+
+class TestApplyOverAxes(object):
+    def test_simple(self):
+        a = np.arange(24).reshape(2, 3, 4)
+        aoa_a = apply_over_axes(np.sum, a, [0, 2])
+        assert_array_equal(aoa_a, np.array([[[60], [92], [124]]]))
+
+
+class TestExpandDims(object):
+    def test_functionality(self):
+        s = (2, 3, 4, 5)
+        a = np.empty(s)
+        for axis in range(-5, 4):
+            b = expand_dims(a, axis)
+            assert_(b.shape[axis] == 1)
+            assert_(np.squeeze(b).shape == s)
+
+    def test_deprecations(self):
+        # 2017-05-17, 1.13.0
+        s = (2, 3, 4, 5)
+        a = np.empty(s)
+        with warnings.catch_warnings():
+            warnings.simplefilter("always")
+            assert_warns(DeprecationWarning, expand_dims, a, -6)
+            assert_warns(DeprecationWarning, expand_dims, a, 5)
+
+    def test_subclasses(self):
+        a = np.arange(10).reshape((2, 5))
+        a = np.ma.array(a, mask=a%3 == 0)
+
+        expanded = np.expand_dims(a, axis=1)
+        assert_(isinstance(expanded, np.ma.MaskedArray))
+        assert_equal(expanded.shape, (2, 1, 5))
+        assert_equal(expanded.mask.shape, (2, 1, 5))
+
+
+class TestArraySplit(object):
+    def test_integer_0_split(self):
+        a = np.arange(10)
+        assert_raises(ValueError, array_split, a, 0)
+
+    def test_integer_split(self):
+        a = np.arange(10)
+        res = array_split(a, 1)
+        desired = [np.arange(10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 2)
+        desired = [np.arange(5), np.arange(5, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 3)
+        desired = [np.arange(4), np.arange(4, 7), np.arange(7, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 4)
+        desired = [np.arange(3), np.arange(3, 6), np.arange(6, 8),
+                   np.arange(8, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 5)
+        desired = [np.arange(2), np.arange(2, 4), np.arange(4, 6),
+                   np.arange(6, 8), np.arange(8, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 6)
+        desired = [np.arange(2), np.arange(2, 4), np.arange(4, 6),
+                   np.arange(6, 8), np.arange(8, 9), np.arange(9, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 7)
+        desired = [np.arange(2), np.arange(2, 4), np.arange(4, 6),
+                   np.arange(6, 7), np.arange(7, 8), np.arange(8, 9),
+                   np.arange(9, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 8)
+        desired = [np.arange(2), np.arange(2, 4), np.arange(4, 5),
+                   np.arange(5, 6), np.arange(6, 7), np.arange(7, 8),
+                   np.arange(8, 9), np.arange(9, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 9)
+        desired = [np.arange(2), np.arange(2, 3), np.arange(3, 4),
+                   np.arange(4, 5), np.arange(5, 6), np.arange(6, 7),
+                   np.arange(7, 8), np.arange(8, 9), np.arange(9, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 10)
+        desired = [np.arange(1), np.arange(1, 2), np.arange(2, 3),
+                   np.arange(3, 4), np.arange(4, 5), np.arange(5, 6),
+                   np.arange(6, 7), np.arange(7, 8), np.arange(8, 9),
+                   np.arange(9, 10)]
+        compare_results(res, desired)
+
+        res = array_split(a, 11)
+        desired = [np.arange(1), np.arange(1, 2), np.arange(2, 3),
+                   np.arange(3, 4), np.arange(4, 5), np.arange(5, 6),
+                   np.arange(6, 7), np.arange(7, 8), np.arange(8, 9),
+                   np.arange(9, 10), np.array([])]
+        compare_results(res, desired)
+
+    def test_integer_split_2D_rows(self):
+        a = np.array([np.arange(10), np.arange(10)])
+        res = array_split(a, 3, axis=0)
+        tgt = [np.array([np.arange(10)]), np.array([np.arange(10)]),
+                   np.zeros((0, 10))]
+        compare_results(res, tgt)
+        assert_(a.dtype.type is res[-1].dtype.type)
+
+        # Same thing for manual splits:
+        res = array_split(a, [0, 1, 2], axis=0)
+        tgt = [np.zeros((0, 10)), np.array([np.arange(10)]),
+               np.array([np.arange(10)])]
+        compare_results(res, tgt)
+        assert_(a.dtype.type is res[-1].dtype.type)
+
+    def test_integer_split_2D_cols(self):
+        a = np.array([np.arange(10), np.arange(10)])
+        res = array_split(a, 3, axis=-1)
+        desired = [np.array([np.arange(4), np.arange(4)]),
+                   np.array([np.arange(4, 7), np.arange(4, 7)]),
+                   np.array([np.arange(7, 10), np.arange(7, 10)])]
+        compare_results(res, desired)
+
+    def test_integer_split_2D_default(self):
+        """ This will fail if we change default axis
+        """
+        a = np.array([np.arange(10), np.arange(10)])
+        res = array_split(a, 3)
+        tgt = [np.array([np.arange(10)]), np.array([np.arange(10)]),
+                   np.zeros((0, 10))]
+        compare_results(res, tgt)
+        assert_(a.dtype.type is res[-1].dtype.type)
+        # perhaps should check higher dimensions
+
+    @pytest.mark.skipif(not IS_64BIT, reason="Needs 64bit platform")
+    def test_integer_split_2D_rows_greater_max_int32(self):
+        a = np.broadcast_to([0], (1 << 32, 2))
+        res = array_split(a, 4)
+        chunk = np.broadcast_to([0], (1 << 30, 2))
+        tgt = [chunk] * 4
+        for i in range(len(tgt)):
+            assert_equal(res[i].shape, tgt[i].shape)
+
+    def test_index_split_simple(self):
+        a = np.arange(10)
+        indices = [1, 5, 7]
+        res = array_split(a, indices, axis=-1)
+        desired = [np.arange(0, 1), np.arange(1, 5), np.arange(5, 7),
+                   np.arange(7, 10)]
+        compare_results(res, desired)
+
+    def test_index_split_low_bound(self):
+        a = np.arange(10)
+        indices = [0, 5, 7]
+        res = array_split(a, indices, axis=-1)
+        desired = [np.array([]), np.arange(0, 5), np.arange(5, 7),
+                   np.arange(7, 10)]
+        compare_results(res, desired)
+
+    def test_index_split_high_bound(self):
+        a = np.arange(10)
+        indices = [0, 5, 7, 10, 12]
+        res = array_split(a, indices, axis=-1)
+        desired = [np.array([]), np.arange(0, 5), np.arange(5, 7),
+                   np.arange(7, 10), np.array([]), np.array([])]
+        compare_results(res, desired)
+
+
+class TestSplit(object):
+    # The split function is essentially the same as array_split,
+    # except that it test if splitting will result in an
+    # equal split.  Only test for this case.
+
+    def test_equal_split(self):
+        a = np.arange(10)
+        res = split(a, 2)
+        desired = [np.arange(5), np.arange(5, 10)]
+        compare_results(res, desired)
+
+    def test_unequal_split(self):
+        a = np.arange(10)
+        assert_raises(ValueError, split, a, 3)
+
+
+class TestColumnStack(object):
+    def test_non_iterable(self):
+        assert_raises(TypeError, column_stack, 1)
+
+    def test_1D_arrays(self):
+        # example from docstring
+        a = np.array((1, 2, 3))
+        b = np.array((2, 3, 4))
+        expected = np.array([[1, 2],
+                             [2, 3],
+                             [3, 4]])
+        actual = np.column_stack((a, b))
+        assert_equal(actual, expected)
+
+    def test_2D_arrays(self):
+        # same as hstack 2D docstring example
+        a = np.array([[1], [2], [3]])
+        b = np.array([[2], [3], [4]])
+        expected = np.array([[1, 2],
+                             [2, 3],
+                             [3, 4]])
+        actual = np.column_stack((a, b))
+        assert_equal(actual, expected)
+
+    def test_generator(self):
+        with assert_warns(FutureWarning):
+            column_stack((np.arange(3) for _ in range(2)))
+
+
+class TestDstack(object):
+    def test_non_iterable(self):
+        assert_raises(TypeError, dstack, 1)
+
+    def test_0D_array(self):
+        a = np.array(1)
+        b = np.array(2)
+        res = dstack([a, b])
+        desired = np.array([[[1, 2]]])
+        assert_array_equal(res, desired)
+
+    def test_1D_array(self):
+        a = np.array([1])
+        b = np.array([2])
+        res = dstack([a, b])
+        desired = np.array([[[1, 2]]])
+        assert_array_equal(res, desired)
+
+    def test_2D_array(self):
+        a = np.array([[1], [2]])
+        b = np.array([[1], [2]])
+        res = dstack([a, b])
+        desired = np.array([[[1, 1]], [[2, 2, ]]])
+        assert_array_equal(res, desired)
+
+    def test_2D_array2(self):
+        a = np.array([1, 2])
+        b = np.array([1, 2])
+        res = dstack([a, b])
+        desired = np.array([[[1, 1], [2, 2]]])
+        assert_array_equal(res, desired)
+
+    def test_generator(self):
+        with assert_warns(FutureWarning):
+            dstack((np.arange(3) for _ in range(2)))
+
+
+# array_split has more comprehensive test of splitting.
+# only do simple test on hsplit, vsplit, and dsplit
+class TestHsplit(object):
+    """Only testing for integer splits.
+
+    """
+    def test_non_iterable(self):
+        assert_raises(ValueError, hsplit, 1, 1)
+
+    def test_0D_array(self):
+        a = np.array(1)
+        try:
+            hsplit(a, 2)
+            assert_(0)
+        except ValueError:
+            pass
+
+    def test_1D_array(self):
+        a = np.array([1, 2, 3, 4])
+        res = hsplit(a, 2)
+        desired = [np.array([1, 2]), np.array([3, 4])]
+        compare_results(res, desired)
+
+    def test_2D_array(self):
+        a = np.array([[1, 2, 3, 4],
+                  [1, 2, 3, 4]])
+        res = hsplit(a, 2)
+        desired = [np.array([[1, 2], [1, 2]]), np.array([[3, 4], [3, 4]])]
+        compare_results(res, desired)
+
+
+class TestVsplit(object):
+    """Only testing for integer splits.
+
+    """
+    def test_non_iterable(self):
+        assert_raises(ValueError, vsplit, 1, 1)
+
+    def test_0D_array(self):
+        a = np.array(1)
+        assert_raises(ValueError, vsplit, a, 2)
+
+    def test_1D_array(self):
+        a = np.array([1, 2, 3, 4])
+        try:
+            vsplit(a, 2)
+            assert_(0)
+        except ValueError:
+            pass
+
+    def test_2D_array(self):
+        a = np.array([[1, 2, 3, 4],
+                  [1, 2, 3, 4]])
+        res = vsplit(a, 2)
+        desired = [np.array([[1, 2, 3, 4]]), np.array([[1, 2, 3, 4]])]
+        compare_results(res, desired)
+
+
+class TestDsplit(object):
+    # Only testing for integer splits.
+    def test_non_iterable(self):
+        assert_raises(ValueError, dsplit, 1, 1)
+
+    def test_0D_array(self):
+        a = np.array(1)
+        assert_raises(ValueError, dsplit, a, 2)
+
+    def test_1D_array(self):
+        a = np.array([1, 2, 3, 4])
+        assert_raises(ValueError, dsplit, a, 2)
+
+    def test_2D_array(self):
+        a = np.array([[1, 2, 3, 4],
+                  [1, 2, 3, 4]])
+        try:
+            dsplit(a, 2)
+            assert_(0)
+        except ValueError:
+            pass
+
+    def test_3D_array(self):
+        a = np.array([[[1, 2, 3, 4],
+                   [1, 2, 3, 4]],
+                  [[1, 2, 3, 4],
+                   [1, 2, 3, 4]]])
+        res = dsplit(a, 2)
+        desired = [np.array([[[1, 2], [1, 2]], [[1, 2], [1, 2]]]),
+                   np.array([[[3, 4], [3, 4]], [[3, 4], [3, 4]]])]
+        compare_results(res, desired)
+
+
+class TestSqueeze(object):
+    def test_basic(self):
+        from numpy.random import rand
+
+        a = rand(20, 10, 10, 1, 1)
+        b = rand(20, 1, 10, 1, 20)
+        c = rand(1, 1, 20, 10)
+        assert_array_equal(np.squeeze(a), np.reshape(a, (20, 10, 10)))
+        assert_array_equal(np.squeeze(b), np.reshape(b, (20, 10, 20)))
+        assert_array_equal(np.squeeze(c), np.reshape(c, (20, 10)))
+
+        # Squeezing to 0-dim should still give an ndarray
+        a = [[[1.5]]]
+        res = np.squeeze(a)
+        assert_equal(res, 1.5)
+        assert_equal(res.ndim, 0)
+        assert_equal(type(res), np.ndarray)
+
+
+class TestKron(object):
+    def test_return_type(self):
+        class myarray(np.ndarray):
+            __array_priority__ = 0.0
+
+        a = np.ones([2, 2])
+        ma = myarray(a.shape, a.dtype, a.data)
+        assert_equal(type(kron(a, a)), np.ndarray)
+        assert_equal(type(kron(ma, ma)), myarray)
+        assert_equal(type(kron(a, ma)), np.ndarray)
+        assert_equal(type(kron(ma, a)), myarray)
+
+
+class TestTile(object):
+    def test_basic(self):
+        a = np.array([0, 1, 2])
+        b = [[1, 2], [3, 4]]
+        assert_equal(tile(a, 2), [0, 1, 2, 0, 1, 2])
+        assert_equal(tile(a, (2, 2)), [[0, 1, 2, 0, 1, 2], [0, 1, 2, 0, 1, 2]])
+        assert_equal(tile(a, (1, 2)), [[0, 1, 2, 0, 1, 2]])
+        assert_equal(tile(b, 2), [[1, 2, 1, 2], [3, 4, 3, 4]])
+        assert_equal(tile(b, (2, 1)), [[1, 2], [3, 4], [1, 2], [3, 4]])
+        assert_equal(tile(b, (2, 2)), [[1, 2, 1, 2], [3, 4, 3, 4],
+                                       [1, 2, 1, 2], [3, 4, 3, 4]])
+
+    def test_tile_one_repetition_on_array_gh4679(self):
+        a = np.arange(5)
+        b = tile(a, 1)
+        b += 2
+        assert_equal(a, np.arange(5))
+
+    def test_empty(self):
+        a = np.array([[[]]])
+        b = np.array([[], []])
+        c = tile(b, 2).shape
+        d = tile(a, (3, 2, 5)).shape
+        assert_equal(c, (2, 0))
+        assert_equal(d, (3, 2, 0))
+
+    def test_kroncompare(self):
+        from numpy.random import randint
+
+        reps = [(2,), (1, 2), (2, 1), (2, 2), (2, 3, 2), (3, 2)]
+        shape = [(3,), (2, 3), (3, 4, 3), (3, 2, 3), (4, 3, 2, 4), (2, 2)]
+        for s in shape:
+            b = randint(0, 10, size=s)
+            for r in reps:
+                a = np.ones(r, b.dtype)
+                large = tile(b, r)
+                klarge = kron(a, b)
+                assert_equal(large, klarge)
+
+
+class TestMayShareMemory(object):
+    def test_basic(self):
+        d = np.ones((50, 60))
+        d2 = np.ones((30, 60, 6))
+        assert_(np.may_share_memory(d, d))
+        assert_(np.may_share_memory(d, d[::-1]))
+        assert_(np.may_share_memory(d, d[::2]))
+        assert_(np.may_share_memory(d, d[1:, ::-1]))
+
+        assert_(not np.may_share_memory(d[::-1], d2))
+        assert_(not np.may_share_memory(d[::2], d2))
+        assert_(not np.may_share_memory(d[1:, ::-1], d2))
+        assert_(np.may_share_memory(d2[1:, ::-1], d2))
+
+
+# Utility
+def compare_results(res, desired):
+    for i in range(len(desired)):
+        assert_array_equal(res[i], desired[i])
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_stride_tricks.py b/contrib/python/numpy/py2/numpy/lib/tests/test_stride_tricks.py
new file mode 100644
index 00000000000..b2bd7da3ef4
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_stride_tricks.py
@@ -0,0 +1,445 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.core._rational_tests import rational
+from numpy.testing import (
+    assert_equal, assert_array_equal, assert_raises, assert_,
+    assert_raises_regex
+    )
+from numpy.lib.stride_tricks import (
+    as_strided, broadcast_arrays, _broadcast_shape, broadcast_to
+    )
+
+def assert_shapes_correct(input_shapes, expected_shape):
+    # Broadcast a list of arrays with the given input shapes and check the
+    # common output shape.
+
+    inarrays = [np.zeros(s) for s in input_shapes]
+    outarrays = broadcast_arrays(*inarrays)
+    outshapes = [a.shape for a in outarrays]
+    expected = [expected_shape] * len(inarrays)
+    assert_equal(outshapes, expected)
+
+
+def assert_incompatible_shapes_raise(input_shapes):
+    # Broadcast a list of arrays with the given (incompatible) input shapes
+    # and check that they raise a ValueError.
+
+    inarrays = [np.zeros(s) for s in input_shapes]
+    assert_raises(ValueError, broadcast_arrays, *inarrays)
+
+
+def assert_same_as_ufunc(shape0, shape1, transposed=False, flipped=False):
+    # Broadcast two shapes against each other and check that the data layout
+    # is the same as if a ufunc did the broadcasting.
+
+    x0 = np.zeros(shape0, dtype=int)
+    # Note that multiply.reduce's identity element is 1.0, so when shape1==(),
+    # this gives the desired n==1.
+    n = int(np.multiply.reduce(shape1))
+    x1 = np.arange(n).reshape(shape1)
+    if transposed:
+        x0 = x0.T
+        x1 = x1.T
+    if flipped:
+        x0 = x0[::-1]
+        x1 = x1[::-1]
+    # Use the add ufunc to do the broadcasting. Since we're adding 0s to x1, the
+    # result should be exactly the same as the broadcasted view of x1.
+    y = x0 + x1
+    b0, b1 = broadcast_arrays(x0, x1)
+    assert_array_equal(y, b1)
+
+
+def test_same():
+    x = np.arange(10)
+    y = np.arange(10)
+    bx, by = broadcast_arrays(x, y)
+    assert_array_equal(x, bx)
+    assert_array_equal(y, by)
+
+def test_broadcast_kwargs():
+    # ensure that a TypeError is appropriately raised when
+    # np.broadcast_arrays() is called with any keyword
+    # argument other than 'subok'
+    x = np.arange(10)
+    y = np.arange(10)
+
+    with assert_raises_regex(TypeError,
+                             r'broadcast_arrays\(\) got an unexpected keyword*'):
+        broadcast_arrays(x, y, dtype='float64')
+
+
+def test_one_off():
+    x = np.array([[1, 2, 3]])
+    y = np.array([[1], [2], [3]])
+    bx, by = broadcast_arrays(x, y)
+    bx0 = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
+    by0 = bx0.T
+    assert_array_equal(bx0, bx)
+    assert_array_equal(by0, by)
+
+
+def test_same_input_shapes():
+    # Check that the final shape is just the input shape.
+
+    data = [
+        (),
+        (1,),
+        (3,),
+        (0, 1),
+        (0, 3),
+        (1, 0),
+        (3, 0),
+        (1, 3),
+        (3, 1),
+        (3, 3),
+    ]
+    for shape in data:
+        input_shapes = [shape]
+        # Single input.
+        assert_shapes_correct(input_shapes, shape)
+        # Double input.
+        input_shapes2 = [shape, shape]
+        assert_shapes_correct(input_shapes2, shape)
+        # Triple input.
+        input_shapes3 = [shape, shape, shape]
+        assert_shapes_correct(input_shapes3, shape)
+
+
+def test_two_compatible_by_ones_input_shapes():
+    # Check that two different input shapes of the same length, but some have
+    # ones, broadcast to the correct shape.
+
+    data = [
+        [[(1,), (3,)], (3,)],
+        [[(1, 3), (3, 3)], (3, 3)],
+        [[(3, 1), (3, 3)], (3, 3)],
+        [[(1, 3), (3, 1)], (3, 3)],
+        [[(1, 1), (3, 3)], (3, 3)],
+        [[(1, 1), (1, 3)], (1, 3)],
+        [[(1, 1), (3, 1)], (3, 1)],
+        [[(1, 0), (0, 0)], (0, 0)],
+        [[(0, 1), (0, 0)], (0, 0)],
+        [[(1, 0), (0, 1)], (0, 0)],
+        [[(1, 1), (0, 0)], (0, 0)],
+        [[(1, 1), (1, 0)], (1, 0)],
+        [[(1, 1), (0, 1)], (0, 1)],
+    ]
+    for input_shapes, expected_shape in data:
+        assert_shapes_correct(input_shapes, expected_shape)
+        # Reverse the input shapes since broadcasting should be symmetric.
+        assert_shapes_correct(input_shapes[::-1], expected_shape)
+
+
+def test_two_compatible_by_prepending_ones_input_shapes():
+    # Check that two different input shapes (of different lengths) broadcast
+    # to the correct shape.
+
+    data = [
+        [[(), (3,)], (3,)],
+        [[(3,), (3, 3)], (3, 3)],
+        [[(3,), (3, 1)], (3, 3)],
+        [[(1,), (3, 3)], (3, 3)],
+        [[(), (3, 3)], (3, 3)],
+        [[(1, 1), (3,)], (1, 3)],
+        [[(1,), (3, 1)], (3, 1)],
+        [[(1,), (1, 3)], (1, 3)],
+        [[(), (1, 3)], (1, 3)],
+        [[(), (3, 1)], (3, 1)],
+        [[(), (0,)], (0,)],
+        [[(0,), (0, 0)], (0, 0)],
+        [[(0,), (0, 1)], (0, 0)],
+        [[(1,), (0, 0)], (0, 0)],
+        [[(), (0, 0)], (0, 0)],
+        [[(1, 1), (0,)], (1, 0)],
+        [[(1,), (0, 1)], (0, 1)],
+        [[(1,), (1, 0)], (1, 0)],
+        [[(), (1, 0)], (1, 0)],
+        [[(), (0, 1)], (0, 1)],
+    ]
+    for input_shapes, expected_shape in data:
+        assert_shapes_correct(input_shapes, expected_shape)
+        # Reverse the input shapes since broadcasting should be symmetric.
+        assert_shapes_correct(input_shapes[::-1], expected_shape)
+
+
+def test_incompatible_shapes_raise_valueerror():
+    # Check that a ValueError is raised for incompatible shapes.
+
+    data = [
+        [(3,), (4,)],
+        [(2, 3), (2,)],
+        [(3,), (3,), (4,)],
+        [(1, 3, 4), (2, 3, 3)],
+    ]
+    for input_shapes in data:
+        assert_incompatible_shapes_raise(input_shapes)
+        # Reverse the input shapes since broadcasting should be symmetric.
+        assert_incompatible_shapes_raise(input_shapes[::-1])
+
+
+def test_same_as_ufunc():
+    # Check that the data layout is the same as if a ufunc did the operation.
+
+    data = [
+        [[(1,), (3,)], (3,)],
+        [[(1, 3), (3, 3)], (3, 3)],
+        [[(3, 1), (3, 3)], (3, 3)],
+        [[(1, 3), (3, 1)], (3, 3)],
+        [[(1, 1), (3, 3)], (3, 3)],
+        [[(1, 1), (1, 3)], (1, 3)],
+        [[(1, 1), (3, 1)], (3, 1)],
+        [[(1, 0), (0, 0)], (0, 0)],
+        [[(0, 1), (0, 0)], (0, 0)],
+        [[(1, 0), (0, 1)], (0, 0)],
+        [[(1, 1), (0, 0)], (0, 0)],
+        [[(1, 1), (1, 0)], (1, 0)],
+        [[(1, 1), (0, 1)], (0, 1)],
+        [[(), (3,)], (3,)],
+        [[(3,), (3, 3)], (3, 3)],
+        [[(3,), (3, 1)], (3, 3)],
+        [[(1,), (3, 3)], (3, 3)],
+        [[(), (3, 3)], (3, 3)],
+        [[(1, 1), (3,)], (1, 3)],
+        [[(1,), (3, 1)], (3, 1)],
+        [[(1,), (1, 3)], (1, 3)],
+        [[(), (1, 3)], (1, 3)],
+        [[(), (3, 1)], (3, 1)],
+        [[(), (0,)], (0,)],
+        [[(0,), (0, 0)], (0, 0)],
+        [[(0,), (0, 1)], (0, 0)],
+        [[(1,), (0, 0)], (0, 0)],
+        [[(), (0, 0)], (0, 0)],
+        [[(1, 1), (0,)], (1, 0)],
+        [[(1,), (0, 1)], (0, 1)],
+        [[(1,), (1, 0)], (1, 0)],
+        [[(), (1, 0)], (1, 0)],
+        [[(), (0, 1)], (0, 1)],
+    ]
+    for input_shapes, expected_shape in data:
+        assert_same_as_ufunc(input_shapes[0], input_shapes[1],
+                             "Shapes: %s %s" % (input_shapes[0], input_shapes[1]))
+        # Reverse the input shapes since broadcasting should be symmetric.
+        assert_same_as_ufunc(input_shapes[1], input_shapes[0])
+        # Try them transposed, too.
+        assert_same_as_ufunc(input_shapes[0], input_shapes[1], True)
+        # ... and flipped for non-rank-0 inputs in order to test negative
+        # strides.
+        if () not in input_shapes:
+            assert_same_as_ufunc(input_shapes[0], input_shapes[1], False, True)
+            assert_same_as_ufunc(input_shapes[0], input_shapes[1], True, True)
+
+
+def test_broadcast_to_succeeds():
+    data = [
+        [np.array(0), (0,), np.array(0)],
+        [np.array(0), (1,), np.zeros(1)],
+        [np.array(0), (3,), np.zeros(3)],
+        [np.ones(1), (1,), np.ones(1)],
+        [np.ones(1), (2,), np.ones(2)],
+        [np.ones(1), (1, 2, 3), np.ones((1, 2, 3))],
+        [np.arange(3), (3,), np.arange(3)],
+        [np.arange(3), (1, 3), np.arange(3).reshape(1, -1)],
+        [np.arange(3), (2, 3), np.array([[0, 1, 2], [0, 1, 2]])],
+        # test if shape is not a tuple
+        [np.ones(0), 0, np.ones(0)],
+        [np.ones(1), 1, np.ones(1)],
+        [np.ones(1), 2, np.ones(2)],
+        # these cases with size 0 are strange, but they reproduce the behavior
+        # of broadcasting with ufuncs (see test_same_as_ufunc above)
+        [np.ones(1), (0,), np.ones(0)],
+        [np.ones((1, 2)), (0, 2), np.ones((0, 2))],
+        [np.ones((2, 1)), (2, 0), np.ones((2, 0))],
+    ]
+    for input_array, shape, expected in data:
+        actual = broadcast_to(input_array, shape)
+        assert_array_equal(expected, actual)
+
+
+def test_broadcast_to_raises():
+    data = [
+        [(0,), ()],
+        [(1,), ()],
+        [(3,), ()],
+        [(3,), (1,)],
+        [(3,), (2,)],
+        [(3,), (4,)],
+        [(1, 2), (2, 1)],
+        [(1, 1), (1,)],
+        [(1,), -1],
+        [(1,), (-1,)],
+        [(1, 2), (-1, 2)],
+    ]
+    for orig_shape, target_shape in data:
+        arr = np.zeros(orig_shape)
+        assert_raises(ValueError, lambda: broadcast_to(arr, target_shape))
+
+
+def test_broadcast_shape():
+    # broadcast_shape is already exercized indirectly by broadcast_arrays
+    assert_equal(_broadcast_shape(), ())
+    assert_equal(_broadcast_shape([1, 2]), (2,))
+    assert_equal(_broadcast_shape(np.ones((1, 1))), (1, 1))
+    assert_equal(_broadcast_shape(np.ones((1, 1)), np.ones((3, 4))), (3, 4))
+    assert_equal(_broadcast_shape(*([np.ones((1, 2))] * 32)), (1, 2))
+    assert_equal(_broadcast_shape(*([np.ones((1, 2))] * 100)), (1, 2))
+
+    # regression tests for gh-5862
+    assert_equal(_broadcast_shape(*([np.ones(2)] * 32 + [1])), (2,))
+    bad_args = [np.ones(2)] * 32 + [np.ones(3)] * 32
+    assert_raises(ValueError, lambda: _broadcast_shape(*bad_args))
+
+
+def test_as_strided():
+    a = np.array([None])
+    a_view = as_strided(a)
+    expected = np.array([None])
+    assert_array_equal(a_view, np.array([None]))
+
+    a = np.array([1, 2, 3, 4])
+    a_view = as_strided(a, shape=(2,), strides=(2 * a.itemsize,))
+    expected = np.array([1, 3])
+    assert_array_equal(a_view, expected)
+
+    a = np.array([1, 2, 3, 4])
+    a_view = as_strided(a, shape=(3, 4), strides=(0, 1 * a.itemsize))
+    expected = np.array([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]])
+    assert_array_equal(a_view, expected)
+
+    # Regression test for gh-5081
+    dt = np.dtype([('num', 'i4'), ('obj', 'O')])
+    a = np.empty((4,), dtype=dt)
+    a['num'] = np.arange(1, 5)
+    a_view = as_strided(a, shape=(3, 4), strides=(0, a.itemsize))
+    expected_num = [[1, 2, 3, 4]] * 3
+    expected_obj = [[None]*4]*3
+    assert_equal(a_view.dtype, dt)
+    assert_array_equal(expected_num, a_view['num'])
+    assert_array_equal(expected_obj, a_view['obj'])
+
+    # Make sure that void types without fields are kept unchanged
+    a = np.empty((4,), dtype='V4')
+    a_view = as_strided(a, shape=(3, 4), strides=(0, a.itemsize))
+    assert_equal(a.dtype, a_view.dtype)
+
+    # Make sure that the only type that could fail is properly handled
+    dt = np.dtype({'names': [''], 'formats': ['V4']})
+    a = np.empty((4,), dtype=dt)
+    a_view = as_strided(a, shape=(3, 4), strides=(0, a.itemsize))
+    assert_equal(a.dtype, a_view.dtype)
+
+    # Custom dtypes should not be lost (gh-9161)
+    r = [rational(i) for i in range(4)]
+    a = np.array(r, dtype=rational)
+    a_view = as_strided(a, shape=(3, 4), strides=(0, a.itemsize))
+    assert_equal(a.dtype, a_view.dtype)
+    assert_array_equal([r] * 3, a_view)
+
+def as_strided_writeable():
+    arr = np.ones(10)
+    view = as_strided(arr, writeable=False)
+    assert_(not view.flags.writeable)
+
+    # Check that writeable also is fine:
+    view = as_strided(arr, writeable=True)
+    assert_(view.flags.writeable)
+    view[...] = 3
+    assert_array_equal(arr, np.full_like(arr, 3))
+
+    # Test that things do not break down for readonly:
+    arr.flags.writeable = False
+    view = as_strided(arr, writeable=False)
+    view = as_strided(arr, writeable=True)
+    assert_(not view.flags.writeable)
+
+
+class VerySimpleSubClass(np.ndarray):
+    def __new__(cls, *args, **kwargs):
+        kwargs['subok'] = True
+        return np.array(*args, **kwargs).view(cls)
+
+
+class SimpleSubClass(VerySimpleSubClass):
+    def __new__(cls, *args, **kwargs):
+        kwargs['subok'] = True
+        self = np.array(*args, **kwargs).view(cls)
+        self.info = 'simple'
+        return self
+
+    def __array_finalize__(self, obj):
+        self.info = getattr(obj, 'info', '') + ' finalized'
+
+
+def test_subclasses():
+    # test that subclass is preserved only if subok=True
+    a = VerySimpleSubClass([1, 2, 3, 4])
+    assert_(type(a) is VerySimpleSubClass)
+    a_view = as_strided(a, shape=(2,), strides=(2 * a.itemsize,))
+    assert_(type(a_view) is np.ndarray)
+    a_view = as_strided(a, shape=(2,), strides=(2 * a.itemsize,), subok=True)
+    assert_(type(a_view) is VerySimpleSubClass)
+    # test that if a subclass has __array_finalize__, it is used
+    a = SimpleSubClass([1, 2, 3, 4])
+    a_view = as_strided(a, shape=(2,), strides=(2 * a.itemsize,), subok=True)
+    assert_(type(a_view) is SimpleSubClass)
+    assert_(a_view.info == 'simple finalized')
+
+    # similar tests for broadcast_arrays
+    b = np.arange(len(a)).reshape(-1, 1)
+    a_view, b_view = broadcast_arrays(a, b)
+    assert_(type(a_view) is np.ndarray)
+    assert_(type(b_view) is np.ndarray)
+    assert_(a_view.shape == b_view.shape)
+    a_view, b_view = broadcast_arrays(a, b, subok=True)
+    assert_(type(a_view) is SimpleSubClass)
+    assert_(a_view.info == 'simple finalized')
+    assert_(type(b_view) is np.ndarray)
+    assert_(a_view.shape == b_view.shape)
+
+    # and for broadcast_to
+    shape = (2, 4)
+    a_view = broadcast_to(a, shape)
+    assert_(type(a_view) is np.ndarray)
+    assert_(a_view.shape == shape)
+    a_view = broadcast_to(a, shape, subok=True)
+    assert_(type(a_view) is SimpleSubClass)
+    assert_(a_view.info == 'simple finalized')
+    assert_(a_view.shape == shape)
+
+
+def test_writeable():
+    # broadcast_to should return a readonly array
+    original = np.array([1, 2, 3])
+    result = broadcast_to(original, (2, 3))
+    assert_equal(result.flags.writeable, False)
+    assert_raises(ValueError, result.__setitem__, slice(None), 0)
+
+    # but the result of broadcast_arrays needs to be writeable (for now), to
+    # preserve backwards compatibility
+    for results in [broadcast_arrays(original),
+                    broadcast_arrays(0, original)]:
+        for result in results:
+            assert_equal(result.flags.writeable, True)
+    # keep readonly input readonly
+    original.flags.writeable = False
+    _, result = broadcast_arrays(0, original)
+    assert_equal(result.flags.writeable, False)
+
+    # regression test for GH6491
+    shape = (2,)
+    strides = [0]
+    tricky_array = as_strided(np.array(0), shape, strides)
+    other = np.zeros((1,))
+    first, second = broadcast_arrays(tricky_array, other)
+    assert_(first.shape == second.shape)
+
+
+def test_reference_types():
+    input_array = np.array('a', dtype=object)
+    expected = np.array(['a'] * 3, dtype=object)
+    actual = broadcast_to(input_array, (3,))
+    assert_array_equal(expected, actual)
+
+    actual, _ = broadcast_arrays(input_array, np.ones(3))
+    assert_array_equal(expected, actual)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_twodim_base.py b/contrib/python/numpy/py2/numpy/lib/tests/test_twodim_base.py
new file mode 100644
index 00000000000..fe1348d286e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_twodim_base.py
@@ -0,0 +1,534 @@
+"""Test functions for matrix module
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import (
+    assert_equal, assert_array_equal, assert_array_max_ulp,
+    assert_array_almost_equal, assert_raises, assert_
+    )
+
+from numpy import (
+    arange, add, fliplr, flipud, zeros, ones, eye, array, diag, histogram2d,
+    tri, mask_indices, triu_indices, triu_indices_from, tril_indices,
+    tril_indices_from, vander,
+    )
+
+import numpy as np
+
+
+from numpy.core.tests.test_overrides import requires_array_function
+
+
+def get_mat(n):
+    data = arange(n)
+    data = add.outer(data, data)
+    return data
+
+
+class TestEye(object):
+    def test_basic(self):
+        assert_equal(eye(4),
+                     array([[1, 0, 0, 0],
+                            [0, 1, 0, 0],
+                            [0, 0, 1, 0],
+                            [0, 0, 0, 1]]))
+
+        assert_equal(eye(4, dtype='f'),
+                     array([[1, 0, 0, 0],
+                            [0, 1, 0, 0],
+                            [0, 0, 1, 0],
+                            [0, 0, 0, 1]], 'f'))
+
+        assert_equal(eye(3) == 1,
+                     eye(3, dtype=bool))
+
+    def test_diag(self):
+        assert_equal(eye(4, k=1),
+                     array([[0, 1, 0, 0],
+                            [0, 0, 1, 0],
+                            [0, 0, 0, 1],
+                            [0, 0, 0, 0]]))
+
+        assert_equal(eye(4, k=-1),
+                     array([[0, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 1, 0, 0],
+                            [0, 0, 1, 0]]))
+
+    def test_2d(self):
+        assert_equal(eye(4, 3),
+                     array([[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1],
+                            [0, 0, 0]]))
+
+        assert_equal(eye(3, 4),
+                     array([[1, 0, 0, 0],
+                            [0, 1, 0, 0],
+                            [0, 0, 1, 0]]))
+
+    def test_diag2d(self):
+        assert_equal(eye(3, 4, k=2),
+                     array([[0, 0, 1, 0],
+                            [0, 0, 0, 1],
+                            [0, 0, 0, 0]]))
+
+        assert_equal(eye(4, 3, k=-2),
+                     array([[0, 0, 0],
+                            [0, 0, 0],
+                            [1, 0, 0],
+                            [0, 1, 0]]))
+
+    def test_eye_bounds(self):
+        assert_equal(eye(2, 2, 1), [[0, 1], [0, 0]])
+        assert_equal(eye(2, 2, -1), [[0, 0], [1, 0]])
+        assert_equal(eye(2, 2, 2), [[0, 0], [0, 0]])
+        assert_equal(eye(2, 2, -2), [[0, 0], [0, 0]])
+        assert_equal(eye(3, 2, 2), [[0, 0], [0, 0], [0, 0]])
+        assert_equal(eye(3, 2, 1), [[0, 1], [0, 0], [0, 0]])
+        assert_equal(eye(3, 2, -1), [[0, 0], [1, 0], [0, 1]])
+        assert_equal(eye(3, 2, -2), [[0, 0], [0, 0], [1, 0]])
+        assert_equal(eye(3, 2, -3), [[0, 0], [0, 0], [0, 0]])
+
+    def test_strings(self):
+        assert_equal(eye(2, 2, dtype='S3'),
+                     [[b'1', b''], [b'', b'1']])
+
+    def test_bool(self):
+        assert_equal(eye(2, 2, dtype=bool), [[True, False], [False, True]])
+
+    def test_order(self):
+        mat_c = eye(4, 3, k=-1)
+        mat_f = eye(4, 3, k=-1, order='F')
+        assert_equal(mat_c, mat_f)
+        assert mat_c.flags.c_contiguous
+        assert not mat_c.flags.f_contiguous
+        assert not mat_f.flags.c_contiguous
+        assert mat_f.flags.f_contiguous
+
+
+class TestDiag(object):
+    def test_vector(self):
+        vals = (100 * arange(5)).astype('l')
+        b = zeros((5, 5))
+        for k in range(5):
+            b[k, k] = vals[k]
+        assert_equal(diag(vals), b)
+        b = zeros((7, 7))
+        c = b.copy()
+        for k in range(5):
+            b[k, k + 2] = vals[k]
+            c[k + 2, k] = vals[k]
+        assert_equal(diag(vals, k=2), b)
+        assert_equal(diag(vals, k=-2), c)
+
+    def test_matrix(self, vals=None):
+        if vals is None:
+            vals = (100 * get_mat(5) + 1).astype('l')
+        b = zeros((5,))
+        for k in range(5):
+            b[k] = vals[k, k]
+        assert_equal(diag(vals), b)
+        b = b * 0
+        for k in range(3):
+            b[k] = vals[k, k + 2]
+        assert_equal(diag(vals, 2), b[:3])
+        for k in range(3):
+            b[k] = vals[k + 2, k]
+        assert_equal(diag(vals, -2), b[:3])
+
+    def test_fortran_order(self):
+        vals = array((100 * get_mat(5) + 1), order='F', dtype='l')
+        self.test_matrix(vals)
+
+    def test_diag_bounds(self):
+        A = [[1, 2], [3, 4], [5, 6]]
+        assert_equal(diag(A, k=2), [])
+        assert_equal(diag(A, k=1), [2])
+        assert_equal(diag(A, k=0), [1, 4])
+        assert_equal(diag(A, k=-1), [3, 6])
+        assert_equal(diag(A, k=-2), [5])
+        assert_equal(diag(A, k=-3), [])
+
+    def test_failure(self):
+        assert_raises(ValueError, diag, [[[1]]])
+
+
+class TestFliplr(object):
+    def test_basic(self):
+        assert_raises(ValueError, fliplr, ones(4))
+        a = get_mat(4)
+        b = a[:, ::-1]
+        assert_equal(fliplr(a), b)
+        a = [[0, 1, 2],
+             [3, 4, 5]]
+        b = [[2, 1, 0],
+             [5, 4, 3]]
+        assert_equal(fliplr(a), b)
+
+
+class TestFlipud(object):
+    def test_basic(self):
+        a = get_mat(4)
+        b = a[::-1, :]
+        assert_equal(flipud(a), b)
+        a = [[0, 1, 2],
+             [3, 4, 5]]
+        b = [[3, 4, 5],
+             [0, 1, 2]]
+        assert_equal(flipud(a), b)
+
+
+class TestHistogram2d(object):
+    def test_simple(self):
+        x = array(
+            [0.41702200, 0.72032449, 1.1437481e-4, 0.302332573, 0.146755891])
+        y = array(
+            [0.09233859, 0.18626021, 0.34556073, 0.39676747, 0.53881673])
+        xedges = np.linspace(0, 1, 10)
+        yedges = np.linspace(0, 1, 10)
+        H = histogram2d(x, y, (xedges, yedges))[0]
+        answer = array(
+            [[0, 0, 0, 1, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0, 1, 0, 0],
+             [0, 0, 0, 0, 0, 0, 0, 0, 0],
+             [1, 0, 1, 0, 0, 0, 0, 0, 0],
+             [0, 1, 0, 0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0, 0, 0, 0]])
+        assert_array_equal(H.T, answer)
+        H = histogram2d(x, y, xedges)[0]
+        assert_array_equal(H.T, answer)
+        H, xedges, yedges = histogram2d(list(range(10)), list(range(10)))
+        assert_array_equal(H, eye(10, 10))
+        assert_array_equal(xedges, np.linspace(0, 9, 11))
+        assert_array_equal(yedges, np.linspace(0, 9, 11))
+
+    def test_asym(self):
+        x = array([1, 1, 2, 3, 4, 4, 4, 5])
+        y = array([1, 3, 2, 0, 1, 2, 3, 4])
+        H, xed, yed = histogram2d(
+            x, y, (6, 5), range=[[0, 6], [0, 5]], density=True)
+        answer = array(
+            [[0., 0, 0, 0, 0],
+             [0, 1, 0, 1, 0],
+             [0, 0, 1, 0, 0],
+             [1, 0, 0, 0, 0],
+             [0, 1, 1, 1, 0],
+             [0, 0, 0, 0, 1]])
+        assert_array_almost_equal(H, answer/8., 3)
+        assert_array_equal(xed, np.linspace(0, 6, 7))
+        assert_array_equal(yed, np.linspace(0, 5, 6))
+
+    def test_density(self):
+        x = array([1, 2, 3, 1, 2, 3, 1, 2, 3])
+        y = array([1, 1, 1, 2, 2, 2, 3, 3, 3])
+        H, xed, yed = histogram2d(
+            x, y, [[1, 2, 3, 5], [1, 2, 3, 5]], density=True)
+        answer = array([[1, 1, .5],
+                        [1, 1, .5],
+                        [.5, .5, .25]])/9.
+        assert_array_almost_equal(H, answer, 3)
+
+    def test_all_outliers(self):
+        r = np.random.rand(100) + 1. + 1e6  # histogramdd rounds by decimal=6
+        H, xed, yed = histogram2d(r, r, (4, 5), range=([0, 1], [0, 1]))
+        assert_array_equal(H, 0)
+
+    def test_empty(self):
+        a, edge1, edge2 = histogram2d([], [], bins=([0, 1], [0, 1]))
+        assert_array_max_ulp(a, array([[0.]]))
+
+        a, edge1, edge2 = histogram2d([], [], bins=4)
+        assert_array_max_ulp(a, np.zeros((4, 4)))
+
+    def test_binparameter_combination(self):
+        x = array(
+            [0, 0.09207008, 0.64575234, 0.12875982, 0.47390599,
+             0.59944483, 1])
+        y = array(
+            [0, 0.14344267, 0.48988575, 0.30558665, 0.44700682,
+             0.15886423, 1])
+        edges = (0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)
+        H, xe, ye = histogram2d(x, y, (edges, 4))
+        answer = array(
+            [[2., 0., 0., 0.],
+             [0., 1., 0., 0.],
+             [0., 0., 0., 0.],
+             [0., 0., 0., 0.],
+             [0., 1., 0., 0.],
+             [1., 0., 0., 0.],
+             [0., 1., 0., 0.],
+             [0., 0., 0., 0.],
+             [0., 0., 0., 0.],
+             [0., 0., 0., 1.]])
+        assert_array_equal(H, answer)
+        assert_array_equal(ye, array([0., 0.25, 0.5, 0.75, 1]))
+        H, xe, ye = histogram2d(x, y, (4, edges))
+        answer = array(
+            [[1., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
+             [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
+             [0., 1., 0., 0., 1., 0., 0., 0., 0., 0.],
+             [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])
+        assert_array_equal(H, answer)
+        assert_array_equal(xe, array([0., 0.25, 0.5, 0.75, 1]))
+
+    @requires_array_function
+    def test_dispatch(self):
+        class ShouldDispatch(object):
+            def __array_function__(self, function, types, args, kwargs):
+                return types, args, kwargs
+
+        xy = [1, 2]
+        s_d = ShouldDispatch()
+        r = histogram2d(s_d, xy)
+        # Cannot use assert_equal since that dispatches...
+        assert_(r == ((ShouldDispatch,), (s_d, xy), {}))
+        r = histogram2d(xy, s_d)
+        assert_(r == ((ShouldDispatch,), (xy, s_d), {}))
+        r = histogram2d(xy, xy, bins=s_d)
+        assert_(r, ((ShouldDispatch,), (xy, xy), dict(bins=s_d)))
+        r = histogram2d(xy, xy, bins=[s_d, 5])
+        assert_(r, ((ShouldDispatch,), (xy, xy), dict(bins=[s_d, 5])))
+        assert_raises(Exception, histogram2d, xy, xy, bins=[s_d])
+        r = histogram2d(xy, xy, weights=s_d)
+        assert_(r, ((ShouldDispatch,), (xy, xy), dict(weights=s_d)))
+
+
+class TestTri(object):
+    def test_dtype(self):
+        out = array([[1, 0, 0],
+                     [1, 1, 0],
+                     [1, 1, 1]])
+        assert_array_equal(tri(3), out)
+        assert_array_equal(tri(3, dtype=bool), out.astype(bool))
+
+
+def test_tril_triu_ndim2():
+    for dtype in np.typecodes['AllFloat'] + np.typecodes['AllInteger']:
+        a = np.ones((2, 2), dtype=dtype)
+        b = np.tril(a)
+        c = np.triu(a)
+        assert_array_equal(b, [[1, 0], [1, 1]])
+        assert_array_equal(c, b.T)
+        # should return the same dtype as the original array
+        assert_equal(b.dtype, a.dtype)
+        assert_equal(c.dtype, a.dtype)
+
+
+def test_tril_triu_ndim3():
+    for dtype in np.typecodes['AllFloat'] + np.typecodes['AllInteger']:
+        a = np.array([
+            [[1, 1], [1, 1]],
+            [[1, 1], [1, 0]],
+            [[1, 1], [0, 0]],
+            ], dtype=dtype)
+        a_tril_desired = np.array([
+            [[1, 0], [1, 1]],
+            [[1, 0], [1, 0]],
+            [[1, 0], [0, 0]],
+            ], dtype=dtype)
+        a_triu_desired = np.array([
+            [[1, 1], [0, 1]],
+            [[1, 1], [0, 0]],
+            [[1, 1], [0, 0]],
+            ], dtype=dtype)
+        a_triu_observed = np.triu(a)
+        a_tril_observed = np.tril(a)
+        assert_array_equal(a_triu_observed, a_triu_desired)
+        assert_array_equal(a_tril_observed, a_tril_desired)
+        assert_equal(a_triu_observed.dtype, a.dtype)
+        assert_equal(a_tril_observed.dtype, a.dtype)
+
+
+def test_tril_triu_with_inf():
+    # Issue 4859
+    arr = np.array([[1, 1, np.inf],
+                    [1, 1, 1],
+                    [np.inf, 1, 1]])
+    out_tril = np.array([[1, 0, 0],
+                         [1, 1, 0],
+                         [np.inf, 1, 1]])
+    out_triu = out_tril.T
+    assert_array_equal(np.triu(arr), out_triu)
+    assert_array_equal(np.tril(arr), out_tril)
+
+
+def test_tril_triu_dtype():
+    # Issue 4916
+    # tril and triu should return the same dtype as input
+    for c in np.typecodes['All']:
+        if c == 'V':
+            continue
+        arr = np.zeros((3, 3), dtype=c)
+        assert_equal(np.triu(arr).dtype, arr.dtype)
+        assert_equal(np.tril(arr).dtype, arr.dtype)
+
+    # check special cases
+    arr = np.array([['2001-01-01T12:00', '2002-02-03T13:56'],
+                    ['2004-01-01T12:00', '2003-01-03T13:45']],
+                   dtype='datetime64')
+    assert_equal(np.triu(arr).dtype, arr.dtype)
+    assert_equal(np.tril(arr).dtype, arr.dtype)
+
+    arr = np.zeros((3,3), dtype='f4,f4')
+    assert_equal(np.triu(arr).dtype, arr.dtype)
+    assert_equal(np.tril(arr).dtype, arr.dtype)
+
+
+def test_mask_indices():
+    # simple test without offset
+    iu = mask_indices(3, np.triu)
+    a = np.arange(9).reshape(3, 3)
+    assert_array_equal(a[iu], array([0, 1, 2, 4, 5, 8]))
+    # Now with an offset
+    iu1 = mask_indices(3, np.triu, 1)
+    assert_array_equal(a[iu1], array([1, 2, 5]))
+
+
+def test_tril_indices():
+    # indices without and with offset
+    il1 = tril_indices(4)
+    il2 = tril_indices(4, k=2)
+    il3 = tril_indices(4, m=5)
+    il4 = tril_indices(4, k=2, m=5)
+
+    a = np.array([[1, 2, 3, 4],
+                  [5, 6, 7, 8],
+                  [9, 10, 11, 12],
+                  [13, 14, 15, 16]])
+    b = np.arange(1, 21).reshape(4, 5)
+
+    # indexing:
+    assert_array_equal(a[il1],
+                       array([1, 5, 6, 9, 10, 11, 13, 14, 15, 16]))
+    assert_array_equal(b[il3],
+                       array([1, 6, 7, 11, 12, 13, 16, 17, 18, 19]))
+
+    # And for assigning values:
+    a[il1] = -1
+    assert_array_equal(a,
+                       array([[-1, 2, 3, 4],
+                              [-1, -1, 7, 8],
+                              [-1, -1, -1, 12],
+                              [-1, -1, -1, -1]]))
+    b[il3] = -1
+    assert_array_equal(b,
+                       array([[-1, 2, 3, 4, 5],
+                              [-1, -1, 8, 9, 10],
+                              [-1, -1, -1, 14, 15],
+                              [-1, -1, -1, -1, 20]]))
+    # These cover almost the whole array (two diagonals right of the main one):
+    a[il2] = -10
+    assert_array_equal(a,
+                       array([[-10, -10, -10, 4],
+                              [-10, -10, -10, -10],
+                              [-10, -10, -10, -10],
+                              [-10, -10, -10, -10]]))
+    b[il4] = -10
+    assert_array_equal(b,
+                       array([[-10, -10, -10, 4, 5],
+                              [-10, -10, -10, -10, 10],
+                              [-10, -10, -10, -10, -10],
+                              [-10, -10, -10, -10, -10]]))
+
+
+class TestTriuIndices(object):
+    def test_triu_indices(self):
+        iu1 = triu_indices(4)
+        iu2 = triu_indices(4, k=2)
+        iu3 = triu_indices(4, m=5)
+        iu4 = triu_indices(4, k=2, m=5)
+
+        a = np.array([[1, 2, 3, 4],
+                      [5, 6, 7, 8],
+                      [9, 10, 11, 12],
+                      [13, 14, 15, 16]])
+        b = np.arange(1, 21).reshape(4, 5)
+
+        # Both for indexing:
+        assert_array_equal(a[iu1],
+                           array([1, 2, 3, 4, 6, 7, 8, 11, 12, 16]))
+        assert_array_equal(b[iu3],
+                           array([1, 2, 3, 4, 5, 7, 8, 9,
+                                  10, 13, 14, 15, 19, 20]))
+
+        # And for assigning values:
+        a[iu1] = -1
+        assert_array_equal(a,
+                           array([[-1, -1, -1, -1],
+                                  [5, -1, -1, -1],
+                                  [9, 10, -1, -1],
+                                  [13, 14, 15, -1]]))
+        b[iu3] = -1
+        assert_array_equal(b,
+                           array([[-1, -1, -1, -1, -1],
+                                  [6, -1, -1, -1, -1],
+                                  [11, 12, -1, -1, -1],
+                                  [16, 17, 18, -1, -1]]))
+
+        # These cover almost the whole array (two diagonals right of the
+        # main one):
+        a[iu2] = -10
+        assert_array_equal(a,
+                           array([[-1, -1, -10, -10],
+                                  [5, -1, -1, -10],
+                                  [9, 10, -1, -1],
+                                  [13, 14, 15, -1]]))
+        b[iu4] = -10
+        assert_array_equal(b,
+                           array([[-1, -1, -10, -10, -10],
+                                  [6, -1, -1, -10, -10],
+                                  [11, 12, -1, -1, -10],
+                                  [16, 17, 18, -1, -1]]))
+
+
+class TestTrilIndicesFrom(object):
+    def test_exceptions(self):
+        assert_raises(ValueError, tril_indices_from, np.ones((2,)))
+        assert_raises(ValueError, tril_indices_from, np.ones((2, 2, 2)))
+        # assert_raises(ValueError, tril_indices_from, np.ones((2, 3)))
+
+
+class TestTriuIndicesFrom(object):
+    def test_exceptions(self):
+        assert_raises(ValueError, triu_indices_from, np.ones((2,)))
+        assert_raises(ValueError, triu_indices_from, np.ones((2, 2, 2)))
+        # assert_raises(ValueError, triu_indices_from, np.ones((2, 3)))
+
+
+class TestVander(object):
+    def test_basic(self):
+        c = np.array([0, 1, -2, 3])
+        v = vander(c)
+        powers = np.array([[0, 0, 0, 0, 1],
+                           [1, 1, 1, 1, 1],
+                           [16, -8, 4, -2, 1],
+                           [81, 27, 9, 3, 1]])
+        # Check default value of N:
+        assert_array_equal(v, powers[:, 1:])
+        # Check a range of N values, including 0 and 5 (greater than default)
+        m = powers.shape[1]
+        for n in range(6):
+            v = vander(c, N=n)
+            assert_array_equal(v, powers[:, m-n:m])
+
+    def test_dtypes(self):
+        c = array([11, -12, 13], dtype=np.int8)
+        v = vander(c)
+        expected = np.array([[121, 11, 1],
+                             [144, -12, 1],
+                             [169, 13, 1]])
+        assert_array_equal(v, expected)
+
+        c = array([1.0+1j, 1.0-1j])
+        v = vander(c, N=3)
+        expected = np.array([[2j, 1+1j, 1],
+                             [-2j, 1-1j, 1]])
+        # The data is floating point, but the values are small integers,
+        # so assert_array_equal *should* be safe here (rather than, say,
+        # assert_array_almost_equal).
+        assert_array_equal(v, expected)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_type_check.py b/contrib/python/numpy/py2/numpy/lib/tests/test_type_check.py
new file mode 100644
index 00000000000..2982ca31a3c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_type_check.py
@@ -0,0 +1,442 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.compat import long
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises
+    )
+from numpy.lib.type_check import (
+    common_type, mintypecode, isreal, iscomplex, isposinf, isneginf,
+    nan_to_num, isrealobj, iscomplexobj, asfarray, real_if_close
+    )
+
+
+def assert_all(x):
+    assert_(np.all(x), x)
+
+
+class TestCommonType(object):
+    def test_basic(self):
+        ai32 = np.array([[1, 2], [3, 4]], dtype=np.int32)
+        af16 = np.array([[1, 2], [3, 4]], dtype=np.float16)
+        af32 = np.array([[1, 2], [3, 4]], dtype=np.float32)
+        af64 = np.array([[1, 2], [3, 4]], dtype=np.float64)
+        acs = np.array([[1+5j, 2+6j], [3+7j, 4+8j]], dtype=np.csingle)
+        acd = np.array([[1+5j, 2+6j], [3+7j, 4+8j]], dtype=np.cdouble)
+        assert_(common_type(ai32) == np.float64)
+        assert_(common_type(af16) == np.float16)
+        assert_(common_type(af32) == np.float32)
+        assert_(common_type(af64) == np.float64)
+        assert_(common_type(acs) == np.csingle)
+        assert_(common_type(acd) == np.cdouble)
+
+
+class TestMintypecode(object):
+
+    def test_default_1(self):
+        for itype in '1bcsuwil':
+            assert_equal(mintypecode(itype), 'd')
+        assert_equal(mintypecode('f'), 'f')
+        assert_equal(mintypecode('d'), 'd')
+        assert_equal(mintypecode('F'), 'F')
+        assert_equal(mintypecode('D'), 'D')
+
+    def test_default_2(self):
+        for itype in '1bcsuwil':
+            assert_equal(mintypecode(itype+'f'), 'f')
+            assert_equal(mintypecode(itype+'d'), 'd')
+            assert_equal(mintypecode(itype+'F'), 'F')
+            assert_equal(mintypecode(itype+'D'), 'D')
+        assert_equal(mintypecode('ff'), 'f')
+        assert_equal(mintypecode('fd'), 'd')
+        assert_equal(mintypecode('fF'), 'F')
+        assert_equal(mintypecode('fD'), 'D')
+        assert_equal(mintypecode('df'), 'd')
+        assert_equal(mintypecode('dd'), 'd')
+        #assert_equal(mintypecode('dF',savespace=1),'F')
+        assert_equal(mintypecode('dF'), 'D')
+        assert_equal(mintypecode('dD'), 'D')
+        assert_equal(mintypecode('Ff'), 'F')
+        #assert_equal(mintypecode('Fd',savespace=1),'F')
+        assert_equal(mintypecode('Fd'), 'D')
+        assert_equal(mintypecode('FF'), 'F')
+        assert_equal(mintypecode('FD'), 'D')
+        assert_equal(mintypecode('Df'), 'D')
+        assert_equal(mintypecode('Dd'), 'D')
+        assert_equal(mintypecode('DF'), 'D')
+        assert_equal(mintypecode('DD'), 'D')
+
+    def test_default_3(self):
+        assert_equal(mintypecode('fdF'), 'D')
+        #assert_equal(mintypecode('fdF',savespace=1),'F')
+        assert_equal(mintypecode('fdD'), 'D')
+        assert_equal(mintypecode('fFD'), 'D')
+        assert_equal(mintypecode('dFD'), 'D')
+
+        assert_equal(mintypecode('ifd'), 'd')
+        assert_equal(mintypecode('ifF'), 'F')
+        assert_equal(mintypecode('ifD'), 'D')
+        assert_equal(mintypecode('idF'), 'D')
+        #assert_equal(mintypecode('idF',savespace=1),'F')
+        assert_equal(mintypecode('idD'), 'D')
+
+
+class TestIsscalar(object):
+
+    def test_basic(self):
+        assert_(np.isscalar(3))
+        assert_(not np.isscalar([3]))
+        assert_(not np.isscalar((3,)))
+        assert_(np.isscalar(3j))
+        assert_(np.isscalar(long(10)))
+        assert_(np.isscalar(4.0))
+
+
+class TestReal(object):
+
+    def test_real(self):
+        y = np.random.rand(10,)
+        assert_array_equal(y, np.real(y))
+
+        y = np.array(1)
+        out = np.real(y)
+        assert_array_equal(y, out)
+        assert_(isinstance(out, np.ndarray))
+
+        y = 1
+        out = np.real(y)
+        assert_equal(y, out)
+        assert_(not isinstance(out, np.ndarray))
+
+    def test_cmplx(self):
+        y = np.random.rand(10,)+1j*np.random.rand(10,)
+        assert_array_equal(y.real, np.real(y))
+
+        y = np.array(1 + 1j)
+        out = np.real(y)
+        assert_array_equal(y.real, out)
+        assert_(isinstance(out, np.ndarray))
+
+        y = 1 + 1j
+        out = np.real(y)
+        assert_equal(1.0, out)
+        assert_(not isinstance(out, np.ndarray))
+
+
+class TestImag(object):
+
+    def test_real(self):
+        y = np.random.rand(10,)
+        assert_array_equal(0, np.imag(y))
+
+        y = np.array(1)
+        out = np.imag(y)
+        assert_array_equal(0, out)
+        assert_(isinstance(out, np.ndarray))
+
+        y = 1
+        out = np.imag(y)
+        assert_equal(0, out)
+        assert_(not isinstance(out, np.ndarray))
+
+    def test_cmplx(self):
+        y = np.random.rand(10,)+1j*np.random.rand(10,)
+        assert_array_equal(y.imag, np.imag(y))
+
+        y = np.array(1 + 1j)
+        out = np.imag(y)
+        assert_array_equal(y.imag, out)
+        assert_(isinstance(out, np.ndarray))
+
+        y = 1 + 1j
+        out = np.imag(y)
+        assert_equal(1.0, out)
+        assert_(not isinstance(out, np.ndarray))
+
+
+class TestIscomplex(object):
+
+    def test_fail(self):
+        z = np.array([-1, 0, 1])
+        res = iscomplex(z)
+        assert_(not np.sometrue(res, axis=0))
+
+    def test_pass(self):
+        z = np.array([-1j, 1, 0])
+        res = iscomplex(z)
+        assert_array_equal(res, [1, 0, 0])
+
+
+class TestIsreal(object):
+
+    def test_pass(self):
+        z = np.array([-1, 0, 1j])
+        res = isreal(z)
+        assert_array_equal(res, [1, 1, 0])
+
+    def test_fail(self):
+        z = np.array([-1j, 1, 0])
+        res = isreal(z)
+        assert_array_equal(res, [0, 1, 1])
+
+
+class TestIscomplexobj(object):
+
+    def test_basic(self):
+        z = np.array([-1, 0, 1])
+        assert_(not iscomplexobj(z))
+        z = np.array([-1j, 0, -1])
+        assert_(iscomplexobj(z))
+
+    def test_scalar(self):
+        assert_(not iscomplexobj(1.0))
+        assert_(iscomplexobj(1+0j))
+
+    def test_list(self):
+        assert_(iscomplexobj([3, 1+0j, True]))
+        assert_(not iscomplexobj([3, 1, True]))
+
+    def test_duck(self):
+        class DummyComplexArray:
+            @property
+            def dtype(self):
+                return np.dtype(complex)
+        dummy = DummyComplexArray()
+        assert_(iscomplexobj(dummy))
+
+    def test_pandas_duck(self):
+        # This tests a custom np.dtype duck-typed class, such as used by pandas
+        # (pandas.core.dtypes)
+        class PdComplex(np.complex128):
+            pass
+        class PdDtype(object):
+            name = 'category'
+            names = None
+            type = PdComplex
+            kind = 'c'
+            str = '<c16'
+            base = np.dtype('complex128')
+        class DummyPd:
+            @property
+            def dtype(self):
+                return PdDtype
+        dummy = DummyPd()
+        assert_(iscomplexobj(dummy))
+
+    def test_custom_dtype_duck(self):
+        class MyArray(list):
+            @property
+            def dtype(self):
+                return complex
+
+        a = MyArray([1+0j, 2+0j, 3+0j])
+        assert_(iscomplexobj(a))
+
+
+class TestIsrealobj(object):
+    def test_basic(self):
+        z = np.array([-1, 0, 1])
+        assert_(isrealobj(z))
+        z = np.array([-1j, 0, -1])
+        assert_(not isrealobj(z))
+
+
+class TestIsnan(object):
+
+    def test_goodvalues(self):
+        z = np.array((-1., 0., 1.))
+        res = np.isnan(z) == 0
+        assert_all(np.all(res, axis=0))
+
+    def test_posinf(self):
+        with np.errstate(divide='ignore'):
+            assert_all(np.isnan(np.array((1.,))/0.) == 0)
+
+    def test_neginf(self):
+        with np.errstate(divide='ignore'):
+            assert_all(np.isnan(np.array((-1.,))/0.) == 0)
+
+    def test_ind(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isnan(np.array((0.,))/0.) == 1)
+
+    def test_integer(self):
+        assert_all(np.isnan(1) == 0)
+
+    def test_complex(self):
+        assert_all(np.isnan(1+1j) == 0)
+
+    def test_complex1(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isnan(np.array(0+0j)/0.) == 1)
+
+
+class TestIsfinite(object):
+    # Fixme, wrong place, isfinite now ufunc
+
+    def test_goodvalues(self):
+        z = np.array((-1., 0., 1.))
+        res = np.isfinite(z) == 1
+        assert_all(np.all(res, axis=0))
+
+    def test_posinf(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isfinite(np.array((1.,))/0.) == 0)
+
+    def test_neginf(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isfinite(np.array((-1.,))/0.) == 0)
+
+    def test_ind(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isfinite(np.array((0.,))/0.) == 0)
+
+    def test_integer(self):
+        assert_all(np.isfinite(1) == 1)
+
+    def test_complex(self):
+        assert_all(np.isfinite(1+1j) == 1)
+
+    def test_complex1(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isfinite(np.array(1+1j)/0.) == 0)
+
+
+class TestIsinf(object):
+    # Fixme, wrong place, isinf now ufunc
+
+    def test_goodvalues(self):
+        z = np.array((-1., 0., 1.))
+        res = np.isinf(z) == 0
+        assert_all(np.all(res, axis=0))
+
+    def test_posinf(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isinf(np.array((1.,))/0.) == 1)
+
+    def test_posinf_scalar(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isinf(np.array(1.,)/0.) == 1)
+
+    def test_neginf(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isinf(np.array((-1.,))/0.) == 1)
+
+    def test_neginf_scalar(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isinf(np.array(-1.)/0.) == 1)
+
+    def test_ind(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_all(np.isinf(np.array((0.,))/0.) == 0)
+
+
+class TestIsposinf(object):
+
+    def test_generic(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            vals = isposinf(np.array((-1., 0, 1))/0.)
+        assert_(vals[0] == 0)
+        assert_(vals[1] == 0)
+        assert_(vals[2] == 1)
+
+
+class TestIsneginf(object):
+
+    def test_generic(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            vals = isneginf(np.array((-1., 0, 1))/0.)
+        assert_(vals[0] == 1)
+        assert_(vals[1] == 0)
+        assert_(vals[2] == 0)
+
+
+class TestNanToNum(object):
+
+    def test_generic(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            vals = nan_to_num(np.array((-1., 0, 1))/0.)
+        assert_all(vals[0] < -1e10) and assert_all(np.isfinite(vals[0]))
+        assert_(vals[1] == 0)
+        assert_all(vals[2] > 1e10) and assert_all(np.isfinite(vals[2]))
+        assert_equal(type(vals), np.ndarray)
+
+        # perform the same test but in-place
+        with np.errstate(divide='ignore', invalid='ignore'):
+            vals = np.array((-1., 0, 1))/0.
+        result = nan_to_num(vals, copy=False)
+
+        assert_(result is vals)
+        assert_all(vals[0] < -1e10) and assert_all(np.isfinite(vals[0]))
+        assert_(vals[1] == 0)
+        assert_all(vals[2] > 1e10) and assert_all(np.isfinite(vals[2]))
+        assert_equal(type(vals), np.ndarray)
+
+    def test_array(self):
+        vals = nan_to_num([1])
+        assert_array_equal(vals, np.array([1], int))
+        assert_equal(type(vals), np.ndarray)
+
+    def test_integer(self):
+        vals = nan_to_num(1)
+        assert_all(vals == 1)
+        assert_equal(type(vals), np.int_)
+
+    def test_float(self):
+        vals = nan_to_num(1.0)
+        assert_all(vals == 1.0)
+        assert_equal(type(vals), np.float_)
+
+    def test_complex_good(self):
+        vals = nan_to_num(1+1j)
+        assert_all(vals == 1+1j)
+        assert_equal(type(vals), np.complex_)
+
+    def test_complex_bad(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            v = 1 + 1j
+            v += np.array(0+1.j)/0.
+        vals = nan_to_num(v)
+        # !! This is actually (unexpectedly) zero
+        assert_all(np.isfinite(vals))
+        assert_equal(type(vals), np.complex_)
+
+    def test_complex_bad2(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            v = 1 + 1j
+            v += np.array(-1+1.j)/0.
+        vals = nan_to_num(v)
+        assert_all(np.isfinite(vals))
+        assert_equal(type(vals), np.complex_)
+        # Fixme
+        #assert_all(vals.imag > 1e10)  and assert_all(np.isfinite(vals))
+        # !! This is actually (unexpectedly) positive
+        # !! inf.  Comment out for now, and see if it
+        # !! changes
+        #assert_all(vals.real < -1e10) and assert_all(np.isfinite(vals))
+
+
+class TestRealIfClose(object):
+
+    def test_basic(self):
+        a = np.random.rand(10)
+        b = real_if_close(a+1e-15j)
+        assert_all(isrealobj(b))
+        assert_array_equal(a, b)
+        b = real_if_close(a+1e-7j)
+        assert_all(iscomplexobj(b))
+        b = real_if_close(a+1e-7j, tol=1e-6)
+        assert_all(isrealobj(b))
+
+
+class TestArrayConversion(object):
+
+    def test_asfarray(self):
+        a = asfarray(np.array([1, 2, 3]))
+        assert_equal(a.__class__, np.ndarray)
+        assert_(np.issubdtype(a.dtype, np.floating))
+
+        # previously this would infer dtypes from arrays, unlike every single
+        # other numpy function
+        assert_raises(TypeError,
+            asfarray, np.array([1, 2, 3]), dtype=np.array(1.0))
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_ufunclike.py b/contrib/python/numpy/py2/numpy/lib/tests/test_ufunclike.py
new file mode 100644
index 00000000000..0f06876a1b2
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_ufunclike.py
@@ -0,0 +1,106 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+import numpy.core as nx
+import numpy.lib.ufunclike as ufl
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_warns, assert_raises
+)
+
+
+class TestUfunclike(object):
+
+    def test_isposinf(self):
+        a = nx.array([nx.inf, -nx.inf, nx.nan, 0.0, 3.0, -3.0])
+        out = nx.zeros(a.shape, bool)
+        tgt = nx.array([True, False, False, False, False, False])
+
+        res = ufl.isposinf(a)
+        assert_equal(res, tgt)
+        res = ufl.isposinf(a, out)
+        assert_equal(res, tgt)
+        assert_equal(out, tgt)
+
+        a = a.astype(np.complex)
+        with assert_raises(TypeError):
+            ufl.isposinf(a)
+
+    def test_isneginf(self):
+        a = nx.array([nx.inf, -nx.inf, nx.nan, 0.0, 3.0, -3.0])
+        out = nx.zeros(a.shape, bool)
+        tgt = nx.array([False, True, False, False, False, False])
+
+        res = ufl.isneginf(a)
+        assert_equal(res, tgt)
+        res = ufl.isneginf(a, out)
+        assert_equal(res, tgt)
+        assert_equal(out, tgt)
+
+        a = a.astype(np.complex)
+        with assert_raises(TypeError):
+            ufl.isneginf(a)
+
+    def test_fix(self):
+        a = nx.array([[1.0, 1.1, 1.5, 1.8], [-1.0, -1.1, -1.5, -1.8]])
+        out = nx.zeros(a.shape, float)
+        tgt = nx.array([[1., 1., 1., 1.], [-1., -1., -1., -1.]])
+
+        res = ufl.fix(a)
+        assert_equal(res, tgt)
+        res = ufl.fix(a, out)
+        assert_equal(res, tgt)
+        assert_equal(out, tgt)
+        assert_equal(ufl.fix(3.14), 3)
+
+    def test_fix_with_subclass(self):
+        class MyArray(nx.ndarray):
+            def __new__(cls, data, metadata=None):
+                res = nx.array(data, copy=True).view(cls)
+                res.metadata = metadata
+                return res
+
+            def __array_wrap__(self, obj, context=None):
+                if isinstance(obj, MyArray):
+                    obj.metadata = self.metadata
+                return obj
+
+            def __array_finalize__(self, obj):
+                self.metadata = getattr(obj, 'metadata', None)
+                return self
+
+        a = nx.array([1.1, -1.1])
+        m = MyArray(a, metadata='foo')
+        f = ufl.fix(m)
+        assert_array_equal(f, nx.array([1, -1]))
+        assert_(isinstance(f, MyArray))
+        assert_equal(f.metadata, 'foo')
+
+        # check 0d arrays don't decay to scalars
+        m0d = m[0,...]
+        m0d.metadata = 'bar'
+        f0d = ufl.fix(m0d)
+        assert_(isinstance(f0d, MyArray))
+        assert_equal(f0d.metadata, 'bar')
+
+    def test_deprecated(self):
+        # NumPy 1.13.0, 2017-04-26
+        assert_warns(DeprecationWarning, ufl.fix, [1, 2], y=nx.empty(2))
+        assert_warns(DeprecationWarning, ufl.isposinf, [1, 2], y=nx.empty(2))
+        assert_warns(DeprecationWarning, ufl.isneginf, [1, 2], y=nx.empty(2))
+
+    def test_scalar(self):
+        x = np.inf
+        actual = np.isposinf(x)
+        expected = np.True_
+        assert_equal(actual, expected)
+        assert_equal(type(actual), type(expected))
+
+        x = -3.4
+        actual = np.fix(x)
+        expected = np.float64(-3.0)
+        assert_equal(actual, expected)
+        assert_equal(type(actual), type(expected))
+
+        out = np.array(0.0)
+        actual = np.fix(x, out=out)
+        assert_(actual is out)
diff --git a/contrib/python/numpy/py2/numpy/lib/tests/test_utils.py b/contrib/python/numpy/py2/numpy/lib/tests/test_utils.py
new file mode 100644
index 00000000000..2723f344070
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/lib/tests/test_utils.py
@@ -0,0 +1,91 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import pytest
+
+from numpy.core import arange
+from numpy.testing import assert_, assert_equal, assert_raises_regex
+from numpy.lib import deprecate
+import numpy.lib.utils as utils
+
+if sys.version_info[0] >= 3:
+    from io import StringIO
+else:
+    from StringIO import StringIO
+
+
+[email protected](sys.flags.optimize == 2, reason="Python running -OO")
+def test_lookfor():
+    out = StringIO()
+    utils.lookfor('eigenvalue', module='numpy', output=out,
+                  import_modules=False)
+    out = out.getvalue()
+    assert_('numpy.linalg.eig' in out)
+
+
+@deprecate
+def old_func(self, x):
+    return x
+
+
+@deprecate(message="Rather use new_func2")
+def old_func2(self, x):
+    return x
+
+
+def old_func3(self, x):
+    return x
+new_func3 = deprecate(old_func3, old_name="old_func3", new_name="new_func3")
+
+
+def test_deprecate_decorator():
+    assert_('deprecated' in old_func.__doc__)
+
+
+def test_deprecate_decorator_message():
+    assert_('Rather use new_func2' in old_func2.__doc__)
+
+
+def test_deprecate_fn():
+    assert_('old_func3' in new_func3.__doc__)
+    assert_('new_func3' in new_func3.__doc__)
+
+
+def test_safe_eval_nameconstant():
+    # Test if safe_eval supports Python 3.4 _ast.NameConstant
+    utils.safe_eval('None')
+
+
+class TestByteBounds(object):
+
+    def test_byte_bounds(self):
+        # pointer difference matches size * itemsize
+        # due to contiguity
+        a = arange(12).reshape(3, 4)
+        low, high = utils.byte_bounds(a)
+        assert_equal(high - low, a.size * a.itemsize)
+
+    def test_unusual_order_positive_stride(self):
+        a = arange(12).reshape(3, 4)
+        b = a.T
+        low, high = utils.byte_bounds(b)
+        assert_equal(high - low, b.size * b.itemsize)
+
+    def test_unusual_order_negative_stride(self):
+        a = arange(12).reshape(3, 4)
+        b = a.T[::-1]
+        low, high = utils.byte_bounds(b)
+        assert_equal(high - low, b.size * b.itemsize)
+
+    def test_strided(self):
+        a = arange(12)
+        b = a[::2]
+        low, high = utils.byte_bounds(b)
+        # the largest pointer address is lost (even numbers only in the
+        # stride), and compensate addresses for striding by 2
+        assert_equal(high - low, b.size * 2 * b.itemsize - b.itemsize)
+
+
+def test_assert_raises_regex_context_manager():
+    with assert_raises_regex(ValueError, 'no deprecation warning'):
+        raise ValueError('no deprecation warning')
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/clapack_scrub.py b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/clapack_scrub.py
new file mode 100644
index 00000000000..e72a39e64e9
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/clapack_scrub.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+from __future__ import division, absolute_import, print_function
+
+import sys, os
+import re
+from plex import Scanner, Str, Lexicon, Opt, Bol, State, AnyChar, TEXT, IGNORE
+from plex.traditional import re as Re
+
+PY2 = sys.version_info < (3, 0)
+
+if PY2:
+    from io import BytesIO as UStringIO
+else:
+    from io import StringIO as UStringIO
+
+class MyScanner(Scanner):
+    def __init__(self, info, name='<default>'):
+        Scanner.__init__(self, self.lexicon, info, name)
+
+    def begin(self, state_name):
+        Scanner.begin(self, state_name)
+
+def sep_seq(sequence, sep):
+    pat = Str(sequence[0])
+    for s in sequence[1:]:
+        pat += sep + Str(s)
+    return pat
+
+def runScanner(data, scanner_class, lexicon=None):
+    info = UStringIO(data)
+    outfo = UStringIO()
+    if lexicon is not None:
+        scanner = scanner_class(lexicon, info)
+    else:
+        scanner = scanner_class(info)
+    while True:
+        value, text = scanner.read()
+        if value is None:
+            break
+        elif value is IGNORE:
+            pass
+        else:
+            outfo.write(value)
+    return outfo.getvalue(), scanner
+
+class LenSubsScanner(MyScanner):
+    """Following clapack, we remove ftnlen arguments, which f2c puts after
+    a char * argument to hold the length of the passed string. This is just
+    a nuisance in C.
+    """
+    def __init__(self, info, name='<ftnlen>'):
+        MyScanner.__init__(self, info, name)
+        self.paren_count = 0
+
+    def beginArgs(self, text):
+        if self.paren_count == 0:
+            self.begin('args')
+        self.paren_count += 1
+        return text
+
+    def endArgs(self, text):
+        self.paren_count -= 1
+        if self.paren_count == 0:
+            self.begin('')
+        return text
+
+    digits = Re('[0-9]+')
+    iofun = Re(r'\([^;]*;')
+    decl = Re(r'\([^)]*\)[,;'+'\n]')
+    any = Re('[.]*')
+    S = Re('[ \t\n]*')
+    cS = Str(',') + S
+    len_ = Re('[a-z][a-z0-9]*_len')
+
+    iofunctions = Str("s_cat", "s_copy", "s_stop", "s_cmp",
+                      "i_len", "do_fio", "do_lio") + iofun
+
+    # Routines to not scrub the ftnlen argument from
+    keep_ftnlen = (Str('ilaenv_') | Str('iparmq_') | Str('s_rnge')) + Str('(')
+
+    lexicon = Lexicon([
+        (iofunctions,                           TEXT),
+        (keep_ftnlen,                           beginArgs),
+        State('args', [
+            (Str(')'),   endArgs),
+            (Str('('),   beginArgs),
+            (AnyChar,    TEXT),
+        ]),
+        (cS+Re(r'[1-9][0-9]*L'),                IGNORE),
+        (cS+Str('ftnlen')+Opt(S+len_),          IGNORE),
+        (cS+sep_seq(['(', 'ftnlen', ')'], S)+S+digits,      IGNORE),
+        (Bol+Str('ftnlen ')+len_+Str(';\n'),    IGNORE),
+        (cS+len_,                               TEXT),
+        (AnyChar,                               TEXT),
+    ])
+
+def scrubFtnlen(source):
+    return runScanner(source, LenSubsScanner)[0]
+
+def cleanSource(source):
+    # remove whitespace at end of lines
+    source = re.sub(r'[\t ]+\n', '\n', source)
+    # remove comments like .. Scalar Arguments ..
+    source = re.sub(r'(?m)^[\t ]*/\* *\.\. .*?\n', '', source)
+    # collapse blanks of more than two in-a-row to two
+    source = re.sub(r'\n\n\n\n+', r'\n\n\n', source)
+    return source
+
+class LineQueue(object):
+    def __init__(self):
+        object.__init__(self)
+        self._queue = []
+
+    def add(self, line):
+        self._queue.append(line)
+
+    def clear(self):
+        self._queue = []
+
+    def flushTo(self, other_queue):
+        for line in self._queue:
+            other_queue.add(line)
+        self.clear()
+
+    def getValue(self):
+        q = LineQueue()
+        self.flushTo(q)
+        s = ''.join(q._queue)
+        self.clear()
+        return s
+
+class CommentQueue(LineQueue):
+    def __init__(self):
+        LineQueue.__init__(self)
+
+    def add(self, line):
+        if line.strip() == '':
+            LineQueue.add(self, '\n')
+        else:
+            line = '  ' + line[2:-3].rstrip() + '\n'
+            LineQueue.add(self, line)
+
+    def flushTo(self, other_queue):
+        if len(self._queue) == 0:
+            pass
+        elif len(self._queue) == 1:
+            other_queue.add('/*' + self._queue[0][2:].rstrip() + ' */\n')
+        else:
+            other_queue.add('/*\n')
+            LineQueue.flushTo(self, other_queue)
+            other_queue.add('*/\n')
+        self.clear()
+
+# This really seems to be about 4x longer than it needs to be
+def cleanComments(source):
+    lines = LineQueue()
+    comments = CommentQueue()
+    def isCommentLine(line):
+        return line.startswith('/*') and line.endswith('*/\n')
+
+    blanks = LineQueue()
+    def isBlank(line):
+        return line.strip() == ''
+
+    def SourceLines(line):
+        if isCommentLine(line):
+            comments.add(line)
+            return HaveCommentLines
+        else:
+            lines.add(line)
+            return SourceLines
+    def HaveCommentLines(line):
+        if isBlank(line):
+            blanks.add('\n')
+            return HaveBlankLines
+        elif isCommentLine(line):
+            comments.add(line)
+            return HaveCommentLines
+        else:
+            comments.flushTo(lines)
+            lines.add(line)
+            return SourceLines
+    def HaveBlankLines(line):
+        if isBlank(line):
+            blanks.add('\n')
+            return HaveBlankLines
+        elif isCommentLine(line):
+            blanks.flushTo(comments)
+            comments.add(line)
+            return HaveCommentLines
+        else:
+            comments.flushTo(lines)
+            blanks.flushTo(lines)
+            lines.add(line)
+            return SourceLines
+
+    state = SourceLines
+    for line in UStringIO(source):
+        state = state(line)
+    comments.flushTo(lines)
+    return lines.getValue()
+
+def removeHeader(source):
+    lines = LineQueue()
+
+    def LookingForHeader(line):
+        m = re.match(r'/\*[^\n]*-- translated', line)
+        if m:
+            return InHeader
+        else:
+            lines.add(line)
+            return LookingForHeader
+    def InHeader(line):
+        if line.startswith('*/'):
+            return OutOfHeader
+        else:
+            return InHeader
+    def OutOfHeader(line):
+        if line.startswith('#include "f2c.h"'):
+            pass
+        else:
+            lines.add(line)
+        return OutOfHeader
+
+    state = LookingForHeader
+    for line in UStringIO(source):
+        state = state(line)
+    return lines.getValue()
+
+def removeSubroutinePrototypes(source):
+    expression = re.compile(
+        r'/[*] Subroutine [*]/^\s*(?:(?:inline|static)\s+){0,2}(?!else|typedef|return)\w+\s+\*?\s*(\w+)\s*\([^0]+\)\s*;?'
+    )
+    lines = LineQueue()
+    for line in UStringIO(source):
+        if not expression.match(line):
+            lines.add(line)
+
+    return lines.getValue()
+
+def removeBuiltinFunctions(source):
+    lines = LineQueue()
+    def LookingForBuiltinFunctions(line):
+        if line.strip() == '/* Builtin functions */':
+            return InBuiltInFunctions
+        else:
+            lines.add(line)
+            return LookingForBuiltinFunctions
+
+    def InBuiltInFunctions(line):
+        if line.strip() == '':
+            return LookingForBuiltinFunctions
+        else:
+            return InBuiltInFunctions
+
+    state = LookingForBuiltinFunctions
+    for line in UStringIO(source):
+        state = state(line)
+    return lines.getValue()
+
+def replaceDlamch(source):
+    """Replace dlamch_ calls with appropriate macros"""
+    def repl(m):
+        s = m.group(1)
+        return dict(E='EPSILON', P='PRECISION', S='SAFEMINIMUM',
+                    B='BASE')[s[0]]
+    source = re.sub(r'dlamch_\("(.*?)"\)', repl, source)
+    source = re.sub(r'^\s+extern.*? dlamch_.*?;$(?m)', '', source)
+    return source
+
+# do it
+
+def scrubSource(source, nsteps=None, verbose=False):
+    steps = [
+             ('scrubbing ftnlen', scrubFtnlen),
+             ('remove header', removeHeader),
+             ('clean source', cleanSource),
+             ('clean comments', cleanComments),
+             ('replace dlamch_() calls', replaceDlamch),
+             ('remove prototypes', removeSubroutinePrototypes),
+             ('remove builtin function prototypes', removeBuiltinFunctions),
+            ]
+
+    if nsteps is not None:
+        steps = steps[:nsteps]
+
+    for msg, step in steps:
+        if verbose:
+            print(msg)
+        source = step(source)
+
+    return source
+
+if __name__ == '__main__':
+    filename = sys.argv[1]
+    outfilename = os.path.join(sys.argv[2], os.path.basename(filename))
+    fo = open(filename, 'r')
+    source = fo.read()
+    fo.close()
+
+    if len(sys.argv) > 3:
+        nsteps = int(sys.argv[3])
+    else:
+        nsteps = None
+
+    source = scrub_source(source, nsteps, verbose=True)
+
+    writefo = open(outfilename, 'w')
+    writefo.write(source)
+    writefo.close()
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c.c
new file mode 100644
index 00000000000..1114bef3b1c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c.c
@@ -0,0 +1,764 @@
+/*
+  Functions here are copied from the source code for libf2c.
+
+  Typically each function there is in its own file.
+
+  We don't link against libf2c directly, because we can't guarantee
+  it is available, and shipping a static library isn't portable.
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "f2c.h"
+
+
+extern void s_wsfe(cilist *f) {;}
+extern void e_wsfe(void) {;}
+extern void do_fio(integer *c, char *s, ftnlen l) {;}
+
+/* You'll want this if you redo the f2c_*.c files with the -C option
+ * to f2c for checking array subscripts. (It's not suggested you do that
+ * for production use, of course.) */
+extern int
+s_rnge(char *var, int index, char *routine, int lineno)
+{
+    fprintf(stderr, "array index out-of-bounds for %s[%d] in routine %s:%d\n",
+            var, index, routine, lineno);
+    fflush(stderr);
+    abort();
+}
+
+#ifdef KR_headers
+extern float sqrtf();
+double f__cabsf(real, imag) float real, imag;
+#else
+#undef abs
+
+double f__cabsf(float real, float imag)
+#endif
+{
+float temp;
+
+if(real < 0.0f)
+	real = -real;
+if(imag < 0.0f)
+	imag = -imag;
+if(imag > real){
+	temp = real;
+	real = imag;
+	imag = temp;
+}
+if((imag+real) == real)
+	return((float)real);
+
+temp = imag/real;
+temp = real*sqrtf(1.0 + temp*temp);  /*overflow!!*/
+return(temp);
+}
+
+
+#ifdef KR_headers
+extern double sqrt();
+double f__cabs(real, imag) double real, imag;
+#else
+#undef abs
+
+double f__cabs(double real, double imag)
+#endif
+{
+double temp;
+
+if(real < 0)
+	real = -real;
+if(imag < 0)
+	imag = -imag;
+if(imag > real){
+	temp = real;
+	real = imag;
+	imag = temp;
+}
+if((imag+real) == real)
+	return((double)real);
+
+temp = imag/real;
+temp = real*sqrt(1.0 + temp*temp);  /*overflow!!*/
+return(temp);
+}
+
+ VOID
+#ifdef KR_headers
+r_cnjg(r, z) complex *r, *z;
+#else
+r_cnjg(complex *r, complex *z)
+#endif
+{
+r->r = z->r;
+r->i = - z->i;
+}
+
+ VOID
+#ifdef KR_headers
+d_cnjg(r, z) doublecomplex *r, *z;
+#else
+d_cnjg(doublecomplex *r, doublecomplex *z)
+#endif
+{
+r->r = z->r;
+r->i = - z->i;
+}
+
+
+#ifdef KR_headers
+float r_imag(z) complex *z;
+#else
+float r_imag(complex *z)
+#endif
+{
+return(z->i);
+}
+
+#ifdef KR_headers
+double d_imag(z) doublecomplex *z;
+#else
+double d_imag(doublecomplex *z)
+#endif
+{
+return(z->i);
+}
+
+
+#define log10e 0.43429448190325182765
+
+#ifdef KR_headers
+float logf();
+float r_lg10(x) real *x;
+#else
+#undef abs
+
+float r_lg10(real *x)
+#endif
+{
+return( log10e * logf(*x) );
+}
+
+#ifdef KR_headers
+double log();
+double d_lg10(x) doublereal *x;
+#else
+#undef abs
+
+double d_lg10(doublereal *x)
+#endif
+{
+return( log10e * log(*x) );
+}
+
+#ifdef KR_headers
+double r_sign(a,b) real *a, *b;
+#else
+double r_sign(real *a, real *b)
+#endif
+{
+float x;
+x = (*a >= 0.0f ? *a : - *a);
+return( *b >= 0.0f ? x : -x);
+}
+
+#ifdef KR_headers
+double d_sign(a,b) doublereal *a, *b;
+#else
+double d_sign(doublereal *a, doublereal *b)
+#endif
+{
+double x;
+x = (*a >= 0 ? *a : - *a);
+return( *b >= 0 ? x : -x);
+}
+
+
+#ifdef KR_headers
+double floor();
+integer i_dnnt(x) doublereal *x;
+#else
+#undef abs
+
+integer i_dnnt(doublereal *x)
+#endif
+{
+return( (*x)>=0 ?
+	floor(*x + .5) : -floor(.5 - *x) );
+}
+
+
+#ifdef KR_headers
+double floor();
+integer i_nint(x) real *x;
+#else
+#undef abs
+integer i_nint(real *x)
+#endif
+{
+return (integer)(*x >= 0 ? floor(*x + .5) : -floor(.5 - *x));
+}
+
+#ifdef KR_headers
+double pow();
+double pow_dd(ap, bp) doublereal *ap, *bp;
+#else
+#undef abs
+
+double pow_dd(doublereal *ap, doublereal *bp)
+#endif
+{
+return(pow(*ap, *bp) );
+}
+
+
+#ifdef KR_headers
+double pow_ri(ap, bp) real *ap; integer *bp;
+#else
+double pow_ri(real *ap, integer *bp)
+#endif
+{
+float pow, x;
+integer n;
+unsigned long u;
+
+pow = 1;
+x = *ap;
+n = *bp;
+
+if(n != 0)
+	{
+	if(n < 0)
+		{
+		n = -n;
+		x = 1.0f/x;
+		}
+	for(u = n; ; )
+		{
+		if(u & 01)
+			pow *= x;
+		if(u >>= 1)
+			x *= x;
+		else
+			break;
+		}
+	}
+return(pow);
+}
+
+#ifdef KR_headers
+double pow_di(ap, bp) doublereal *ap; integer *bp;
+#else
+double pow_di(doublereal *ap, integer *bp)
+#endif
+{
+double pow, x;
+integer n;
+unsigned long u;
+
+pow = 1;
+x = *ap;
+n = *bp;
+
+if(n != 0)
+	{
+	if(n < 0)
+		{
+		n = -n;
+		x = 1/x;
+		}
+	for(u = n; ; )
+		{
+		if(u & 01)
+			pow *= x;
+		if(u >>= 1)
+			x *= x;
+		else
+			break;
+		}
+	}
+return(pow);
+}
+
+#ifdef KR_headers
+VOID pow_zi(p, a, b) 	/* p = a**b  */
+ doublecomplex *p, *a; integer *b;
+#else
+extern void z_div(doublecomplex*, doublecomplex*, doublecomplex*);
+void pow_zi(doublecomplex *p, doublecomplex *a, integer *b) 	/* p = a**b  */
+#endif
+{
+	integer n;
+	unsigned long u;
+	double t;
+	doublecomplex q, x;
+	static doublecomplex one = {1.0, 0.0};
+
+	n = *b;
+	q.r = 1;
+	q.i = 0;
+
+	if(n == 0)
+		goto done;
+	if(n < 0)
+		{
+		n = -n;
+		z_div(&x, &one, a);
+		}
+	else
+		{
+		x.r = a->r;
+		x.i = a->i;
+		}
+
+	for(u = n; ; )
+		{
+		if(u & 01)
+			{
+			t = q.r * x.r - q.i * x.i;
+			q.i = q.r * x.i + q.i * x.r;
+			q.r = t;
+			}
+		if(u >>= 1)
+			{
+			t = x.r * x.r - x.i * x.i;
+			x.i = 2 * x.r * x.i;
+			x.r = t;
+			}
+		else
+			break;
+		}
+ done:
+	p->i = q.i;
+	p->r = q.r;
+	}
+
+#ifdef KR_headers
+VOID pow_ci(p, a, b) 	/* p = a**b  */
+ complex *p, *a; integer *b;
+#else
+extern void pow_zi(doublecomplex*, doublecomplex*, integer*);
+void pow_ci(complex *p, complex *a, integer *b) 	/* p = a**b  */
+#endif
+{
+doublecomplex p1, a1;
+
+a1.r = a->r;
+a1.i = a->i;
+
+pow_zi(&p1, &a1, b);
+
+p->r = p1.r;
+p->i = p1.i;
+}
+
+/* Unless compiled with -DNO_OVERWRITE, this variant of s_cat allows the
+ * target of a concatenation to appear on its right-hand side (contrary
+ * to the Fortran 77 Standard, but in accordance with Fortran 90).
+ */
+#define NO_OVERWRITE
+
+
+#ifndef NO_OVERWRITE
+
+#undef abs
+#ifdef KR_headers
+ extern char *F77_aloc();
+ extern void free();
+ extern void exit_();
+#else
+
+ extern char *F77_aloc(ftnlen, char*);
+#endif
+
+#endif /* NO_OVERWRITE */
+
+ VOID
+#ifdef KR_headers
+s_cat(lp, rpp, rnp, np, ll) char *lp, *rpp[]; ftnlen rnp[], *np, ll;
+#else
+s_cat(char *lp, char *rpp[], ftnlen rnp[], ftnlen *np, ftnlen ll)
+#endif
+{
+	ftnlen i, nc;
+	char *rp;
+	ftnlen n = *np;
+#ifndef NO_OVERWRITE
+	ftnlen L, m;
+	char *lp0, *lp1;
+
+	lp0 = 0;
+	lp1 = lp;
+	L = ll;
+	i = 0;
+	while(i < n) {
+		rp = rpp[i];
+		m = rnp[i++];
+		if (rp >= lp1 || rp + m <= lp) {
+			if ((L -= m) <= 0) {
+				n = i;
+				break;
+				}
+			lp1 += m;
+			continue;
+			}
+		lp0 = lp;
+		lp = lp1 = F77_aloc(L = ll, "s_cat");
+		break;
+		}
+	lp1 = lp;
+#endif /* NO_OVERWRITE */
+	for(i = 0 ; i < n ; ++i) {
+		nc = ll;
+		if(rnp[i] < nc)
+			nc = rnp[i];
+		ll -= nc;
+		rp = rpp[i];
+		while(--nc >= 0)
+			*lp++ = *rp++;
+		}
+	while(--ll >= 0)
+		*lp++ = ' ';
+#ifndef NO_OVERWRITE
+	if (lp0) {
+		memmove(lp0, lp1, L);
+		free(lp1);
+		}
+#endif
+	}
+
+
+/* compare two strings */
+
+#ifdef KR_headers
+integer s_cmp(a0, b0, la, lb) char *a0, *b0; ftnlen la, lb;
+#else
+integer s_cmp(char *a0, char *b0, ftnlen la, ftnlen lb)
+#endif
+{
+register unsigned char *a, *aend, *b, *bend;
+a = (unsigned char *)a0;
+b = (unsigned char *)b0;
+aend = a + la;
+bend = b + lb;
+
+if(la <= lb)
+	{
+	while(a < aend)
+		if(*a != *b)
+			return( *a - *b );
+		else
+			{ ++a; ++b; }
+
+	while(b < bend)
+		if(*b != ' ')
+			return( ' ' - *b );
+		else	++b;
+	}
+
+else
+	{
+	while(b < bend)
+		if(*a == *b)
+			{ ++a; ++b; }
+		else
+			return( *a - *b );
+	while(a < aend)
+		if(*a != ' ')
+			return(*a - ' ');
+		else	++a;
+	}
+return(0);
+}
+/* Unless compiled with -DNO_OVERWRITE, this variant of s_copy allows the
+ * target of an assignment to appear on its right-hand side (contrary
+ * to the Fortran 77 Standard, but in accordance with Fortran 90),
+ * as in  a(2:5) = a(4:7) .
+ */
+
+
+
+/* assign strings:  a = b */
+
+#ifdef KR_headers
+VOID s_copy(a, b, la, lb) register char *a, *b; ftnlen la, lb;
+#else
+void s_copy(register char *a, register char *b, ftnlen la, ftnlen lb)
+#endif
+{
+	register char *aend, *bend;
+
+	aend = a + la;
+
+	if(la <= lb)
+#ifndef NO_OVERWRITE
+		if (a <= b || a >= b + la)
+#endif
+			while(a < aend)
+				*a++ = *b++;
+#ifndef NO_OVERWRITE
+		else
+			for(b += la; a < aend; )
+				*--aend = *--b;
+#endif
+
+	else {
+		bend = b + lb;
+#ifndef NO_OVERWRITE
+		if (a <= b || a >= bend)
+#endif
+			while(b < bend)
+				*a++ = *b++;
+#ifndef NO_OVERWRITE
+		else {
+			a += lb;
+			while(b < bend)
+				*--a = *--bend;
+			a += lb;
+			}
+#endif
+		while(a < aend)
+			*a++ = ' ';
+		}
+	}
+
+
+#ifdef KR_headers
+double f__cabsf();
+double c_abs(z) complex *z;
+#else
+double f__cabsf(float, float);
+double c_abs(complex *z)
+#endif
+{
+return( f__cabsf( z->r, z->i ) );
+}
+
+#ifdef KR_headers
+double f__cabs();
+double z_abs(z) doublecomplex *z;
+#else
+double f__cabs(double, double);
+double z_abs(doublecomplex *z)
+#endif
+{
+return( f__cabs( z->r, z->i ) );
+}
+
+
+#ifdef KR_headers
+extern void sig_die();
+VOID c_div(c, a, b) complex *a, *b, *c;
+#else
+extern void sig_die(char*, int);
+void c_div(complex *c, complex *a, complex *b)
+#endif
+{
+float ratio, den;
+float abr, abi;
+
+if( (abr = b->r) < 0.f)
+	abr = - abr;
+if( (abi = b->i) < 0.f)
+	abi = - abi;
+if( abr <= abi )
+	{
+	  /*Let IEEE Infinties handle this ;( */
+	  /*if(abi == 0)
+		sig_die("complex division by zero", 1);*/
+	ratio = b->r / b->i ;
+	den = b->i * (1 + ratio*ratio);
+	c->r = (a->r*ratio + a->i) / den;
+	c->i = (a->i*ratio - a->r) / den;
+	}
+
+else
+	{
+	ratio = b->i / b->r ;
+	den = b->r * (1.f + ratio*ratio);
+	c->r = (a->r + a->i*ratio) / den;
+	c->i = (a->i - a->r*ratio) / den;
+	}
+
+}
+
+#ifdef KR_headers
+extern void sig_die();
+VOID z_div(c, a, b) doublecomplex *a, *b, *c;
+#else
+extern void sig_die(char*, int);
+void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b)
+#endif
+{
+double ratio, den;
+double abr, abi;
+
+if( (abr = b->r) < 0.)
+	abr = - abr;
+if( (abi = b->i) < 0.)
+	abi = - abi;
+if( abr <= abi )
+	{
+	  /*Let IEEE Infinties handle this ;( */
+	  /*if(abi == 0)
+		sig_die("complex division by zero", 1);*/
+	ratio = b->r / b->i ;
+	den = b->i * (1 + ratio*ratio);
+	c->r = (a->r*ratio + a->i) / den;
+	c->i = (a->i*ratio - a->r) / den;
+	}
+
+else
+	{
+	ratio = b->i / b->r ;
+	den = b->r * (1 + ratio*ratio);
+	c->r = (a->r + a->i*ratio) / den;
+	c->i = (a->i - a->r*ratio) / den;
+	}
+
+}
+
+
+#ifdef KR_headers
+float sqrtf(), f__cabsf();
+VOID c_sqrt(r, z) complex *r, *z;
+#else
+#undef abs
+
+extern double f__cabsf(float, float);
+void c_sqrt(complex *r, complex *z)
+#endif
+{
+float mag;
+
+if( (mag = f__cabsf(z->r, z->i)) == 0.f)
+	r->r = r->i = 0.f;
+else if(z->r > 0.0f)
+	{
+	r->r = sqrtf(0.5f * (mag + z->r) );
+	r->i = z->i / r->r / 2.0f;
+	}
+else
+	{
+	r->i = sqrtf(0.5f * (mag - z->r) );
+	if(z->i < 0.0f)
+		r->i = - r->i;
+	r->r = z->i / r->i / 2.0f;
+	}
+}
+
+
+#ifdef KR_headers
+double sqrt(), f__cabs();
+VOID z_sqrt(r, z) doublecomplex *r, *z;
+#else
+#undef abs
+
+extern double f__cabs(double, double);
+void z_sqrt(doublecomplex *r, doublecomplex *z)
+#endif
+{
+double mag;
+
+if( (mag = f__cabs(z->r, z->i)) == 0.)
+	r->r = r->i = 0.;
+else if(z->r > 0)
+	{
+	r->r = sqrt(0.5 * (mag + z->r) );
+	r->i = z->i / r->r / 2;
+	}
+else
+	{
+	r->i = sqrt(0.5 * (mag - z->r) );
+	if(z->i < 0)
+		r->i = - r->i;
+	r->r = z->i / r->i / 2;
+	}
+}
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef KR_headers
+integer pow_ii(ap, bp) integer *ap, *bp;
+#else
+integer pow_ii(integer *ap, integer *bp)
+#endif
+{
+	integer pow, x, n;
+	unsigned long u;
+
+	x = *ap;
+	n = *bp;
+
+	if (n <= 0) {
+		if (n == 0 || x == 1)
+			return 1;
+		if (x != -1)
+			return x == 0 ? 1/x : 0;
+		n = -n;
+		}
+	u = n;
+	for(pow = 1; ; )
+		{
+		if(u & 01)
+			pow *= x;
+		if(u >>= 1)
+			x *= x;
+		else
+			break;
+		}
+	return(pow);
+	}
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef KR_headers
+extern void f_exit();
+VOID s_stop(s, n) char *s; ftnlen n;
+#else
+#undef abs
+#undef min
+#undef max
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+void f_exit(void);
+
+int s_stop(char *s, ftnlen n)
+#endif
+{
+int i;
+
+if(n > 0)
+	{
+	fprintf(stderr, "STOP ");
+	for(i = 0; i<n ; ++i)
+		putc(*s++, stderr);
+	fprintf(stderr, " statement executed\n");
+	}
+#ifdef NO_ONEXIT
+f_exit();
+#endif
+exit(0);
+
+/* We cannot avoid (useless) compiler diagnostics here:		*/
+/* some compilers complain if there is no return statement,	*/
+/* and others complain that this one cannot be reached.		*/
+
+return 0; /* NOT REACHED */
+}
+#ifdef __cplusplus
+}
+#endif
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c.h b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c.h
new file mode 100644
index 00000000000..80f1a12b19d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c.h
@@ -0,0 +1,388 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+
+typedef int integer;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+#ifdef f2c_i2
+/* for -i2 */
+typedef short flag;
+typedef short ftnlen;
+typedef short ftnint;
+#else
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+#endif
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	shortint h;
+	integer i;
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+typedef long Long;	/* No longer used; formerly in Namelist */
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#ifndef abs
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#endif
+#define dabs(x) (doublereal)abs(x)
+#ifndef min
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#endif
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef int /* Unknown procedure type */ (*U_fp)(...);
+typedef shortint (*J_fp)(...);
+typedef integer (*I_fp)(...);
+typedef real (*R_fp)(...);
+typedef doublereal (*D_fp)(...), (*E_fp)(...);
+typedef /* Complex */ VOID (*C_fp)(...);
+typedef /* Double Complex */ VOID (*Z_fp)(...);
+typedef logical (*L_fp)(...);
+typedef shortlogical (*K_fp)(...);
+typedef /* Character */ VOID (*H_fp)(...);
+typedef /* Subroutine */ int (*S_fp)(...);
+#else
+typedef int /* Unknown procedure type */ (*U_fp)(void);
+typedef shortint (*J_fp)(void);
+typedef integer (*I_fp)(void);
+typedef real (*R_fp)(void);
+typedef doublereal (*D_fp)(void), (*E_fp)(void);
+typedef /* Complex */ VOID (*C_fp)(void);
+typedef /* Double Complex */ VOID (*Z_fp)(void);
+typedef logical (*L_fp)(void);
+typedef shortlogical (*K_fp)(void);
+typedef /* Character */ VOID (*H_fp)(void);
+typedef /* Subroutine */ int (*S_fp)(void);
+#endif
+/* E_fp is for real functions when -R is not specified */
+typedef VOID C_f;	/* complex function */
+typedef VOID H_f;	/* character function */
+typedef VOID Z_f;	/* double complex function */
+typedef doublereal E_f;	/* real function with -R not specified */
+
+/* undef any lower-case symbols that your C compiler predefines, e.g.: */
+
+#ifndef Skip_f2c_Undefs
+#undef cray
+#undef gcos
+#undef mc68010
+#undef mc68020
+#undef mips
+#undef pdp11
+#undef sgi
+#undef sparc
+#undef sun
+#undef sun2
+#undef sun3
+#undef sun4
+#undef u370
+#undef u3b
+#undef u3b2
+#undef u3b5
+#undef unix
+#undef vax
+#endif
+
+/*  https://anonscm.debian.org/cgit/collab-maint/libf2c2.git/tree/f2ch.add  */
+
+/* If you are using a C++ compiler, append the following to f2c.h
+   for compiling libF77 and libI77. */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int abort_(void);
+extern double c_abs(complex *);
+extern void c_cos(complex *, complex *);
+extern void c_div(complex *, complex *, complex *);
+extern void c_exp(complex *, complex *);
+extern void c_log(complex *, complex *);
+extern void c_sin(complex *, complex *);
+extern void c_sqrt(complex *, complex *);
+extern double d_abs(double *);
+extern double d_acos(double *);
+extern double d_asin(double *);
+extern double d_atan(double *);
+extern double d_atn2(double *, double *);
+extern void d_cnjg(doublecomplex *, doublecomplex *);
+extern double d_cos(double *);
+extern double d_cosh(double *);
+extern double d_dim(double *, double *);
+extern double d_exp(double *);
+extern double d_imag(doublecomplex *);
+extern double d_int(double *);
+extern double d_lg10(double *);
+extern double d_log(double *);
+extern double d_mod(double *, double *);
+extern double d_nint(double *);
+extern double d_prod(float *, float *);
+extern double d_sign(double *, double *);
+extern double d_sin(double *);
+extern double d_sinh(double *);
+extern double d_sqrt(double *);
+extern double d_tan(double *);
+extern double d_tanh(double *);
+extern double derf_(double *);
+extern double derfc_(double *);
+extern void do_fio(ftnint *, char *, ftnlen);
+extern integer do_lio(ftnint *, ftnint *, char *, ftnlen);
+extern integer do_uio(ftnint *, char *, ftnlen);
+extern integer e_rdfe(void);
+extern integer e_rdue(void);
+extern integer e_rsfe(void);
+extern integer e_rsfi(void);
+extern integer e_rsle(void);
+extern integer e_rsli(void);
+extern integer e_rsue(void);
+extern integer e_wdfe(void);
+extern integer e_wdue(void);
+extern void e_wsfe(void);
+extern integer e_wsfi(void);
+extern integer e_wsle(void);
+extern integer e_wsli(void);
+extern integer e_wsue(void);
+extern int ef1asc_(ftnint *, ftnlen *, ftnint *, ftnlen *);
+extern integer ef1cmc_(ftnint *, ftnlen *, ftnint *, ftnlen *);
+
+extern double erf_(float *);
+extern double erfc_(float *);
+extern integer f_back(alist *);
+extern integer f_clos(cllist *);
+extern integer f_end(alist *);
+extern void f_exit(void);
+extern integer f_inqu(inlist *);
+extern integer f_open(olist *);
+extern integer f_rew(alist *);
+extern int flush_(void);
+extern void getarg_(integer *, char *, ftnlen);
+extern void getenv_(char *, char *, ftnlen, ftnlen);
+extern short h_abs(short *);
+extern short h_dim(short *, short *);
+extern short h_dnnt(double *);
+extern short h_indx(char *, char *, ftnlen, ftnlen);
+extern short h_len(char *, ftnlen);
+extern short h_mod(short *, short *);
+extern short h_nint(float *);
+extern short h_sign(short *, short *);
+extern short hl_ge(char *, char *, ftnlen, ftnlen);
+extern short hl_gt(char *, char *, ftnlen, ftnlen);
+extern short hl_le(char *, char *, ftnlen, ftnlen);
+extern short hl_lt(char *, char *, ftnlen, ftnlen);
+extern integer i_abs(integer *);
+extern integer i_dim(integer *, integer *);
+extern integer i_dnnt(double *);
+extern integer i_indx(char *, char *, ftnlen, ftnlen);
+extern integer i_len(char *, ftnlen);
+extern integer i_mod(integer *, integer *);
+extern integer i_nint(float *);
+extern integer i_sign(integer *, integer *);
+extern integer iargc_(void);
+extern ftnlen l_ge(char *, char *, ftnlen, ftnlen);
+extern ftnlen l_gt(char *, char *, ftnlen, ftnlen);
+extern ftnlen l_le(char *, char *, ftnlen, ftnlen);
+extern ftnlen l_lt(char *, char *, ftnlen, ftnlen);
+extern void pow_ci(complex *, complex *, integer *);
+extern double pow_dd(double *, double *);
+extern double pow_di(double *, integer *);
+extern short pow_hh(short *, shortint *);
+extern integer pow_ii(integer *, integer *);
+extern double pow_ri(float *, integer *);
+extern void pow_zi(doublecomplex *, doublecomplex *, integer *);
+extern void pow_zz(doublecomplex *, doublecomplex *, doublecomplex *);
+extern double r_abs(float *);
+extern double r_acos(float *);
+extern double r_asin(float *);
+extern double r_atan(float *);
+extern double r_atn2(float *, float *);
+extern void r_cnjg(complex *, complex *);
+extern double r_cos(float *);
+extern double r_cosh(float *);
+extern double r_dim(float *, float *);
+extern double r_exp(float *);
+extern float r_imag(complex *);
+extern double r_int(float *);
+extern float r_lg10(real *);
+extern double r_log(float *);
+extern double r_mod(float *, float *);
+extern double r_nint(float *);
+extern double r_sign(float *, float *);
+extern double r_sin(float *);
+extern double r_sinh(float *);
+extern double r_sqrt(float *);
+extern double r_tan(float *);
+extern double r_tanh(float *);
+extern void s_cat(char *, char **, integer *, integer *, ftnlen);
+extern integer s_cmp(char *, char *, ftnlen, ftnlen);
+extern void s_copy(char *, char *, ftnlen, ftnlen);
+extern int s_paus(char *, ftnlen);
+extern integer s_rdfe(cilist *);
+extern integer s_rdue(cilist *);
+extern integer s_rnge(char *, integer, char *, integer);
+extern integer s_rsfe(cilist *);
+extern integer s_rsfi(icilist *);
+extern integer s_rsle(cilist *);
+extern integer s_rsli(icilist *);
+extern integer s_rsne(cilist *);
+extern integer s_rsni(icilist *);
+extern integer s_rsue(cilist *);
+extern int s_stop(char *, ftnlen);
+extern integer s_wdfe(cilist *);
+extern integer s_wdue(cilist *);
+extern void s_wsfe(	cilist *);
+extern integer s_wsfi(icilist *);
+extern integer s_wsle(cilist *);
+extern integer s_wsli(icilist *);
+extern integer s_wsne(cilist *);
+extern integer s_wsni(icilist *);
+extern integer s_wsue(cilist *);
+extern void sig_die(char *, int);
+extern integer signal_(integer *, void (*)(int));
+extern integer system_(char *, ftnlen);
+extern double z_abs(doublecomplex *);
+extern void z_cos(doublecomplex *, doublecomplex *);
+extern void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
+extern void z_exp(doublecomplex *, doublecomplex *);
+extern void z_log(doublecomplex *, doublecomplex *);
+extern void z_sin(doublecomplex *, doublecomplex *);
+extern void z_sqrt(doublecomplex *, doublecomplex *);
+
+#ifdef __cplusplus
+	}
+#endif
+
+#endif
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_blas.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_blas.c
new file mode 100644
index 00000000000..3af506b7124
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_blas.c
@@ -0,0 +1,21615 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static complex c_b21 = {1.f,0.f};
+static doublecomplex c_b1078 = {1.,0.};
+
+/* Subroutine */ int caxpy_(integer *n, complex *ca, complex *cx, integer *
+	incx, complex *cy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    extern doublereal scabs1_(complex *);
+
+
+/*
+    Purpose
+    =======
+
+       CAXPY constant times a vector plus a vector.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (scabs1_(ca) == 0.f) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = iy;
+	i__3 = iy;
+	i__4 = ix;
+	q__2.r = ca->r * cx[i__4].r - ca->i * cx[i__4].i, q__2.i = ca->r * cx[
+		i__4].i + ca->i * cx[i__4].r;
+	q__1.r = cy[i__3].r + q__2.r, q__1.i = cy[i__3].i + q__2.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	i__4 = i__;
+	q__2.r = ca->r * cx[i__4].r - ca->i * cx[i__4].i, q__2.i = ca->r * cx[
+		i__4].i + ca->i * cx[i__4].r;
+	q__1.r = cy[i__3].r + q__2.r, q__1.i = cy[i__3].i + q__2.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+/* L30: */
+    }
+    return 0;
+} /* caxpy_ */
+
+/* Subroutine */ int ccopy_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+
+
+/*
+    Purpose
+    =======
+
+       CCOPY copies a vector x to a vector y.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = iy;
+	i__3 = ix;
+	cy[i__2].r = cx[i__3].r, cy[i__2].i = cx[i__3].i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	cy[i__2].r = cx[i__3].r, cy[i__2].i = cx[i__3].i;
+/* L30: */
+    }
+    return 0;
+} /* ccopy_ */
+
+/* Complex */ VOID cdotc_(complex * ret_val, integer *n, complex *cx, integer
+	*incx, complex *cy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static complex ctemp;
+
+
+/*
+    Purpose
+    =======
+
+       forms the dot product of two vectors, conjugating the first
+       vector.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack,  3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    ctemp.r = 0.f, ctemp.i = 0.f;
+     ret_val->r = 0.f,  ret_val->i = 0.f;
+    if (*n <= 0) {
+	return ;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	r_cnjg(&q__3, &cx[ix]);
+	i__2 = iy;
+	q__2.r = q__3.r * cy[i__2].r - q__3.i * cy[i__2].i, q__2.i = q__3.r *
+		cy[i__2].i + q__3.i * cy[i__2].r;
+	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
+    return ;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	r_cnjg(&q__3, &cx[i__]);
+	i__2 = i__;
+	q__2.r = q__3.r * cy[i__2].r - q__3.i * cy[i__2].i, q__2.i = q__3.r *
+		cy[i__2].i + q__3.i * cy[i__2].r;
+	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+/* L30: */
+    }
+     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
+    return ;
+} /* cdotc_ */
+
+/* Complex */ VOID cdotu_(complex * ret_val, integer *n, complex *cx, integer
+	*incx, complex *cy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static complex ctemp;
+
+
+/*
+    Purpose
+    =======
+
+       CDOTU forms the dot product of two vectors.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    ctemp.r = 0.f, ctemp.i = 0.f;
+     ret_val->r = 0.f,  ret_val->i = 0.f;
+    if (*n <= 0) {
+	return ;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	i__3 = iy;
+	q__2.r = cx[i__2].r * cy[i__3].r - cx[i__2].i * cy[i__3].i, q__2.i =
+		cx[i__2].r * cy[i__3].i + cx[i__2].i * cy[i__3].r;
+	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
+    return ;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = cx[i__2].r * cy[i__3].r - cx[i__2].i * cy[i__3].i, q__2.i =
+		cx[i__2].r * cy[i__3].i + cx[i__2].i * cy[i__3].r;
+	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+/* L30: */
+    }
+     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
+    return ;
+} /* cdotu_ */
+
+/* Subroutine */ int cgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, complex *alpha, complex *a, integer *lda, complex *b,
+	integer *ldb, complex *beta, complex *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5, i__6;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static logical nota, notb;
+    static complex temp;
+    static logical conja, conjb;
+    static integer ncola;
+    extern logical lsame_(char *, char *);
+    static integer nrowa, nrowb;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CGEMM  performs one of the matrix-matrix operations
+
+       C := alpha*op( A )*op( B ) + beta*C,
+
+    where  op( X ) is one of
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A )
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n',  op( A ) = A.
+
+                TRANSA = 'T' or 't',  op( A ) = A'.
+
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).
+
+             Unchanged on exit.
+
+    TRANSB - CHARACTER*1.
+             On entry, TRANSB specifies the form of op( B ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSB = 'N' or 'n',  op( B ) = B.
+
+                TRANSB = 'T' or 't',  op( B ) = B'.
+
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry,  M  specifies  the number  of rows  of the  matrix
+             op( A )  and of the  matrix  C.  M  must  be at least  zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N  specifies the number  of columns of the matrix
+             op( B ) and the number of columns of the matrix C. N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry,  K  specifies  the number of columns of the matrix
+             op( A ) and the number of rows of the matrix op( B ). K must
+             be at least  zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by m  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+             LDA must be at least  max( 1, m ), otherwise  LDA must be at
+             least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - COMPLEX          array of DIMENSION ( LDB, kb ), where kb is
+             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  n by k  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+             LDB must be at least  max( 1, k ), otherwise  LDB must be at
+             least  max( 1, n ).
+             Unchanged on exit.
+
+    BETA   - COMPLEX         .
+             On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+             supplied as zero then C need not be set on input.
+             Unchanged on exit.
+
+    C      - COMPLEX          array of DIMENSION ( LDC, n ).
+             Before entry, the leading  m by n  part of the array  C must
+             contain the matrix  C,  except when  beta  is zero, in which
+             case C need not be set on entry.
+             On exit, the array  C  is overwritten by the  m by n  matrix
+             ( alpha*op( A )*op( B ) + beta*C ).
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+       conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+       B  respectively are to be  transposed but  not conjugated  and set
+       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+       and the number of rows of  B  respectively.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    nota = lsame_(transa, "N");
+    notb = lsame_(transb, "N");
+    conja = lsame_(transa, "C");
+    conjb = lsame_(transb, "C");
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && ! conja && ! lsame_(transa, "T")) {
+	info = 1;
+    } else if (! notb && ! conjb && ! lsame_(transb, "T")) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	xerbla_("CGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (alpha->r == 0.f && alpha->i == 0.f || *k == 0)
+	    && (beta->r == 1.f && beta->i == 0.f)) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	if (beta->r == 0.f && beta->i == 0.f) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * c_dim1;
+		    c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * c_dim1;
+		    i__4 = i__ + j * c_dim1;
+		    q__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4].i,
+			    q__1.i = beta->r * c__[i__4].i + beta->i * c__[
+			    i__4].r;
+		    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (beta->r == 0.f && beta->i == 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L50: */
+		    }
+		} else if (beta->r != 1.f || beta->i != 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__1.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = l + j * b_dim1;
+		    if (b[i__3].r != 0.f || b[i__3].i != 0.f) {
+			i__3 = l + j * b_dim1;
+			q__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3].i,
+				q__1.i = alpha->r * b[i__3].i + alpha->i * b[
+				i__3].r;
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    q__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    q__1.r = c__[i__5].r + q__2.r, q__1.i = c__[i__5]
+				    .i + q__2.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else if (conja) {
+
+/*           Form  C := alpha*conjg( A' )*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * b_dim1;
+			q__2.r = q__3.r * b[i__4].r - q__3.i * b[i__4].i,
+				q__2.i = q__3.r * b[i__4].i + q__3.i * b[i__4]
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+		    }
+		    if (beta->r == 0.f && beta->i == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			i__4 = l + i__ * a_dim1;
+			i__5 = l + j * b_dim1;
+			q__2.r = a[i__4].r * b[i__5].r - a[i__4].i * b[i__5]
+				.i, q__2.i = a[i__4].r * b[i__5].i + a[i__4]
+				.i * b[i__5].r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L130: */
+		    }
+		    if (beta->r == 0.f && beta->i == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L140: */
+		}
+/* L150: */
+	    }
+	}
+    } else if (nota) {
+	if (conjb) {
+
+/*           Form  C := alpha*A*conjg( B' ) + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (beta->r == 0.f && beta->i == 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L160: */
+		    }
+		} else if (beta->r != 1.f || beta->i != 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__1.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L170: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * b_dim1;
+		    if (b[i__3].r != 0.f || b[i__3].i != 0.f) {
+			r_cnjg(&q__2, &b[j + l * b_dim1]);
+			q__1.r = alpha->r * q__2.r - alpha->i * q__2.i,
+				q__1.i = alpha->r * q__2.i + alpha->i *
+				q__2.r;
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    q__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    q__1.r = c__[i__5].r + q__2.r, q__1.i = c__[i__5]
+				    .i + q__2.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L180: */
+			}
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A*B'          + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (beta->r == 0.f && beta->i == 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L210: */
+		    }
+		} else if (beta->r != 1.f || beta->i != 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__1.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L220: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * b_dim1;
+		    if (b[i__3].r != 0.f || b[i__3].i != 0.f) {
+			i__3 = j + l * b_dim1;
+			q__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3].i,
+				q__1.i = alpha->r * b[i__3].i + alpha->i * b[
+				i__3].r;
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    q__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    q__1.r = c__[i__5].r + q__2.r, q__1.i = c__[i__5]
+				    .i + q__2.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+/* L250: */
+	    }
+	}
+    } else if (conja) {
+	if (conjb) {
+
+/*           Form  C := alpha*conjg( A' )*conjg( B' ) + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			r_cnjg(&q__4, &b[j + l * b_dim1]);
+			q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, q__2.i =
+				q__3.r * q__4.i + q__3.i * q__4.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L260: */
+		    }
+		    if (beta->r == 0.f && beta->i == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L270: */
+		}
+/* L280: */
+	    }
+	} else {
+
+/*           Form  C := alpha*conjg( A' )*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			i__4 = j + l * b_dim1;
+			q__2.r = q__3.r * b[i__4].r - q__3.i * b[i__4].i,
+				q__2.i = q__3.r * b[i__4].i + q__3.i * b[i__4]
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L290: */
+		    }
+		    if (beta->r == 0.f && beta->i == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L300: */
+		}
+/* L310: */
+	    }
+	}
+    } else {
+	if (conjb) {
+
+/*           Form  C := alpha*A'*conjg( B' ) + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			i__4 = l + i__ * a_dim1;
+			r_cnjg(&q__3, &b[j + l * b_dim1]);
+			q__2.r = a[i__4].r * q__3.r - a[i__4].i * q__3.i,
+				q__2.i = a[i__4].r * q__3.i + a[i__4].i *
+				q__3.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L320: */
+		    }
+		    if (beta->r == 0.f && beta->i == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L330: */
+		}
+/* L340: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			i__4 = l + i__ * a_dim1;
+			i__5 = j + l * b_dim1;
+			q__2.r = a[i__4].r * b[i__5].r - a[i__4].i * b[i__5]
+				.i, q__2.i = a[i__4].r * b[i__5].i + a[i__4]
+				.i * b[i__5].r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L350: */
+		    }
+		    if (beta->r == 0.f && beta->i == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, q__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L360: */
+		}
+/* L370: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CGEMM . */
+
+} /* cgemm_ */
+
+/* Subroutine */ int cgemv_(char *trans, integer *m, integer *n, complex *
+	alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
+	beta, complex *y, integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static complex temp;
+    static integer lenx, leny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj;
+
+
+/*
+    Purpose
+    =======
+
+    CGEMV performs one of the matrix-vector operations
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
+
+       y := alpha*conjg( A' )*x + beta*y,
+
+    where alpha and beta are scalars, x and y are vectors and A is an
+    m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+
+                TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    X      - COMPLEX          array of DIMENSION at least
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+             Before entry, the incremented array X must contain the
+             vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - COMPLEX         .
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - COMPLEX          array of DIMENSION at least
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+             Before entry with BETA non-zero, the incremented array Y
+             must contain the vector y. On exit, Y is overwritten by the
+             updated vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! lsame_(trans, "C")
+	    ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("CGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r
+	    == 1.f && beta->i == 0.f)) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T");
+
+/*
+       Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+       up the start points in  X  and  Y.
+*/
+
+    if (lsame_(trans, "N")) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+
+       First form  y := beta*y.
+*/
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    if (lsame_(trans, "N")) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+		    i__2 = jx;
+		    q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    temp.r = q__1.r, temp.i = q__1.i;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__;
+			i__4 = i__;
+			i__5 = i__ + j * a_dim1;
+			q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				q__2.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i +
+				q__2.i;
+			y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+		    i__2 = jx;
+		    q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    temp.r = q__1.r, temp.i = q__1.i;
+		    iy = ky;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = iy;
+			i__4 = iy;
+			i__5 = i__ + j * a_dim1;
+			q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				q__2.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i +
+				q__2.i;
+			y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp.r = 0.f, temp.i = 0.f;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__;
+			q__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[i__4]
+				.i, q__2.i = a[i__3].r * x[i__4].i + a[i__3]
+				.i * x[i__4].r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			i__3 = i__;
+			q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i,
+				q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3]
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i =
+			alpha->r * temp.i + alpha->i * temp.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jy += *incy;
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp.r = 0.f, temp.i = 0.f;
+		ix = kx;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = ix;
+			q__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[i__4]
+				.i, q__2.i = a[i__3].r * x[i__4].i + a[i__3]
+				.i * x[i__4].r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+			ix += *incx;
+/* L120: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			i__3 = ix;
+			q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i,
+				q__2.i = q__3.r * x[i__3].i + q__3.i * x[i__3]
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+			ix += *incx;
+/* L130: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i =
+			alpha->r * temp.i + alpha->i * temp.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jy += *incy;
+/* L140: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CGEMV . */
+
+} /* cgemv_ */
+
+/* Subroutine */ int cgerc_(integer *m, integer *n, complex *alpha, complex *
+	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jy, kx, info;
+    static complex temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CGERC  performs the rank 1 operation
+
+       A := alpha*x*conjg( y' ) + A,
+
+    where alpha is a scalar, x is an m element vector, y is an n element
+    vector and A is an m by n matrix.
+
+    Arguments
+    ==========
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - COMPLEX          array of dimension at least
+             ( 1 + ( m - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the m
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients. On exit, A is
+             overwritten by the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CGERC ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0.f || y[i__2].i != 0.f) {
+		r_cnjg(&q__2, &y[jy]);
+		q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i =
+			alpha->r * q__2.i + alpha->i * q__2.r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = i__;
+		    q__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, q__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    q__1.r = a[i__4].r + q__2.r, q__1.i = a[i__4].i + q__2.i;
+		    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0.f || y[i__2].i != 0.f) {
+		r_cnjg(&q__2, &y[jy]);
+		q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i =
+			alpha->r * q__2.i + alpha->i * q__2.r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = ix;
+		    q__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, q__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    q__1.r = a[i__4].r + q__2.r, q__1.i = a[i__4].i + q__2.i;
+		    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of CGERC . */
+
+} /* cgerc_ */
+
+/* Subroutine */ int cgeru_(integer *m, integer *n, complex *alpha, complex *
+	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jy, kx, info;
+    static complex temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CGERU  performs the rank 1 operation
+
+       A := alpha*x*y' + A,
+
+    where alpha is a scalar, x is an m element vector, y is an n element
+    vector and A is an m by n matrix.
+
+    Arguments
+    ==========
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - COMPLEX          array of dimension at least
+             ( 1 + ( m - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the m
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients. On exit, A is
+             overwritten by the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CGERU ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0.f || y[i__2].i != 0.f) {
+		i__2 = jy;
+		q__1.r = alpha->r * y[i__2].r - alpha->i * y[i__2].i, q__1.i =
+			 alpha->r * y[i__2].i + alpha->i * y[i__2].r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = i__;
+		    q__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, q__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    q__1.r = a[i__4].r + q__2.r, q__1.i = a[i__4].i + q__2.i;
+		    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0.f || y[i__2].i != 0.f) {
+		i__2 = jy;
+		q__1.r = alpha->r * y[i__2].r - alpha->i * y[i__2].i, q__1.i =
+			 alpha->r * y[i__2].i + alpha->i * y[i__2].r;
+		temp.r = q__1.r, temp.i = q__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = ix;
+		    q__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, q__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    q__1.r = a[i__4].r + q__2.r, q__1.i = a[i__4].i + q__2.i;
+		    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of CGERU . */
+
+} /* cgeru_ */
+
+/* Subroutine */ int chemv_(char *uplo, integer *n, complex *alpha, complex *
+	a, integer *lda, complex *x, integer *incx, complex *beta, complex *y,
+	 integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static complex temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CHEMV  performs the matrix-vector  operation
+
+       y := alpha*A*x + beta*y,
+
+    where alpha and beta are scalars, x and y are n element vectors and
+    A is an n by n hermitian matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the hermitian matrix and the strictly
+             lower triangular part of A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the hermitian matrix and the strictly
+             upper triangular part of A is not referenced.
+             Note that the imaginary parts of the diagonal elements need
+             not be set and are assumed to be zero.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - COMPLEX         .
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y. On exit, Y is overwritten by the updated
+             vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("CHEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f &&
+	    beta->i == 0.f)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+
+       First form  y := beta*y.
+*/
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    if (lsame_(uplo, "U")) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+		    i__3 = i__;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		r__1 = a[i__4].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+		    i__3 = ix;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		r__1 = a[i__4].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		r__1 = a[i__4].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+		    i__3 = i__;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		r__1 = a[i__4].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+		    i__3 = ix;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHEMV . */
+
+} /* chemv_ */
+
+/* Subroutine */ int cher2_(char *uplo, integer *n, complex *alpha, complex *
+	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static complex temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CHER2  performs the hermitian rank 2 operation
+
+       A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+
+    where alpha is a scalar, x and y are n element vectors and A is an n
+    by n hermitian matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the hermitian matrix and the strictly
+             lower triangular part of A is not referenced. On exit, the
+             upper triangular part of the array A is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the hermitian matrix and the strictly
+             upper triangular part of A is not referenced. On exit, the
+             lower triangular part of the array A is overwritten by the
+             lower triangular part of the updated matrix.
+             Note that the imaginary parts of the diagonal elements need
+             not be set, they are assumed to be zero, and on exit they
+             are set to zero.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CHER2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+
+/*
+       Set up the start points in X and Y if the increments are not both
+       unity.
+*/
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+*/
+
+    if (lsame_(uplo, "U")) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (x[i__2].r != 0.f || x[i__2].i != 0.f || (y[i__3].r != 0.f
+			|| y[i__3].i != 0.f)) {
+		    r_cnjg(&q__2, &y[j]);
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i =
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = j;
+		    q__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    q__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = i__;
+			q__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				q__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			q__2.r = a[i__4].r + q__3.r, q__2.i = a[i__4].i +
+				q__3.i;
+			i__6 = i__;
+			q__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				q__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L10: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    q__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    q__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = j;
+		    q__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    q__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    r__1 = a[i__3].r + q__1.r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    r__1 = a[i__3].r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (x[i__2].r != 0.f || x[i__2].i != 0.f || (y[i__3].r != 0.f
+			|| y[i__3].i != 0.f)) {
+		    r_cnjg(&q__2, &y[jy]);
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i =
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = jx;
+		    q__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    q__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = ix;
+			q__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				q__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			q__2.r = a[i__4].r + q__3.r, q__2.i = a[i__4].i +
+				q__3.i;
+			i__6 = iy;
+			q__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				q__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    q__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    q__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = jy;
+		    q__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    q__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    r__1 = a[i__3].r + q__1.r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    r__1 = a[i__3].r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (x[i__2].r != 0.f || x[i__2].i != 0.f || (y[i__3].r != 0.f
+			|| y[i__3].i != 0.f)) {
+		    r_cnjg(&q__2, &y[j]);
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i =
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = j;
+		    q__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    q__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    q__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    q__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = j;
+		    q__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    q__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    r__1 = a[i__3].r + q__1.r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = i__;
+			q__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				q__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			q__2.r = a[i__4].r + q__3.r, q__2.i = a[i__4].i +
+				q__3.i;
+			i__6 = i__;
+			q__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				q__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L50: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    r__1 = a[i__3].r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (x[i__2].r != 0.f || x[i__2].i != 0.f || (y[i__3].r != 0.f
+			|| y[i__3].i != 0.f)) {
+		    r_cnjg(&q__2, &y[jy]);
+		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i =
+			    alpha->r * q__2.i + alpha->i * q__2.r;
+		    temp1.r = q__1.r, temp1.i = q__1.i;
+		    i__2 = jx;
+		    q__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    q__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    r_cnjg(&q__1, &q__2);
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    q__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    q__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = jy;
+		    q__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    q__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    r__1 = a[i__3].r + q__1.r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			ix += *incx;
+			iy += *incy;
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = ix;
+			q__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				q__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			q__2.r = a[i__4].r + q__3.r, q__2.i = a[i__4].i +
+				q__3.i;
+			i__6 = iy;
+			q__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				q__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L70: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    r__1 = a[i__3].r;
+		    a[i__2].r = r__1, a[i__2].i = 0.f;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHER2 . */
+
+} /* cher2_ */
+
+/* Subroutine */ int cher2k_(char *uplo, char *trans, integer *n, integer *k,
+	complex *alpha, complex *a, integer *lda, complex *b, integer *ldb,
+	real *beta, complex *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5, i__6, i__7;
+    real r__1;
+    complex q__1, q__2, q__3, q__4, q__5, q__6;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static complex temp1, temp2;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CHER2K  performs one of the hermitian rank 2k operations
+
+       C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C,
+
+    or
+
+       C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C,
+
+    where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+    hermitian matrix and  A and B  are  n by k matrices in the first case
+    and  k by n  matrices in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'    C := alpha*A*conjg( B' )          +
+                                           conjg( alpha )*B*conjg( A' ) +
+                                           beta*C.
+
+                TRANS = 'C' or 'c'    C := alpha*conjg( A' )*B          +
+                                           conjg( alpha )*conjg( B' )*A +
+                                           beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns  of the  matrices  A and B,  and on  entry  with
+             TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+             matrices  A and B.  K must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - COMPLEX          array of DIMENSION ( LDB, kb ), where kb is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  k by n  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDB must be at least  max( 1, n ), otherwise  LDB must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - COMPLEX          array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  hermitian matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  hermitian matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+             Note that the imaginary parts of the diagonal elements need
+             not be set,  they are assumed to be zero,  and on exit they
+             are set to zero.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1.
+       Ed Anderson, Cray Research Inc.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldb < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldc < max(1,*n)) {
+	info = 12;
+    }
+    if (info != 0) {
+	xerbla_("CHER2K", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f || *k == 0) && *beta ==
+	     1.f) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	if (upper) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L30: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*
+          Form  C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) +
+                     C.
+*/
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L90: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L100: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    i__4 = j + l * b_dim1;
+		    if (a[i__3].r != 0.f || a[i__3].i != 0.f || (b[i__4].r !=
+			    0.f || b[i__4].i != 0.f)) {
+			r_cnjg(&q__2, &b[j + l * b_dim1]);
+			q__1.r = alpha->r * q__2.r - alpha->i * q__2.i,
+				q__1.i = alpha->r * q__2.i + alpha->i *
+				q__2.r;
+			temp1.r = q__1.r, temp1.i = q__1.i;
+			i__3 = j + l * a_dim1;
+			q__2.r = alpha->r * a[i__3].r - alpha->i * a[i__3].i,
+				q__2.i = alpha->r * a[i__3].i + alpha->i * a[
+				i__3].r;
+			r_cnjg(&q__1, &q__2);
+			temp2.r = q__1.r, temp2.i = q__1.i;
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__3.r = a[i__6].r * temp1.r - a[i__6].i *
+				    temp1.i, q__3.i = a[i__6].r * temp1.i + a[
+				    i__6].i * temp1.r;
+			    q__2.r = c__[i__5].r + q__3.r, q__2.i = c__[i__5]
+				    .i + q__3.i;
+			    i__7 = i__ + l * b_dim1;
+			    q__4.r = b[i__7].r * temp2.r - b[i__7].i *
+				    temp2.i, q__4.i = b[i__7].r * temp2.i + b[
+				    i__7].i * temp2.r;
+			    q__1.r = q__2.r + q__4.r, q__1.i = q__2.i +
+				    q__4.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L110: */
+			}
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = j + l * a_dim1;
+			q__2.r = a[i__5].r * temp1.r - a[i__5].i * temp1.i,
+				q__2.i = a[i__5].r * temp1.i + a[i__5].i *
+				temp1.r;
+			i__6 = j + l * b_dim1;
+			q__3.r = b[i__6].r * temp2.r - b[i__6].i * temp2.i,
+				q__3.i = b[i__6].r * temp2.i + b[i__6].i *
+				temp2.r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			r__1 = c__[i__4].r + q__1.r;
+			c__[i__3].r = r__1, c__[i__3].i = 0.f;
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L140: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L150: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    i__4 = j + l * b_dim1;
+		    if (a[i__3].r != 0.f || a[i__3].i != 0.f || (b[i__4].r !=
+			    0.f || b[i__4].i != 0.f)) {
+			r_cnjg(&q__2, &b[j + l * b_dim1]);
+			q__1.r = alpha->r * q__2.r - alpha->i * q__2.i,
+				q__1.i = alpha->r * q__2.i + alpha->i *
+				q__2.r;
+			temp1.r = q__1.r, temp1.i = q__1.i;
+			i__3 = j + l * a_dim1;
+			q__2.r = alpha->r * a[i__3].r - alpha->i * a[i__3].i,
+				q__2.i = alpha->r * a[i__3].i + alpha->i * a[
+				i__3].r;
+			r_cnjg(&q__1, &q__2);
+			temp2.r = q__1.r, temp2.i = q__1.i;
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__3.r = a[i__6].r * temp1.r - a[i__6].i *
+				    temp1.i, q__3.i = a[i__6].r * temp1.i + a[
+				    i__6].i * temp1.r;
+			    q__2.r = c__[i__5].r + q__3.r, q__2.i = c__[i__5]
+				    .i + q__3.i;
+			    i__7 = i__ + l * b_dim1;
+			    q__4.r = b[i__7].r * temp2.r - b[i__7].i *
+				    temp2.i, q__4.i = b[i__7].r * temp2.i + b[
+				    i__7].i * temp2.r;
+			    q__1.r = q__2.r + q__4.r, q__1.i = q__2.i +
+				    q__4.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L160: */
+			}
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = j + l * a_dim1;
+			q__2.r = a[i__5].r * temp1.r - a[i__5].i * temp1.i,
+				q__2.i = a[i__5].r * temp1.i + a[i__5].i *
+				temp1.r;
+			i__6 = j + l * b_dim1;
+			q__3.r = b[i__6].r * temp2.r - b[i__6].i * temp2.i,
+				q__3.i = b[i__6].r * temp2.i + b[i__6].i *
+				temp2.r;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			r__1 = c__[i__4].r + q__1.r;
+			c__[i__3].r = r__1, c__[i__3].i = 0.f;
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*
+          Form  C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A +
+                     C.
+*/
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp1.r = 0.f, temp1.i = 0.f;
+		    temp2.r = 0.f, temp2.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * b_dim1;
+			q__2.r = q__3.r * b[i__4].r - q__3.i * b[i__4].i,
+				q__2.i = q__3.r * b[i__4].i + q__3.i * b[i__4]
+				.r;
+			q__1.r = temp1.r + q__2.r, q__1.i = temp1.i + q__2.i;
+			temp1.r = q__1.r, temp1.i = q__1.i;
+			r_cnjg(&q__3, &b[l + i__ * b_dim1]);
+			i__4 = l + j * a_dim1;
+			q__2.r = q__3.r * a[i__4].r - q__3.i * a[i__4].i,
+				q__2.i = q__3.r * a[i__4].i + q__3.i * a[i__4]
+				.r;
+			q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+			temp2.r = q__1.r, temp2.i = q__1.i;
+/* L190: */
+		    }
+		    if (i__ == j) {
+			if (*beta == 0.f) {
+			    i__3 = j + j * c_dim1;
+			    q__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    r_cnjg(&q__4, alpha);
+			    q__3.r = q__4.r * temp2.r - q__4.i * temp2.i,
+				    q__3.i = q__4.r * temp2.i + q__4.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    r__1 = q__1.r;
+			    c__[i__3].r = r__1, c__[i__3].i = 0.f;
+			} else {
+			    i__3 = j + j * c_dim1;
+			    i__4 = j + j * c_dim1;
+			    q__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    r_cnjg(&q__4, alpha);
+			    q__3.r = q__4.r * temp2.r - q__4.i * temp2.i,
+				    q__3.i = q__4.r * temp2.i + q__4.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    r__1 = *beta * c__[i__4].r + q__1.r;
+			    c__[i__3].r = r__1, c__[i__3].i = 0.f;
+			}
+		    } else {
+			if (*beta == 0.f) {
+			    i__3 = i__ + j * c_dim1;
+			    q__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    r_cnjg(&q__4, alpha);
+			    q__3.r = q__4.r * temp2.r - q__4.i * temp2.i,
+				    q__3.i = q__4.r * temp2.i + q__4.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+			} else {
+			    i__3 = i__ + j * c_dim1;
+			    i__4 = i__ + j * c_dim1;
+			    q__3.r = *beta * c__[i__4].r, q__3.i = *beta *
+				    c__[i__4].i;
+			    q__4.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__4.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    q__2.r = q__3.r + q__4.r, q__2.i = q__3.i +
+				    q__4.i;
+			    r_cnjg(&q__6, alpha);
+			    q__5.r = q__6.r * temp2.r - q__6.i * temp2.i,
+				    q__5.i = q__6.r * temp2.i + q__6.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__5.r, q__1.i = q__2.i +
+				    q__5.i;
+			    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+			}
+		    }
+/* L200: */
+		}
+/* L210: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+		    temp1.r = 0.f, temp1.i = 0.f;
+		    temp2.r = 0.f, temp2.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * b_dim1;
+			q__2.r = q__3.r * b[i__4].r - q__3.i * b[i__4].i,
+				q__2.i = q__3.r * b[i__4].i + q__3.i * b[i__4]
+				.r;
+			q__1.r = temp1.r + q__2.r, q__1.i = temp1.i + q__2.i;
+			temp1.r = q__1.r, temp1.i = q__1.i;
+			r_cnjg(&q__3, &b[l + i__ * b_dim1]);
+			i__4 = l + j * a_dim1;
+			q__2.r = q__3.r * a[i__4].r - q__3.i * a[i__4].i,
+				q__2.i = q__3.r * a[i__4].i + q__3.i * a[i__4]
+				.r;
+			q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+			temp2.r = q__1.r, temp2.i = q__1.i;
+/* L220: */
+		    }
+		    if (i__ == j) {
+			if (*beta == 0.f) {
+			    i__3 = j + j * c_dim1;
+			    q__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    r_cnjg(&q__4, alpha);
+			    q__3.r = q__4.r * temp2.r - q__4.i * temp2.i,
+				    q__3.i = q__4.r * temp2.i + q__4.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    r__1 = q__1.r;
+			    c__[i__3].r = r__1, c__[i__3].i = 0.f;
+			} else {
+			    i__3 = j + j * c_dim1;
+			    i__4 = j + j * c_dim1;
+			    q__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    r_cnjg(&q__4, alpha);
+			    q__3.r = q__4.r * temp2.r - q__4.i * temp2.i,
+				    q__3.i = q__4.r * temp2.i + q__4.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    r__1 = *beta * c__[i__4].r + q__1.r;
+			    c__[i__3].r = r__1, c__[i__3].i = 0.f;
+			}
+		    } else {
+			if (*beta == 0.f) {
+			    i__3 = i__ + j * c_dim1;
+			    q__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    r_cnjg(&q__4, alpha);
+			    q__3.r = q__4.r * temp2.r - q__4.i * temp2.i,
+				    q__3.i = q__4.r * temp2.i + q__4.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+			} else {
+			    i__3 = i__ + j * c_dim1;
+			    i__4 = i__ + j * c_dim1;
+			    q__3.r = *beta * c__[i__4].r, q__3.i = *beta *
+				    c__[i__4].i;
+			    q__4.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    q__4.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    q__2.r = q__3.r + q__4.r, q__2.i = q__3.i +
+				    q__4.i;
+			    r_cnjg(&q__6, alpha);
+			    q__5.r = q__6.r * temp2.r - q__6.i * temp2.i,
+				    q__5.i = q__6.r * temp2.i + q__6.i *
+				    temp2.r;
+			    q__1.r = q__2.r + q__5.r, q__1.i = q__2.i +
+				    q__5.i;
+			    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+			}
+		    }
+/* L230: */
+		}
+/* L240: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHER2K. */
+
+} /* cher2k_ */
+
+/* Subroutine */ int cherk_(char *uplo, char *trans, integer *n, integer *k,
+	real *alpha, complex *a, integer *lda, real *beta, complex *c__,
+	integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5,
+	    i__6;
+    real r__1;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static complex temp;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static real rtemp;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    CHERK  performs one of the hermitian rank k operations
+
+       C := alpha*A*conjg( A' ) + beta*C,
+
+    or
+
+       C := alpha*conjg( A' )*A + beta*C,
+
+    where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+    matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+    matrix in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   C := alpha*A*conjg( A' ) + beta*C.
+
+                TRANS = 'C' or 'c'   C := alpha*conjg( A' )*A + beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns   of  the   matrix   A,   and  on   entry   with
+             TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+             matrix A.  K must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - COMPLEX          array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  hermitian matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  hermitian matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+             Note that the imaginary parts of the diagonal elements need
+             not be set,  they are assumed to be zero,  and on exit they
+             are set to zero.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1.
+       Ed Anderson, Cray Research Inc.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldc < max(1,*n)) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("CHERK ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.f) {
+	if (upper) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L30: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  C := alpha*A*conjg( A' ) + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L90: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L100: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    if (a[i__3].r != 0.f || a[i__3].i != 0.f) {
+			r_cnjg(&q__2, &a[j + l * a_dim1]);
+			q__1.r = *alpha * q__2.r, q__1.i = *alpha * q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    q__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    q__1.r = c__[i__5].r + q__2.r, q__1.i = c__[i__5]
+				    .i + q__2.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L110: */
+			}
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = i__ + l * a_dim1;
+			q__1.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				q__1.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			r__1 = c__[i__4].r + q__1.r;
+			c__[i__3].r = r__1, c__[i__3].i = 0.f;
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L140: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			q__1.r = *beta * c__[i__4].r, q__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L150: */
+		    }
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    if (a[i__3].r != 0.f || a[i__3].i != 0.f) {
+			r_cnjg(&q__2, &a[j + l * a_dim1]);
+			q__1.r = *alpha * q__2.r, q__1.i = *alpha * q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = j + l * a_dim1;
+			q__1.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				q__1.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			r__1 = c__[i__4].r + q__1.r;
+			c__[i__3].r = r__1, c__[i__3].i = 0.f;
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    q__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    q__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    q__1.r = c__[i__5].r + q__2.r, q__1.i = c__[i__5]
+				    .i + q__2.i;
+			    c__[i__4].r = q__1.r, c__[i__4].i = q__1.i;
+/* L160: */
+			}
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*        Form  C := alpha*conjg( A' )*A + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * a_dim1;
+			q__2.r = q__3.r * a[i__4].r - q__3.i * a[i__4].i,
+				q__2.i = q__3.r * a[i__4].i + q__3.i * a[i__4]
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L190: */
+		    }
+		    if (*beta == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = *alpha * temp.r, q__1.i = *alpha * temp.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = *alpha * temp.r, q__2.i = *alpha * temp.i;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = *beta * c__[i__4].r, q__3.i = *beta * c__[
+				i__4].i;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L200: */
+		}
+		rtemp = 0.f;
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    r_cnjg(&q__3, &a[l + j * a_dim1]);
+		    i__3 = l + j * a_dim1;
+		    q__2.r = q__3.r * a[i__3].r - q__3.i * a[i__3].i, q__2.i =
+			     q__3.r * a[i__3].i + q__3.i * a[i__3].r;
+		    q__1.r = rtemp + q__2.r, q__1.i = q__2.i;
+		    rtemp = q__1.r;
+/* L210: */
+		}
+		if (*beta == 0.f) {
+		    i__2 = j + j * c_dim1;
+		    r__1 = *alpha * rtemp;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *alpha * rtemp + *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		}
+/* L220: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		rtemp = 0.f;
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    r_cnjg(&q__3, &a[l + j * a_dim1]);
+		    i__3 = l + j * a_dim1;
+		    q__2.r = q__3.r * a[i__3].r - q__3.i * a[i__3].i, q__2.i =
+			     q__3.r * a[i__3].i + q__3.i * a[i__3].r;
+		    q__1.r = rtemp + q__2.r, q__1.i = q__2.i;
+		    rtemp = q__1.r;
+/* L230: */
+		}
+		if (*beta == 0.f) {
+		    i__2 = j + j * c_dim1;
+		    r__1 = *alpha * rtemp;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    r__1 = *alpha * rtemp + *beta * c__[i__3].r;
+		    c__[i__2].r = r__1, c__[i__2].i = 0.f;
+		}
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    temp.r = 0.f, temp.i = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			r_cnjg(&q__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * a_dim1;
+			q__2.r = q__3.r * a[i__4].r - q__3.i * a[i__4].i,
+				q__2.i = q__3.r * a[i__4].i + q__3.i * a[i__4]
+				.r;
+			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
+			temp.r = q__1.r, temp.i = q__1.i;
+/* L240: */
+		    }
+		    if (*beta == 0.f) {
+			i__3 = i__ + j * c_dim1;
+			q__1.r = *alpha * temp.r, q__1.i = *alpha * temp.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			q__2.r = *alpha * temp.r, q__2.i = *alpha * temp.i;
+			i__4 = i__ + j * c_dim1;
+			q__3.r = *beta * c__[i__4].r, q__3.i = *beta * c__[
+				i__4].i;
+			q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+		    }
+/* L250: */
+		}
+/* L260: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHERK . */
+
+} /* cherk_ */
+
+/* Subroutine */ int cscal_(integer *n, complex *ca, complex *cx, integer *
+	incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, nincx;
+
+
+/*
+    Purpose
+    =======
+
+       CSCAL scales a vector by a constant.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack,  3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	i__3 = i__;
+	i__4 = i__;
+	q__1.r = ca->r * cx[i__4].r - ca->i * cx[i__4].i, q__1.i = ca->r * cx[
+		i__4].i + ca->i * cx[i__4].r;
+	cx[i__3].r = q__1.r, cx[i__3].i = q__1.i;
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	i__1 = i__;
+	i__3 = i__;
+	q__1.r = ca->r * cx[i__3].r - ca->i * cx[i__3].i, q__1.i = ca->r * cx[
+		i__3].i + ca->i * cx[i__3].r;
+	cx[i__1].r = q__1.r, cx[i__1].i = q__1.i;
+/* L30: */
+    }
+    return 0;
+} /* cscal_ */
+
+/* Subroutine */ int csrot_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy, real *c__, real *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static complex ctemp;
+
+
+/*
+    Purpose
+    =======
+
+    CSROT applies a plane rotation, where the cos and sin (c and s) are real
+    and the vectors cx and cy are complex.
+    jack dongarra, linpack, 3/11/78.
+
+    Arguments
+    ==========
+
+    N        (input) INTEGER
+             On entry, N specifies the order of the vectors cx and cy.
+             N must be at least zero.
+             Unchanged on exit.
+
+    CX       (input) COMPLEX array, dimension at least
+             ( 1 + ( N - 1 )*abs( INCX ) ).
+             Before entry, the incremented array CX must contain the n
+             element vector cx. On exit, CX is overwritten by the updated
+             vector cx.
+
+    INCX     (input) INTEGER
+             On entry, INCX specifies the increment for the elements of
+             CX. INCX must not be zero.
+             Unchanged on exit.
+
+    CY       (input) COMPLEX array, dimension at least
+             ( 1 + ( N - 1 )*abs( INCY ) ).
+             Before entry, the incremented array CY must contain the n
+             element vector cy. On exit, CY is overwritten by the updated
+             vector cy.
+
+    INCY     (input) INTEGER
+             On entry, INCY specifies the increment for the elements of
+             CY. INCY must not be zero.
+             Unchanged on exit.
+
+    C        (input) REAL
+             On entry, C specifies the cosine, cos.
+             Unchanged on exit.
+
+    S        (input) REAL
+             On entry, S specifies the sine, sin.
+             Unchanged on exit.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments not equal
+            to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	q__3.r = *s * cy[i__3].r, q__3.i = *s * cy[i__3].i;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	i__4 = ix;
+	q__3.r = *s * cx[i__4].r, q__3.i = *s * cx[i__4].i;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = ix;
+	cx[i__2].r = ctemp.r, cx[i__2].i = ctemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	q__3.r = *s * cy[i__3].r, q__3.i = *s * cy[i__3].i;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	ctemp.r = q__1.r, ctemp.i = q__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	i__4 = i__;
+	q__3.r = *s * cx[i__4].r, q__3.i = *s * cx[i__4].i;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = i__;
+	cx[i__2].r = ctemp.r, cx[i__2].i = ctemp.i;
+/* L30: */
+    }
+    return 0;
+} /* csrot_ */
+
+/* Subroutine */ int csscal_(integer *n, real *sa, complex *cx, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, nincx;
+
+
+/*
+    Purpose
+    =======
+
+       CSSCAL scales a complex vector by a real constant.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	i__3 = i__;
+	i__4 = i__;
+	r__1 = *sa * cx[i__4].r;
+	r__2 = *sa * r_imag(&cx[i__]);
+	q__1.r = r__1, q__1.i = r__2;
+	cx[i__3].r = q__1.r, cx[i__3].i = q__1.i;
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	i__1 = i__;
+	i__3 = i__;
+	r__1 = *sa * cx[i__3].r;
+	r__2 = *sa * r_imag(&cx[i__]);
+	q__1.r = r__1, q__1.i = r__2;
+	cx[i__1].r = q__1.r, cx[i__1].i = q__1.i;
+/* L30: */
+    }
+    return 0;
+} /* csscal_ */
+
+/* Subroutine */ int cswap_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static complex ctemp;
+
+
+/*
+    Purpose
+    =======
+
+      CSWAP interchanges two vectors.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+         code for unequal increments or equal increments not equal
+           to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	ctemp.r = cx[i__2].r, ctemp.i = cx[i__2].i;
+	i__2 = ix;
+	i__3 = iy;
+	cx[i__2].r = cy[i__3].r, cx[i__2].i = cy[i__3].i;
+	i__2 = iy;
+	cy[i__2].r = ctemp.r, cy[i__2].i = ctemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*       code for both increments equal to 1 */
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	ctemp.r = cx[i__2].r, ctemp.i = cx[i__2].i;
+	i__2 = i__;
+	i__3 = i__;
+	cx[i__2].r = cy[i__3].r, cx[i__2].i = cy[i__3].i;
+	i__2 = i__;
+	cy[i__2].r = ctemp.r, cy[i__2].i = ctemp.i;
+/* L30: */
+    }
+    return 0;
+} /* cswap_ */
+
+/* Subroutine */ int ctrmm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, complex *alpha, complex *a, integer *lda,
+	complex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5,
+	    i__6;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static complex temp;
+    extern logical lsame_(char *, char *);
+    static logical lside;
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    CTRMM  performs one of the matrix-matrix operations
+
+       B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+
+    where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry,  SIDE specifies whether  op( A ) multiplies B from
+             the left or right as follows:
+
+                SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+
+                SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - COMPLEX          array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain the matrix  B,  and  on exit  is overwritten  by the
+             transformed matrix.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    noconj = lsame_(transa, "T");
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("CTRMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		b[i__3].r = 0.f, b[i__3].i = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*A*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = k + j * b_dim1;
+			if (b[i__3].r != 0.f || b[i__3].i != 0.f) {
+			    i__3 = k + j * b_dim1;
+			    q__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3]
+				    .i, q__1.i = alpha->r * b[i__3].i +
+				    alpha->i * b[i__3].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    i__3 = k - 1;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * a_dim1;
+				q__2.r = temp.r * a[i__6].r - temp.i * a[i__6]
+					.i, q__2.i = temp.r * a[i__6].i +
+					temp.i * a[i__6].r;
+				q__1.r = b[i__5].r + q__2.r, q__1.i = b[i__5]
+					.i + q__2.i;
+				b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L30: */
+			    }
+			    if (nounit) {
+				i__3 = k + k * a_dim1;
+				q__1.r = temp.r * a[i__3].r - temp.i * a[i__3]
+					.i, q__1.i = temp.r * a[i__3].i +
+					temp.i * a[i__3].r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__3 = k + j * b_dim1;
+			    b[i__3].r = temp.r, b[i__3].i = temp.i;
+			}
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (k = *m; k >= 1; --k) {
+			i__2 = k + j * b_dim1;
+			if (b[i__2].r != 0.f || b[i__2].i != 0.f) {
+			    i__2 = k + j * b_dim1;
+			    q__1.r = alpha->r * b[i__2].r - alpha->i * b[i__2]
+				    .i, q__1.i = alpha->r * b[i__2].i +
+				    alpha->i * b[i__2].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    i__2 = k + j * b_dim1;
+			    b[i__2].r = temp.r, b[i__2].i = temp.i;
+			    if (nounit) {
+				i__2 = k + j * b_dim1;
+				i__3 = k + j * b_dim1;
+				i__4 = k + k * a_dim1;
+				q__1.r = b[i__3].r * a[i__4].r - b[i__3].i *
+					a[i__4].i, q__1.i = b[i__3].r * a[
+					i__4].i + b[i__3].i * a[i__4].r;
+				b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+			    }
+			    i__2 = *m;
+			    for (i__ = k + 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * a_dim1;
+				q__2.r = temp.r * a[i__5].r - temp.i * a[i__5]
+					.i, q__2.i = temp.r * a[i__5].i +
+					temp.i * a[i__5].r;
+				q__1.r = b[i__4].r + q__2.r, q__1.i = b[i__4]
+					.i + q__2.i;
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L60: */
+			    }
+			}
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*A'*B   or   B := alpha*conjg( A' )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			i__2 = i__ + j * b_dim1;
+			temp.r = b[i__2].r, temp.i = b[i__2].i;
+			if (noconj) {
+			    if (nounit) {
+				i__2 = i__ + i__ * a_dim1;
+				q__1.r = temp.r * a[i__2].r - temp.i * a[i__2]
+					.i, q__1.i = temp.r * a[i__2].i +
+					temp.i * a[i__2].r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__2 = i__ - 1;
+			    for (k = 1; k <= i__2; ++k) {
+				i__3 = k + i__ * a_dim1;
+				i__4 = k + j * b_dim1;
+				q__2.r = a[i__3].r * b[i__4].r - a[i__3].i *
+					b[i__4].i, q__2.i = a[i__3].r * b[
+					i__4].i + a[i__3].i * b[i__4].r;
+				q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+			    }
+			} else {
+			    if (nounit) {
+				r_cnjg(&q__2, &a[i__ + i__ * a_dim1]);
+				q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+					q__1.i = temp.r * q__2.i + temp.i *
+					q__2.r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__2 = i__ - 1;
+			    for (k = 1; k <= i__2; ++k) {
+				r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+				i__3 = k + j * b_dim1;
+				q__2.r = q__3.r * b[i__3].r - q__3.i * b[i__3]
+					.i, q__2.i = q__3.r * b[i__3].i +
+					q__3.i * b[i__3].r;
+				q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+			    }
+			}
+			i__2 = i__ + j * b_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L110: */
+		    }
+/* L120: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * b_dim1;
+			temp.r = b[i__3].r, temp.i = b[i__3].i;
+			if (noconj) {
+			    if (nounit) {
+				i__3 = i__ + i__ * a_dim1;
+				q__1.r = temp.r * a[i__3].r - temp.i * a[i__3]
+					.i, q__1.i = temp.r * a[i__3].i +
+					temp.i * a[i__3].r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__3 = *m;
+			    for (k = i__ + 1; k <= i__3; ++k) {
+				i__4 = k + i__ * a_dim1;
+				i__5 = k + j * b_dim1;
+				q__2.r = a[i__4].r * b[i__5].r - a[i__4].i *
+					b[i__5].i, q__2.i = a[i__4].r * b[
+					i__5].i + a[i__4].i * b[i__5].r;
+				q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L130: */
+			    }
+			} else {
+			    if (nounit) {
+				r_cnjg(&q__2, &a[i__ + i__ * a_dim1]);
+				q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+					q__1.i = temp.r * q__2.i + temp.i *
+					q__2.r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__3 = *m;
+			    for (k = i__ + 1; k <= i__3; ++k) {
+				r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+				i__4 = k + j * b_dim1;
+				q__2.r = q__3.r * b[i__4].r - q__3.i * b[i__4]
+					.i, q__2.i = q__3.r * b[i__4].i +
+					q__3.i * b[i__4].r;
+				q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L140: */
+			    }
+			}
+			i__3 = i__ + j * b_dim1;
+			q__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				q__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L150: */
+		    }
+/* L160: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*A. */
+
+	    if (upper) {
+		for (j = *n; j >= 1; --j) {
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			i__1 = j + j * a_dim1;
+			q__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				q__1.i = temp.r * a[i__1].i + temp.i * a[i__1]
+				.r;
+			temp.r = q__1.r, temp.i = q__1.i;
+		    }
+		    i__1 = *m;
+		    for (i__ = 1; i__ <= i__1; ++i__) {
+			i__2 = i__ + j * b_dim1;
+			i__3 = i__ + j * b_dim1;
+			q__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				q__1.i = temp.r * b[i__3].i + temp.i * b[i__3]
+				.r;
+			b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L170: */
+		    }
+		    i__1 = j - 1;
+		    for (k = 1; k <= i__1; ++k) {
+			i__2 = k + j * a_dim1;
+			if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+			    i__2 = k + j * a_dim1;
+			    q__1.r = alpha->r * a[i__2].r - alpha->i * a[i__2]
+				    .i, q__1.i = alpha->r * a[i__2].i +
+				    alpha->i * a[i__2].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * b_dim1;
+				q__2.r = temp.r * b[i__5].r - temp.i * b[i__5]
+					.i, q__2.i = temp.r * b[i__5].i +
+					temp.i * b[i__5].r;
+				q__1.r = b[i__4].r + q__2.r, q__1.i = b[i__4]
+					.i + q__2.i;
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L180: */
+			    }
+			}
+/* L190: */
+		    }
+/* L200: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			i__2 = j + j * a_dim1;
+			q__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				q__1.i = temp.r * a[i__2].i + temp.i * a[i__2]
+				.r;
+			temp.r = q__1.r, temp.i = q__1.i;
+		    }
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * b_dim1;
+			i__4 = i__ + j * b_dim1;
+			q__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				q__1.i = temp.r * b[i__4].i + temp.i * b[i__4]
+				.r;
+			b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L210: */
+		    }
+		    i__2 = *n;
+		    for (k = j + 1; k <= i__2; ++k) {
+			i__3 = k + j * a_dim1;
+			if (a[i__3].r != 0.f || a[i__3].i != 0.f) {
+			    i__3 = k + j * a_dim1;
+			    q__1.r = alpha->r * a[i__3].r - alpha->i * a[i__3]
+				    .i, q__1.i = alpha->r * a[i__3].i +
+				    alpha->i * a[i__3].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * b_dim1;
+				q__2.r = temp.r * b[i__6].r - temp.i * b[i__6]
+					.i, q__2.i = temp.r * b[i__6].i +
+					temp.i * b[i__6].r;
+				q__1.r = b[i__5].r + q__2.r, q__1.i = b[i__5]
+					.i + q__2.i;
+				b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L220: */
+			    }
+			}
+/* L230: */
+		    }
+/* L240: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*A'   or   B := alpha*B*conjg( A' ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    i__2 = k - 1;
+		    for (j = 1; j <= i__2; ++j) {
+			i__3 = j + k * a_dim1;
+			if (a[i__3].r != 0.f || a[i__3].i != 0.f) {
+			    if (noconj) {
+				i__3 = j + k * a_dim1;
+				q__1.r = alpha->r * a[i__3].r - alpha->i * a[
+					i__3].i, q__1.i = alpha->r * a[i__3]
+					.i + alpha->i * a[i__3].r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    } else {
+				r_cnjg(&q__2, &a[j + k * a_dim1]);
+				q__1.r = alpha->r * q__2.r - alpha->i *
+					q__2.i, q__1.i = alpha->r * q__2.i +
+					alpha->i * q__2.r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * b_dim1;
+				q__2.r = temp.r * b[i__6].r - temp.i * b[i__6]
+					.i, q__2.i = temp.r * b[i__6].i +
+					temp.i * b[i__6].r;
+				q__1.r = b[i__5].r + q__2.r, q__1.i = b[i__5]
+					.i + q__2.i;
+				b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L250: */
+			    }
+			}
+/* L260: */
+		    }
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			if (noconj) {
+			    i__2 = k + k * a_dim1;
+			    q__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				    q__1.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			} else {
+			    r_cnjg(&q__2, &a[k + k * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+				    q__1.i = temp.r * q__2.i + temp.i *
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    if (temp.r != 1.f || temp.i != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + k * b_dim1;
+			    i__4 = i__ + k * b_dim1;
+			    q__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				    q__1.i = temp.r * b[i__4].i + temp.i * b[
+				    i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L270: */
+			}
+		    }
+/* L280: */
+		}
+	    } else {
+		for (k = *n; k >= 1; --k) {
+		    i__1 = *n;
+		    for (j = k + 1; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+			    if (noconj) {
+				i__2 = j + k * a_dim1;
+				q__1.r = alpha->r * a[i__2].r - alpha->i * a[
+					i__2].i, q__1.i = alpha->r * a[i__2]
+					.i + alpha->i * a[i__2].r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    } else {
+				r_cnjg(&q__2, &a[j + k * a_dim1]);
+				q__1.r = alpha->r * q__2.r - alpha->i *
+					q__2.i, q__1.i = alpha->r * q__2.i +
+					alpha->i * q__2.r;
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * b_dim1;
+				q__2.r = temp.r * b[i__5].r - temp.i * b[i__5]
+					.i, q__2.i = temp.r * b[i__5].i +
+					temp.i * b[i__5].r;
+				q__1.r = b[i__4].r + q__2.r, q__1.i = b[i__4]
+					.i + q__2.i;
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L290: */
+			    }
+			}
+/* L300: */
+		    }
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			if (noconj) {
+			    i__1 = k + k * a_dim1;
+			    q__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				    q__1.i = temp.r * a[i__1].i + temp.i * a[
+				    i__1].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			} else {
+			    r_cnjg(&q__2, &a[k + k * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+				    q__1.i = temp.r * q__2.i + temp.i *
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    if (temp.r != 1.f || temp.i != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + k * b_dim1;
+			    i__3 = i__ + k * b_dim1;
+			    q__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				    q__1.i = temp.r * b[i__3].i + temp.i * b[
+				    i__3].r;
+			    b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L310: */
+			}
+		    }
+/* L320: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTRMM . */
+
+} /* ctrmm_ */
+
+/* Subroutine */ int ctrmv_(char *uplo, char *trans, char *diag, integer *n,
+	complex *a, integer *lda, complex *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, ix, jx, kx, info;
+    static complex temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    CTRMV  performs one of the matrix-vector operations
+
+       x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+
+    where x is an n element vector and  A is an n by n unit, or non-unit,
+    upper or lower triangular matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   x := A*x.
+
+                TRANS = 'T' or 't'   x := A'*x.
+
+                TRANS = 'C' or 'c'   x := conjg( A' )*x.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit
+             triangular as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular matrix and the strictly lower triangular part of
+             A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular matrix and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of
+             A are not referenced either, but are assumed to be unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x. On exit, X is overwritten with the
+             tranformed vector x.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("CTRMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T");
+    nounit = lsame_(diag, "N");
+
+/*
+       Set up the start point in X if the increment is not unity. This
+       will be  ( N - 1 )*INCX  too small for descending loops.
+*/
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := A*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__;
+			    i__4 = i__;
+			    i__5 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__4].r + q__2.r, q__1.i = x[i__4].i +
+				    q__2.i;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+/* L10: */
+			}
+			if (nounit) {
+			    i__2 = j;
+			    i__3 = j;
+			    i__4 = j + j * a_dim1;
+			    q__1.r = x[i__3].r * a[i__4].r - x[i__3].i * a[
+				    i__4].i, q__1.i = x[i__3].r * a[i__4].i +
+				    x[i__3].i * a[i__4].r;
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = jx;
+		    if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+			i__2 = jx;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			ix = kx;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = ix;
+			    i__4 = ix;
+			    i__5 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__4].r + q__2.r, q__1.i = x[i__4].i +
+				    q__2.i;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    i__2 = jx;
+			    i__3 = jx;
+			    i__4 = j + j * a_dim1;
+			    q__1.r = x[i__3].r * a[i__4].r - x[i__3].i * a[
+				    i__4].i, q__1.i = x[i__3].r * a[i__4].i +
+				    x[i__3].i * a[i__4].r;
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+			}
+		    }
+		    jx += *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = i__;
+			    i__3 = i__;
+			    i__4 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
+				    q__2.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i +
+				    q__2.i;
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+/* L50: */
+			}
+			if (nounit) {
+			    i__1 = j;
+			    i__2 = j;
+			    i__3 = j + j * a_dim1;
+			    q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, q__1.i = x[i__2].r * a[i__3].i +
+				    x[i__2].i * a[i__3].r;
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
+			i__1 = jx;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			ix = kx;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = ix;
+			    i__3 = ix;
+			    i__4 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
+				    q__2.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i +
+				    q__2.i;
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    i__1 = jx;
+			    i__2 = jx;
+			    i__3 = j + j * a_dim1;
+			    q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, q__1.i = x[i__2].r * a[i__3].i +
+				    x[i__2].i * a[i__3].r;
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+			}
+		    }
+		    jx -= *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x  or  x := conjg( A' )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    if (noconj) {
+			if (nounit) {
+			    i__1 = j + j * a_dim1;
+			    q__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				    q__1.i = temp.r * a[i__1].i + temp.i * a[
+				    i__1].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    i__1 = i__ + j * a_dim1;
+			    i__2 = i__;
+			    q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, q__2.i = a[i__1].r * x[i__2].i +
+				    a[i__1].i * x[i__2].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+				    q__1.i = temp.r * q__2.i + temp.i *
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__1 = i__;
+			    q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i,
+				    q__2.i = q__3.r * x[i__1].i + q__3.i * x[
+				    i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+			}
+		    }
+		    i__1 = j;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    ix = jx;
+		    if (noconj) {
+			if (nounit) {
+			    i__1 = j + j * a_dim1;
+			    q__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				    q__1.i = temp.r * a[i__1].i + temp.i * a[
+				    i__1].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    ix -= *incx;
+			    i__1 = i__ + j * a_dim1;
+			    i__2 = ix;
+			    q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, q__2.i = a[i__1].r * x[i__2].i +
+				    a[i__1].i * x[i__2].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L120: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+				    q__1.i = temp.r * q__2.i + temp.i *
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    ix -= *incx;
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__1 = ix;
+			    q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i,
+				    q__2.i = q__3.r * x[i__1].i + q__3.i * x[
+				    i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L130: */
+			}
+		    }
+		    i__1 = jx;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+		    jx -= *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    if (noconj) {
+			if (nounit) {
+			    i__2 = j + j * a_dim1;
+			    q__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				    q__1.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = i__;
+			    q__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, q__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L150: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+				    q__1.i = temp.r * q__2.i + temp.i *
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__3 = i__;
+			    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i,
+				    q__2.i = q__3.r * x[i__3].i + q__3.i * x[
+				    i__3].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L160: */
+			}
+		    }
+		    i__2 = j;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = jx;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    ix = jx;
+		    if (noconj) {
+			if (nounit) {
+			    i__2 = j + j * a_dim1;
+			    q__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				    q__1.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    ix += *incx;
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = ix;
+			    q__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, q__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L180: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i,
+				    q__1.i = temp.r * q__2.i + temp.i *
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    ix += *incx;
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__3 = ix;
+			    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i,
+				    q__2.i = q__3.r * x[i__3].i + q__3.i * x[
+				    i__3].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i +
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L190: */
+			}
+		    }
+		    i__2 = jx;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+		    jx += *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTRMV . */
+
+} /* ctrmv_ */
+
+/* Subroutine */ int ctrsm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, complex *alpha, complex *a, integer *lda,
+	complex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5,
+	    i__6, i__7;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static complex temp;
+    extern logical lsame_(char *, char *);
+    static logical lside;
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    CTRSM  solves one of the matrix equations
+
+       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+
+    where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+
+    The matrix X is overwritten on B.
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry, SIDE specifies whether op( A ) appears on the left
+             or right of X as follows:
+
+                SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+
+                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX         .
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - COMPLEX          array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain  the  right-hand  side  matrix  B,  and  on exit  is
+             overwritten by the solution matrix  X.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    noconj = lsame_(transa, "T");
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("CTRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		b[i__3].r = 0.f, b[i__3].i = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (alpha->r != 1.f || alpha->i != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    q__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, q__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			i__2 = k + j * b_dim1;
+			if (b[i__2].r != 0.f || b[i__2].i != 0.f) {
+			    if (nounit) {
+				i__2 = k + j * b_dim1;
+				c_div(&q__1, &b[k + j * b_dim1], &a[k + k *
+					a_dim1]);
+				b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+			    }
+			    i__2 = k - 1;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = k + j * b_dim1;
+				i__6 = i__ + k * a_dim1;
+				q__2.r = b[i__5].r * a[i__6].r - b[i__5].i *
+					a[i__6].i, q__2.i = b[i__5].r * a[
+					i__6].i + b[i__5].i * a[i__6].r;
+				q__1.r = b[i__4].r - q__2.r, q__1.i = b[i__4]
+					.i - q__2.i;
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (alpha->r != 1.f || alpha->i != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    q__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, q__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = k + j * b_dim1;
+			if (b[i__3].r != 0.f || b[i__3].i != 0.f) {
+			    if (nounit) {
+				i__3 = k + j * b_dim1;
+				c_div(&q__1, &b[k + j * b_dim1], &a[k + k *
+					a_dim1]);
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+			    }
+			    i__3 = *m;
+			    for (i__ = k + 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = k + j * b_dim1;
+				i__7 = i__ + k * a_dim1;
+				q__2.r = b[i__6].r * a[i__7].r - b[i__6].i *
+					a[i__7].i, q__2.i = b[i__6].r * a[
+					i__7].i + b[i__6].i * a[i__7].r;
+				q__1.r = b[i__5].r - q__2.r, q__1.i = b[i__5]
+					.i - q__2.i;
+				b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*
+             Form  B := alpha*inv( A' )*B
+             or    B := alpha*inv( conjg( A' ) )*B.
+*/
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * b_dim1;
+			q__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3].i,
+				q__1.i = alpha->r * b[i__3].i + alpha->i * b[
+				i__3].r;
+			temp.r = q__1.r, temp.i = q__1.i;
+			if (noconj) {
+			    i__3 = i__ - 1;
+			    for (k = 1; k <= i__3; ++k) {
+				i__4 = k + i__ * a_dim1;
+				i__5 = k + j * b_dim1;
+				q__2.r = a[i__4].r * b[i__5].r - a[i__4].i *
+					b[i__5].i, q__2.i = a[i__4].r * b[
+					i__5].i + a[i__4].i * b[i__5].r;
+				q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L110: */
+			    }
+			    if (nounit) {
+				c_div(&q__1, &temp, &a[i__ + i__ * a_dim1]);
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			} else {
+			    i__3 = i__ - 1;
+			    for (k = 1; k <= i__3; ++k) {
+				r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+				i__4 = k + j * b_dim1;
+				q__2.r = q__3.r * b[i__4].r - q__3.i * b[i__4]
+					.i, q__2.i = q__3.r * b[i__4].i +
+					q__3.i * b[i__4].r;
+				q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L120: */
+			    }
+			    if (nounit) {
+				r_cnjg(&q__2, &a[i__ + i__ * a_dim1]);
+				c_div(&q__1, &temp, &q__2);
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			}
+			i__3 = i__ + j * b_dim1;
+			b[i__3].r = temp.r, b[i__3].i = temp.i;
+/* L130: */
+		    }
+/* L140: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			i__2 = i__ + j * b_dim1;
+			q__1.r = alpha->r * b[i__2].r - alpha->i * b[i__2].i,
+				q__1.i = alpha->r * b[i__2].i + alpha->i * b[
+				i__2].r;
+			temp.r = q__1.r, temp.i = q__1.i;
+			if (noconj) {
+			    i__2 = *m;
+			    for (k = i__ + 1; k <= i__2; ++k) {
+				i__3 = k + i__ * a_dim1;
+				i__4 = k + j * b_dim1;
+				q__2.r = a[i__3].r * b[i__4].r - a[i__3].i *
+					b[i__4].i, q__2.i = a[i__3].r * b[
+					i__4].i + a[i__3].i * b[i__4].r;
+				q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L150: */
+			    }
+			    if (nounit) {
+				c_div(&q__1, &temp, &a[i__ + i__ * a_dim1]);
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			} else {
+			    i__2 = *m;
+			    for (k = i__ + 1; k <= i__2; ++k) {
+				r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+				i__3 = k + j * b_dim1;
+				q__2.r = q__3.r * b[i__3].r - q__3.i * b[i__3]
+					.i, q__2.i = q__3.r * b[i__3].i +
+					q__3.i * b[i__3].r;
+				q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+					q__2.i;
+				temp.r = q__1.r, temp.i = q__1.i;
+/* L160: */
+			    }
+			    if (nounit) {
+				r_cnjg(&q__2, &a[i__ + i__ * a_dim1]);
+				c_div(&q__1, &temp, &q__2);
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			}
+			i__2 = i__ + j * b_dim1;
+			b[i__2].r = temp.r, b[i__2].i = temp.i;
+/* L170: */
+		    }
+/* L180: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (alpha->r != 1.f || alpha->i != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    q__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, q__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L190: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = k + j * a_dim1;
+			if (a[i__3].r != 0.f || a[i__3].i != 0.f) {
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = k + j * a_dim1;
+				i__7 = i__ + k * b_dim1;
+				q__2.r = a[i__6].r * b[i__7].r - a[i__6].i *
+					b[i__7].i, q__2.i = a[i__6].r * b[
+					i__7].i + a[i__6].i * b[i__7].r;
+				q__1.r = b[i__5].r - q__2.r, q__1.i = b[i__5]
+					.i - q__2.i;
+				b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L200: */
+			    }
+			}
+/* L210: */
+		    }
+		    if (nounit) {
+			c_div(&q__1, &c_b21, &a[j + j * a_dim1]);
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    q__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				    q__1.i = temp.r * b[i__4].i + temp.i * b[
+				    i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L220: */
+			}
+		    }
+/* L230: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (alpha->r != 1.f || alpha->i != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * b_dim1;
+			    i__3 = i__ + j * b_dim1;
+			    q__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3]
+				    .i, q__1.i = alpha->r * b[i__3].i +
+				    alpha->i * b[i__3].r;
+			    b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L240: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= i__1; ++k) {
+			i__2 = k + j * a_dim1;
+			if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = k + j * a_dim1;
+				i__6 = i__ + k * b_dim1;
+				q__2.r = a[i__5].r * b[i__6].r - a[i__5].i *
+					b[i__6].i, q__2.i = a[i__5].r * b[
+					i__6].i + a[i__5].i * b[i__6].r;
+				q__1.r = b[i__4].r - q__2.r, q__1.i = b[i__4]
+					.i - q__2.i;
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L250: */
+			    }
+			}
+/* L260: */
+		    }
+		    if (nounit) {
+			c_div(&q__1, &c_b21, &a[j + j * a_dim1]);
+			temp.r = q__1.r, temp.i = q__1.i;
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * b_dim1;
+			    i__3 = i__ + j * b_dim1;
+			    q__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				    q__1.i = temp.r * b[i__3].i + temp.i * b[
+				    i__3].r;
+			    b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L270: */
+			}
+		    }
+/* L280: */
+		}
+	    }
+	} else {
+
+/*
+             Form  B := alpha*B*inv( A' )
+             or    B := alpha*B*inv( conjg( A' ) ).
+*/
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			if (noconj) {
+			    c_div(&q__1, &c_b21, &a[k + k * a_dim1]);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			} else {
+			    r_cnjg(&q__2, &a[k + k * a_dim1]);
+			    c_div(&q__1, &c_b21, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + k * b_dim1;
+			    i__3 = i__ + k * b_dim1;
+			    q__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				    q__1.i = temp.r * b[i__3].i + temp.i * b[
+				    i__3].r;
+			    b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L290: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+			    if (noconj) {
+				i__2 = j + k * a_dim1;
+				temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    } else {
+				r_cnjg(&q__1, &a[j + k * a_dim1]);
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * b_dim1;
+				q__2.r = temp.r * b[i__5].r - temp.i * b[i__5]
+					.i, q__2.i = temp.r * b[i__5].i +
+					temp.i * b[i__5].r;
+				q__1.r = b[i__4].r - q__2.r, q__1.i = b[i__4]
+					.i - q__2.i;
+				b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L300: */
+			    }
+			}
+/* L310: */
+		    }
+		    if (alpha->r != 1.f || alpha->i != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + k * b_dim1;
+			    i__3 = i__ + k * b_dim1;
+			    q__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3]
+				    .i, q__1.i = alpha->r * b[i__3].i +
+				    alpha->i * b[i__3].r;
+			    b[i__2].r = q__1.r, b[i__2].i = q__1.i;
+/* L320: */
+			}
+		    }
+/* L330: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    if (nounit) {
+			if (noconj) {
+			    c_div(&q__1, &c_b21, &a[k + k * a_dim1]);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			} else {
+			    r_cnjg(&q__2, &a[k + k * a_dim1]);
+			    c_div(&q__1, &c_b21, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + k * b_dim1;
+			    i__4 = i__ + k * b_dim1;
+			    q__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				    q__1.i = temp.r * b[i__4].i + temp.i * b[
+				    i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L340: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= i__2; ++j) {
+			i__3 = j + k * a_dim1;
+			if (a[i__3].r != 0.f || a[i__3].i != 0.f) {
+			    if (noconj) {
+				i__3 = j + k * a_dim1;
+				temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    } else {
+				r_cnjg(&q__1, &a[j + k * a_dim1]);
+				temp.r = q__1.r, temp.i = q__1.i;
+			    }
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * b_dim1;
+				q__2.r = temp.r * b[i__6].r - temp.i * b[i__6]
+					.i, q__2.i = temp.r * b[i__6].i +
+					temp.i * b[i__6].r;
+				q__1.r = b[i__5].r - q__2.r, q__1.i = b[i__5]
+					.i - q__2.i;
+				b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L350: */
+			    }
+			}
+/* L360: */
+		    }
+		    if (alpha->r != 1.f || alpha->i != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + k * b_dim1;
+			    i__4 = i__ + k * b_dim1;
+			    q__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, q__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L370: */
+			}
+		    }
+/* L380: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTRSM . */
+
+} /* ctrsm_ */
+
+/* Subroutine */ int ctrsv_(char *uplo, char *trans, char *diag, integer *n,
+	complex *a, integer *lda, complex *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, ix, jx, kx, info;
+    static complex temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    CTRSV  solves one of the systems of equations
+
+       A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+
+    where b and x are n element vectors and A is an n by n unit, or
+    non-unit, upper or lower triangular matrix.
+
+    No test for singularity or near-singularity is included in this
+    routine. Such tests must be performed before calling this routine.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the equations to be solved as
+             follows:
+
+                TRANS = 'N' or 'n'   A*x = b.
+
+                TRANS = 'T' or 't'   A'*x = b.
+
+                TRANS = 'C' or 'c'   conjg( A' )*x = b.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit
+             triangular as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    A      - COMPLEX          array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular matrix and the strictly lower triangular part of
+             A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular matrix and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of
+             A are not referenced either, but are assumed to be unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - COMPLEX          array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element right-hand side vector b. On exit, X is overwritten
+             with the solution vector x.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("CTRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T");
+    nounit = lsame_(diag, "N");
+
+/*
+       Set up the start point in X if the increment is not unity. This
+       will be  ( N - 1 )*INCX  too small for descending loops.
+*/
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
+			if (nounit) {
+			    i__1 = j;
+			    c_div(&q__1, &x[j], &a[j + j * a_dim1]);
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+			}
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    i__1 = i__;
+			    i__2 = i__;
+			    i__3 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
+				    q__2.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    q__1.r = x[i__2].r - q__2.r, q__1.i = x[i__2].i -
+				    q__2.i;
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
+			if (nounit) {
+			    i__1 = jx;
+			    c_div(&q__1, &x[jx], &a[j + j * a_dim1]);
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+			}
+			i__1 = jx;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			ix = jx;
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    ix -= *incx;
+			    i__1 = ix;
+			    i__2 = ix;
+			    i__3 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
+				    q__2.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    q__1.r = x[i__2].r - q__2.r, q__1.i = x[i__2].i -
+				    q__2.i;
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+			if (nounit) {
+			    i__2 = j;
+			    c_div(&q__1, &x[j], &a[j + j * a_dim1]);
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+			}
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    i__3 = i__;
+			    i__4 = i__;
+			    i__5 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__4].r - q__2.r, q__1.i = x[i__4].i -
+				    q__2.i;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = jx;
+		    if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+			if (nounit) {
+			    i__2 = jx;
+			    c_div(&q__1, &x[jx], &a[j + j * a_dim1]);
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+			}
+			i__2 = jx;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			ix = jx;
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    ix += *incx;
+			    i__3 = ix;
+			    i__4 = ix;
+			    i__5 = i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__4].r - q__2.r, q__1.i = x[i__4].i -
+				    q__2.i;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = i__;
+			    q__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, q__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__3 = i__;
+			    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i,
+				    q__2.i = q__3.r * x[i__3].i + q__3.i * x[
+				    i__3].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__2 = j;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    ix = kx;
+		    i__2 = jx;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = ix;
+			    q__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, q__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L120: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__3 = ix;
+			    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i,
+				    q__2.i = q__3.r * x[i__3].i + q__3.i * x[
+				    i__3].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L130: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__2 = jx;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+		    jx += *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = i__ + j * a_dim1;
+			    i__3 = i__;
+			    q__2.r = a[i__2].r * x[i__3].r - a[i__2].i * x[
+				    i__3].i, q__2.i = a[i__2].r * x[i__3].i +
+				    a[i__2].i * x[i__3].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L150: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__2 = i__;
+			    q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i,
+				    q__2.i = q__3.r * x[i__2].i + q__3.i * x[
+				    i__2].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L160: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__1 = j;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    ix = kx;
+		    i__1 = jx;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = i__ + j * a_dim1;
+			    i__3 = ix;
+			    q__2.r = a[i__2].r * x[i__3].r - a[i__2].i * x[
+				    i__3].i, q__2.i = a[i__2].r * x[i__3].i +
+				    a[i__2].i * x[i__3].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L180: */
+			}
+			if (nounit) {
+			    c_div(&q__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    r_cnjg(&q__3, &a[i__ + j * a_dim1]);
+			    i__2 = ix;
+			    q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i,
+				    q__2.i = q__3.r * x[i__2].i + q__3.i * x[
+				    i__2].r;
+			    q__1.r = temp.r - q__2.r, q__1.i = temp.i -
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L190: */
+			}
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j + j * a_dim1]);
+			    c_div(&q__1, &temp, &q__2);
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+		    }
+		    i__1 = jx;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+		    jx -= *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTRSV . */
+
+} /* ctrsv_ */
+
+/* Subroutine */ int daxpy_(integer *n, doublereal *da, doublereal *dx,
+	integer *incx, doublereal *dy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+
+
+/*
+    Purpose
+    =======
+
+       DAXPY constant times a vector plus a vector.
+       uses unrolled loops for increments equal to one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dy;
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*da == 0.) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dy[iy] += *da * dx[ix];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*
+          code for both increments equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 4;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dy[i__] += *da * dx[i__];
+/* L30: */
+    }
+    if (*n < 4) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 4) {
+	dy[i__] += *da * dx[i__];
+	dy[i__ + 1] += *da * dx[i__ + 1];
+	dy[i__ + 2] += *da * dx[i__ + 2];
+	dy[i__ + 3] += *da * dx[i__ + 3];
+/* L50: */
+    }
+    return 0;
+} /* daxpy_ */
+
+doublereal dcabs1_(doublecomplex *z__)
+{
+    /* System generated locals */
+    doublereal ret_val, d__1, d__2;
+
+/*
+    Purpose
+    =======
+
+    DCABS1 computes absolute value of a double complex number
+
+    =====================================================================
+*/
+
+
+    ret_val = (d__1 = z__->r, abs(d__1)) + (d__2 = d_imag(z__), abs(d__2));
+    return ret_val;
+} /* dcabs1_ */
+
+/* Subroutine */ int dcopy_(integer *n, doublereal *dx, integer *incx,
+	doublereal *dy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+
+
+/*
+    Purpose
+    =======
+
+       DCOPY copies a vector, x, to a vector, y.
+       uses unrolled loops for increments equal to one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dy;
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dy[iy] = dx[ix];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*
+          code for both increments equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 7;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dy[i__] = dx[i__];
+/* L30: */
+    }
+    if (*n < 7) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 7) {
+	dy[i__] = dx[i__];
+	dy[i__ + 1] = dx[i__ + 1];
+	dy[i__ + 2] = dx[i__ + 2];
+	dy[i__ + 3] = dx[i__ + 3];
+	dy[i__ + 4] = dx[i__ + 4];
+	dy[i__ + 5] = dx[i__ + 5];
+	dy[i__ + 6] = dx[i__ + 6];
+/* L50: */
+    }
+    return 0;
+} /* dcopy_ */
+
+doublereal ddot_(integer *n, doublereal *dx, integer *incx, doublereal *dy,
+	integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal ret_val;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+    static doublereal dtemp;
+
+
+/*
+    Purpose
+    =======
+
+       DDOT forms the dot product of two vectors.
+       uses unrolled loops for increments equal to one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dy;
+    --dx;
+
+    /* Function Body */
+    ret_val = 0.;
+    dtemp = 0.;
+    if (*n <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtemp += dx[ix] * dy[iy];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    ret_val = dtemp;
+    return ret_val;
+
+/*
+          code for both increments equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtemp += dx[i__] * dy[i__];
+/* L30: */
+    }
+    if (*n < 5) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 5) {
+	dtemp = dtemp + dx[i__] * dy[i__] + dx[i__ + 1] * dy[i__ + 1] + dx[
+		i__ + 2] * dy[i__ + 2] + dx[i__ + 3] * dy[i__ + 3] + dx[i__ +
+		4] * dy[i__ + 4];
+/* L50: */
+    }
+L60:
+    ret_val = dtemp;
+    return ret_val;
+} /* ddot_ */
+
+/* Subroutine */ int dgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, doublereal *alpha, doublereal *a, integer *lda,
+	doublereal *b, integer *ldb, doublereal *beta, doublereal *c__,
+	integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static logical nota, notb;
+    static doublereal temp;
+    static integer ncola;
+    extern logical lsame_(char *, char *);
+    static integer nrowa, nrowb;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DGEMM  performs one of the matrix-matrix operations
+
+       C := alpha*op( A )*op( B ) + beta*C,
+
+    where  op( X ) is one of
+
+       op( X ) = X   or   op( X ) = X',
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A )
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n',  op( A ) = A.
+
+                TRANSA = 'T' or 't',  op( A ) = A'.
+
+                TRANSA = 'C' or 'c',  op( A ) = A'.
+
+             Unchanged on exit.
+
+    TRANSB - CHARACTER*1.
+             On entry, TRANSB specifies the form of op( B ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSB = 'N' or 'n',  op( B ) = B.
+
+                TRANSB = 'T' or 't',  op( B ) = B'.
+
+                TRANSB = 'C' or 'c',  op( B ) = B'.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry,  M  specifies  the number  of rows  of the  matrix
+             op( A )  and of the  matrix  C.  M  must  be at least  zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N  specifies the number  of columns of the matrix
+             op( B ) and the number of columns of the matrix C. N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry,  K  specifies  the number of columns of the matrix
+             op( A ) and the number of rows of the matrix op( B ). K must
+             be at least  zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by m  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+             LDA must be at least  max( 1, m ), otherwise  LDA must be at
+             least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
+             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  n by k  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+             LDB must be at least  max( 1, k ), otherwise  LDB must be at
+             least  max( 1, n ).
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION.
+             On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+             supplied as zero then C need not be set on input.
+             Unchanged on exit.
+
+    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+             Before entry, the leading  m by n  part of the array  C must
+             contain the matrix  C,  except when  beta  is zero, in which
+             case C need not be set on entry.
+             On exit, the array  C  is overwritten by the  m by n  matrix
+             ( alpha*op( A )*op( B ) + beta*C ).
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+       and  columns of  A  and the  number of  rows  of  B  respectively.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    nota = lsame_(transa, "N");
+    notb = lsame_(transb, "N");
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && ! lsame_(transa, "C") && ! lsame_(
+	    transa, "T")) {
+	info = 1;
+    } else if (! notb && ! lsame_(transb, "C") && !
+	    lsame_(transb, "T")) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	xerbla_("DGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
+	return 0;
+    }
+
+/*     And if  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	if (*beta == 0.) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    c__[i__ + j * c_dim1] = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L50: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (b[l + j * b_dim1] != 0.) {
+			temp = *alpha * b[l + j * b_dim1];
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+/* L100: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	}
+    } else {
+	if (nota) {
+
+/*           Form  C := alpha*A*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L130: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L140: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (b[j + l * b_dim1] != 0.) {
+			temp = *alpha * b[j + l * b_dim1];
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+/* L170: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * b[j + l * b_dim1];
+/* L180: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEMM . */
+
+} /* dgemm_ */
+
+/* Subroutine */ int dgemv_(char *trans, integer *m, integer *n, doublereal *
+	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx,
+	doublereal *beta, doublereal *y, integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static doublereal temp;
+    static integer lenx, leny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DGEMV  performs one of the matrix-vector operations
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+
+    where alpha and beta are scalars, x and y are vectors and A is an
+    m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    X      - DOUBLE PRECISION array of DIMENSION at least
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+             Before entry, the incremented array X must contain the
+             vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION.
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - DOUBLE PRECISION array of DIMENSION at least
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+             Before entry with BETA non-zero, the incremented array Y
+             must contain the vector y. On exit, Y is overwritten by the
+             updated vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! lsame_(trans, "C")
+	    ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) {
+	return 0;
+    }
+
+/*
+       Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+       up the start points in  X  and  Y.
+*/
+
+    if (lsame_(trans, "N")) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+
+       First form  y := beta*y.
+*/
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (lsame_(trans, "N")) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.) {
+		    temp = *alpha * x[jx];
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			y[i__] += temp * a[i__ + j * a_dim1];
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.) {
+		    temp = *alpha * x[jx];
+		    iy = ky;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			y[iy] += temp * a[i__ + j * a_dim1];
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = 0.;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[jy] += *alpha * temp;
+		jy += *incy;
+/* L100: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = 0.;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp += a[i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+/* L110: */
+		}
+		y[jy] += *alpha * temp;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEMV . */
+
+} /* dgemv_ */
+
+/* Subroutine */ int dger_(integer *m, integer *n, doublereal *alpha,
+	doublereal *x, integer *incx, doublereal *y, integer *incy,
+	doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jy, kx, info;
+    static doublereal temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DGER   performs the rank 1 operation
+
+       A := alpha*x*y' + A,
+
+    where alpha is a scalar, x is an m element vector, y is an n element
+    vector and A is an m by n matrix.
+
+    Arguments
+    ==========
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( m - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the m
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients. On exit, A is
+             overwritten by the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DGER  ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.) {
+	return 0;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (y[jy] != 0.) {
+		temp = *alpha * y[jy];
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    a[i__ + j * a_dim1] += x[i__] * temp;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (y[jy] != 0.) {
+		temp = *alpha * y[jy];
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    a[i__ + j * a_dim1] += x[ix] * temp;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of DGER  . */
+
+} /* dger_ */
+
+doublereal dnrm2_(integer *n, doublereal *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal ret_val, d__1;
+
+    /* Local variables */
+    static integer ix;
+    static doublereal ssq, norm, scale, absxi;
+
+
+/*
+    Purpose
+    =======
+
+    DNRM2 returns the euclidean norm of a vector via the function
+    name, so that
+
+       DNRM2 := sqrt( x'*x )
+
+    Further Details
+    ===============
+
+    -- This version written on 25-October-1982.
+       Modified on 14-October-1993 to inline the call to DLASSQ.
+       Sven Hammarling, Nag Ltd.
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n < 1 || *incx < 1) {
+	norm = 0.;
+    } else if (*n == 1) {
+	norm = abs(x[1]);
+    } else {
+	scale = 0.;
+	ssq = 1.;
+/*
+          The following loop is equivalent to this call to the LAPACK
+          auxiliary routine:
+          CALL DLASSQ( N, X, INCX, SCALE, SSQ )
+*/
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    if (x[ix] != 0.) {
+		absxi = (d__1 = x[ix], abs(d__1));
+		if (scale < absxi) {
+/* Computing 2nd power */
+		    d__1 = scale / absxi;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    d__1 = absxi / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of DNRM2. */
+
+} /* dnrm2_ */
+
+/* Subroutine */ int drot_(integer *n, doublereal *dx, integer *incx,
+	doublereal *dy, integer *incy, doublereal *c__, doublereal *s)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static doublereal dtemp;
+
+
+/*
+    Purpose
+    =======
+
+       DROT applies a plane rotation.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dy;
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+         code for unequal increments or equal increments not equal
+           to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtemp = *c__ * dx[ix] + *s * dy[iy];
+	dy[iy] = *c__ * dy[iy] - *s * dx[ix];
+	dx[ix] = dtemp;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*       code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtemp = *c__ * dx[i__] + *s * dy[i__];
+	dy[i__] = *c__ * dy[i__] - *s * dx[i__];
+	dx[i__] = dtemp;
+/* L30: */
+    }
+    return 0;
+} /* drot_ */
+
+/* Subroutine */ int dscal_(integer *n, doublereal *da, doublereal *dx,
+	integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i__, m, mp1, nincx;
+
+
+/*
+    Purpose
+    =======
+
+       DSCAL scales a vector by a constant.
+       uses unrolled loops for increment equal to one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	dx[i__] = *da * dx[i__];
+/* L10: */
+    }
+    return 0;
+
+/*
+          code for increment equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	dx[i__] = *da * dx[i__];
+/* L30: */
+    }
+    if (*n < 5) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i__ = mp1; i__ <= i__2; i__ += 5) {
+	dx[i__] = *da * dx[i__];
+	dx[i__ + 1] = *da * dx[i__ + 1];
+	dx[i__ + 2] = *da * dx[i__ + 2];
+	dx[i__ + 3] = *da * dx[i__ + 3];
+	dx[i__ + 4] = *da * dx[i__ + 4];
+/* L50: */
+    }
+    return 0;
+} /* dscal_ */
+
+/* Subroutine */ int dswap_(integer *n, doublereal *dx, integer *incx,
+	doublereal *dy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+    static doublereal dtemp;
+
+
+/*
+    Purpose
+    =======
+
+       interchanges two vectors.
+       uses unrolled loops for increments equal one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dy;
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+         code for unequal increments or equal increments not equal
+           to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtemp = dx[ix];
+	dx[ix] = dy[iy];
+	dy[iy] = dtemp;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*
+         code for both increments equal to 1
+
+
+         clean-up loop
+*/
+
+L20:
+    m = *n % 3;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dtemp = dx[i__];
+	dx[i__] = dy[i__];
+	dy[i__] = dtemp;
+/* L30: */
+    }
+    if (*n < 3) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 3) {
+	dtemp = dx[i__];
+	dx[i__] = dy[i__];
+	dy[i__] = dtemp;
+	dtemp = dx[i__ + 1];
+	dx[i__ + 1] = dy[i__ + 1];
+	dy[i__ + 1] = dtemp;
+	dtemp = dx[i__ + 2];
+	dx[i__ + 2] = dy[i__ + 2];
+	dy[i__ + 2] = dtemp;
+/* L50: */
+    }
+    return 0;
+} /* dswap_ */
+
+/* Subroutine */ int dsymv_(char *uplo, integer *n, doublereal *alpha,
+	doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal
+	*beta, doublereal *y, integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static doublereal temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DSYMV  performs the matrix-vector  operation
+
+       y := alpha*A*x + beta*y,
+
+    where alpha and beta are scalars, x and y are n element vectors and
+    A is an n by n symmetric matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the symmetric matrix and the strictly
+             lower triangular part of A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the symmetric matrix and the strictly
+             upper triangular part of A is not referenced.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION.
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y. On exit, Y is overwritten by the updated
+             vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("DSYMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0. && *beta == 1.) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+
+       First form  y := beta*y.
+*/
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (lsame_(uplo, "U")) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[i__];
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * a[j + j * a_dim1] + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[iy] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * a[j + j * a_dim1] + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		y[j] += temp1 * a[j + j * a_dim1];
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		y[jy] += temp1 * a[j + j * a_dim1];
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSYMV . */
+
+} /* dsymv_ */
+
+/* Subroutine */ int dsyr2_(char *uplo, integer *n, doublereal *alpha,
+	doublereal *x, integer *incx, doublereal *y, integer *incy,
+	doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static doublereal temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DSYR2  performs the symmetric rank 2 operation
+
+       A := alpha*x*y' + alpha*y*x' + A,
+
+    where alpha is a scalar, x and y are n element vectors and A is an n
+    by n symmetric matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the symmetric matrix and the strictly
+             lower triangular part of A is not referenced. On exit, the
+             upper triangular part of the array A is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the symmetric matrix and the strictly
+             upper triangular part of A is not referenced. On exit, the
+             lower triangular part of the array A is overwritten by the
+             lower triangular part of the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DSYR2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0.) {
+	return 0;
+    }
+
+/*
+       Set up the start points in X and Y if the increments are not both
+       unity.
+*/
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+*/
+
+    if (lsame_(uplo, "U")) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[j] != 0. || y[j] != 0.) {
+		    temp1 = *alpha * y[j];
+		    temp2 = *alpha * x[j];
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[i__] *
+				temp1 + y[i__] * temp2;
+/* L10: */
+		    }
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0. || y[jy] != 0.) {
+		    temp1 = *alpha * y[jy];
+		    temp2 = *alpha * x[jx];
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[ix] *
+				temp1 + y[iy] * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[j] != 0. || y[j] != 0.) {
+		    temp1 = *alpha * y[j];
+		    temp2 = *alpha * x[j];
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[i__] *
+				temp1 + y[i__] * temp2;
+/* L50: */
+		    }
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0. || y[jy] != 0.) {
+		    temp1 = *alpha * y[jy];
+		    temp2 = *alpha * x[jx];
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[ix] *
+				temp1 + y[iy] * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSYR2 . */
+
+} /* dsyr2_ */
+
+/* Subroutine */ int dsyr2k_(char *uplo, char *trans, integer *n, integer *k,
+	doublereal *alpha, doublereal *a, integer *lda, doublereal *b,
+	integer *ldb, doublereal *beta, doublereal *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static doublereal temp1, temp2;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DSYR2K  performs one of the symmetric rank 2k operations
+
+       C := alpha*A*B' + alpha*B*A' + beta*C,
+
+    or
+
+       C := alpha*A'*B + alpha*B'*A + beta*C,
+
+    where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+    and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+    matrices in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   C := alpha*A*B' + alpha*B*A' +
+                                          beta*C.
+
+                TRANS = 'T' or 't'   C := alpha*A'*B + alpha*B'*A +
+                                          beta*C.
+
+                TRANS = 'C' or 'c'   C := alpha*A'*B + alpha*B'*A +
+                                          beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns  of the  matrices  A and B,  and on  entry  with
+             TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+             of rows of the matrices  A and B.  K must be at least  zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  k by n  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDB must be at least  max( 1, n ), otherwise  LDB must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION.
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  symmetric matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  symmetric matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldb < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldc < max(1,*n)) {
+	info = 12;
+    }
+    if (info != 0) {
+	xerbla_("DSYR2K", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	if (upper) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  C := alpha*A*B' + alpha*B*A' + C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L90: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L100: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0. || b[j + l * b_dim1] != 0.) {
+			temp1 = *alpha * b[j + l * b_dim1];
+			temp2 = *alpha * a[j + l * a_dim1];
+			i__3 = j;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] = c__[i__ + j * c_dim1] + a[
+				    i__ + l * a_dim1] * temp1 + b[i__ + l *
+				    b_dim1] * temp2;
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L140: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L150: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0. || b[j + l * b_dim1] != 0.) {
+			temp1 = *alpha * b[j + l * b_dim1];
+			temp2 = *alpha * a[j + l * a_dim1];
+			i__3 = *n;
+			for (i__ = j; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] = c__[i__ + j * c_dim1] + a[
+				    i__ + l * a_dim1] * temp1 + b[i__ + l *
+				    b_dim1] * temp2;
+/* L160: */
+			}
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*        Form  C := alpha*A'*B + alpha*B'*A + C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp1 = 0.;
+		    temp2 = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp1 += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+			temp2 += b[l + i__ * b_dim1] * a[l + j * a_dim1];
+/* L190: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp1 + *alpha *
+				temp2;
+		    } else {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1]
+				+ *alpha * temp1 + *alpha * temp2;
+		    }
+/* L200: */
+		}
+/* L210: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+		    temp1 = 0.;
+		    temp2 = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp1 += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+			temp2 += b[l + i__ * b_dim1] * a[l + j * a_dim1];
+/* L220: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp1 + *alpha *
+				temp2;
+		    } else {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1]
+				+ *alpha * temp1 + *alpha * temp2;
+		    }
+/* L230: */
+		}
+/* L240: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSYR2K. */
+
+} /* dsyr2k_ */
+
+/* Subroutine */ int dsyrk_(char *uplo, char *trans, integer *n, integer *k,
+	doublereal *alpha, doublereal *a, integer *lda, doublereal *beta,
+	doublereal *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static doublereal temp;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    DSYRK  performs one of the symmetric rank k operations
+
+       C := alpha*A*A' + beta*C,
+
+    or
+
+       C := alpha*A'*A + beta*C,
+
+    where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+    and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+    in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   C := alpha*A*A' + beta*C.
+
+                TRANS = 'T' or 't'   C := alpha*A'*A + beta*C.
+
+                TRANS = 'C' or 'c'   C := alpha*A'*A + beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns   of  the   matrix   A,   and  on   entry   with
+             TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+             of rows of the matrix  A.  K must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION.
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  symmetric matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  symmetric matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldc < max(1,*n)) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("DSYRK ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	if (upper) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  C := alpha*A*A' + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L90: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L100: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0.) {
+			temp = *alpha * a[j + l * a_dim1];
+			i__3 = j;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L140: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L150: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0.) {
+			temp = *alpha * a[j + l * a_dim1];
+			i__3 = *n;
+			for (i__ = j; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L160: */
+			}
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*        Form  C := alpha*A'*A + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * a[l + j * a_dim1];
+/* L190: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L200: */
+		}
+/* L210: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * a[l + j * a_dim1];
+/* L220: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L230: */
+		}
+/* L240: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSYRK . */
+
+} /* dsyrk_ */
+
+/* Subroutine */ int dtrmm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
+	lda, doublereal *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static doublereal temp;
+    static logical lside;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    Purpose
+    =======
+
+    DTRMM  performs one of the matrix-matrix operations
+
+       B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+
+    where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'.
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry,  SIDE specifies whether  op( A ) multiplies B from
+             the left or right as follows:
+
+                SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+
+                SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = A'.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain the matrix  B,  and  on exit  is overwritten  by the
+             transformed matrix.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DTRMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*A*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    temp = *alpha * b[k + j * b_dim1];
+			    i__3 = k - 1;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * a[i__ + k *
+					a_dim1];
+/* L30: */
+			    }
+			    if (nounit) {
+				temp *= a[k + k * a_dim1];
+			    }
+			    b[k + j * b_dim1] = temp;
+			}
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (k = *m; k >= 1; --k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    temp = *alpha * b[k + j * b_dim1];
+			    b[k + j * b_dim1] = temp;
+			    if (nounit) {
+				b[k + j * b_dim1] *= a[k + k * a_dim1];
+			    }
+			    i__2 = *m;
+			    for (i__ = k + 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * a[i__ + k *
+					a_dim1];
+/* L60: */
+			    }
+			}
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*A'*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			temp = b[i__ + j * b_dim1];
+			if (nounit) {
+			    temp *= a[i__ + i__ * a_dim1];
+			}
+			i__2 = i__ - 1;
+			for (k = 1; k <= i__2; ++k) {
+			    temp += a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L90: */
+			}
+			b[i__ + j * b_dim1] = *alpha * temp;
+/* L100: */
+		    }
+/* L110: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			temp = b[i__ + j * b_dim1];
+			if (nounit) {
+			    temp *= a[i__ + i__ * a_dim1];
+			}
+			i__3 = *m;
+			for (k = i__ + 1; k <= i__3; ++k) {
+			    temp += a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L120: */
+			}
+			b[i__ + j * b_dim1] = *alpha * temp;
+/* L130: */
+		    }
+/* L140: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*A. */
+
+	    if (upper) {
+		for (j = *n; j >= 1; --j) {
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__1 = *m;
+		    for (i__ = 1; i__ <= i__1; ++i__) {
+			b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L150: */
+		    }
+		    i__1 = j - 1;
+		    for (k = 1; k <= i__1; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    temp = *alpha * a[k + j * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L160: */
+			    }
+			}
+/* L170: */
+		    }
+/* L180: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L190: */
+		    }
+		    i__2 = *n;
+		    for (k = j + 1; k <= i__2; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    temp = *alpha * a[k + j * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L200: */
+			    }
+			}
+/* L210: */
+		    }
+/* L220: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*A'. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    i__2 = k - 1;
+		    for (j = 1; j <= i__2; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = *alpha * a[j + k * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[k + k * a_dim1];
+		    }
+		    if (temp != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    } else {
+		for (k = *n; k >= 1; --k) {
+		    i__1 = *n;
+		    for (j = k + 1; j <= i__1; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = *alpha * a[j + k * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L270: */
+			    }
+			}
+/* L280: */
+		    }
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[k + k * a_dim1];
+		    }
+		    if (temp != 1.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L290: */
+			}
+		    }
+/* L300: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRMM . */
+
+} /* dtrmm_ */
+
+/* Subroutine */ int dtrmv_(char *uplo, char *trans, char *diag, integer *n,
+	doublereal *a, integer *lda, doublereal *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jx, kx, info;
+    static doublereal temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    Purpose
+    =======
+
+    DTRMV  performs one of the matrix-vector operations
+
+       x := A*x,   or   x := A'*x,
+
+    where x is an n element vector and  A is an n by n unit, or non-unit,
+    upper or lower triangular matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   x := A*x.
+
+                TRANS = 'T' or 't'   x := A'*x.
+
+                TRANS = 'C' or 'c'   x := A'*x.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit
+             triangular as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular matrix and the strictly lower triangular part of
+             A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular matrix and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of
+             A are not referenced either, but are assumed to be unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - DOUBLE PRECISION array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x. On exit, X is overwritten with the
+             tranformed vector x.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("DTRMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N");
+
+/*
+       Set up the start point in X if the increment is not unity. This
+       will be  ( N - 1 )*INCX  too small for descending loops.
+*/
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := A*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    x[i__] += temp * a[i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[j + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    x[ix] += temp * a[i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    x[i__] += temp * a[i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j + j * a_dim1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    x[ix] += temp * a[i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j + j * a_dim1];
+			}
+		    }
+		    jx -= *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    for (i__ = j - 1; i__ >= 1; --i__) {
+			temp += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    ix = jx;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    for (i__ = j - 1; i__ >= 1; --i__) {
+			ix -= *incx;
+			temp += a[i__ + j * a_dim1] * x[ix];
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = x[j];
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			temp += a[i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = x[jx];
+		    ix = jx;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			ix += *incx;
+			temp += a[i__ + j * a_dim1] * x[ix];
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRMV . */
+
+} /* dtrmv_ */
+
+/* Subroutine */ int dtrsm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
+	lda, doublereal *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static doublereal temp;
+    static logical lside;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    Purpose
+    =======
+
+    DTRSM  solves one of the matrix equations
+
+       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+
+    where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'.
+
+    The matrix X is overwritten on B.
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry, SIDE specifies whether op( A ) appears on the left
+             or right of X as follows:
+
+                SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+
+                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = A'.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION.
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain  the  right-hand  side  matrix  B,  and  on exit  is
+             overwritten by the solution matrix  X.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DTRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    if (nounit) {
+				b[k + j * b_dim1] /= a[k + k * a_dim1];
+			    }
+			    i__2 = k - 1;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= b[k + j * b_dim1] * a[
+					i__ + k * a_dim1];
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    if (nounit) {
+				b[k + j * b_dim1] /= a[k + k * a_dim1];
+			    }
+			    i__3 = *m;
+			    for (i__ = k + 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= b[k + j * b_dim1] * a[
+					i__ + k * a_dim1];
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*inv( A' )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			temp = *alpha * b[i__ + j * b_dim1];
+			i__3 = i__ - 1;
+			for (k = 1; k <= i__3; ++k) {
+			    temp -= a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L110: */
+			}
+			if (nounit) {
+			    temp /= a[i__ + i__ * a_dim1];
+			}
+			b[i__ + j * b_dim1] = temp;
+/* L120: */
+		    }
+/* L130: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			temp = *alpha * b[i__ + j * b_dim1];
+			i__2 = *m;
+			for (k = i__ + 1; k <= i__2; ++k) {
+			    temp -= a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L140: */
+			}
+			if (nounit) {
+			    temp /= a[i__ + i__ * a_dim1];
+			}
+			b[i__ + j * b_dim1] = temp;
+/* L150: */
+		    }
+/* L160: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L170: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= a[k + j * a_dim1] * b[
+					i__ + k * b_dim1];
+/* L180: */
+			    }
+			}
+/* L190: */
+		    }
+		    if (nounit) {
+			temp = 1. / a[j + j * a_dim1];
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L200: */
+			}
+		    }
+/* L210: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (*alpha != 1.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L220: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= i__1; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= a[k + j * a_dim1] * b[
+					i__ + k * b_dim1];
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    if (nounit) {
+			temp = 1. / a[j + j * a_dim1];
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*inv( A' ). */
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			temp = 1. / a[k + k * a_dim1];
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L270: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= i__1; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = a[j + k * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= temp * b[i__ + k *
+					b_dim1];
+/* L280: */
+			    }
+			}
+/* L290: */
+		    }
+		    if (*alpha != 1.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = *alpha * b[i__ + k * b_dim1]
+				    ;
+/* L300: */
+			}
+		    }
+/* L310: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    if (nounit) {
+			temp = 1. / a[k + k * a_dim1];
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L320: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= i__2; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = a[j + k * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= temp * b[i__ + k *
+					b_dim1];
+/* L330: */
+			    }
+			}
+/* L340: */
+		    }
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = *alpha * b[i__ + k * b_dim1]
+				    ;
+/* L350: */
+			}
+		    }
+/* L360: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRSM . */
+
+} /* dtrsm_ */
+
+doublereal dzasum_(integer *n, doublecomplex *zx, integer *incx)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal ret_val;
+
+    /* Local variables */
+    static integer i__, ix;
+    static doublereal stemp;
+    extern doublereal dcabs1_(doublecomplex *);
+
+
+/*
+    Purpose
+    =======
+
+       DZASUM takes the sum of the absolute values.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zx;
+
+    /* Function Body */
+    ret_val = 0.;
+    stemp = 0.;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp += dcabs1_(&zx[ix]);
+	ix += *incx;
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp += dcabs1_(&zx[i__]);
+/* L30: */
+    }
+    ret_val = stemp;
+    return ret_val;
+} /* dzasum_ */
+
+doublereal dznrm2_(integer *n, doublecomplex *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublereal ret_val, d__1;
+
+    /* Local variables */
+    static integer ix;
+    static doublereal ssq, temp, norm, scale;
+
+
+/*
+    Purpose
+    =======
+
+    DZNRM2 returns the euclidean norm of a vector via the function
+    name, so that
+
+       DZNRM2 := sqrt( conjg( x' )*x )
+
+    Further Details
+    ===============
+
+    -- This version written on 25-October-1982.
+       Modified on 14-October-1993 to inline the call to ZLASSQ.
+       Sven Hammarling, Nag Ltd.
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n < 1 || *incx < 1) {
+	norm = 0.;
+    } else {
+	scale = 0.;
+	ssq = 1.;
+/*
+          The following loop is equivalent to this call to the LAPACK
+          auxiliary routine:
+          CALL ZLASSQ( N, X, INCX, SCALE, SSQ )
+*/
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    i__3 = ix;
+	    if (x[i__3].r != 0.) {
+		i__3 = ix;
+		temp = (d__1 = x[i__3].r, abs(d__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    d__1 = scale / temp;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    d__1 = temp / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+	    if (d_imag(&x[ix]) != 0.) {
+		temp = (d__1 = d_imag(&x[ix]), abs(d__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    d__1 = scale / temp;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    d__1 = temp / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of DZNRM2. */
+
+} /* dznrm2_ */
+
+integer icamax_(integer *n, complex *cx, integer *incx)
+{
+    /* System generated locals */
+    integer ret_val, i__1;
+
+    /* Local variables */
+    static integer i__, ix;
+    static real smax;
+    extern doublereal scabs1_(complex *);
+
+
+/*
+    Purpose
+    =======
+
+       ICAMAX finds the index of element having max. absolute value.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cx;
+
+    /* Function Body */
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    smax = scabs1_(&cx[1]);
+    ix += *incx;
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if (scabs1_(&cx[ix]) <= smax) {
+	    goto L5;
+	}
+	ret_val = i__;
+	smax = scabs1_(&cx[ix]);
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    smax = scabs1_(&cx[1]);
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if (scabs1_(&cx[i__]) <= smax) {
+	    goto L30;
+	}
+	ret_val = i__;
+	smax = scabs1_(&cx[i__]);
+L30:
+	;
+    }
+    return ret_val;
+} /* icamax_ */
+
+integer idamax_(integer *n, doublereal *dx, integer *incx)
+{
+    /* System generated locals */
+    integer ret_val, i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, ix;
+    static doublereal dmax__;
+
+
+/*
+    Purpose
+    =======
+
+       IDAMAX finds the index of element having max. absolute value.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --dx;
+
+    /* Function Body */
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    dmax__ = abs(dx[1]);
+    ix += *incx;
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if ((d__1 = dx[ix], abs(d__1)) <= dmax__) {
+	    goto L5;
+	}
+	ret_val = i__;
+	dmax__ = (d__1 = dx[ix], abs(d__1));
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    dmax__ = abs(dx[1]);
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if ((d__1 = dx[i__], abs(d__1)) <= dmax__) {
+	    goto L30;
+	}
+	ret_val = i__;
+	dmax__ = (d__1 = dx[i__], abs(d__1));
+L30:
+	;
+    }
+    return ret_val;
+} /* idamax_ */
+
+integer isamax_(integer *n, real *sx, integer *incx)
+{
+    /* System generated locals */
+    integer ret_val, i__1;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, ix;
+    static real smax;
+
+
+/*
+    Purpose
+    =======
+
+       ISAMAX finds the index of element having max. absolute value.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sx;
+
+    /* Function Body */
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    smax = dabs(sx[1]);
+    ix += *incx;
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if ((r__1 = sx[ix], dabs(r__1)) <= smax) {
+	    goto L5;
+	}
+	ret_val = i__;
+	smax = (r__1 = sx[ix], dabs(r__1));
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    smax = dabs(sx[1]);
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if ((r__1 = sx[i__], dabs(r__1)) <= smax) {
+	    goto L30;
+	}
+	ret_val = i__;
+	smax = (r__1 = sx[i__], dabs(r__1));
+L30:
+	;
+    }
+    return ret_val;
+} /* isamax_ */
+
+integer izamax_(integer *n, doublecomplex *zx, integer *incx)
+{
+    /* System generated locals */
+    integer ret_val, i__1;
+
+    /* Local variables */
+    static integer i__, ix;
+    static doublereal smax;
+    extern doublereal dcabs1_(doublecomplex *);
+
+
+/*
+    Purpose
+    =======
+
+       IZAMAX finds the index of element having max. absolute value.
+
+    Further Details
+    ===============
+
+       jack dongarra, 1/15/85.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zx;
+
+    /* Function Body */
+    ret_val = 0;
+    if (*n < 1 || *incx <= 0) {
+	return ret_val;
+    }
+    ret_val = 1;
+    if (*n == 1) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    smax = dcabs1_(&zx[1]);
+    ix += *incx;
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if (dcabs1_(&zx[ix]) <= smax) {
+	    goto L5;
+	}
+	ret_val = i__;
+	smax = dcabs1_(&zx[ix]);
+L5:
+	ix += *incx;
+/* L10: */
+    }
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    smax = dcabs1_(&zx[1]);
+    i__1 = *n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	if (dcabs1_(&zx[i__]) <= smax) {
+	    goto L30;
+	}
+	ret_val = i__;
+	smax = dcabs1_(&zx[i__]);
+L30:
+	;
+    }
+    return ret_val;
+} /* izamax_ */
+
+/* Subroutine */ int saxpy_(integer *n, real *sa, real *sx, integer *incx,
+	real *sy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+
+
+/*
+    Purpose
+    =======
+
+       SAXPY constant times a vector plus a vector.
+       uses unrolled loop for increments equal to one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sy;
+    --sx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*sa == 0.f) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	sy[iy] += *sa * sx[ix];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*
+          code for both increments equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 4;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	sy[i__] += *sa * sx[i__];
+/* L30: */
+    }
+    if (*n < 4) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 4) {
+	sy[i__] += *sa * sx[i__];
+	sy[i__ + 1] += *sa * sx[i__ + 1];
+	sy[i__ + 2] += *sa * sx[i__ + 2];
+	sy[i__ + 3] += *sa * sx[i__ + 3];
+/* L50: */
+    }
+    return 0;
+} /* saxpy_ */
+
+doublereal scabs1_(complex *z__)
+{
+    /* System generated locals */
+    real ret_val, r__1, r__2;
+
+
+/*
+    Purpose
+    =======
+
+    SCABS1 computes absolute value of a complex number
+
+    =====================================================================
+*/
+
+    ret_val = (r__1 = z__->r, dabs(r__1)) + (r__2 = r_imag(z__), dabs(r__2));
+    return ret_val;
+} /* scabs1_ */
+
+doublereal scasum_(integer *n, complex *cx, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    real ret_val, r__1, r__2;
+
+    /* Local variables */
+    static integer i__, nincx;
+    static real stemp;
+
+
+/*
+    Purpose
+    =======
+
+       SCASUM takes the sum of the absolute values of a complex vector and
+       returns a single precision result.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --cx;
+
+    /* Function Body */
+    ret_val = 0.f;
+    stemp = 0.f;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	i__3 = i__;
+	stemp = stemp + (r__1 = cx[i__3].r, dabs(r__1)) + (r__2 = r_imag(&cx[
+		i__]), dabs(r__2));
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	i__1 = i__;
+	stemp = stemp + (r__1 = cx[i__1].r, dabs(r__1)) + (r__2 = r_imag(&cx[
+		i__]), dabs(r__2));
+/* L30: */
+    }
+    ret_val = stemp;
+    return ret_val;
+} /* scasum_ */
+
+doublereal scnrm2_(integer *n, complex *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    real ret_val, r__1;
+
+    /* Local variables */
+    static integer ix;
+    static real ssq, temp, norm, scale;
+
+
+/*
+    Purpose
+    =======
+
+    SCNRM2 returns the euclidean norm of a vector via the function
+    name, so that
+
+       SCNRM2 := sqrt( conjg( x' )*x )
+
+    Further Details
+    ===============
+
+    -- This version written on 25-October-1982.
+       Modified on 14-October-1993 to inline the call to CLASSQ.
+       Sven Hammarling, Nag Ltd.
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n < 1 || *incx < 1) {
+	norm = 0.f;
+    } else {
+	scale = 0.f;
+	ssq = 1.f;
+/*
+          The following loop is equivalent to this call to the LAPACK
+          auxiliary routine:
+          CALL CLASSQ( N, X, INCX, SCALE, SSQ )
+*/
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    i__3 = ix;
+	    if (x[i__3].r != 0.f) {
+		i__3 = ix;
+		temp = (r__1 = x[i__3].r, dabs(r__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    r__1 = scale / temp;
+		    ssq = ssq * (r__1 * r__1) + 1.f;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    r__1 = temp / scale;
+		    ssq += r__1 * r__1;
+		}
+	    }
+	    if (r_imag(&x[ix]) != 0.f) {
+		temp = (r__1 = r_imag(&x[ix]), dabs(r__1));
+		if (scale < temp) {
+/* Computing 2nd power */
+		    r__1 = scale / temp;
+		    ssq = ssq * (r__1 * r__1) + 1.f;
+		    scale = temp;
+		} else {
+/* Computing 2nd power */
+		    r__1 = temp / scale;
+		    ssq += r__1 * r__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of SCNRM2. */
+
+} /* scnrm2_ */
+
+/* Subroutine */ int scopy_(integer *n, real *sx, integer *incx, real *sy,
+	integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+
+
+/*
+    Purpose
+    =======
+
+       SCOPY copies a vector, x, to a vector, y.
+       uses unrolled loops for increments equal to 1.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sy;
+    --sx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	sy[iy] = sx[ix];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*
+          code for both increments equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 7;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	sy[i__] = sx[i__];
+/* L30: */
+    }
+    if (*n < 7) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 7) {
+	sy[i__] = sx[i__];
+	sy[i__ + 1] = sx[i__ + 1];
+	sy[i__ + 2] = sx[i__ + 2];
+	sy[i__ + 3] = sx[i__ + 3];
+	sy[i__ + 4] = sx[i__ + 4];
+	sy[i__ + 5] = sx[i__ + 5];
+	sy[i__ + 6] = sx[i__ + 6];
+/* L50: */
+    }
+    return 0;
+} /* scopy_ */
+
+doublereal sdot_(integer *n, real *sx, integer *incx, real *sy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+    real ret_val;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+    static real stemp;
+
+
+/*
+    Purpose
+    =======
+
+       SDOT forms the dot product of two vectors.
+       uses unrolled loops for increments equal to one.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sy;
+    --sx;
+
+    /* Function Body */
+    stemp = 0.f;
+    ret_val = 0.f;
+    if (*n <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp += sx[ix] * sy[iy];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    ret_val = stemp;
+    return ret_val;
+
+/*
+          code for both increments equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp += sx[i__] * sy[i__];
+/* L30: */
+    }
+    if (*n < 5) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 5) {
+	stemp = stemp + sx[i__] * sy[i__] + sx[i__ + 1] * sy[i__ + 1] + sx[
+		i__ + 2] * sy[i__ + 2] + sx[i__ + 3] * sy[i__ + 3] + sx[i__ +
+		4] * sy[i__ + 4];
+/* L50: */
+    }
+L60:
+    ret_val = stemp;
+    return ret_val;
+} /* sdot_ */
+
+/* Subroutine */ int sgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, real *alpha, real *a, integer *lda, real *b, integer *
+	ldb, real *beta, real *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static logical nota, notb;
+    static real temp;
+    static integer ncola;
+    extern logical lsame_(char *, char *);
+    static integer nrowa, nrowb;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SGEMM  performs one of the matrix-matrix operations
+
+       C := alpha*op( A )*op( B ) + beta*C,
+
+    where  op( X ) is one of
+
+       op( X ) = X   or   op( X ) = X',
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A )
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n',  op( A ) = A.
+
+                TRANSA = 'T' or 't',  op( A ) = A'.
+
+                TRANSA = 'C' or 'c',  op( A ) = A'.
+
+             Unchanged on exit.
+
+    TRANSB - CHARACTER*1.
+             On entry, TRANSB specifies the form of op( B ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSB = 'N' or 'n',  op( B ) = B.
+
+                TRANSB = 'T' or 't',  op( B ) = B'.
+
+                TRANSB = 'C' or 'c',  op( B ) = B'.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry,  M  specifies  the number  of rows  of the  matrix
+             op( A )  and of the  matrix  C.  M  must  be at least  zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N  specifies the number  of columns of the matrix
+             op( B ) and the number of columns of the matrix C. N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry,  K  specifies  the number of columns of the matrix
+             op( A ) and the number of rows of the matrix op( B ). K must
+             be at least  zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by m  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+             LDA must be at least  max( 1, m ), otherwise  LDA must be at
+             least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - REAL             array of DIMENSION ( LDB, kb ), where kb is
+             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  n by k  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+             LDB must be at least  max( 1, k ), otherwise  LDB must be at
+             least  max( 1, n ).
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+             supplied as zero then C need not be set on input.
+             Unchanged on exit.
+
+    C      - REAL             array of DIMENSION ( LDC, n ).
+             Before entry, the leading  m by n  part of the array  C must
+             contain the matrix  C,  except when  beta  is zero, in which
+             case C need not be set on entry.
+             On exit, the array  C  is overwritten by the  m by n  matrix
+             ( alpha*op( A )*op( B ) + beta*C ).
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+       and  columns of  A  and the  number of  rows  of  B  respectively.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    nota = lsame_(transa, "N");
+    notb = lsame_(transb, "N");
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && ! lsame_(transa, "C") && ! lsame_(
+	    transa, "T")) {
+	info = 1;
+    } else if (! notb && ! lsame_(transb, "C") && !
+	    lsame_(transb, "T")) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	xerbla_("SGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) {
+	return 0;
+    }
+
+/*     And if  alpha.eq.zero. */
+
+    if (*alpha == 0.f) {
+	if (*beta == 0.f) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    c__[i__ + j * c_dim1] = 0.f;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L50: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (b[l + j * b_dim1] != 0.f) {
+			temp = *alpha * b[l + j * b_dim1];
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+/* L100: */
+		    }
+		    if (*beta == 0.f) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	}
+    } else {
+	if (nota) {
+
+/*           Form  C := alpha*A*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L130: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L140: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (b[j + l * b_dim1] != 0.f) {
+			temp = *alpha * b[j + l * b_dim1];
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+/* L170: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * b[j + l * b_dim1];
+/* L180: */
+		    }
+		    if (*beta == 0.f) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SGEMM . */
+
+} /* sgemm_ */
+
+/* Subroutine */ int sgemv_(char *trans, integer *m, integer *n, real *alpha,
+	real *a, integer *lda, real *x, integer *incx, real *beta, real *y,
+	integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static real temp;
+    static integer lenx, leny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SGEMV  performs one of the matrix-vector operations
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+
+    where alpha and beta are scalars, x and y are vectors and A is an
+    m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    X      - REAL             array of DIMENSION at least
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+             Before entry, the incremented array X must contain the
+             vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - REAL             array of DIMENSION at least
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+             Before entry with BETA non-zero, the incremented array Y
+             must contain the vector y. On exit, Y is overwritten by the
+             updated vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! lsame_(trans, "C")
+	    ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("SGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) {
+	return 0;
+    }
+
+/*
+       Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+       up the start points in  X  and  Y.
+*/
+
+    if (lsame_(trans, "N")) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+
+       First form  y := beta*y.
+*/
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    if (lsame_(trans, "N")) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.f) {
+		    temp = *alpha * x[jx];
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			y[i__] += temp * a[i__ + j * a_dim1];
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.f) {
+		    temp = *alpha * x[jx];
+		    iy = ky;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			y[iy] += temp * a[i__ + j * a_dim1];
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = 0.f;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[jy] += *alpha * temp;
+		jy += *incy;
+/* L100: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = 0.f;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp += a[i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+/* L110: */
+		}
+		y[jy] += *alpha * temp;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SGEMV . */
+
+} /* sgemv_ */
+
+/* Subroutine */ int sger_(integer *m, integer *n, real *alpha, real *x,
+	integer *incx, real *y, integer *incy, real *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jy, kx, info;
+    static real temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SGER   performs the rank 1 operation
+
+       A := alpha*x*y' + A,
+
+    where alpha is a scalar, x is an m element vector, y is an n element
+    vector and A is an m by n matrix.
+
+    Arguments
+    ==========
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - REAL             array of dimension at least
+             ( 1 + ( m - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the m
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - REAL             array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients. On exit, A is
+             overwritten by the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("SGER  ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.f) {
+	return 0;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (y[jy] != 0.f) {
+		temp = *alpha * y[jy];
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    a[i__ + j * a_dim1] += x[i__] * temp;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (y[jy] != 0.f) {
+		temp = *alpha * y[jy];
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    a[i__ + j * a_dim1] += x[ix] * temp;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of SGER  . */
+
+} /* sger_ */
+
+doublereal snrm2_(integer *n, real *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    real ret_val, r__1;
+
+    /* Local variables */
+    static integer ix;
+    static real ssq, norm, scale, absxi;
+
+
+/*
+    Purpose
+    =======
+
+    SNRM2 returns the euclidean norm of a vector via the function
+    name, so that
+
+       SNRM2 := sqrt( x'*x ).
+
+    Further Details
+    ===============
+
+    -- This version written on 25-October-1982.
+       Modified on 14-October-1993 to inline the call to SLASSQ.
+       Sven Hammarling, Nag Ltd.
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n < 1 || *incx < 1) {
+	norm = 0.f;
+    } else if (*n == 1) {
+	norm = dabs(x[1]);
+    } else {
+	scale = 0.f;
+	ssq = 1.f;
+/*
+          The following loop is equivalent to this call to the LAPACK
+          auxiliary routine:
+          CALL SLASSQ( N, X, INCX, SCALE, SSQ )
+*/
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    if (x[ix] != 0.f) {
+		absxi = (r__1 = x[ix], dabs(r__1));
+		if (scale < absxi) {
+/* Computing 2nd power */
+		    r__1 = scale / absxi;
+		    ssq = ssq * (r__1 * r__1) + 1.f;
+		    scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    r__1 = absxi / scale;
+		    ssq += r__1 * r__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of SNRM2. */
+
+} /* snrm2_ */
+
+/* Subroutine */ int srot_(integer *n, real *sx, integer *incx, real *sy,
+	integer *incy, real *c__, real *s)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static real stemp;
+
+
+/*
+    Purpose
+    =======
+
+       applies a plane rotation.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sy;
+    --sx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+         code for unequal increments or equal increments not equal
+           to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp = *c__ * sx[ix] + *s * sy[iy];
+	sy[iy] = *c__ * sy[iy] - *s * sx[ix];
+	sx[ix] = stemp;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*       code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp = *c__ * sx[i__] + *s * sy[i__];
+	sy[i__] = *c__ * sy[i__] - *s * sx[i__];
+	sx[i__] = stemp;
+/* L30: */
+    }
+    return 0;
+} /* srot_ */
+
+/* Subroutine */ int sscal_(integer *n, real *sa, real *sx, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i__, m, mp1, nincx;
+
+
+/*
+    Purpose
+    =======
+
+       scales a vector by a constant.
+       uses unrolled loops for increment equal to 1.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	sx[i__] = *sa * sx[i__];
+/* L10: */
+    }
+    return 0;
+
+/*
+          code for increment equal to 1
+
+
+          clean-up loop
+*/
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	sx[i__] = *sa * sx[i__];
+/* L30: */
+    }
+    if (*n < 5) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i__ = mp1; i__ <= i__2; i__ += 5) {
+	sx[i__] = *sa * sx[i__];
+	sx[i__ + 1] = *sa * sx[i__ + 1];
+	sx[i__ + 2] = *sa * sx[i__ + 2];
+	sx[i__ + 3] = *sa * sx[i__ + 3];
+	sx[i__ + 4] = *sa * sx[i__ + 4];
+/* L50: */
+    }
+    return 0;
+} /* sscal_ */
+
+/* Subroutine */ int sswap_(integer *n, real *sx, integer *incx, real *sy,
+	integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, m, ix, iy, mp1;
+    static real stemp;
+
+
+/*
+    Purpose
+    =======
+
+       interchanges two vectors.
+       uses unrolled loops for increments equal to 1.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --sy;
+    --sx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+         code for unequal increments or equal increments not equal
+           to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp = sx[ix];
+	sx[ix] = sy[iy];
+	sy[iy] = stemp;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*
+         code for both increments equal to 1
+
+
+         clean-up loop
+*/
+
+L20:
+    m = *n % 3;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	stemp = sx[i__];
+	sx[i__] = sy[i__];
+	sy[i__] = stemp;
+/* L30: */
+    }
+    if (*n < 3) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 3) {
+	stemp = sx[i__];
+	sx[i__] = sy[i__];
+	sy[i__] = stemp;
+	stemp = sx[i__ + 1];
+	sx[i__ + 1] = sy[i__ + 1];
+	sy[i__ + 1] = stemp;
+	stemp = sx[i__ + 2];
+	sx[i__ + 2] = sy[i__ + 2];
+	sy[i__ + 2] = stemp;
+/* L50: */
+    }
+    return 0;
+} /* sswap_ */
+
+/* Subroutine */ int ssymv_(char *uplo, integer *n, real *alpha, real *a,
+	integer *lda, real *x, integer *incx, real *beta, real *y, integer *
+	incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static real temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SSYMV  performs the matrix-vector  operation
+
+       y := alpha*A*x + beta*y,
+
+    where alpha and beta are scalars, x and y are n element vectors and
+    A is an n by n symmetric matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the symmetric matrix and the strictly
+             lower triangular part of A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the symmetric matrix and the strictly
+             upper triangular part of A is not referenced.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - REAL             array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - REAL             array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y. On exit, Y is overwritten by the updated
+             vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("SSYMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0.f && *beta == 1.f) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+
+       First form  y := beta*y.
+*/
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    if (lsame_(uplo, "U")) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[i__];
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * a[j + j * a_dim1] + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[iy] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * a[j + j * a_dim1] + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		y[j] += temp1 * a[j + j * a_dim1];
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		y[jy] += temp1 * a[j + j * a_dim1];
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * a[i__ + j * a_dim1];
+		    temp2 += a[i__ + j * a_dim1] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSYMV . */
+
+} /* ssymv_ */
+
+/* Subroutine */ int ssyr2_(char *uplo, integer *n, real *alpha, real *x,
+	integer *incx, real *y, integer *incy, real *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static real temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SSYR2  performs the symmetric rank 2 operation
+
+       A := alpha*x*y' + alpha*y*x' + A,
+
+    where alpha is a scalar, x and y are n element vectors and A is an n
+    by n symmetric matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - REAL             array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - REAL             array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the symmetric matrix and the strictly
+             lower triangular part of A is not referenced. On exit, the
+             upper triangular part of the array A is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the symmetric matrix and the strictly
+             upper triangular part of A is not referenced. On exit, the
+             lower triangular part of the array A is overwritten by the
+             lower triangular part of the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("SSYR2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || *alpha == 0.f) {
+	return 0;
+    }
+
+/*
+       Set up the start points in X and Y if the increments are not both
+       unity.
+*/
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+*/
+
+    if (lsame_(uplo, "U")) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[j] != 0.f || y[j] != 0.f) {
+		    temp1 = *alpha * y[j];
+		    temp2 = *alpha * x[j];
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[i__] *
+				temp1 + y[i__] * temp2;
+/* L10: */
+		    }
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.f || y[jy] != 0.f) {
+		    temp1 = *alpha * y[jy];
+		    temp2 = *alpha * x[jx];
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[ix] *
+				temp1 + y[iy] * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[j] != 0.f || y[j] != 0.f) {
+		    temp1 = *alpha * y[j];
+		    temp2 = *alpha * x[j];
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[i__] *
+				temp1 + y[i__] * temp2;
+/* L50: */
+		    }
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.f || y[jy] != 0.f) {
+		    temp1 = *alpha * y[jy];
+		    temp2 = *alpha * x[jx];
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			a[i__ + j * a_dim1] = a[i__ + j * a_dim1] + x[ix] *
+				temp1 + y[iy] * temp2;
+			ix += *incx;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSYR2 . */
+
+} /* ssyr2_ */
+
+/* Subroutine */ int ssyr2k_(char *uplo, char *trans, integer *n, integer *k,
+	real *alpha, real *a, integer *lda, real *b, integer *ldb, real *beta,
+	 real *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static real temp1, temp2;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SSYR2K  performs one of the symmetric rank 2k operations
+
+       C := alpha*A*B' + alpha*B*A' + beta*C,
+
+    or
+
+       C := alpha*A'*B + alpha*B'*A + beta*C,
+
+    where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+    and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+    matrices in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   C := alpha*A*B' + alpha*B*A' +
+                                          beta*C.
+
+                TRANS = 'T' or 't'   C := alpha*A'*B + alpha*B'*A +
+                                          beta*C.
+
+                TRANS = 'C' or 'c'   C := alpha*A'*B + alpha*B'*A +
+                                          beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns  of the  matrices  A and B,  and on  entry  with
+             TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+             of rows of the matrices  A and B.  K must be at least  zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - REAL             array of DIMENSION ( LDB, kb ), where kb is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  k by n  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDB must be at least  max( 1, n ), otherwise  LDB must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - REAL             array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  symmetric matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  symmetric matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldb < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldc < max(1,*n)) {
+	info = 12;
+    }
+    if (info != 0) {
+	xerbla_("SSYR2K", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.f) {
+	if (upper) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  C := alpha*A*B' + alpha*B*A' + C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L90: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L100: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0.f || b[j + l * b_dim1] != 0.f)
+			    {
+			temp1 = *alpha * b[j + l * b_dim1];
+			temp2 = *alpha * a[j + l * a_dim1];
+			i__3 = j;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] = c__[i__ + j * c_dim1] + a[
+				    i__ + l * a_dim1] * temp1 + b[i__ + l *
+				    b_dim1] * temp2;
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L140: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L150: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0.f || b[j + l * b_dim1] != 0.f)
+			    {
+			temp1 = *alpha * b[j + l * b_dim1];
+			temp2 = *alpha * a[j + l * a_dim1];
+			i__3 = *n;
+			for (i__ = j; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] = c__[i__ + j * c_dim1] + a[
+				    i__ + l * a_dim1] * temp1 + b[i__ + l *
+				    b_dim1] * temp2;
+/* L160: */
+			}
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*        Form  C := alpha*A'*B + alpha*B'*A + C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp1 = 0.f;
+		    temp2 = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp1 += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+			temp2 += b[l + i__ * b_dim1] * a[l + j * a_dim1];
+/* L190: */
+		    }
+		    if (*beta == 0.f) {
+			c__[i__ + j * c_dim1] = *alpha * temp1 + *alpha *
+				temp2;
+		    } else {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1]
+				+ *alpha * temp1 + *alpha * temp2;
+		    }
+/* L200: */
+		}
+/* L210: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+		    temp1 = 0.f;
+		    temp2 = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp1 += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+			temp2 += b[l + i__ * b_dim1] * a[l + j * a_dim1];
+/* L220: */
+		    }
+		    if (*beta == 0.f) {
+			c__[i__ + j * c_dim1] = *alpha * temp1 + *alpha *
+				temp2;
+		    } else {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1]
+				+ *alpha * temp1 + *alpha * temp2;
+		    }
+/* L230: */
+		}
+/* L240: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSYR2K. */
+
+} /* ssyr2k_ */
+
+/* Subroutine */ int ssyrk_(char *uplo, char *trans, integer *n, integer *k,
+	real *alpha, real *a, integer *lda, real *beta, real *c__, integer *
+	ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static real temp;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    SSYRK  performs one of the symmetric rank k operations
+
+       C := alpha*A*A' + beta*C,
+
+    or
+
+       C := alpha*A'*A + beta*C,
+
+    where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+    and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+    in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   C := alpha*A*A' + beta*C.
+
+                TRANS = 'T' or 't'   C := alpha*A'*A + beta*C.
+
+                TRANS = 'C' or 'c'   C := alpha*A'*A + beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns   of  the   matrix   A,   and  on   entry   with
+             TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+             of rows of the matrix  A.  K must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - REAL            .
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - REAL             array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  symmetric matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  symmetric matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldc < max(1,*n)) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("SSYRK ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.f) {
+	if (upper) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  C := alpha*A*A' + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L90: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L100: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0.f) {
+			temp = *alpha * a[j + l * a_dim1];
+			i__3 = j;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.f) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.f;
+/* L140: */
+		    }
+		} else if (*beta != 1.f) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L150: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (a[j + l * a_dim1] != 0.f) {
+			temp = *alpha * a[j + l * a_dim1];
+			i__3 = *n;
+			for (i__ = j; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l *
+				    a_dim1];
+/* L160: */
+			}
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*        Form  C := alpha*A'*A + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * a[l + j * a_dim1];
+/* L190: */
+		    }
+		    if (*beta == 0.f) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L200: */
+		}
+/* L210: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+		    temp = 0.f;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * a[l + j * a_dim1];
+/* L220: */
+		    }
+		    if (*beta == 0.f) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L230: */
+		}
+/* L240: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSYRK . */
+
+} /* ssyrk_ */
+
+/* Subroutine */ int strmm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, real *alpha, real *a, integer *lda, real *b,
+	integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static real temp;
+    static logical lside;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    Purpose
+    =======
+
+    STRMM  performs one of the matrix-matrix operations
+
+       B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+
+    where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'.
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry,  SIDE specifies whether  op( A ) multiplies B from
+             the left or right as follows:
+
+                SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+
+                SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = A'.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - REAL             array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain the matrix  B,  and  on exit  is overwritten  by the
+             transformed matrix.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("STRMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.f) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*A*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			if (b[k + j * b_dim1] != 0.f) {
+			    temp = *alpha * b[k + j * b_dim1];
+			    i__3 = k - 1;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * a[i__ + k *
+					a_dim1];
+/* L30: */
+			    }
+			    if (nounit) {
+				temp *= a[k + k * a_dim1];
+			    }
+			    b[k + j * b_dim1] = temp;
+			}
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (k = *m; k >= 1; --k) {
+			if (b[k + j * b_dim1] != 0.f) {
+			    temp = *alpha * b[k + j * b_dim1];
+			    b[k + j * b_dim1] = temp;
+			    if (nounit) {
+				b[k + j * b_dim1] *= a[k + k * a_dim1];
+			    }
+			    i__2 = *m;
+			    for (i__ = k + 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * a[i__ + k *
+					a_dim1];
+/* L60: */
+			    }
+			}
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*A'*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			temp = b[i__ + j * b_dim1];
+			if (nounit) {
+			    temp *= a[i__ + i__ * a_dim1];
+			}
+			i__2 = i__ - 1;
+			for (k = 1; k <= i__2; ++k) {
+			    temp += a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L90: */
+			}
+			b[i__ + j * b_dim1] = *alpha * temp;
+/* L100: */
+		    }
+/* L110: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			temp = b[i__ + j * b_dim1];
+			if (nounit) {
+			    temp *= a[i__ + i__ * a_dim1];
+			}
+			i__3 = *m;
+			for (k = i__ + 1; k <= i__3; ++k) {
+			    temp += a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L120: */
+			}
+			b[i__ + j * b_dim1] = *alpha * temp;
+/* L130: */
+		    }
+/* L140: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*A. */
+
+	    if (upper) {
+		for (j = *n; j >= 1; --j) {
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__1 = *m;
+		    for (i__ = 1; i__ <= i__1; ++i__) {
+			b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L150: */
+		    }
+		    i__1 = j - 1;
+		    for (k = 1; k <= i__1; ++k) {
+			if (a[k + j * a_dim1] != 0.f) {
+			    temp = *alpha * a[k + j * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L160: */
+			    }
+			}
+/* L170: */
+		    }
+/* L180: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L190: */
+		    }
+		    i__2 = *n;
+		    for (k = j + 1; k <= i__2; ++k) {
+			if (a[k + j * a_dim1] != 0.f) {
+			    temp = *alpha * a[k + j * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L200: */
+			    }
+			}
+/* L210: */
+		    }
+/* L220: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*A'. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    i__2 = k - 1;
+		    for (j = 1; j <= i__2; ++j) {
+			if (a[j + k * a_dim1] != 0.f) {
+			    temp = *alpha * a[j + k * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[k + k * a_dim1];
+		    }
+		    if (temp != 1.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    } else {
+		for (k = *n; k >= 1; --k) {
+		    i__1 = *n;
+		    for (j = k + 1; j <= i__1; ++j) {
+			if (a[j + k * a_dim1] != 0.f) {
+			    temp = *alpha * a[j + k * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k *
+					b_dim1];
+/* L270: */
+			    }
+			}
+/* L280: */
+		    }
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[k + k * a_dim1];
+		    }
+		    if (temp != 1.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L290: */
+			}
+		    }
+/* L300: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STRMM . */
+
+} /* strmm_ */
+
+/* Subroutine */ int strmv_(char *uplo, char *trans, char *diag, integer *n,
+	real *a, integer *lda, real *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jx, kx, info;
+    static real temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    Purpose
+    =======
+
+    STRMV  performs one of the matrix-vector operations
+
+       x := A*x,   or   x := A'*x,
+
+    where x is an n element vector and  A is an n by n unit, or non-unit,
+    upper or lower triangular matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   x := A*x.
+
+                TRANS = 'T' or 't'   x := A'*x.
+
+                TRANS = 'C' or 'c'   x := A'*x.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit
+             triangular as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular matrix and the strictly lower triangular part of
+             A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular matrix and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of
+             A are not referenced either, but are assumed to be unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - REAL             array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x. On exit, X is overwritten with the
+             tranformed vector x.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("STRMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N");
+
+/*
+       Set up the start point in X if the increment is not unity. This
+       will be  ( N - 1 )*INCX  too small for descending loops.
+*/
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := A*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.f) {
+			temp = x[j];
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    x[i__] += temp * a[i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[j + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.f) {
+			temp = x[jx];
+			ix = kx;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    x[ix] += temp * a[i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.f) {
+			temp = x[j];
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    x[i__] += temp * a[i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j + j * a_dim1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.f) {
+			temp = x[jx];
+			ix = kx;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    x[ix] += temp * a[i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j + j * a_dim1];
+			}
+		    }
+		    jx -= *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    for (i__ = j - 1; i__ >= 1; --i__) {
+			temp += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    ix = jx;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    for (i__ = j - 1; i__ >= 1; --i__) {
+			ix -= *incx;
+			temp += a[i__ + j * a_dim1] * x[ix];
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = x[j];
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			temp += a[i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = x[jx];
+		    ix = jx;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			ix += *incx;
+			temp += a[i__ + j * a_dim1] * x[ix];
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STRMV . */
+
+} /* strmv_ */
+
+/* Subroutine */ int strsm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, real *alpha, real *a, integer *lda, real *b,
+	integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static real temp;
+    static logical lside;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    Purpose
+    =======
+
+    STRSM  solves one of the matrix equations
+
+       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+
+    where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'.
+
+    The matrix X is overwritten on B.
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry, SIDE specifies whether op( A ) appears on the left
+             or right of X as follows:
+
+                SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+
+                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = A'.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - REAL            .
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - REAL             array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - REAL             array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain  the  right-hand  side  matrix  B,  and  on exit  is
+             overwritten by the solution matrix  X.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("STRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.f) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			if (b[k + j * b_dim1] != 0.f) {
+			    if (nounit) {
+				b[k + j * b_dim1] /= a[k + k * a_dim1];
+			    }
+			    i__2 = k - 1;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= b[k + j * b_dim1] * a[
+					i__ + k * a_dim1];
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			if (b[k + j * b_dim1] != 0.f) {
+			    if (nounit) {
+				b[k + j * b_dim1] /= a[k + k * a_dim1];
+			    }
+			    i__3 = *m;
+			    for (i__ = k + 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= b[k + j * b_dim1] * a[
+					i__ + k * a_dim1];
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*inv( A' )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			temp = *alpha * b[i__ + j * b_dim1];
+			i__3 = i__ - 1;
+			for (k = 1; k <= i__3; ++k) {
+			    temp -= a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L110: */
+			}
+			if (nounit) {
+			    temp /= a[i__ + i__ * a_dim1];
+			}
+			b[i__ + j * b_dim1] = temp;
+/* L120: */
+		    }
+/* L130: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			temp = *alpha * b[i__ + j * b_dim1];
+			i__2 = *m;
+			for (k = i__ + 1; k <= i__2; ++k) {
+			    temp -= a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L140: */
+			}
+			if (nounit) {
+			    temp /= a[i__ + i__ * a_dim1];
+			}
+			b[i__ + j * b_dim1] = temp;
+/* L150: */
+		    }
+/* L160: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L170: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			if (a[k + j * a_dim1] != 0.f) {
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= a[k + j * a_dim1] * b[
+					i__ + k * b_dim1];
+/* L180: */
+			    }
+			}
+/* L190: */
+		    }
+		    if (nounit) {
+			temp = 1.f / a[j + j * a_dim1];
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L200: */
+			}
+		    }
+/* L210: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (*alpha != 1.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L220: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= i__1; ++k) {
+			if (a[k + j * a_dim1] != 0.f) {
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= a[k + j * a_dim1] * b[
+					i__ + k * b_dim1];
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    if (nounit) {
+			temp = 1.f / a[j + j * a_dim1];
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*inv( A' ). */
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			temp = 1.f / a[k + k * a_dim1];
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L270: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= i__1; ++j) {
+			if (a[j + k * a_dim1] != 0.f) {
+			    temp = a[j + k * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= temp * b[i__ + k *
+					b_dim1];
+/* L280: */
+			    }
+			}
+/* L290: */
+		    }
+		    if (*alpha != 1.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = *alpha * b[i__ + k * b_dim1]
+				    ;
+/* L300: */
+			}
+		    }
+/* L310: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    if (nounit) {
+			temp = 1.f / a[k + k * a_dim1];
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L320: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= i__2; ++j) {
+			if (a[j + k * a_dim1] != 0.f) {
+			    temp = a[j + k * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= temp * b[i__ + k *
+					b_dim1];
+/* L330: */
+			    }
+			}
+/* L340: */
+		    }
+		    if (*alpha != 1.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = *alpha * b[i__ + k * b_dim1]
+				    ;
+/* L350: */
+			}
+		    }
+/* L360: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STRSM . */
+
+} /* strsm_ */
+
+/* Subroutine */ int zaxpy_(integer *n, doublecomplex *za, doublecomplex *zx,
+	integer *incx, doublecomplex *zy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    extern doublereal dcabs1_(doublecomplex *);
+
+
+/*
+    Purpose
+    =======
+
+       ZAXPY constant times a vector plus a vector.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zy;
+    --zx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (dcabs1_(za) == 0.) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = iy;
+	i__3 = iy;
+	i__4 = ix;
+	z__2.r = za->r * zx[i__4].r - za->i * zx[i__4].i, z__2.i = za->r * zx[
+		i__4].i + za->i * zx[i__4].r;
+	z__1.r = zy[i__3].r + z__2.r, z__1.i = zy[i__3].i + z__2.i;
+	zy[i__2].r = z__1.r, zy[i__2].i = z__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	i__4 = i__;
+	z__2.r = za->r * zx[i__4].r - za->i * zx[i__4].i, z__2.i = za->r * zx[
+		i__4].i + za->i * zx[i__4].r;
+	z__1.r = zy[i__3].r + z__2.r, z__1.i = zy[i__3].i + z__2.i;
+	zy[i__2].r = z__1.r, zy[i__2].i = z__1.i;
+/* L30: */
+    }
+    return 0;
+} /* zaxpy_ */
+
+/* Subroutine */ int zcopy_(integer *n, doublecomplex *zx, integer *incx,
+	doublecomplex *zy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+
+
+/*
+    Purpose
+    =======
+
+       ZCOPY copies a vector, x, to a vector, y.
+
+    Further Details
+    ===============
+
+       jack dongarra, linpack, 4/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zy;
+    --zx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = iy;
+	i__3 = ix;
+	zy[i__2].r = zx[i__3].r, zy[i__2].i = zx[i__3].i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	zy[i__2].r = zx[i__3].r, zy[i__2].i = zx[i__3].i;
+/* L30: */
+    }
+    return 0;
+} /* zcopy_ */
+
+/* Double Complex */ VOID zdotc_(doublecomplex * ret_val, integer *n,
+	doublecomplex *zx, integer *incx, doublecomplex *zy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static doublecomplex ztemp;
+
+
+/*
+    Purpose
+    =======
+
+    ZDOTC forms the dot product of a vector.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zy;
+    --zx;
+
+    /* Function Body */
+    ztemp.r = 0., ztemp.i = 0.;
+     ret_val->r = 0.,  ret_val->i = 0.;
+    if (*n <= 0) {
+	return ;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d_cnjg(&z__3, &zx[ix]);
+	i__2 = iy;
+	z__2.r = z__3.r * zy[i__2].r - z__3.i * zy[i__2].i, z__2.i = z__3.r *
+		zy[i__2].i + z__3.i * zy[i__2].r;
+	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
+	ztemp.r = z__1.r, ztemp.i = z__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
+    return ;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d_cnjg(&z__3, &zx[i__]);
+	i__2 = i__;
+	z__2.r = z__3.r * zy[i__2].r - z__3.i * zy[i__2].i, z__2.i = z__3.r *
+		zy[i__2].i + z__3.i * zy[i__2].r;
+	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
+	ztemp.r = z__1.r, ztemp.i = z__1.i;
+/* L30: */
+    }
+     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
+    return ;
+} /* zdotc_ */
+
+/* Double Complex */ VOID zdotu_(doublecomplex * ret_val, integer *n,
+	doublecomplex *zx, integer *incx, doublecomplex *zy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static doublecomplex ztemp;
+
+
+/*
+    Purpose
+    =======
+
+       ZDOTU forms the dot product of two vectors.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zy;
+    --zx;
+
+    /* Function Body */
+    ztemp.r = 0., ztemp.i = 0.;
+     ret_val->r = 0.,  ret_val->i = 0.;
+    if (*n <= 0) {
+	return ;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments
+            not equal to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	i__3 = iy;
+	z__2.r = zx[i__2].r * zy[i__3].r - zx[i__2].i * zy[i__3].i, z__2.i =
+		zx[i__2].r * zy[i__3].i + zx[i__2].i * zy[i__3].r;
+	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
+	ztemp.r = z__1.r, ztemp.i = z__1.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
+    return ;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = zx[i__2].r * zy[i__3].r - zx[i__2].i * zy[i__3].i, z__2.i =
+		zx[i__2].r * zy[i__3].i + zx[i__2].i * zy[i__3].r;
+	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
+	ztemp.r = z__1.r, ztemp.i = z__1.i;
+/* L30: */
+    }
+     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
+    return ;
+} /* zdotu_ */
+
+/* Subroutine */ int zdrot_(integer *n, doublecomplex *cx, integer *incx,
+	doublecomplex *cy, integer *incy, doublereal *c__, doublereal *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static doublecomplex ctemp;
+
+
+/*
+    Purpose
+    =======
+
+    Applies a plane rotation, where the cos and sin (c and s) are real
+    and the vectors cx and cy are complex.
+    jack dongarra, linpack, 3/11/78.
+
+    Arguments
+    ==========
+
+    N        (input) INTEGER
+             On entry, N specifies the order of the vectors cx and cy.
+             N must be at least zero.
+             Unchanged on exit.
+
+    CX       (input) COMPLEX*16 array, dimension at least
+             ( 1 + ( N - 1 )*abs( INCX ) ).
+             Before entry, the incremented array CX must contain the n
+             element vector cx. On exit, CX is overwritten by the updated
+             vector cx.
+
+    INCX     (input) INTEGER
+             On entry, INCX specifies the increment for the elements of
+             CX. INCX must not be zero.
+             Unchanged on exit.
+
+    CY       (input) COMPLEX*16 array, dimension at least
+             ( 1 + ( N - 1 )*abs( INCY ) ).
+             Before entry, the incremented array CY must contain the n
+             element vector cy. On exit, CY is overwritten by the updated
+             vector cy.
+
+    INCY     (input) INTEGER
+             On entry, INCY specifies the increment for the elements of
+             CY. INCY must not be zero.
+             Unchanged on exit.
+
+    C        (input) DOUBLE PRECISION
+             On entry, C specifies the cosine, cos.
+             Unchanged on exit.
+
+    S        (input) DOUBLE PRECISION
+             On entry, S specifies the sine, sin.
+             Unchanged on exit.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+          code for unequal increments or equal increments not equal
+            to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	z__3.r = *s * cy[i__3].r, z__3.i = *s * cy[i__3].i;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	ctemp.r = z__1.r, ctemp.i = z__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	i__4 = ix;
+	z__3.r = *s * cx[i__4].r, z__3.i = *s * cx[i__4].i;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = ix;
+	cx[i__2].r = ctemp.r, cx[i__2].i = ctemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	z__3.r = *s * cy[i__3].r, z__3.i = *s * cy[i__3].i;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	ctemp.r = z__1.r, ctemp.i = z__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	i__4 = i__;
+	z__3.r = *s * cx[i__4].r, z__3.i = *s * cx[i__4].i;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = i__;
+	cx[i__2].r = ctemp.r, cx[i__2].i = ctemp.i;
+/* L30: */
+    }
+    return 0;
+} /* zdrot_ */
+
+/* Subroutine */ int zdscal_(integer *n, doublereal *da, doublecomplex *zx,
+	integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, ix;
+
+
+/*
+    Purpose
+    =======
+
+       ZDSCAL scales a vector by a constant.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	z__2.r = *da, z__2.i = 0.;
+	i__3 = ix;
+	z__1.r = z__2.r * zx[i__3].r - z__2.i * zx[i__3].i, z__1.i = z__2.r *
+		zx[i__3].i + z__2.i * zx[i__3].r;
+	zx[i__2].r = z__1.r, zx[i__2].i = z__1.i;
+	ix += *incx;
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	z__2.r = *da, z__2.i = 0.;
+	i__3 = i__;
+	z__1.r = z__2.r * zx[i__3].r - z__2.i * zx[i__3].i, z__1.i = z__2.r *
+		zx[i__3].i + z__2.i * zx[i__3].r;
+	zx[i__2].r = z__1.r, zx[i__2].i = z__1.i;
+/* L30: */
+    }
+    return 0;
+} /* zdscal_ */
+
+/* Subroutine */ int zgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda,
+	doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *
+	c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5, i__6;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static logical nota, notb;
+    static doublecomplex temp;
+    static logical conja, conjb;
+    static integer ncola;
+    extern logical lsame_(char *, char *);
+    static integer nrowa, nrowb;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZGEMM  performs one of the matrix-matrix operations
+
+       C := alpha*op( A )*op( B ) + beta*C,
+
+    where  op( X ) is one of
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A )
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n',  op( A ) = A.
+
+                TRANSA = 'T' or 't',  op( A ) = A'.
+
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).
+
+             Unchanged on exit.
+
+    TRANSB - CHARACTER*1.
+             On entry, TRANSB specifies the form of op( B ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSB = 'N' or 'n',  op( B ) = B.
+
+                TRANSB = 'T' or 't',  op( B ) = B'.
+
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry,  M  specifies  the number  of rows  of the  matrix
+             op( A )  and of the  matrix  C.  M  must  be at least  zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N  specifies the number  of columns of the matrix
+             op( B ) and the number of columns of the matrix C. N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry,  K  specifies  the number of columns of the matrix
+             op( A ) and the number of rows of the matrix op( B ). K must
+             be at least  zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by m  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+             LDA must be at least  max( 1, m ), otherwise  LDA must be at
+             least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is
+             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  n by k  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+             LDB must be at least  max( 1, k ), otherwise  LDB must be at
+             least  max( 1, n ).
+             Unchanged on exit.
+
+    BETA   - COMPLEX*16      .
+             On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+             supplied as zero then C need not be set on input.
+             Unchanged on exit.
+
+    C      - COMPLEX*16       array of DIMENSION ( LDC, n ).
+             Before entry, the leading  m by n  part of the array  C must
+             contain the matrix  C,  except when  beta  is zero, in which
+             case C need not be set on entry.
+             On exit, the array  C  is overwritten by the  m by n  matrix
+             ( alpha*op( A )*op( B ) + beta*C ).
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+       conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+       B  respectively are to be  transposed but  not conjugated  and set
+       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+       and the number of rows of  B  respectively.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    nota = lsame_(transa, "N");
+    notb = lsame_(transb, "N");
+    conja = lsame_(transa, "C");
+    conjb = lsame_(transb, "C");
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && ! conja && ! lsame_(transa, "T")) {
+	info = 1;
+    } else if (! notb && ! conjb && ! lsame_(transb, "T")) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	xerbla_("ZGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (alpha->r == 0. && alpha->i == 0. || *k == 0) &&
+	     (beta->r == 1. && beta->i == 0.)) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0. && alpha->i == 0.) {
+	if (beta->r == 0. && beta->i == 0.) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * c_dim1;
+		    c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * c_dim1;
+		    i__4 = i__ + j * c_dim1;
+		    z__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4].i,
+			    z__1.i = beta->r * c__[i__4].i + beta->i * c__[
+			    i__4].r;
+		    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (beta->r == 0. && beta->i == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L50: */
+		    }
+		} else if (beta->r != 1. || beta->i != 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__1.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = l + j * b_dim1;
+		    if (b[i__3].r != 0. || b[i__3].i != 0.) {
+			i__3 = l + j * b_dim1;
+			z__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3].i,
+				z__1.i = alpha->r * b[i__3].i + alpha->i * b[
+				i__3].r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    z__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    z__1.r = c__[i__5].r + z__2.r, z__1.i = c__[i__5]
+				    .i + z__2.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else if (conja) {
+
+/*           Form  C := alpha*conjg( A' )*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * b_dim1;
+			z__2.r = z__3.r * b[i__4].r - z__3.i * b[i__4].i,
+				z__2.i = z__3.r * b[i__4].i + z__3.i * b[i__4]
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			i__4 = l + i__ * a_dim1;
+			i__5 = l + j * b_dim1;
+			z__2.r = a[i__4].r * b[i__5].r - a[i__4].i * b[i__5]
+				.i, z__2.i = a[i__4].r * b[i__5].i + a[i__4]
+				.i * b[i__5].r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L130: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L140: */
+		}
+/* L150: */
+	    }
+	}
+    } else if (nota) {
+	if (conjb) {
+
+/*           Form  C := alpha*A*conjg( B' ) + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (beta->r == 0. && beta->i == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L160: */
+		    }
+		} else if (beta->r != 1. || beta->i != 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__1.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L170: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * b_dim1;
+		    if (b[i__3].r != 0. || b[i__3].i != 0.) {
+			d_cnjg(&z__2, &b[j + l * b_dim1]);
+			z__1.r = alpha->r * z__2.r - alpha->i * z__2.i,
+				z__1.i = alpha->r * z__2.i + alpha->i *
+				z__2.r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    z__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    z__1.r = c__[i__5].r + z__2.r, z__1.i = c__[i__5]
+				    .i + z__2.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L180: */
+			}
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A*B'          + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (beta->r == 0. && beta->i == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L210: */
+		    }
+		} else if (beta->r != 1. || beta->i != 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__1.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L220: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * b_dim1;
+		    if (b[i__3].r != 0. || b[i__3].i != 0.) {
+			i__3 = j + l * b_dim1;
+			z__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3].i,
+				z__1.i = alpha->r * b[i__3].i + alpha->i * b[
+				i__3].r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    z__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    z__1.r = c__[i__5].r + z__2.r, z__1.i = c__[i__5]
+				    .i + z__2.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+/* L250: */
+	    }
+	}
+    } else if (conja) {
+	if (conjb) {
+
+/*           Form  C := alpha*conjg( A' )*conjg( B' ) + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			d_cnjg(&z__4, &b[j + l * b_dim1]);
+			z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, z__2.i =
+				z__3.r * z__4.i + z__3.i * z__4.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L260: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L270: */
+		}
+/* L280: */
+	    }
+	} else {
+
+/*           Form  C := alpha*conjg( A' )*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			i__4 = j + l * b_dim1;
+			z__2.r = z__3.r * b[i__4].r - z__3.i * b[i__4].i,
+				z__2.i = z__3.r * b[i__4].i + z__3.i * b[i__4]
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L290: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L300: */
+		}
+/* L310: */
+	    }
+	}
+    } else {
+	if (conjb) {
+
+/*           Form  C := alpha*A'*conjg( B' ) + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			i__4 = l + i__ * a_dim1;
+			d_cnjg(&z__3, &b[j + l * b_dim1]);
+			z__2.r = a[i__4].r * z__3.r - a[i__4].i * z__3.i,
+				z__2.i = a[i__4].r * z__3.i + a[i__4].i *
+				z__3.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L320: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L330: */
+		}
+/* L340: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			i__4 = l + i__ * a_dim1;
+			i__5 = j + l * b_dim1;
+			z__2.r = a[i__4].r * b[i__5].r - a[i__4].i * b[i__5]
+				.i, z__2.i = a[i__4].r * b[i__5].i + a[i__4]
+				.i * b[i__5].r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L350: */
+		    }
+		    if (beta->r == 0. && beta->i == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__2.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = beta->r * c__[i__4].r - beta->i * c__[i__4]
+				.i, z__3.i = beta->r * c__[i__4].i + beta->i *
+				 c__[i__4].r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L360: */
+		}
+/* L370: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZGEMM . */
+
+} /* zgemm_ */
+
+/* Subroutine */ int zgemv_(char *trans, integer *m, integer *n,
+	doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *
+	x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *
+	incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static doublecomplex temp;
+    static integer lenx, leny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj;
+
+
+/*
+    Purpose
+    =======
+
+    ZGEMV  performs one of the matrix-vector operations
+
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
+
+       y := alpha*conjg( A' )*x + beta*y,
+
+    where alpha and beta are scalars, x and y are vectors and A is an
+    m by n matrix.
+
+    Arguments
+    ==========
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+
+                TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of DIMENSION at least
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+             Before entry, the incremented array X must contain the
+             vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - COMPLEX*16      .
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - COMPLEX*16       array of DIMENSION at least
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+             and at least
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+             Before entry with BETA non-zero, the incremented array Y
+             must contain the vector y. On exit, Y is overwritten by the
+             updated vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! lsame_(trans, "C")
+	    ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("ZGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r ==
+	    1. && beta->i == 0.)) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T");
+
+/*
+       Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+       up the start points in  X  and  Y.
+*/
+
+    if (lsame_(trans, "N")) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+
+       First form  y := beta*y.
+*/
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    if (lsame_(trans, "N")) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		if (x[i__2].r != 0. || x[i__2].i != 0.) {
+		    i__2 = jx;
+		    z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    temp.r = z__1.r, temp.i = z__1.i;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__;
+			i__4 = i__;
+			i__5 = i__ + j * a_dim1;
+			z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				z__2.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i +
+				z__2.i;
+			y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		if (x[i__2].r != 0. || x[i__2].i != 0.) {
+		    i__2 = jx;
+		    z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    z__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    temp.r = z__1.r, temp.i = z__1.i;
+		    iy = ky;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = iy;
+			i__4 = iy;
+			i__5 = i__ + j * a_dim1;
+			z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				z__2.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i +
+				z__2.i;
+			y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp.r = 0., temp.i = 0.;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__;
+			z__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[i__4]
+				.i, z__2.i = a[i__3].r * x[i__4].i + a[i__3]
+				.i * x[i__4].r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			i__3 = i__;
+			z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i,
+				z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3]
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i =
+			alpha->r * temp.i + alpha->i * temp.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jy += *incy;
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp.r = 0., temp.i = 0.;
+		ix = kx;
+		if (noconj) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = ix;
+			z__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[i__4]
+				.i, z__2.i = a[i__3].r * x[i__4].i + a[i__3]
+				.i * x[i__4].r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+			ix += *incx;
+/* L120: */
+		    }
+		} else {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			i__3 = ix;
+			z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i,
+				z__2.i = z__3.r * x[i__3].i + z__3.i * x[i__3]
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+			ix += *incx;
+/* L130: */
+		    }
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i =
+			alpha->r * temp.i + alpha->i * temp.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jy += *incy;
+/* L140: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZGEMV . */
+
+} /* zgemv_ */
+
+/* Subroutine */ int zgerc_(integer *m, integer *n, doublecomplex *alpha,
+	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy,
+	doublecomplex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jy, kx, info;
+    static doublecomplex temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZGERC  performs the rank 1 operation
+
+       A := alpha*x*conjg( y' ) + A,
+
+    where alpha is a scalar, x is an m element vector, y is an n element
+    vector and A is an m by n matrix.
+
+    Arguments
+    ==========
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of dimension at least
+             ( 1 + ( m - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the m
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients. On exit, A is
+             overwritten by the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZGERC ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0. || y[i__2].i != 0.) {
+		d_cnjg(&z__2, &y[jy]);
+		z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i =
+			alpha->r * z__2.i + alpha->i * z__2.r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = i__;
+		    z__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, z__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    z__1.r = a[i__4].r + z__2.r, z__1.i = a[i__4].i + z__2.i;
+		    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0. || y[i__2].i != 0.) {
+		d_cnjg(&z__2, &y[jy]);
+		z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i =
+			alpha->r * z__2.i + alpha->i * z__2.r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = ix;
+		    z__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, z__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    z__1.r = a[i__4].r + z__2.r, z__1.i = a[i__4].i + z__2.i;
+		    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZGERC . */
+
+} /* zgerc_ */
+
+/* Subroutine */ int zgeru_(integer *m, integer *n, doublecomplex *alpha,
+	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy,
+	doublecomplex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j, ix, jy, kx, info;
+    static doublecomplex temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZGERU  performs the rank 1 operation
+
+       A := alpha*x*y' + A,
+
+    where alpha is a scalar, x is an m element vector, y is an n element
+    vector and A is an m by n matrix.
+
+    Arguments
+    ==========
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of the matrix A.
+             M must be at least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of dimension at least
+             ( 1 + ( m - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the m
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry, the leading m by n part of the array A must
+             contain the matrix of coefficients. On exit, A is
+             overwritten by the updated matrix.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZGERU ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0. || y[i__2].i != 0.) {
+		i__2 = jy;
+		z__1.r = alpha->r * y[i__2].r - alpha->i * y[i__2].i, z__1.i =
+			 alpha->r * y[i__2].i + alpha->i * y[i__2].r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = i__;
+		    z__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, z__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    z__1.r = a[i__4].r + z__2.r, z__1.i = a[i__4].i + z__2.i;
+		    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = jy;
+	    if (y[i__2].r != 0. || y[i__2].i != 0.) {
+		i__2 = jy;
+		z__1.r = alpha->r * y[i__2].r - alpha->i * y[i__2].i, z__1.i =
+			 alpha->r * y[i__2].i + alpha->i * y[i__2].r;
+		temp.r = z__1.r, temp.i = z__1.i;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * a_dim1;
+		    i__4 = i__ + j * a_dim1;
+		    i__5 = ix;
+		    z__2.r = x[i__5].r * temp.r - x[i__5].i * temp.i, z__2.i =
+			     x[i__5].r * temp.i + x[i__5].i * temp.r;
+		    z__1.r = a[i__4].r + z__2.r, z__1.i = a[i__4].i + z__2.i;
+		    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZGERU . */
+
+} /* zgeru_ */
+
+/* Subroutine */ int zhemv_(char *uplo, integer *n, doublecomplex *alpha,
+	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx,
+	doublecomplex *beta, doublecomplex *y, integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZHEMV  performs the matrix-vector  operation
+
+       y := alpha*A*x + beta*y,
+
+    where alpha and beta are scalars, x and y are n element vectors and
+    A is an n by n hermitian matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the hermitian matrix and the strictly
+             lower triangular part of A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the hermitian matrix and the strictly
+             upper triangular part of A is not referenced.
+             Note that the imaginary parts of the diagonal elements need
+             not be set and are assumed to be zero.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    BETA   - COMPLEX*16      .
+             On entry, BETA specifies the scalar beta. When BETA is
+             supplied as zero then Y need not be set on input.
+             Unchanged on exit.
+
+    Y      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y. On exit, Y is overwritten by the updated
+             vector y.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*lda < max(1,*n)) {
+	info = 5;
+    } else if (*incx == 0) {
+	info = 7;
+    } else if (*incy == 0) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("ZHEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 1. &&
+	    beta->i == 0.)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+
+       First form  y := beta*y.
+*/
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    if (lsame_(uplo, "U")) {
+
+/*        Form  y  when A is stored in upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+		    i__3 = i__;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		d__1 = a[i__4].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+		    i__3 = ix;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		d__1 = a[i__4].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when A is stored in lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = j;
+		i__3 = j;
+		i__4 = j + j * a_dim1;
+		d__1 = a[i__4].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+		    i__3 = i__;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = j + j * a_dim1;
+		d__1 = a[i__4].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+		    i__3 = ix;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i =
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHEMV . */
+
+} /* zhemv_ */
+
+/* Subroutine */ int zher2_(char *uplo, integer *n, doublecomplex *alpha,
+	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy,
+	doublecomplex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    static doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZHER2  performs the hermitian rank 2 operation
+
+       A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+
+    where alpha is a scalar, x and y are n element vectors and A is an n
+    by n hermitian matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the upper or lower
+             triangular part of the array A is to be referenced as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the upper triangular part of A
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the lower triangular part of A
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x.
+             Unchanged on exit.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Y      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCY ) ).
+             Before entry, the incremented array Y must contain the n
+             element vector y.
+             Unchanged on exit.
+
+    INCY   - INTEGER.
+             On entry, INCY specifies the increment for the elements of
+             Y. INCY must not be zero.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular part of the hermitian matrix and the strictly
+             lower triangular part of A is not referenced. On exit, the
+             upper triangular part of the array A is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular part of the hermitian matrix and the strictly
+             upper triangular part of A is not referenced. On exit, the
+             lower triangular part of the array A is overwritten by the
+             lower triangular part of the updated matrix.
+             Note that the imaginary parts of the diagonal elements need
+             not be set, they are assumed to be zero, and on exit they
+             are set to zero.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*n)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZHER2 ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+
+/*
+       Set up the start points in X and Y if the increments are not both
+       unity.
+*/
+
+    if (*incx != 1 || *incy != 1) {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*n - 1) * *incx;
+	}
+	if (*incy > 0) {
+	    ky = 1;
+	} else {
+	    ky = 1 - (*n - 1) * *incy;
+	}
+	jx = kx;
+	jy = ky;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through the triangular part
+       of A.
+*/
+
+    if (lsame_(uplo, "U")) {
+
+/*        Form  A  when A is stored in the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (x[i__2].r != 0. || x[i__2].i != 0. || (y[i__3].r != 0. ||
+			y[i__3].i != 0.)) {
+		    d_cnjg(&z__2, &y[j]);
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i =
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = j;
+		    z__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    z__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = i__;
+			z__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				z__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			z__2.r = a[i__4].r + z__3.r, z__2.i = a[i__4].i +
+				z__3.i;
+			i__6 = i__;
+			z__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				z__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L10: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    z__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    z__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = j;
+		    z__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    z__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = a[i__3].r + z__1.r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = a[i__3].r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (x[i__2].r != 0. || x[i__2].i != 0. || (y[i__3].r != 0. ||
+			y[i__3].i != 0.)) {
+		    d_cnjg(&z__2, &y[jy]);
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i =
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = jx;
+		    z__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    z__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix = kx;
+		    iy = ky;
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = ix;
+			z__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				z__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			z__2.r = a[i__4].r + z__3.r, z__2.i = a[i__4].i +
+				z__3.i;
+			i__6 = iy;
+			z__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				z__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			ix += *incx;
+			iy += *incy;
+/* L30: */
+		    }
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    z__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    z__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = jy;
+		    z__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    z__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = a[i__3].r + z__1.r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = a[i__3].r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L40: */
+	    }
+	}
+    } else {
+
+/*        Form  A  when A is stored in the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		i__3 = j;
+		if (x[i__2].r != 0. || x[i__2].i != 0. || (y[i__3].r != 0. ||
+			y[i__3].i != 0.)) {
+		    d_cnjg(&z__2, &y[j]);
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i =
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = j;
+		    z__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    z__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = j;
+		    z__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    z__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = j;
+		    z__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    z__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = a[i__3].r + z__1.r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = i__;
+			z__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				z__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			z__2.r = a[i__4].r + z__3.r, z__2.i = a[i__4].i +
+				z__3.i;
+			i__6 = i__;
+			z__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				z__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L50: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = a[i__3].r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		}
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		i__3 = jy;
+		if (x[i__2].r != 0. || x[i__2].i != 0. || (y[i__3].r != 0. ||
+			y[i__3].i != 0.)) {
+		    d_cnjg(&z__2, &y[jy]);
+		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i =
+			    alpha->r * z__2.i + alpha->i * z__2.r;
+		    temp1.r = z__1.r, temp1.i = z__1.i;
+		    i__2 = jx;
+		    z__2.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i,
+			    z__2.i = alpha->r * x[i__2].i + alpha->i * x[i__2]
+			    .r;
+		    d_cnjg(&z__1, &z__2);
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    i__4 = jx;
+		    z__2.r = x[i__4].r * temp1.r - x[i__4].i * temp1.i,
+			    z__2.i = x[i__4].r * temp1.i + x[i__4].i *
+			    temp1.r;
+		    i__5 = jy;
+		    z__3.r = y[i__5].r * temp2.r - y[i__5].i * temp2.i,
+			    z__3.i = y[i__5].r * temp2.i + y[i__5].i *
+			    temp2.r;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    d__1 = a[i__3].r + z__1.r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		    ix = jx;
+		    iy = jy;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			ix += *incx;
+			iy += *incy;
+			i__3 = i__ + j * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			i__5 = ix;
+			z__3.r = x[i__5].r * temp1.r - x[i__5].i * temp1.i,
+				z__3.i = x[i__5].r * temp1.i + x[i__5].i *
+				temp1.r;
+			z__2.r = a[i__4].r + z__3.r, z__2.i = a[i__4].i +
+				z__3.i;
+			i__6 = iy;
+			z__4.r = y[i__6].r * temp2.r - y[i__6].i * temp2.i,
+				z__4.i = y[i__6].r * temp2.i + y[i__6].i *
+				temp2.r;
+			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L70: */
+		    }
+		} else {
+		    i__2 = j + j * a_dim1;
+		    i__3 = j + j * a_dim1;
+		    d__1 = a[i__3].r;
+		    a[i__2].r = d__1, a[i__2].i = 0.;
+		}
+		jx += *incx;
+		jy += *incy;
+/* L80: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHER2 . */
+
+} /* zher2_ */
+
+/* Subroutine */ int zher2k_(char *uplo, char *trans, integer *n, integer *k,
+	doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *
+	b, integer *ldb, doublereal *beta, doublecomplex *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5, i__6, i__7;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4, z__5, z__6;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZHER2K  performs one of the hermitian rank 2k operations
+
+       C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C,
+
+    or
+
+       C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C,
+
+    where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+    hermitian matrix and  A and B  are  n by k matrices in the first case
+    and  k by n  matrices in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'    C := alpha*A*conjg( B' )          +
+                                           conjg( alpha )*B*conjg( A' ) +
+                                           beta*C.
+
+                TRANS = 'C' or 'c'    C := alpha*conjg( A' )*B          +
+                                           conjg( alpha )*conjg( B' )*A +
+                                           beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns  of the  matrices  A and B,  and on  entry  with
+             TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+             matrices  A and B.  K must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16         .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  B  must contain the matrix  B,  otherwise
+             the leading  k by n  part of the array  B  must contain  the
+             matrix B.
+             Unchanged on exit.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDB must be at least  max( 1, n ), otherwise  LDB must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION            .
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - COMPLEX*16          array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  hermitian matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  hermitian matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+             Note that the imaginary parts of the diagonal elements need
+             not be set,  they are assumed to be zero,  and on exit they
+             are set to zero.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1.
+       Ed Anderson, Cray Research Inc.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldb < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldc < max(1,*n)) {
+	info = 12;
+    }
+    if (info != 0) {
+	xerbla_("ZHER2K", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0. && alpha->i == 0. || *k == 0) && *beta ==
+	    1.) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0. && alpha->i == 0.) {
+	if (upper) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L30: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*
+          Form  C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) +
+                     C.
+*/
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L90: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L100: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    i__4 = j + l * b_dim1;
+		    if (a[i__3].r != 0. || a[i__3].i != 0. || (b[i__4].r !=
+			    0. || b[i__4].i != 0.)) {
+			d_cnjg(&z__2, &b[j + l * b_dim1]);
+			z__1.r = alpha->r * z__2.r - alpha->i * z__2.i,
+				z__1.i = alpha->r * z__2.i + alpha->i *
+				z__2.r;
+			temp1.r = z__1.r, temp1.i = z__1.i;
+			i__3 = j + l * a_dim1;
+			z__2.r = alpha->r * a[i__3].r - alpha->i * a[i__3].i,
+				z__2.i = alpha->r * a[i__3].i + alpha->i * a[
+				i__3].r;
+			d_cnjg(&z__1, &z__2);
+			temp2.r = z__1.r, temp2.i = z__1.i;
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__3.r = a[i__6].r * temp1.r - a[i__6].i *
+				    temp1.i, z__3.i = a[i__6].r * temp1.i + a[
+				    i__6].i * temp1.r;
+			    z__2.r = c__[i__5].r + z__3.r, z__2.i = c__[i__5]
+				    .i + z__3.i;
+			    i__7 = i__ + l * b_dim1;
+			    z__4.r = b[i__7].r * temp2.r - b[i__7].i *
+				    temp2.i, z__4.i = b[i__7].r * temp2.i + b[
+				    i__7].i * temp2.r;
+			    z__1.r = z__2.r + z__4.r, z__1.i = z__2.i +
+				    z__4.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L110: */
+			}
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = j + l * a_dim1;
+			z__2.r = a[i__5].r * temp1.r - a[i__5].i * temp1.i,
+				z__2.i = a[i__5].r * temp1.i + a[i__5].i *
+				temp1.r;
+			i__6 = j + l * b_dim1;
+			z__3.r = b[i__6].r * temp2.r - b[i__6].i * temp2.i,
+				z__3.i = b[i__6].r * temp2.i + b[i__6].i *
+				temp2.r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			d__1 = c__[i__4].r + z__1.r;
+			c__[i__3].r = d__1, c__[i__3].i = 0.;
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L140: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L150: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    i__4 = j + l * b_dim1;
+		    if (a[i__3].r != 0. || a[i__3].i != 0. || (b[i__4].r !=
+			    0. || b[i__4].i != 0.)) {
+			d_cnjg(&z__2, &b[j + l * b_dim1]);
+			z__1.r = alpha->r * z__2.r - alpha->i * z__2.i,
+				z__1.i = alpha->r * z__2.i + alpha->i *
+				z__2.r;
+			temp1.r = z__1.r, temp1.i = z__1.i;
+			i__3 = j + l * a_dim1;
+			z__2.r = alpha->r * a[i__3].r - alpha->i * a[i__3].i,
+				z__2.i = alpha->r * a[i__3].i + alpha->i * a[
+				i__3].r;
+			d_cnjg(&z__1, &z__2);
+			temp2.r = z__1.r, temp2.i = z__1.i;
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__3.r = a[i__6].r * temp1.r - a[i__6].i *
+				    temp1.i, z__3.i = a[i__6].r * temp1.i + a[
+				    i__6].i * temp1.r;
+			    z__2.r = c__[i__5].r + z__3.r, z__2.i = c__[i__5]
+				    .i + z__3.i;
+			    i__7 = i__ + l * b_dim1;
+			    z__4.r = b[i__7].r * temp2.r - b[i__7].i *
+				    temp2.i, z__4.i = b[i__7].r * temp2.i + b[
+				    i__7].i * temp2.r;
+			    z__1.r = z__2.r + z__4.r, z__1.i = z__2.i +
+				    z__4.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L160: */
+			}
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = j + l * a_dim1;
+			z__2.r = a[i__5].r * temp1.r - a[i__5].i * temp1.i,
+				z__2.i = a[i__5].r * temp1.i + a[i__5].i *
+				temp1.r;
+			i__6 = j + l * b_dim1;
+			z__3.r = b[i__6].r * temp2.r - b[i__6].i * temp2.i,
+				z__3.i = b[i__6].r * temp2.i + b[i__6].i *
+				temp2.r;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			d__1 = c__[i__4].r + z__1.r;
+			c__[i__3].r = d__1, c__[i__3].i = 0.;
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*
+          Form  C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A +
+                     C.
+*/
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp1.r = 0., temp1.i = 0.;
+		    temp2.r = 0., temp2.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * b_dim1;
+			z__2.r = z__3.r * b[i__4].r - z__3.i * b[i__4].i,
+				z__2.i = z__3.r * b[i__4].i + z__3.i * b[i__4]
+				.r;
+			z__1.r = temp1.r + z__2.r, z__1.i = temp1.i + z__2.i;
+			temp1.r = z__1.r, temp1.i = z__1.i;
+			d_cnjg(&z__3, &b[l + i__ * b_dim1]);
+			i__4 = l + j * a_dim1;
+			z__2.r = z__3.r * a[i__4].r - z__3.i * a[i__4].i,
+				z__2.i = z__3.r * a[i__4].i + z__3.i * a[i__4]
+				.r;
+			z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+			temp2.r = z__1.r, temp2.i = z__1.i;
+/* L190: */
+		    }
+		    if (i__ == j) {
+			if (*beta == 0.) {
+			    i__3 = j + j * c_dim1;
+			    z__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    d_cnjg(&z__4, alpha);
+			    z__3.r = z__4.r * temp2.r - z__4.i * temp2.i,
+				    z__3.i = z__4.r * temp2.i + z__4.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    d__1 = z__1.r;
+			    c__[i__3].r = d__1, c__[i__3].i = 0.;
+			} else {
+			    i__3 = j + j * c_dim1;
+			    i__4 = j + j * c_dim1;
+			    z__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    d_cnjg(&z__4, alpha);
+			    z__3.r = z__4.r * temp2.r - z__4.i * temp2.i,
+				    z__3.i = z__4.r * temp2.i + z__4.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    d__1 = *beta * c__[i__4].r + z__1.r;
+			    c__[i__3].r = d__1, c__[i__3].i = 0.;
+			}
+		    } else {
+			if (*beta == 0.) {
+			    i__3 = i__ + j * c_dim1;
+			    z__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    d_cnjg(&z__4, alpha);
+			    z__3.r = z__4.r * temp2.r - z__4.i * temp2.i,
+				    z__3.i = z__4.r * temp2.i + z__4.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+			} else {
+			    i__3 = i__ + j * c_dim1;
+			    i__4 = i__ + j * c_dim1;
+			    z__3.r = *beta * c__[i__4].r, z__3.i = *beta *
+				    c__[i__4].i;
+			    z__4.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__4.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    z__2.r = z__3.r + z__4.r, z__2.i = z__3.i +
+				    z__4.i;
+			    d_cnjg(&z__6, alpha);
+			    z__5.r = z__6.r * temp2.r - z__6.i * temp2.i,
+				    z__5.i = z__6.r * temp2.i + z__6.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__5.r, z__1.i = z__2.i +
+				    z__5.i;
+			    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+			}
+		    }
+/* L200: */
+		}
+/* L210: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+		    temp1.r = 0., temp1.i = 0.;
+		    temp2.r = 0., temp2.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * b_dim1;
+			z__2.r = z__3.r * b[i__4].r - z__3.i * b[i__4].i,
+				z__2.i = z__3.r * b[i__4].i + z__3.i * b[i__4]
+				.r;
+			z__1.r = temp1.r + z__2.r, z__1.i = temp1.i + z__2.i;
+			temp1.r = z__1.r, temp1.i = z__1.i;
+			d_cnjg(&z__3, &b[l + i__ * b_dim1]);
+			i__4 = l + j * a_dim1;
+			z__2.r = z__3.r * a[i__4].r - z__3.i * a[i__4].i,
+				z__2.i = z__3.r * a[i__4].i + z__3.i * a[i__4]
+				.r;
+			z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+			temp2.r = z__1.r, temp2.i = z__1.i;
+/* L220: */
+		    }
+		    if (i__ == j) {
+			if (*beta == 0.) {
+			    i__3 = j + j * c_dim1;
+			    z__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    d_cnjg(&z__4, alpha);
+			    z__3.r = z__4.r * temp2.r - z__4.i * temp2.i,
+				    z__3.i = z__4.r * temp2.i + z__4.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    d__1 = z__1.r;
+			    c__[i__3].r = d__1, c__[i__3].i = 0.;
+			} else {
+			    i__3 = j + j * c_dim1;
+			    i__4 = j + j * c_dim1;
+			    z__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    d_cnjg(&z__4, alpha);
+			    z__3.r = z__4.r * temp2.r - z__4.i * temp2.i,
+				    z__3.i = z__4.r * temp2.i + z__4.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    d__1 = *beta * c__[i__4].r + z__1.r;
+			    c__[i__3].r = d__1, c__[i__3].i = 0.;
+			}
+		    } else {
+			if (*beta == 0.) {
+			    i__3 = i__ + j * c_dim1;
+			    z__2.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__2.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    d_cnjg(&z__4, alpha);
+			    z__3.r = z__4.r * temp2.r - z__4.i * temp2.i,
+				    z__3.i = z__4.r * temp2.i + z__4.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+			} else {
+			    i__3 = i__ + j * c_dim1;
+			    i__4 = i__ + j * c_dim1;
+			    z__3.r = *beta * c__[i__4].r, z__3.i = *beta *
+				    c__[i__4].i;
+			    z__4.r = alpha->r * temp1.r - alpha->i * temp1.i,
+				    z__4.i = alpha->r * temp1.i + alpha->i *
+				    temp1.r;
+			    z__2.r = z__3.r + z__4.r, z__2.i = z__3.i +
+				    z__4.i;
+			    d_cnjg(&z__6, alpha);
+			    z__5.r = z__6.r * temp2.r - z__6.i * temp2.i,
+				    z__5.i = z__6.r * temp2.i + z__6.i *
+				    temp2.r;
+			    z__1.r = z__2.r + z__5.r, z__1.i = z__2.i +
+				    z__5.i;
+			    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+			}
+		    }
+/* L230: */
+		}
+/* L240: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHER2K. */
+
+} /* zher2k_ */
+
+/* Subroutine */ int zherk_(char *uplo, char *trans, integer *n, integer *k,
+	doublereal *alpha, doublecomplex *a, integer *lda, doublereal *beta,
+	doublecomplex *c__, integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5,
+	    i__6;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, l, info;
+    static doublecomplex temp;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static doublereal rtemp;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    Purpose
+    =======
+
+    ZHERK  performs one of the hermitian rank k operations
+
+       C := alpha*A*conjg( A' ) + beta*C,
+
+    or
+
+       C := alpha*conjg( A' )*A + beta*C,
+
+    where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+    matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+    matrix in the second case.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On  entry,   UPLO  specifies  whether  the  upper  or  lower
+             triangular  part  of the  array  C  is to be  referenced  as
+             follows:
+
+                UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+                                    is to be referenced.
+
+                UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+                                    is to be referenced.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry,  TRANS  specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   C := alpha*A*conjg( A' ) + beta*C.
+
+                TRANS = 'C' or 'c'   C := alpha*conjg( A' )*A + beta*C.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry,  N specifies the order of the matrix C.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    K      - INTEGER.
+             On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+             of  columns   of  the   matrix   A,   and  on   entry   with
+             TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+             matrix A.  K must be at least zero.
+             Unchanged on exit.
+
+    ALPHA  - DOUBLE PRECISION            .
+             On entry, ALPHA specifies the scalar alpha.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+             k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+             Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+             part of the array  A  must contain the matrix  A,  otherwise
+             the leading  k by n  part of the array  A  must contain  the
+             matrix A.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+             then  LDA must be at least  max( 1, n ), otherwise  LDA must
+             be at least  max( 1, k ).
+             Unchanged on exit.
+
+    BETA   - DOUBLE PRECISION.
+             On entry, BETA specifies the scalar beta.
+             Unchanged on exit.
+
+    C      - COMPLEX*16          array of DIMENSION ( LDC, n ).
+             Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+             upper triangular part of the array C must contain the upper
+             triangular part  of the  hermitian matrix  and the strictly
+             lower triangular part of C is not referenced.  On exit, the
+             upper triangular part of the array  C is overwritten by the
+             upper triangular part of the updated matrix.
+             Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+             lower triangular part of the array C must contain the lower
+             triangular part  of the  hermitian matrix  and the strictly
+             upper triangular part of C is not referenced.  On exit, the
+             lower triangular part of the array  C is overwritten by the
+             lower triangular part of the updated matrix.
+             Note that the imaginary parts of the diagonal elements need
+             not be set,  they are assumed to be zero,  and on exit they
+             are set to zero.
+
+    LDC    - INTEGER.
+             On entry, LDC specifies the first dimension of C as declared
+             in  the  calling  (sub)  program.   LDC  must  be  at  least
+             max( 1, n ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1.
+       Ed Anderson, Cray Research Inc.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    if (lsame_(trans, "N")) {
+	nrowa = *n;
+    } else {
+	nrowa = *k;
+    }
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! upper && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*k < 0) {
+	info = 4;
+    } else if (*lda < max(1,nrowa)) {
+	info = 7;
+    } else if (*ldc < max(1,*n)) {
+	info = 10;
+    }
+    if (info != 0) {
+	xerbla_("ZHERK ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	if (upper) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L10: */
+		    }
+/* L20: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L30: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  C := alpha*A*conjg( A' ) + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L90: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = j - 1;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L100: */
+		    }
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    if (a[i__3].r != 0. || a[i__3].i != 0.) {
+			d_cnjg(&z__2, &a[j + l * a_dim1]);
+			z__1.r = *alpha * z__2.r, z__1.i = *alpha * z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    z__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    z__1.r = c__[i__5].r + z__2.r, z__1.i = c__[i__5]
+				    .i + z__2.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L110: */
+			}
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = i__ + l * a_dim1;
+			z__1.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				z__1.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			d__1 = c__[i__4].r + z__1.r;
+			c__[i__3].r = d__1, c__[i__3].i = 0.;
+		    }
+/* L120: */
+		}
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *n;
+		    for (i__ = j; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L140: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			z__1.r = *beta * c__[i__4].r, z__1.i = *beta * c__[
+				i__4].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L150: */
+		    }
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    i__3 = j + l * a_dim1;
+		    if (a[i__3].r != 0. || a[i__3].i != 0.) {
+			d_cnjg(&z__2, &a[j + l * a_dim1]);
+			z__1.r = *alpha * z__2.r, z__1.i = *alpha * z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__3 = j + j * c_dim1;
+			i__4 = j + j * c_dim1;
+			i__5 = j + l * a_dim1;
+			z__1.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				z__1.i = temp.r * a[i__5].i + temp.i * a[i__5]
+				.r;
+			d__1 = c__[i__4].r + z__1.r;
+			c__[i__3].r = d__1, c__[i__3].i = 0.;
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * c_dim1;
+			    i__5 = i__ + j * c_dim1;
+			    i__6 = i__ + l * a_dim1;
+			    z__2.r = temp.r * a[i__6].r - temp.i * a[i__6].i,
+				    z__2.i = temp.r * a[i__6].i + temp.i * a[
+				    i__6].r;
+			    z__1.r = c__[i__5].r + z__2.r, z__1.i = c__[i__5]
+				    .i + z__2.i;
+			    c__[i__4].r = z__1.r, c__[i__4].i = z__1.i;
+/* L160: */
+			}
+		    }
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+    } else {
+
+/*        Form  C := alpha*conjg( A' )*A + beta*C. */
+
+	if (upper) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * a_dim1;
+			z__2.r = z__3.r * a[i__4].r - z__3.i * a[i__4].i,
+				z__2.i = z__3.r * a[i__4].i + z__3.i * a[i__4]
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L190: */
+		    }
+		    if (*beta == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = *alpha * temp.r, z__1.i = *alpha * temp.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = *alpha * temp.r, z__2.i = *alpha * temp.i;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = *beta * c__[i__4].r, z__3.i = *beta * c__[
+				i__4].i;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L200: */
+		}
+		rtemp = 0.;
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    d_cnjg(&z__3, &a[l + j * a_dim1]);
+		    i__3 = l + j * a_dim1;
+		    z__2.r = z__3.r * a[i__3].r - z__3.i * a[i__3].i, z__2.i =
+			     z__3.r * a[i__3].i + z__3.i * a[i__3].r;
+		    z__1.r = rtemp + z__2.r, z__1.i = z__2.i;
+		    rtemp = z__1.r;
+/* L210: */
+		}
+		if (*beta == 0.) {
+		    i__2 = j + j * c_dim1;
+		    d__1 = *alpha * rtemp;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *alpha * rtemp + *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		}
+/* L220: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		rtemp = 0.;
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    d_cnjg(&z__3, &a[l + j * a_dim1]);
+		    i__3 = l + j * a_dim1;
+		    z__2.r = z__3.r * a[i__3].r - z__3.i * a[i__3].i, z__2.i =
+			     z__3.r * a[i__3].i + z__3.i * a[i__3].r;
+		    z__1.r = rtemp + z__2.r, z__1.i = z__2.i;
+		    rtemp = z__1.r;
+/* L230: */
+		}
+		if (*beta == 0.) {
+		    i__2 = j + j * c_dim1;
+		    d__1 = *alpha * rtemp;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		} else {
+		    i__2 = j + j * c_dim1;
+		    i__3 = j + j * c_dim1;
+		    d__1 = *alpha * rtemp + *beta * c__[i__3].r;
+		    c__[i__2].r = d__1, c__[i__2].i = 0.;
+		}
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    temp.r = 0., temp.i = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			d_cnjg(&z__3, &a[l + i__ * a_dim1]);
+			i__4 = l + j * a_dim1;
+			z__2.r = z__3.r * a[i__4].r - z__3.i * a[i__4].i,
+				z__2.i = z__3.r * a[i__4].i + z__3.i * a[i__4]
+				.r;
+			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
+			temp.r = z__1.r, temp.i = z__1.i;
+/* L240: */
+		    }
+		    if (*beta == 0.) {
+			i__3 = i__ + j * c_dim1;
+			z__1.r = *alpha * temp.r, z__1.i = *alpha * temp.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    } else {
+			i__3 = i__ + j * c_dim1;
+			z__2.r = *alpha * temp.r, z__2.i = *alpha * temp.i;
+			i__4 = i__ + j * c_dim1;
+			z__3.r = *beta * c__[i__4].r, z__3.i = *beta * c__[
+				i__4].i;
+			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+		    }
+/* L250: */
+		}
+/* L260: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHERK . */
+
+} /* zherk_ */
+
+/* Subroutine */ int zscal_(integer *n, doublecomplex *za, doublecomplex *zx,
+	integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, ix;
+
+
+/*
+    Purpose
+    =======
+
+       ZSCAL scales a vector by a constant.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 3/93 to return if incx .le. 0.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    ix = 1;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	i__3 = ix;
+	z__1.r = za->r * zx[i__3].r - za->i * zx[i__3].i, z__1.i = za->r * zx[
+		i__3].i + za->i * zx[i__3].r;
+	zx[i__2].r = z__1.r, zx[i__2].i = z__1.i;
+	ix += *incx;
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	z__1.r = za->r * zx[i__3].r - za->i * zx[i__3].i, z__1.i = za->r * zx[
+		i__3].i + za->i * zx[i__3].r;
+	zx[i__2].r = z__1.r, zx[i__2].i = z__1.i;
+/* L30: */
+    }
+    return 0;
+} /* zscal_ */
+
+/* Subroutine */ int zswap_(integer *n, doublecomplex *zx, integer *incx,
+	doublecomplex *zy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static doublecomplex ztemp;
+
+
+/*
+    Purpose
+    =======
+
+       ZSWAP interchanges two vectors.
+
+    Further Details
+    ===============
+
+       jack dongarra, 3/11/78.
+       modified 12/3/93, array(1) declarations changed to array(*)
+
+    =====================================================================
+*/
+
+    /* Parameter adjustments */
+    --zy;
+    --zx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*
+         code for unequal increments or equal increments not equal
+           to 1
+*/
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	ztemp.r = zx[i__2].r, ztemp.i = zx[i__2].i;
+	i__2 = ix;
+	i__3 = iy;
+	zx[i__2].r = zy[i__3].r, zx[i__2].i = zy[i__3].i;
+	i__2 = iy;
+	zy[i__2].r = ztemp.r, zy[i__2].i = ztemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*       code for both increments equal to 1 */
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	ztemp.r = zx[i__2].r, ztemp.i = zx[i__2].i;
+	i__2 = i__;
+	i__3 = i__;
+	zx[i__2].r = zy[i__3].r, zx[i__2].i = zy[i__3].i;
+	i__2 = i__;
+	zy[i__2].r = ztemp.r, zy[i__2].i = ztemp.i;
+/* L30: */
+    }
+    return 0;
+} /* zswap_ */
+
+/* Subroutine */ int ztrmm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, doublecomplex *alpha, doublecomplex *a,
+	integer *lda, doublecomplex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5,
+	    i__6;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static doublecomplex temp;
+    static logical lside;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    ZTRMM  performs one of the matrix-matrix operations
+
+       B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+
+    where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry,  SIDE specifies whether  op( A ) multiplies B from
+             the left or right as follows:
+
+                SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+
+                SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - COMPLEX*16       array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain the matrix  B,  and  on exit  is overwritten  by the
+             transformed matrix.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    noconj = lsame_(transa, "T");
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("ZTRMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0. && alpha->i == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		b[i__3].r = 0., b[i__3].i = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*A*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = k + j * b_dim1;
+			if (b[i__3].r != 0. || b[i__3].i != 0.) {
+			    i__3 = k + j * b_dim1;
+			    z__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3]
+				    .i, z__1.i = alpha->r * b[i__3].i +
+				    alpha->i * b[i__3].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    i__3 = k - 1;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * a_dim1;
+				z__2.r = temp.r * a[i__6].r - temp.i * a[i__6]
+					.i, z__2.i = temp.r * a[i__6].i +
+					temp.i * a[i__6].r;
+				z__1.r = b[i__5].r + z__2.r, z__1.i = b[i__5]
+					.i + z__2.i;
+				b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L30: */
+			    }
+			    if (nounit) {
+				i__3 = k + k * a_dim1;
+				z__1.r = temp.r * a[i__3].r - temp.i * a[i__3]
+					.i, z__1.i = temp.r * a[i__3].i +
+					temp.i * a[i__3].r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__3 = k + j * b_dim1;
+			    b[i__3].r = temp.r, b[i__3].i = temp.i;
+			}
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (k = *m; k >= 1; --k) {
+			i__2 = k + j * b_dim1;
+			if (b[i__2].r != 0. || b[i__2].i != 0.) {
+			    i__2 = k + j * b_dim1;
+			    z__1.r = alpha->r * b[i__2].r - alpha->i * b[i__2]
+				    .i, z__1.i = alpha->r * b[i__2].i +
+				    alpha->i * b[i__2].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    i__2 = k + j * b_dim1;
+			    b[i__2].r = temp.r, b[i__2].i = temp.i;
+			    if (nounit) {
+				i__2 = k + j * b_dim1;
+				i__3 = k + j * b_dim1;
+				i__4 = k + k * a_dim1;
+				z__1.r = b[i__3].r * a[i__4].r - b[i__3].i *
+					a[i__4].i, z__1.i = b[i__3].r * a[
+					i__4].i + b[i__3].i * a[i__4].r;
+				b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+			    }
+			    i__2 = *m;
+			    for (i__ = k + 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * a_dim1;
+				z__2.r = temp.r * a[i__5].r - temp.i * a[i__5]
+					.i, z__2.i = temp.r * a[i__5].i +
+					temp.i * a[i__5].r;
+				z__1.r = b[i__4].r + z__2.r, z__1.i = b[i__4]
+					.i + z__2.i;
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L60: */
+			    }
+			}
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*A'*B   or   B := alpha*conjg( A' )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			i__2 = i__ + j * b_dim1;
+			temp.r = b[i__2].r, temp.i = b[i__2].i;
+			if (noconj) {
+			    if (nounit) {
+				i__2 = i__ + i__ * a_dim1;
+				z__1.r = temp.r * a[i__2].r - temp.i * a[i__2]
+					.i, z__1.i = temp.r * a[i__2].i +
+					temp.i * a[i__2].r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__2 = i__ - 1;
+			    for (k = 1; k <= i__2; ++k) {
+				i__3 = k + i__ * a_dim1;
+				i__4 = k + j * b_dim1;
+				z__2.r = a[i__3].r * b[i__4].r - a[i__3].i *
+					b[i__4].i, z__2.i = a[i__3].r * b[
+					i__4].i + a[i__3].i * b[i__4].r;
+				z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+			    }
+			} else {
+			    if (nounit) {
+				d_cnjg(&z__2, &a[i__ + i__ * a_dim1]);
+				z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+					z__1.i = temp.r * z__2.i + temp.i *
+					z__2.r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__2 = i__ - 1;
+			    for (k = 1; k <= i__2; ++k) {
+				d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+				i__3 = k + j * b_dim1;
+				z__2.r = z__3.r * b[i__3].r - z__3.i * b[i__3]
+					.i, z__2.i = z__3.r * b[i__3].i +
+					z__3.i * b[i__3].r;
+				z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+			    }
+			}
+			i__2 = i__ + j * b_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L110: */
+		    }
+/* L120: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * b_dim1;
+			temp.r = b[i__3].r, temp.i = b[i__3].i;
+			if (noconj) {
+			    if (nounit) {
+				i__3 = i__ + i__ * a_dim1;
+				z__1.r = temp.r * a[i__3].r - temp.i * a[i__3]
+					.i, z__1.i = temp.r * a[i__3].i +
+					temp.i * a[i__3].r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (k = i__ + 1; k <= i__3; ++k) {
+				i__4 = k + i__ * a_dim1;
+				i__5 = k + j * b_dim1;
+				z__2.r = a[i__4].r * b[i__5].r - a[i__4].i *
+					b[i__5].i, z__2.i = a[i__4].r * b[
+					i__5].i + a[i__4].i * b[i__5].r;
+				z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L130: */
+			    }
+			} else {
+			    if (nounit) {
+				d_cnjg(&z__2, &a[i__ + i__ * a_dim1]);
+				z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+					z__1.i = temp.r * z__2.i + temp.i *
+					z__2.r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (k = i__ + 1; k <= i__3; ++k) {
+				d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+				i__4 = k + j * b_dim1;
+				z__2.r = z__3.r * b[i__4].r - z__3.i * b[i__4]
+					.i, z__2.i = z__3.r * b[i__4].i +
+					z__3.i * b[i__4].r;
+				z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L140: */
+			    }
+			}
+			i__3 = i__ + j * b_dim1;
+			z__1.r = alpha->r * temp.r - alpha->i * temp.i,
+				z__1.i = alpha->r * temp.i + alpha->i *
+				temp.r;
+			b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L150: */
+		    }
+/* L160: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*A. */
+
+	    if (upper) {
+		for (j = *n; j >= 1; --j) {
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			i__1 = j + j * a_dim1;
+			z__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				z__1.i = temp.r * a[i__1].i + temp.i * a[i__1]
+				.r;
+			temp.r = z__1.r, temp.i = z__1.i;
+		    }
+		    i__1 = *m;
+		    for (i__ = 1; i__ <= i__1; ++i__) {
+			i__2 = i__ + j * b_dim1;
+			i__3 = i__ + j * b_dim1;
+			z__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				z__1.i = temp.r * b[i__3].i + temp.i * b[i__3]
+				.r;
+			b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L170: */
+		    }
+		    i__1 = j - 1;
+		    for (k = 1; k <= i__1; ++k) {
+			i__2 = k + j * a_dim1;
+			if (a[i__2].r != 0. || a[i__2].i != 0.) {
+			    i__2 = k + j * a_dim1;
+			    z__1.r = alpha->r * a[i__2].r - alpha->i * a[i__2]
+				    .i, z__1.i = alpha->r * a[i__2].i +
+				    alpha->i * a[i__2].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * b_dim1;
+				z__2.r = temp.r * b[i__5].r - temp.i * b[i__5]
+					.i, z__2.i = temp.r * b[i__5].i +
+					temp.i * b[i__5].r;
+				z__1.r = b[i__4].r + z__2.r, z__1.i = b[i__4]
+					.i + z__2.i;
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L180: */
+			    }
+			}
+/* L190: */
+		    }
+/* L200: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			i__2 = j + j * a_dim1;
+			z__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				z__1.i = temp.r * a[i__2].i + temp.i * a[i__2]
+				.r;
+			temp.r = z__1.r, temp.i = z__1.i;
+		    }
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * b_dim1;
+			i__4 = i__ + j * b_dim1;
+			z__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				z__1.i = temp.r * b[i__4].i + temp.i * b[i__4]
+				.r;
+			b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L210: */
+		    }
+		    i__2 = *n;
+		    for (k = j + 1; k <= i__2; ++k) {
+			i__3 = k + j * a_dim1;
+			if (a[i__3].r != 0. || a[i__3].i != 0.) {
+			    i__3 = k + j * a_dim1;
+			    z__1.r = alpha->r * a[i__3].r - alpha->i * a[i__3]
+				    .i, z__1.i = alpha->r * a[i__3].i +
+				    alpha->i * a[i__3].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * b_dim1;
+				z__2.r = temp.r * b[i__6].r - temp.i * b[i__6]
+					.i, z__2.i = temp.r * b[i__6].i +
+					temp.i * b[i__6].r;
+				z__1.r = b[i__5].r + z__2.r, z__1.i = b[i__5]
+					.i + z__2.i;
+				b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L220: */
+			    }
+			}
+/* L230: */
+		    }
+/* L240: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*A'   or   B := alpha*B*conjg( A' ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    i__2 = k - 1;
+		    for (j = 1; j <= i__2; ++j) {
+			i__3 = j + k * a_dim1;
+			if (a[i__3].r != 0. || a[i__3].i != 0.) {
+			    if (noconj) {
+				i__3 = j + k * a_dim1;
+				z__1.r = alpha->r * a[i__3].r - alpha->i * a[
+					i__3].i, z__1.i = alpha->r * a[i__3]
+					.i + alpha->i * a[i__3].r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    } else {
+				d_cnjg(&z__2, &a[j + k * a_dim1]);
+				z__1.r = alpha->r * z__2.r - alpha->i *
+					z__2.i, z__1.i = alpha->r * z__2.i +
+					alpha->i * z__2.r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * b_dim1;
+				z__2.r = temp.r * b[i__6].r - temp.i * b[i__6]
+					.i, z__2.i = temp.r * b[i__6].i +
+					temp.i * b[i__6].r;
+				z__1.r = b[i__5].r + z__2.r, z__1.i = b[i__5]
+					.i + z__2.i;
+				b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L250: */
+			    }
+			}
+/* L260: */
+		    }
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			if (noconj) {
+			    i__2 = k + k * a_dim1;
+			    z__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				    z__1.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			} else {
+			    d_cnjg(&z__2, &a[k + k * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+				    z__1.i = temp.r * z__2.i + temp.i *
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    if (temp.r != 1. || temp.i != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + k * b_dim1;
+			    i__4 = i__ + k * b_dim1;
+			    z__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				    z__1.i = temp.r * b[i__4].i + temp.i * b[
+				    i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L270: */
+			}
+		    }
+/* L280: */
+		}
+	    } else {
+		for (k = *n; k >= 1; --k) {
+		    i__1 = *n;
+		    for (j = k + 1; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			if (a[i__2].r != 0. || a[i__2].i != 0.) {
+			    if (noconj) {
+				i__2 = j + k * a_dim1;
+				z__1.r = alpha->r * a[i__2].r - alpha->i * a[
+					i__2].i, z__1.i = alpha->r * a[i__2]
+					.i + alpha->i * a[i__2].r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    } else {
+				d_cnjg(&z__2, &a[j + k * a_dim1]);
+				z__1.r = alpha->r * z__2.r - alpha->i *
+					z__2.i, z__1.i = alpha->r * z__2.i +
+					alpha->i * z__2.r;
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * b_dim1;
+				z__2.r = temp.r * b[i__5].r - temp.i * b[i__5]
+					.i, z__2.i = temp.r * b[i__5].i +
+					temp.i * b[i__5].r;
+				z__1.r = b[i__4].r + z__2.r, z__1.i = b[i__4]
+					.i + z__2.i;
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L290: */
+			    }
+			}
+/* L300: */
+		    }
+		    temp.r = alpha->r, temp.i = alpha->i;
+		    if (nounit) {
+			if (noconj) {
+			    i__1 = k + k * a_dim1;
+			    z__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				    z__1.i = temp.r * a[i__1].i + temp.i * a[
+				    i__1].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			} else {
+			    d_cnjg(&z__2, &a[k + k * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+				    z__1.i = temp.r * z__2.i + temp.i *
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    if (temp.r != 1. || temp.i != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + k * b_dim1;
+			    i__3 = i__ + k * b_dim1;
+			    z__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				    z__1.i = temp.r * b[i__3].i + temp.i * b[
+				    i__3].r;
+			    b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L310: */
+			}
+		    }
+/* L320: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRMM . */
+
+} /* ztrmm_ */
+
+/* Subroutine */ int ztrmv_(char *uplo, char *trans, char *diag, integer *n,
+	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, ix, jx, kx, info;
+    static doublecomplex temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    ZTRMV  performs one of the matrix-vector operations
+
+       x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+
+    where x is an n element vector and  A is an n by n unit, or non-unit,
+    upper or lower triangular matrix.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the operation to be performed as
+             follows:
+
+                TRANS = 'N' or 'n'   x := A*x.
+
+                TRANS = 'T' or 't'   x := A'*x.
+
+                TRANS = 'C' or 'c'   x := conjg( A' )*x.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit
+             triangular as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular matrix and the strictly lower triangular part of
+             A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular matrix and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of
+             A are not referenced either, but are assumed to be unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element vector x. On exit, X is overwritten with the
+             tranformed vector x.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("ZTRMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T");
+    nounit = lsame_(diag, "N");
+
+/*
+       Set up the start point in X if the increment is not unity. This
+       will be  ( N - 1 )*INCX  too small for descending loops.
+*/
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := A*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0. || x[i__2].i != 0.) {
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__;
+			    i__4 = i__;
+			    i__5 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__4].r + z__2.r, z__1.i = x[i__4].i +
+				    z__2.i;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+/* L10: */
+			}
+			if (nounit) {
+			    i__2 = j;
+			    i__3 = j;
+			    i__4 = j + j * a_dim1;
+			    z__1.r = x[i__3].r * a[i__4].r - x[i__3].i * a[
+				    i__4].i, z__1.i = x[i__3].r * a[i__4].i +
+				    x[i__3].i * a[i__4].r;
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = jx;
+		    if (x[i__2].r != 0. || x[i__2].i != 0.) {
+			i__2 = jx;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			ix = kx;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = ix;
+			    i__4 = ix;
+			    i__5 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__4].r + z__2.r, z__1.i = x[i__4].i +
+				    z__2.i;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    i__2 = jx;
+			    i__3 = jx;
+			    i__4 = j + j * a_dim1;
+			    z__1.r = x[i__3].r * a[i__4].r - x[i__3].i * a[
+				    i__4].i, z__1.i = x[i__3].r * a[i__4].i +
+				    x[i__3].i * a[i__4].r;
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+			}
+		    }
+		    jx += *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0. || x[i__1].i != 0.) {
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = i__;
+			    i__3 = i__;
+			    i__4 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
+				    z__2.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i +
+				    z__2.i;
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+/* L50: */
+			}
+			if (nounit) {
+			    i__1 = j;
+			    i__2 = j;
+			    i__3 = j + j * a_dim1;
+			    z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, z__1.i = x[i__2].r * a[i__3].i +
+				    x[i__2].i * a[i__3].r;
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    if (x[i__1].r != 0. || x[i__1].i != 0.) {
+			i__1 = jx;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			ix = kx;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = ix;
+			    i__3 = ix;
+			    i__4 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
+				    z__2.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i +
+				    z__2.i;
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    i__1 = jx;
+			    i__2 = jx;
+			    i__3 = j + j * a_dim1;
+			    z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, z__1.i = x[i__2].r * a[i__3].i +
+				    x[i__2].i * a[i__3].r;
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+			}
+		    }
+		    jx -= *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x  or  x := conjg( A' )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    if (noconj) {
+			if (nounit) {
+			    i__1 = j + j * a_dim1;
+			    z__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				    z__1.i = temp.r * a[i__1].i + temp.i * a[
+				    i__1].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    i__1 = i__ + j * a_dim1;
+			    i__2 = i__;
+			    z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, z__2.i = a[i__1].r * x[i__2].i +
+				    a[i__1].i * x[i__2].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+				    z__1.i = temp.r * z__2.i + temp.i *
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__1 = i__;
+			    z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i,
+				    z__2.i = z__3.r * x[i__1].i + z__3.i * x[
+				    i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+			}
+		    }
+		    i__1 = j;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    ix = jx;
+		    if (noconj) {
+			if (nounit) {
+			    i__1 = j + j * a_dim1;
+			    z__1.r = temp.r * a[i__1].r - temp.i * a[i__1].i,
+				    z__1.i = temp.r * a[i__1].i + temp.i * a[
+				    i__1].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    ix -= *incx;
+			    i__1 = i__ + j * a_dim1;
+			    i__2 = ix;
+			    z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, z__2.i = a[i__1].r * x[i__2].i +
+				    a[i__1].i * x[i__2].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L120: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+				    z__1.i = temp.r * z__2.i + temp.i *
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    ix -= *incx;
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__1 = ix;
+			    z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i,
+				    z__2.i = z__3.r * x[i__1].i + z__3.i * x[
+				    i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L130: */
+			}
+		    }
+		    i__1 = jx;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+		    jx -= *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    if (noconj) {
+			if (nounit) {
+			    i__2 = j + j * a_dim1;
+			    z__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				    z__1.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = i__;
+			    z__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, z__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+				    z__1.i = temp.r * z__2.i + temp.i *
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__3 = i__;
+			    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i,
+				    z__2.i = z__3.r * x[i__3].i + z__3.i * x[
+				    i__3].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			}
+		    }
+		    i__2 = j;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = jx;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    ix = jx;
+		    if (noconj) {
+			if (nounit) {
+			    i__2 = j + j * a_dim1;
+			    z__1.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
+				    z__1.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    ix += *incx;
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = ix;
+			    z__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, z__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L180: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i,
+				    z__1.i = temp.r * z__2.i + temp.i *
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    ix += *incx;
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__3 = ix;
+			    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i,
+				    z__2.i = z__3.r * x[i__3].i + z__3.i * x[
+				    i__3].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i +
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L190: */
+			}
+		    }
+		    i__2 = jx;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+		    jx += *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRMV . */
+
+} /* ztrmv_ */
+
+/* Subroutine */ int ztrsm_(char *side, char *uplo, char *transa, char *diag,
+	integer *m, integer *n, doublecomplex *alpha, doublecomplex *a,
+	integer *lda, doublecomplex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5,
+	    i__6, i__7;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, k, info;
+    static doublecomplex temp;
+    static logical lside;
+    extern logical lsame_(char *, char *);
+    static integer nrowa;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    ZTRSM  solves one of the matrix equations
+
+       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+
+    where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+
+       op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+
+    The matrix X is overwritten on B.
+
+    Arguments
+    ==========
+
+    SIDE   - CHARACTER*1.
+             On entry, SIDE specifies whether op( A ) appears on the left
+             or right of X as follows:
+
+                SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+
+                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+
+             Unchanged on exit.
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix A is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANSA - CHARACTER*1.
+             On entry, TRANSA specifies the form of op( A ) to be used in
+             the matrix multiplication as follows:
+
+                TRANSA = 'N' or 'n'   op( A ) = A.
+
+                TRANSA = 'T' or 't'   op( A ) = A'.
+
+                TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit triangular
+             as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    M      - INTEGER.
+             On entry, M specifies the number of rows of B. M must be at
+             least zero.
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the number of columns of B.  N must be
+             at least zero.
+             Unchanged on exit.
+
+    ALPHA  - COMPLEX*16      .
+             On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+             zero then  A is not referenced and  B need not be set before
+             entry.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, k ), where k is m
+             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+             upper triangular part of the array  A must contain the upper
+             triangular matrix  and the strictly lower triangular part of
+             A is not referenced.
+             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+             lower triangular part of the array  A must contain the lower
+             triangular matrix  and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+             A  are not referenced either,  but are assumed to be  unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+             then LDA must be at least max( 1, n ).
+             Unchanged on exit.
+
+    B      - COMPLEX*16       array of DIMENSION ( LDB, n ).
+             Before entry,  the leading  m by n part of the array  B must
+             contain  the  right-hand  side  matrix  B,  and  on exit  is
+             overwritten by the solution matrix  X.
+
+    LDB    - INTEGER.
+             On entry, LDB specifies the first dimension of B as declared
+             in  the  calling  (sub)  program.   LDB  must  be  at  least
+             max( 1, m ).
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 3 Blas routine.
+
+    -- Written on 8-February-1989.
+       Jack Dongarra, Argonne National Laboratory.
+       Iain Duff, AERE Harwell.
+       Jeremy Du Croz, Numerical Algorithms Group Ltd.
+       Sven Hammarling, Numerical Algorithms Group Ltd.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    noconj = lsame_(transa, "T");
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("ZTRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (alpha->r == 0. && alpha->i == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		b[i__3].r = 0., b[i__3].i = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    z__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, z__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			i__2 = k + j * b_dim1;
+			if (b[i__2].r != 0. || b[i__2].i != 0.) {
+			    if (nounit) {
+				i__2 = k + j * b_dim1;
+				z_div(&z__1, &b[k + j * b_dim1], &a[k + k *
+					a_dim1]);
+				b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+			    }
+			    i__2 = k - 1;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = k + j * b_dim1;
+				i__6 = i__ + k * a_dim1;
+				z__2.r = b[i__5].r * a[i__6].r - b[i__5].i *
+					a[i__6].i, z__2.i = b[i__5].r * a[
+					i__6].i + b[i__5].i * a[i__6].r;
+				z__1.r = b[i__4].r - z__2.r, z__1.i = b[i__4]
+					.i - z__2.i;
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    z__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, z__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = k + j * b_dim1;
+			if (b[i__3].r != 0. || b[i__3].i != 0.) {
+			    if (nounit) {
+				i__3 = k + j * b_dim1;
+				z_div(&z__1, &b[k + j * b_dim1], &a[k + k *
+					a_dim1]);
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (i__ = k + 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = k + j * b_dim1;
+				i__7 = i__ + k * a_dim1;
+				z__2.r = b[i__6].r * a[i__7].r - b[i__6].i *
+					a[i__7].i, z__2.i = b[i__6].r * a[
+					i__7].i + b[i__6].i * a[i__7].r;
+				z__1.r = b[i__5].r - z__2.r, z__1.i = b[i__5]
+					.i - z__2.i;
+				b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*
+             Form  B := alpha*inv( A' )*B
+             or    B := alpha*inv( conjg( A' ) )*B.
+*/
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * b_dim1;
+			z__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3].i,
+				z__1.i = alpha->r * b[i__3].i + alpha->i * b[
+				i__3].r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			if (noconj) {
+			    i__3 = i__ - 1;
+			    for (k = 1; k <= i__3; ++k) {
+				i__4 = k + i__ * a_dim1;
+				i__5 = k + j * b_dim1;
+				z__2.r = a[i__4].r * b[i__5].r - a[i__4].i *
+					b[i__5].i, z__2.i = a[i__4].r * b[
+					i__5].i + a[i__4].i * b[i__5].r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L110: */
+			    }
+			    if (nounit) {
+				z_div(&z__1, &temp, &a[i__ + i__ * a_dim1]);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			} else {
+			    i__3 = i__ - 1;
+			    for (k = 1; k <= i__3; ++k) {
+				d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+				i__4 = k + j * b_dim1;
+				z__2.r = z__3.r * b[i__4].r - z__3.i * b[i__4]
+					.i, z__2.i = z__3.r * b[i__4].i +
+					z__3.i * b[i__4].r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L120: */
+			    }
+			    if (nounit) {
+				d_cnjg(&z__2, &a[i__ + i__ * a_dim1]);
+				z_div(&z__1, &temp, &z__2);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			}
+			i__3 = i__ + j * b_dim1;
+			b[i__3].r = temp.r, b[i__3].i = temp.i;
+/* L130: */
+		    }
+/* L140: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			i__2 = i__ + j * b_dim1;
+			z__1.r = alpha->r * b[i__2].r - alpha->i * b[i__2].i,
+				z__1.i = alpha->r * b[i__2].i + alpha->i * b[
+				i__2].r;
+			temp.r = z__1.r, temp.i = z__1.i;
+			if (noconj) {
+			    i__2 = *m;
+			    for (k = i__ + 1; k <= i__2; ++k) {
+				i__3 = k + i__ * a_dim1;
+				i__4 = k + j * b_dim1;
+				z__2.r = a[i__3].r * b[i__4].r - a[i__3].i *
+					b[i__4].i, z__2.i = a[i__3].r * b[
+					i__4].i + a[i__3].i * b[i__4].r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			    }
+			    if (nounit) {
+				z_div(&z__1, &temp, &a[i__ + i__ * a_dim1]);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			} else {
+			    i__2 = *m;
+			    for (k = i__ + 1; k <= i__2; ++k) {
+				d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+				i__3 = k + j * b_dim1;
+				z__2.r = z__3.r * b[i__3].r - z__3.i * b[i__3]
+					.i, z__2.i = z__3.r * b[i__3].i +
+					z__3.i * b[i__3].r;
+				z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+					z__2.i;
+				temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			    }
+			    if (nounit) {
+				d_cnjg(&z__2, &a[i__ + i__ * a_dim1]);
+				z_div(&z__1, &temp, &z__2);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			}
+			i__2 = i__ + j * b_dim1;
+			b[i__2].r = temp.r, b[i__2].i = temp.i;
+/* L170: */
+		    }
+/* L180: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    z__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, z__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L190: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = k + j * a_dim1;
+			if (a[i__3].r != 0. || a[i__3].i != 0.) {
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = k + j * a_dim1;
+				i__7 = i__ + k * b_dim1;
+				z__2.r = a[i__6].r * b[i__7].r - a[i__6].i *
+					b[i__7].i, z__2.i = a[i__6].r * b[
+					i__7].i + a[i__6].i * b[i__7].r;
+				z__1.r = b[i__5].r - z__2.r, z__1.i = b[i__5]
+					.i - z__2.i;
+				b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L200: */
+			    }
+			}
+/* L210: */
+		    }
+		    if (nounit) {
+			z_div(&z__1, &c_b1078, &a[j + j * a_dim1]);
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * b_dim1;
+			    i__4 = i__ + j * b_dim1;
+			    z__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				    z__1.i = temp.r * b[i__4].i + temp.i * b[
+				    i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L220: */
+			}
+		    }
+/* L230: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * b_dim1;
+			    i__3 = i__ + j * b_dim1;
+			    z__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3]
+				    .i, z__1.i = alpha->r * b[i__3].i +
+				    alpha->i * b[i__3].r;
+			    b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L240: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= i__1; ++k) {
+			i__2 = k + j * a_dim1;
+			if (a[i__2].r != 0. || a[i__2].i != 0.) {
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = k + j * a_dim1;
+				i__6 = i__ + k * b_dim1;
+				z__2.r = a[i__5].r * b[i__6].r - a[i__5].i *
+					b[i__6].i, z__2.i = a[i__5].r * b[
+					i__6].i + a[i__5].i * b[i__6].r;
+				z__1.r = b[i__4].r - z__2.r, z__1.i = b[i__4]
+					.i - z__2.i;
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L250: */
+			    }
+			}
+/* L260: */
+		    }
+		    if (nounit) {
+			z_div(&z__1, &c_b1078, &a[j + j * a_dim1]);
+			temp.r = z__1.r, temp.i = z__1.i;
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * b_dim1;
+			    i__3 = i__ + j * b_dim1;
+			    z__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				    z__1.i = temp.r * b[i__3].i + temp.i * b[
+				    i__3].r;
+			    b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L270: */
+			}
+		    }
+/* L280: */
+		}
+	    }
+	} else {
+
+/*
+             Form  B := alpha*B*inv( A' )
+             or    B := alpha*B*inv( conjg( A' ) ).
+*/
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			if (noconj) {
+			    z_div(&z__1, &c_b1078, &a[k + k * a_dim1]);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			} else {
+			    d_cnjg(&z__2, &a[k + k * a_dim1]);
+			    z_div(&z__1, &c_b1078, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + k * b_dim1;
+			    i__3 = i__ + k * b_dim1;
+			    z__1.r = temp.r * b[i__3].r - temp.i * b[i__3].i,
+				    z__1.i = temp.r * b[i__3].i + temp.i * b[
+				    i__3].r;
+			    b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L290: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= i__1; ++j) {
+			i__2 = j + k * a_dim1;
+			if (a[i__2].r != 0. || a[i__2].i != 0.) {
+			    if (noconj) {
+				i__2 = j + k * a_dim1;
+				temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    } else {
+				d_cnjg(&z__1, &a[j + k * a_dim1]);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				i__3 = i__ + j * b_dim1;
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + k * b_dim1;
+				z__2.r = temp.r * b[i__5].r - temp.i * b[i__5]
+					.i, z__2.i = temp.r * b[i__5].i +
+					temp.i * b[i__5].r;
+				z__1.r = b[i__4].r - z__2.r, z__1.i = b[i__4]
+					.i - z__2.i;
+				b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L300: */
+			    }
+			}
+/* L310: */
+		    }
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + k * b_dim1;
+			    i__3 = i__ + k * b_dim1;
+			    z__1.r = alpha->r * b[i__3].r - alpha->i * b[i__3]
+				    .i, z__1.i = alpha->r * b[i__3].i +
+				    alpha->i * b[i__3].r;
+			    b[i__2].r = z__1.r, b[i__2].i = z__1.i;
+/* L320: */
+			}
+		    }
+/* L330: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    if (nounit) {
+			if (noconj) {
+			    z_div(&z__1, &c_b1078, &a[k + k * a_dim1]);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			} else {
+			    d_cnjg(&z__2, &a[k + k * a_dim1]);
+			    z_div(&z__1, &c_b1078, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + k * b_dim1;
+			    i__4 = i__ + k * b_dim1;
+			    z__1.r = temp.r * b[i__4].r - temp.i * b[i__4].i,
+				    z__1.i = temp.r * b[i__4].i + temp.i * b[
+				    i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L340: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= i__2; ++j) {
+			i__3 = j + k * a_dim1;
+			if (a[i__3].r != 0. || a[i__3].i != 0.) {
+			    if (noconj) {
+				i__3 = j + k * a_dim1;
+				temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    } else {
+				d_cnjg(&z__1, &a[j + k * a_dim1]);
+				temp.r = z__1.r, temp.i = z__1.i;
+			    }
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				i__4 = i__ + j * b_dim1;
+				i__5 = i__ + j * b_dim1;
+				i__6 = i__ + k * b_dim1;
+				z__2.r = temp.r * b[i__6].r - temp.i * b[i__6]
+					.i, z__2.i = temp.r * b[i__6].i +
+					temp.i * b[i__6].r;
+				z__1.r = b[i__5].r - z__2.r, z__1.i = b[i__5]
+					.i - z__2.i;
+				b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L350: */
+			    }
+			}
+/* L360: */
+		    }
+		    if (alpha->r != 1. || alpha->i != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + k * b_dim1;
+			    i__4 = i__ + k * b_dim1;
+			    z__1.r = alpha->r * b[i__4].r - alpha->i * b[i__4]
+				    .i, z__1.i = alpha->r * b[i__4].i +
+				    alpha->i * b[i__4].r;
+			    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L370: */
+			}
+		    }
+/* L380: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRSM . */
+
+} /* ztrsm_ */
+
+/* Subroutine */ int ztrsv_(char *uplo, char *trans, char *diag, integer *n,
+	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, ix, jx, kx, info;
+    static doublecomplex temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconj, nounit;
+
+
+/*
+    Purpose
+    =======
+
+    ZTRSV  solves one of the systems of equations
+
+       A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+
+    where b and x are n element vectors and A is an n by n unit, or
+    non-unit, upper or lower triangular matrix.
+
+    No test for singularity or near-singularity is included in this
+    routine. Such tests must be performed before calling this routine.
+
+    Arguments
+    ==========
+
+    UPLO   - CHARACTER*1.
+             On entry, UPLO specifies whether the matrix is an upper or
+             lower triangular matrix as follows:
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.
+
+             Unchanged on exit.
+
+    TRANS  - CHARACTER*1.
+             On entry, TRANS specifies the equations to be solved as
+             follows:
+
+                TRANS = 'N' or 'n'   A*x = b.
+
+                TRANS = 'T' or 't'   A'*x = b.
+
+                TRANS = 'C' or 'c'   conjg( A' )*x = b.
+
+             Unchanged on exit.
+
+    DIAG   - CHARACTER*1.
+             On entry, DIAG specifies whether or not A is unit
+             triangular as follows:
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit
+                                    triangular.
+
+             Unchanged on exit.
+
+    N      - INTEGER.
+             On entry, N specifies the order of the matrix A.
+             N must be at least zero.
+             Unchanged on exit.
+
+    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+             Before entry with  UPLO = 'U' or 'u', the leading n by n
+             upper triangular part of the array A must contain the upper
+             triangular matrix and the strictly lower triangular part of
+             A is not referenced.
+             Before entry with UPLO = 'L' or 'l', the leading n by n
+             lower triangular part of the array A must contain the lower
+             triangular matrix and the strictly upper triangular part of
+             A is not referenced.
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of
+             A are not referenced either, but are assumed to be unity.
+             Unchanged on exit.
+
+    LDA    - INTEGER.
+             On entry, LDA specifies the first dimension of A as declared
+             in the calling (sub) program. LDA must be at least
+             max( 1, n ).
+             Unchanged on exit.
+
+    X      - COMPLEX*16       array of dimension at least
+             ( 1 + ( n - 1 )*abs( INCX ) ).
+             Before entry, the incremented array X must contain the n
+             element right-hand side vector b. On exit, X is overwritten
+             with the solution vector x.
+
+    INCX   - INTEGER.
+             On entry, INCX specifies the increment for the elements of
+             X. INCX must not be zero.
+             Unchanged on exit.
+
+    Further Details
+    ===============
+
+    Level 2 Blas routine.
+
+    -- Written on 22-October-1986.
+       Jack Dongarra, Argonne National Lab.
+       Jeremy Du Croz, Nag Central Office.
+       Sven Hammarling, Nag Central Office.
+       Richard Hanson, Sandia National Labs.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag,
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("ZTRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T");
+    nounit = lsame_(diag, "N");
+
+/*
+       Set up the start point in X if the increment is not unity. This
+       will be  ( N - 1 )*INCX  too small for descending loops.
+*/
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*
+       Start the operations. In this version the elements of A are
+       accessed sequentially with one pass through A.
+*/
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0. || x[i__1].i != 0.) {
+			if (nounit) {
+			    i__1 = j;
+			    z_div(&z__1, &x[j], &a[j + j * a_dim1]);
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+			}
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    i__1 = i__;
+			    i__2 = i__;
+			    i__3 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
+				    z__2.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    z__1.r = x[i__2].r - z__2.r, z__1.i = x[i__2].i -
+				    z__2.i;
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    i__1 = jx;
+		    if (x[i__1].r != 0. || x[i__1].i != 0.) {
+			if (nounit) {
+			    i__1 = jx;
+			    z_div(&z__1, &x[jx], &a[j + j * a_dim1]);
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+			}
+			i__1 = jx;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			ix = jx;
+			for (i__ = j - 1; i__ >= 1; --i__) {
+			    ix -= *incx;
+			    i__1 = ix;
+			    i__2 = ix;
+			    i__3 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
+				    z__2.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    z__1.r = x[i__2].r - z__2.r, z__1.i = x[i__2].i -
+				    z__2.i;
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0. || x[i__2].i != 0.) {
+			if (nounit) {
+			    i__2 = j;
+			    z_div(&z__1, &x[j], &a[j + j * a_dim1]);
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+			}
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    i__3 = i__;
+			    i__4 = i__;
+			    i__5 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__4].r - z__2.r, z__1.i = x[i__4].i -
+				    z__2.i;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = jx;
+		    if (x[i__2].r != 0. || x[i__2].i != 0.) {
+			if (nounit) {
+			    i__2 = jx;
+			    z_div(&z__1, &x[jx], &a[j + j * a_dim1]);
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+			}
+			i__2 = jx;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			ix = jx;
+			i__2 = *n;
+			for (i__ = j + 1; i__ <= i__2; ++i__) {
+			    ix += *incx;
+			    i__3 = ix;
+			    i__4 = ix;
+			    i__5 = i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__4].r - z__2.r, z__1.i = x[i__4].i -
+				    z__2.i;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = i__;
+			    z__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, z__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__3 = i__;
+			    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i,
+				    z__2.i = z__3.r * x[i__3].i + z__3.i * x[
+				    i__3].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__2 = j;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    ix = kx;
+		    i__2 = jx;
+		    temp.r = x[i__2].r, temp.i = x[i__2].i;
+		    if (noconj) {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = ix;
+			    z__2.r = a[i__3].r * x[i__4].r - a[i__3].i * x[
+				    i__4].i, z__2.i = a[i__3].r * x[i__4].i +
+				    a[i__3].i * x[i__4].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L120: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__3 = ix;
+			    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i,
+				    z__2.i = z__3.r * x[i__3].i + z__3.i * x[
+				    i__3].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L130: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__2 = jx;
+		    x[i__2].r = temp.r, x[i__2].i = temp.i;
+		    jx += *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = i__ + j * a_dim1;
+			    i__3 = i__;
+			    z__2.r = a[i__2].r * x[i__3].r - a[i__2].i * x[
+				    i__3].i, z__2.i = a[i__2].r * x[i__3].i +
+				    a[i__2].i * x[i__3].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__2 = i__;
+			    z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i,
+				    z__2.i = z__3.r * x[i__2].i + z__3.i * x[
+				    i__2].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__1 = j;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    ix = kx;
+		    i__1 = jx;
+		    temp.r = x[i__1].r, temp.i = x[i__1].i;
+		    if (noconj) {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    i__2 = i__ + j * a_dim1;
+			    i__3 = ix;
+			    z__2.r = a[i__2].r * x[i__3].r - a[i__2].i * x[
+				    i__3].i, z__2.i = a[i__2].r * x[i__3].i +
+				    a[i__2].i * x[i__3].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L180: */
+			}
+			if (nounit) {
+			    z_div(&z__1, &temp, &a[j + j * a_dim1]);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    } else {
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    d_cnjg(&z__3, &a[i__ + j * a_dim1]);
+			    i__2 = ix;
+			    z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i,
+				    z__2.i = z__3.r * x[i__2].i + z__3.i * x[
+				    i__2].r;
+			    z__1.r = temp.r - z__2.r, z__1.i = temp.i -
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L190: */
+			}
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j + j * a_dim1]);
+			    z_div(&z__1, &temp, &z__2);
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+		    }
+		    i__1 = jx;
+		    x[i__1].r = temp.r, x[i__1].i = temp.i;
+		    jx -= *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRSV . */
+
+} /* ztrsv_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_c_lapack.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_c_lapack.c
new file mode 100644
index 00000000000..f52e1e1572c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_c_lapack.c
@@ -0,0 +1,29861 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static complex c_b56 = {0.f,0.f};
+static complex c_b57 = {1.f,0.f};
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+static integer c__0 = 0;
+static integer c__65 = 65;
+static integer c__9 = 9;
+static integer c__6 = 6;
+static real c_b328 = 0.f;
+static real c_b1034 = 1.f;
+static integer c__12 = 12;
+static integer c__49 = 49;
+static real c_b1276 = -1.f;
+static integer c__13 = 13;
+static integer c__15 = 15;
+static integer c__14 = 14;
+static integer c__16 = 16;
+static logical c_false = FALSE_;
+static logical c_true = TRUE_;
+static real c_b2435 = .5f;
+
+/* Subroutine */ int cgebak_(char *job, char *side, integer *n, integer *ilo,
+	integer *ihi, real *scale, integer *m, complex *v, integer *ldv,
+	integer *info)
+{
+    /* System generated locals */
+    integer v_dim1, v_offset, i__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static real s;
+    static integer ii;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int cswap_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static logical leftv;
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
+	    *), xerbla_(char *, integer *);
+    static logical rightv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEBAK forms the right or left eigenvectors of a complex general
+    matrix by backward transformation on the computed eigenvectors of the
+    balanced matrix output by CGEBAL.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the type of backward transformation required:
+            = 'N', do nothing, return immediately;
+            = 'P', do backward transformation for permutation only;
+            = 'S', do backward transformation for scaling only;
+            = 'B', do backward transformations for both permutation and
+                   scaling.
+            JOB must be the same as the argument JOB supplied to CGEBAL.
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  V contains right eigenvectors;
+            = 'L':  V contains left eigenvectors.
+
+    N       (input) INTEGER
+            The number of rows of the matrix V.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            The integers ILO and IHI determined by CGEBAL.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    SCALE   (input) REAL array, dimension (N)
+            Details of the permutation and scaling factors, as returned
+            by CGEBAL.
+
+    M       (input) INTEGER
+            The number of columns of the matrix V.  M >= 0.
+
+    V       (input/output) COMPLEX array, dimension (LDV,M)
+            On entry, the matrix of right or left eigenvectors to be
+            transformed, as returned by CHSEIN or CTREVC.
+            On exit, V is overwritten by the transformed eigenvectors.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V. LDV >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Decode and Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --scale;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+
+    /* Function Body */
+    rightv = lsame_(side, "R");
+    leftv = lsame_(side, "L");
+
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (! rightv && ! leftv) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -7;
+    } else if (*ldv < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEBAK", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*m == 0) {
+	return 0;
+    }
+    if (lsame_(job, "N")) {
+	return 0;
+    }
+
+    if (*ilo == *ihi) {
+	goto L30;
+    }
+
+/*     Backward balance */
+
+    if (lsame_(job, "S") || lsame_(job, "B")) {
+
+	if (rightv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = scale[i__];
+		csscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L10: */
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = 1.f / scale[i__];
+		csscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L20: */
+	    }
+	}
+
+    }
+
+/*
+       Backward permutation
+
+       For  I = ILO-1 step -1 until 1,
+                IHI+1 step 1 until N do --
+*/
+
+L30:
+    if (lsame_(job, "P") || lsame_(job, "B")) {
+	if (rightv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L40;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = scale[i__];
+		if (k == i__) {
+		    goto L40;
+		}
+		cswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L40:
+		;
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L50;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = scale[i__];
+		if (k == i__) {
+		    goto L50;
+		}
+		cswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L50:
+		;
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CGEBAK */
+
+} /* cgebak_ */
+
+/* Subroutine */ int cgebal_(char *job, integer *n, complex *a, integer *lda,
+	integer *ilo, integer *ihi, real *scale, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real c__, f, g;
+    static integer i__, j, k, l, m;
+    static real r__, s, ca, ra;
+    static integer ica, ira, iexc;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int cswap_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static real sfmin1, sfmin2, sfmax1, sfmax2;
+    extern integer icamax_(integer *, complex *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
+	    *), xerbla_(char *, integer *);
+    extern logical sisnan_(real *);
+    static logical noconv;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CGEBAL balances a general complex matrix A.  This involves, first,
+    permuting A by a similarity transformation to isolate eigenvalues
+    in the first 1 to ILO-1 and last IHI+1 to N elements on the
+    diagonal; and second, applying a diagonal similarity transformation
+    to rows and columns ILO to IHI to make the rows and columns as
+    close in norm as possible.  Both steps are optional.
+
+    Balancing may reduce the 1-norm of the matrix, and improve the
+    accuracy of the computed eigenvalues and/or eigenvectors.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the operations to be performed on A:
+            = 'N':  none:  simply set ILO = 1, IHI = N, SCALE(I) = 1.0
+                    for i = 1,...,N;
+            = 'P':  permute only;
+            = 'S':  scale only;
+            = 'B':  both permute and scale.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the input matrix A.
+            On exit,  A is overwritten by the balanced matrix.
+            If JOB = 'N', A is not referenced.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    ILO     (output) INTEGER
+    IHI     (output) INTEGER
+            ILO and IHI are set to integers such that on exit
+            A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
+            If JOB = 'N' or 'S', ILO = 1 and IHI = N.
+
+    SCALE   (output) REAL array, dimension (N)
+            Details of the permutations and scaling factors applied to
+            A.  If P(j) is the index of the row and column interchanged
+            with row and column j and D(j) is the scaling factor
+            applied to row and column j, then
+            SCALE(j) = P(j)    for j = 1,...,ILO-1
+                     = D(j)    for j = ILO,...,IHI
+                     = P(j)    for j = IHI+1,...,N.
+            The order in which the interchanges are made is N to IHI+1,
+            then 1 to ILO-1.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The permutations consist of row and column interchanges which put
+    the matrix in the form
+
+               ( T1   X   Y  )
+       P A P = (  0   B   Z  )
+               (  0   0   T2 )
+
+    where T1 and T2 are upper triangular matrices whose eigenvalues lie
+    along the diagonal.  The column indices ILO and IHI mark the starting
+    and ending columns of the submatrix B. Balancing consists of applying
+    a diagonal similarity transformation inv(D) * B * D to make the
+    1-norms of each row of B and its corresponding column nearly equal.
+    The output matrix is
+
+       ( T1     X*D          Y    )
+       (  0  inv(D)*B*D  inv(D)*Z ).
+       (  0      0           T2   )
+
+    Information about the permutations P and the diagonal matrix D is
+    returned in the vector SCALE.
+
+    This subroutine is based on the EISPACK routine CBAL.
+
+    Modified by Tzu-Yi Chen, Computer Science Division, University of
+      California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --scale;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEBAL", &i__1);
+	return 0;
+    }
+
+    k = 1;
+    l = *n;
+
+    if (*n == 0) {
+	goto L210;
+    }
+
+    if (lsame_(job, "N")) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scale[i__] = 1.f;
+/* L10: */
+	}
+	goto L210;
+    }
+
+    if (lsame_(job, "S")) {
+	goto L120;
+    }
+
+/*     Permutation to isolate eigenvalues if possible */
+
+    goto L50;
+
+/*     Row and column exchange. */
+
+L20:
+    scale[m] = (real) j;
+    if (j == m) {
+	goto L30;
+    }
+
+    cswap_(&l, &a[j * a_dim1 + 1], &c__1, &a[m * a_dim1 + 1], &c__1);
+    i__1 = *n - k + 1;
+    cswap_(&i__1, &a[j + k * a_dim1], lda, &a[m + k * a_dim1], lda);
+
+L30:
+    switch (iexc) {
+	case 1:  goto L40;
+	case 2:  goto L80;
+    }
+
+/*     Search for rows isolating an eigenvalue and push them down. */
+
+L40:
+    if (l == 1) {
+	goto L210;
+    }
+    --l;
+
+L50:
+    for (j = l; j >= 1; --j) {
+
+	i__1 = l;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (i__ == j) {
+		goto L60;
+	    }
+	    i__2 = j + i__ * a_dim1;
+	    if (a[i__2].r != 0.f || r_imag(&a[j + i__ * a_dim1]) != 0.f) {
+		goto L70;
+	    }
+L60:
+	    ;
+	}
+
+	m = l;
+	iexc = 1;
+	goto L20;
+L70:
+	;
+    }
+
+    goto L90;
+
+/*     Search for columns isolating an eigenvalue and push them left. */
+
+L80:
+    ++k;
+
+L90:
+    i__1 = l;
+    for (j = k; j <= i__1; ++j) {
+
+	i__2 = l;
+	for (i__ = k; i__ <= i__2; ++i__) {
+	    if (i__ == j) {
+		goto L100;
+	    }
+	    i__3 = i__ + j * a_dim1;
+	    if (a[i__3].r != 0.f || r_imag(&a[i__ + j * a_dim1]) != 0.f) {
+		goto L110;
+	    }
+L100:
+	    ;
+	}
+
+	m = k;
+	iexc = 2;
+	goto L20;
+L110:
+	;
+    }
+
+L120:
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	scale[i__] = 1.f;
+/* L130: */
+    }
+
+    if (lsame_(job, "P")) {
+	goto L210;
+    }
+
+/*
+       Balance the submatrix in rows K to L.
+
+       Iterative loop for norm reduction
+*/
+
+    sfmin1 = slamch_("S") / slamch_("P");
+    sfmax1 = 1.f / sfmin1;
+    sfmin2 = sfmin1 * 2.f;
+    sfmax2 = 1.f / sfmin2;
+L140:
+    noconv = FALSE_;
+
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	c__ = 0.f;
+	r__ = 0.f;
+
+	i__2 = l;
+	for (j = k; j <= i__2; ++j) {
+	    if (j == i__) {
+		goto L150;
+	    }
+	    i__3 = j + i__ * a_dim1;
+	    c__ += (r__1 = a[i__3].r, dabs(r__1)) + (r__2 = r_imag(&a[j + i__
+		    * a_dim1]), dabs(r__2));
+	    i__3 = i__ + j * a_dim1;
+	    r__ += (r__1 = a[i__3].r, dabs(r__1)) + (r__2 = r_imag(&a[i__ + j
+		    * a_dim1]), dabs(r__2));
+L150:
+	    ;
+	}
+	ica = icamax_(&l, &a[i__ * a_dim1 + 1], &c__1);
+	ca = c_abs(&a[ica + i__ * a_dim1]);
+	i__2 = *n - k + 1;
+	ira = icamax_(&i__2, &a[i__ + k * a_dim1], lda);
+	ra = c_abs(&a[i__ + (ira + k - 1) * a_dim1]);
+
+/*        Guard against zero C or R due to underflow. */
+
+	if (c__ == 0.f || r__ == 0.f) {
+	    goto L200;
+	}
+	g = r__ / 2.f;
+	f = 1.f;
+	s = c__ + r__;
+L160:
+/* Computing MAX */
+	r__1 = max(f,c__);
+/* Computing MIN */
+	r__2 = min(r__,g);
+	if (c__ >= g || dmax(r__1,ca) >= sfmax2 || dmin(r__2,ra) <= sfmin2) {
+	    goto L170;
+	}
+	r__1 = c__ + f + ca + r__ + g + ra;
+	if (sisnan_(&r__1)) {
+
+/*           Exit if NaN to avoid infinite loop */
+
+	    *info = -3;
+	    i__2 = -(*info);
+	    xerbla_("CGEBAL", &i__2);
+	    return 0;
+	}
+	f *= 2.f;
+	c__ *= 2.f;
+	ca *= 2.f;
+	r__ /= 2.f;
+	g /= 2.f;
+	ra /= 2.f;
+	goto L160;
+
+L170:
+	g = c__ / 2.f;
+L180:
+/* Computing MIN */
+	r__1 = min(f,c__), r__1 = min(r__1,g);
+	if (g < r__ || dmax(r__,ra) >= sfmax2 || dmin(r__1,ca) <= sfmin2) {
+	    goto L190;
+	}
+	f /= 2.f;
+	c__ /= 2.f;
+	g /= 2.f;
+	ca /= 2.f;
+	r__ *= 2.f;
+	ra *= 2.f;
+	goto L180;
+
+/*        Now balance. */
+
+L190:
+	if (c__ + r__ >= s * .95f) {
+	    goto L200;
+	}
+	if (f < 1.f && scale[i__] < 1.f) {
+	    if (f * scale[i__] <= sfmin1) {
+		goto L200;
+	    }
+	}
+	if (f > 1.f && scale[i__] > 1.f) {
+	    if (scale[i__] >= sfmax1 / f) {
+		goto L200;
+	    }
+	}
+	g = 1.f / f;
+	scale[i__] *= f;
+	noconv = TRUE_;
+
+	i__2 = *n - k + 1;
+	csscal_(&i__2, &g, &a[i__ + k * a_dim1], lda);
+	csscal_(&l, &f, &a[i__ * a_dim1 + 1], &c__1);
+
+L200:
+	;
+    }
+
+    if (noconv) {
+	goto L140;
+    }
+
+L210:
+    *ilo = k;
+    *ihi = l;
+
+    return 0;
+
+/*     End of CGEBAL */
+
+} /* cgebal_ */
+
+/* Subroutine */ int cgebd2_(integer *m, integer *n, complex *a, integer *lda,
+	 real *d__, real *e, complex *tauq, complex *taup, complex *work,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__;
+    static complex alpha;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    clarfg_(integer *, complex *, complex *, integer *, complex *),
+	    clacgv_(integer *, complex *, integer *), xerbla_(char *, integer
+	    *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEBD2 reduces a complex general m by n matrix A to upper or lower
+    real bidiagonal form B by a unitary transformation: Q' * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the unitary matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the unitary matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) REAL array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) COMPLEX array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix P. See Further Details.
+
+    WORK    (workspace) COMPLEX array, dimension (max(M,N))
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in
+    A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in
+    A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, v and u are complex vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("CGEBD2", &i__1);
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    clarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1, &
+		    tauq[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*           Apply H(i)' to A(i:m,i+1:n) from the left */
+
+	    if (i__ < *n) {
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		r_cnjg(&q__1, &tauq[i__]);
+		clarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &
+			q__1, &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    }
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__;
+	    a[i__2].r = d__[i__3], a[i__2].i = 0.f;
+
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector G(i) to annihilate
+                A(i,i+2:n)
+*/
+
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		clarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+			taup[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Apply G(i) to A(i+1:m,i+1:n) from the right */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		clarf_("Right", &i__2, &i__3, &a[i__ + (i__ + 1) * a_dim1],
+			lda, &taup[i__], &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &work[1]);
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		i__3 = i__;
+		a[i__2].r = e[i__3], a[i__2].i = 0.f;
+	    } else {
+		i__2 = i__;
+		taup[i__2].r = 0.f, taup[i__2].i = 0.f;
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector G(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = *n - i__ + 1;
+	    clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    clarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+		    taup[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*           Apply G(i) to A(i+1:m,i:n) from the right */
+
+	    if (i__ < *m) {
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		clarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &
+			taup[i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    i__2 = *n - i__ + 1;
+	    clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__;
+	    a[i__2].r = d__[i__3], a[i__2].i = 0.f;
+
+	    if (i__ < *m) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:m,i)
+*/
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		clarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1,
+			 &tauq[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Apply H(i)' to A(i+1:m,i+1:n) from the left */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		r_cnjg(&q__1, &tauq[i__]);
+		clarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &
+			c__1, &q__1, &a[i__ + 1 + (i__ + 1) * a_dim1], lda, &
+			work[1]);
+		i__2 = i__ + 1 + i__ * a_dim1;
+		i__3 = i__;
+		a[i__2].r = e[i__3], a[i__2].i = 0.f;
+	    } else {
+		i__2 = i__;
+		tauq[i__2].r = 0.f, tauq[i__2].i = 0.f;
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of CGEBD2 */
+
+} /* cgebd2_ */
+
+/* Subroutine */ int cgebrd_(integer *m, integer *n, complex *a, integer *lda,
+	 real *d__, real *e, complex *tauq, complex *taup, complex *work,
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, nb, nx;
+    static real ws;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *);
+    static integer nbmin, iinfo, minmn;
+    extern /* Subroutine */ int cgebd2_(integer *, integer *, complex *,
+	    integer *, real *, real *, complex *, complex *, complex *,
+	    integer *), clabrd_(integer *, integer *, integer *, complex *,
+	    integer *, real *, real *, complex *, complex *, complex *,
+	    integer *, complex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwrkx, ldwrky, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEBRD reduces a general complex M-by-N matrix A to upper or lower
+    bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the unitary matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the unitary matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) REAL array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) COMPLEX array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix P. See Further Details.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,M,N).
+            For optimum performance LWORK >= (M+N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in
+    A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in
+    A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MAX */
+    i__1 = 1, i__2 = ilaenv_(&c__1, "CGEBRD", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = max(i__1,i__2);
+    lwkopt = (*m + *n) * nb;
+    r__1 = (real) lwkopt;
+    work[1].r = r__1, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = max(1,*m);
+	if (*lwork < max(i__1,*n) && ! lquery) {
+	    *info = -10;
+	}
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("CGEBRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    minmn = min(*m,*n);
+    if (minmn == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    ws = (real) max(*m,*n);
+    ldwrkx = *m;
+    ldwrky = *n;
+
+    if (nb > 1 && nb < minmn) {
+
+/*
+          Set the crossover point NX.
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "CGEBRD", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+
+/*        Determine when to switch from blocked to unblocked code. */
+
+	if (nx < minmn) {
+	    ws = (real) ((*m + *n) * nb);
+	    if ((real) (*lwork) < ws) {
+
+/*
+                Not enough work space for the optimal NB, consider using
+                a smaller block size.
+*/
+
+		nbmin = ilaenv_(&c__2, "CGEBRD", " ", m, n, &c_n1, &c_n1, (
+			ftnlen)6, (ftnlen)1);
+		if (*lwork >= (*m + *n) * nbmin) {
+		    nb = *lwork / (*m + *n);
+		} else {
+		    nb = 1;
+		    nx = minmn;
+		}
+	    }
+	}
+    } else {
+	nx = minmn;
+    }
+
+    i__1 = minmn - nx;
+    i__2 = nb;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+
+/*
+          Reduce rows and columns i:i+ib-1 to bidiagonal form and return
+          the matrices X and Y which are needed to update the unreduced
+          part of the matrix
+*/
+
+	i__3 = *m - i__ + 1;
+	i__4 = *n - i__ + 1;
+	clabrd_(&i__3, &i__4, &nb, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[
+		i__], &tauq[i__], &taup[i__], &work[1], &ldwrkx, &work[ldwrkx
+		* nb + 1], &ldwrky);
+
+/*
+          Update the trailing submatrix A(i+ib:m,i+ib:n), using
+          an update of the form  A := A - V*Y' - X*U'
+*/
+
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemm_("No transpose", "Conjugate transpose", &i__3, &i__4, &nb, &
+		q__1, &a[i__ + nb + i__ * a_dim1], lda, &work[ldwrkx * nb +
+		nb + 1], &ldwrky, &c_b57, &a[i__ + nb + (i__ + nb) * a_dim1],
+		lda);
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemm_("No transpose", "No transpose", &i__3, &i__4, &nb, &q__1, &
+		work[nb + 1], &ldwrkx, &a[i__ + (i__ + nb) * a_dim1], lda, &
+		c_b57, &a[i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*        Copy diagonal and off-diagonal elements of B back into A */
+
+	if (*m >= *n) {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = d__[i__5], a[i__4].i = 0.f;
+		i__4 = j + (j + 1) * a_dim1;
+		i__5 = j;
+		a[i__4].r = e[i__5], a[i__4].i = 0.f;
+/* L10: */
+	    }
+	} else {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = d__[i__5], a[i__4].i = 0.f;
+		i__4 = j + 1 + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = e[i__5], a[i__4].i = 0.f;
+/* L20: */
+	    }
+	}
+/* L30: */
+    }
+
+/*     Use unblocked code to reduce the remainder of the matrix */
+
+    i__2 = *m - i__ + 1;
+    i__1 = *n - i__ + 1;
+    cgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], &
+	    tauq[i__], &taup[i__], &work[1], &iinfo);
+    work[1].r = ws, work[1].i = 0.f;
+    return 0;
+
+/*     End of CGEBRD */
+
+} /* cgebrd_ */
+
+/* Subroutine */ int cgeev_(char *jobvl, char *jobvr, integer *n, complex *a,
+	integer *lda, complex *w, complex *vl, integer *ldvl, complex *vr,
+	integer *ldvr, complex *work, integer *lwork, real *rwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3;
+    real r__1, r__2;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, k, ihi;
+    static real scl;
+    static integer ilo;
+    static real dum[1], eps;
+    static complex tmp;
+    static integer ibal;
+    static char side[1];
+    static real anrm;
+    static integer ierr, itau, iwrk, nout;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern doublereal scnrm2_(integer *, complex *, integer *);
+    extern /* Subroutine */ int cgebak_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, complex *, integer *, integer *), cgebal_(char *, integer *, complex *, integer *,
+	    integer *, integer *, real *, integer *), slabad_(real *,
+	    real *);
+    static logical scalea;
+    extern doublereal clange_(char *, integer *, integer *, complex *,
+	    integer *, real *);
+    static real cscale;
+    extern /* Subroutine */ int cgehrd_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, integer *),
+	     clascl_(char *, integer *, integer *, real *, real *, integer *,
+	    integer *, complex *, integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
+	    *), clacpy_(char *, integer *, integer *, complex *, integer *,
+	    complex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical select[1];
+    static real bignum;
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int chseqr_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *), ctrevc_(char *,
+	    char *, logical *, integer *, complex *, integer *, complex *,
+	    integer *, complex *, integer *, integer *, integer *, complex *,
+	    real *, integer *), cunghr_(integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    integer *);
+    static integer minwrk, maxwrk;
+    static logical wantvl;
+    static real smlnum;
+    static integer hswork, irwork;
+    static logical lquery, wantvr;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEEV computes for an N-by-N complex nonsymmetric matrix A, the
+    eigenvalues and, optionally, the left and/or right eigenvectors.
+
+    The right eigenvector v(j) of A satisfies
+                     A * v(j) = lambda(j) * v(j)
+    where lambda(j) is its eigenvalue.
+    The left eigenvector u(j) of A satisfies
+                  u(j)**H * A = lambda(j) * u(j)**H
+    where u(j)**H denotes the conjugate transpose of u(j).
+
+    The computed eigenvectors are normalized to have Euclidean norm
+    equal to 1 and largest component real.
+
+    Arguments
+    =========
+
+    JOBVL   (input) CHARACTER*1
+            = 'N': left eigenvectors of A are not computed;
+            = 'V': left eigenvectors of are computed.
+
+    JOBVR   (input) CHARACTER*1
+            = 'N': right eigenvectors of A are not computed;
+            = 'V': right eigenvectors of A are computed.
+
+    N       (input) INTEGER
+            The order of the matrix A. N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the N-by-N matrix A.
+            On exit, A has been overwritten.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    W       (output) COMPLEX array, dimension (N)
+            W contains the computed eigenvalues.
+
+    VL      (output) COMPLEX array, dimension (LDVL,N)
+            If JOBVL = 'V', the left eigenvectors u(j) are stored one
+            after another in the columns of VL, in the same order
+            as their eigenvalues.
+            If JOBVL = 'N', VL is not referenced.
+            u(j) = VL(:,j), the j-th column of VL.
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1; if
+            JOBVL = 'V', LDVL >= N.
+
+    VR      (output) COMPLEX array, dimension (LDVR,N)
+            If JOBVR = 'V', the right eigenvectors v(j) are stored one
+            after another in the columns of VR, in the same order
+            as their eigenvalues.
+            If JOBVR = 'N', VR is not referenced.
+            v(j) = VR(:,j), the j-th column of VR.
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1; if
+            JOBVR = 'V', LDVR >= N.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,2*N).
+            For good performance, LWORK must generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    RWORK   (workspace) REAL array, dimension (2*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = i, the QR algorithm failed to compute all the
+                  eigenvalues, and no eigenvectors have been computed;
+                  elements and i+1:N of W contain eigenvalues which have
+                  converged.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --w;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+    --rwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    wantvl = lsame_(jobvl, "V");
+    wantvr = lsame_(jobvr, "V");
+    if (! wantvl && ! lsame_(jobvl, "N")) {
+	*info = -1;
+    } else if (! wantvr && ! lsame_(jobvr, "N")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldvl < 1 || wantvl && *ldvl < *n) {
+	*info = -8;
+    } else if (*ldvr < 1 || wantvr && *ldvr < *n) {
+	*info = -10;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         CWorkspace refers to complex workspace, and RWorkspace to real
+         workspace. NB refers to the optimal block size for the
+         immediately following subroutine, as returned by ILAENV.
+         HSWORK refers to the workspace preferred by CHSEQR, as
+         calculated below. HSWORK is computed assuming ILO=1 and IHI=N,
+         the worst case.)
+*/
+
+    if (*info == 0) {
+	if (*n == 0) {
+	    minwrk = 1;
+	    maxwrk = 1;
+	} else {
+	    maxwrk = *n + *n * ilaenv_(&c__1, "CGEHRD", " ", n, &c__1, n, &
+		    c__0, (ftnlen)6, (ftnlen)1);
+	    minwrk = *n << 1;
+	    if (wantvl) {
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + (*n - 1) * ilaenv_(&c__1, "CUNGHR",
+			 " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+		chseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &w[1], &vl[
+			vl_offset], ldvl, &work[1], &c_n1, info);
+	    } else if (wantvr) {
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + (*n - 1) * ilaenv_(&c__1, "CUNGHR",
+			 " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+		chseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &w[1], &vr[
+			vr_offset], ldvr, &work[1], &c_n1, info);
+	    } else {
+		chseqr_("E", "N", n, &c__1, n, &a[a_offset], lda, &w[1], &vr[
+			vr_offset], ldvr, &work[1], &c_n1, info);
+	    }
+	    hswork = work[1].r;
+/* Computing MAX */
+	    i__1 = max(maxwrk,hswork);
+	    maxwrk = max(i__1,minwrk);
+	}
+	work[1].r = (real) maxwrk, work[1].i = 0.f;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEEV ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = slamch_("P");
+    smlnum = slamch_("S");
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+    smlnum = sqrt(smlnum) / eps;
+    bignum = 1.f / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = clange_("M", n, n, &a[a_offset], lda, dum);
+    scalea = FALSE_;
+    if (anrm > 0.f && anrm < smlnum) {
+	scalea = TRUE_;
+	cscale = smlnum;
+    } else if (anrm > bignum) {
+	scalea = TRUE_;
+	cscale = bignum;
+    }
+    if (scalea) {
+	clascl_("G", &c__0, &c__0, &anrm, &cscale, n, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+/*
+       Balance the matrix
+       (CWorkspace: none)
+       (RWorkspace: need N)
+*/
+
+    ibal = 1;
+    cgebal_("B", n, &a[a_offset], lda, &ilo, &ihi, &rwork[ibal], &ierr);
+
+/*
+       Reduce to upper Hessenberg form
+       (CWorkspace: need 2*N, prefer N+N*NB)
+       (RWorkspace: none)
+*/
+
+    itau = 1;
+    iwrk = itau + *n;
+    i__1 = *lwork - iwrk + 1;
+    cgehrd_(n, &ilo, &ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1,
+	     &ierr);
+
+    if (wantvl) {
+
+/*
+          Want left eigenvectors
+          Copy Householder vectors to VL
+*/
+
+	*(unsigned char *)side = 'L';
+	clacpy_("L", n, n, &a[a_offset], lda, &vl[vl_offset], ldvl)
+		;
+
+/*
+          Generate unitary matrix in VL
+          (CWorkspace: need 2*N-1, prefer N+(N-1)*NB)
+          (RWorkspace: none)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	cunghr_(n, &ilo, &ihi, &vl[vl_offset], ldvl, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VL
+          (CWorkspace: need 1, prefer HSWORK (see comments) )
+          (RWorkspace: none)
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	chseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &w[1], &vl[
+		vl_offset], ldvl, &work[iwrk], &i__1, info);
+
+	if (wantvr) {
+
+/*
+             Want left and right eigenvectors
+             Copy Schur vectors to VR
+*/
+
+	    *(unsigned char *)side = 'B';
+	    clacpy_("F", n, n, &vl[vl_offset], ldvl, &vr[vr_offset], ldvr);
+	}
+
+    } else if (wantvr) {
+
+/*
+          Want right eigenvectors
+          Copy Householder vectors to VR
+*/
+
+	*(unsigned char *)side = 'R';
+	clacpy_("L", n, n, &a[a_offset], lda, &vr[vr_offset], ldvr)
+		;
+
+/*
+          Generate unitary matrix in VR
+          (CWorkspace: need 2*N-1, prefer N+(N-1)*NB)
+          (RWorkspace: none)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	cunghr_(n, &ilo, &ihi, &vr[vr_offset], ldvr, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VR
+          (CWorkspace: need 1, prefer HSWORK (see comments) )
+          (RWorkspace: none)
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	chseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &w[1], &vr[
+		vr_offset], ldvr, &work[iwrk], &i__1, info);
+
+    } else {
+
+/*
+          Compute eigenvalues only
+          (CWorkspace: need 1, prefer HSWORK (see comments) )
+          (RWorkspace: none)
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	chseqr_("E", "N", n, &ilo, &ihi, &a[a_offset], lda, &w[1], &vr[
+		vr_offset], ldvr, &work[iwrk], &i__1, info);
+    }
+
+/*     If INFO > 0 from CHSEQR, then quit */
+
+    if (*info > 0) {
+	goto L50;
+    }
+
+    if (wantvl || wantvr) {
+
+/*
+          Compute left and/or right eigenvectors
+          (CWorkspace: need 2*N)
+          (RWorkspace: need 2*N)
+*/
+
+	irwork = ibal + *n;
+	ctrevc_(side, "B", select, n, &a[a_offset], lda, &vl[vl_offset], ldvl,
+		 &vr[vr_offset], ldvr, n, &nout, &work[iwrk], &rwork[irwork],
+		&ierr);
+    }
+
+    if (wantvl) {
+
+/*
+          Undo balancing of left eigenvectors
+          (CWorkspace: none)
+          (RWorkspace: need N)
+*/
+
+	cgebak_("B", "L", n, &ilo, &ihi, &rwork[ibal], n, &vl[vl_offset],
+		ldvl, &ierr);
+
+/*        Normalize left eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scl = 1.f / scnrm2_(n, &vl[i__ * vl_dim1 + 1], &c__1);
+	    csscal_(n, &scl, &vl[i__ * vl_dim1 + 1], &c__1);
+	    i__2 = *n;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k + i__ * vl_dim1;
+/* Computing 2nd power */
+		r__1 = vl[i__3].r;
+/* Computing 2nd power */
+		r__2 = r_imag(&vl[k + i__ * vl_dim1]);
+		rwork[irwork + k - 1] = r__1 * r__1 + r__2 * r__2;
+/* L10: */
+	    }
+	    k = isamax_(n, &rwork[irwork], &c__1);
+	    r_cnjg(&q__2, &vl[k + i__ * vl_dim1]);
+	    r__1 = sqrt(rwork[irwork + k - 1]);
+	    q__1.r = q__2.r / r__1, q__1.i = q__2.i / r__1;
+	    tmp.r = q__1.r, tmp.i = q__1.i;
+	    cscal_(n, &tmp, &vl[i__ * vl_dim1 + 1], &c__1);
+	    i__2 = k + i__ * vl_dim1;
+	    i__3 = k + i__ * vl_dim1;
+	    r__1 = vl[i__3].r;
+	    q__1.r = r__1, q__1.i = 0.f;
+	    vl[i__2].r = q__1.r, vl[i__2].i = q__1.i;
+/* L20: */
+	}
+    }
+
+    if (wantvr) {
+
+/*
+          Undo balancing of right eigenvectors
+          (CWorkspace: none)
+          (RWorkspace: need N)
+*/
+
+	cgebak_("B", "R", n, &ilo, &ihi, &rwork[ibal], n, &vr[vr_offset],
+		ldvr, &ierr);
+
+/*        Normalize right eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scl = 1.f / scnrm2_(n, &vr[i__ * vr_dim1 + 1], &c__1);
+	    csscal_(n, &scl, &vr[i__ * vr_dim1 + 1], &c__1);
+	    i__2 = *n;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k + i__ * vr_dim1;
+/* Computing 2nd power */
+		r__1 = vr[i__3].r;
+/* Computing 2nd power */
+		r__2 = r_imag(&vr[k + i__ * vr_dim1]);
+		rwork[irwork + k - 1] = r__1 * r__1 + r__2 * r__2;
+/* L30: */
+	    }
+	    k = isamax_(n, &rwork[irwork], &c__1);
+	    r_cnjg(&q__2, &vr[k + i__ * vr_dim1]);
+	    r__1 = sqrt(rwork[irwork + k - 1]);
+	    q__1.r = q__2.r / r__1, q__1.i = q__2.i / r__1;
+	    tmp.r = q__1.r, tmp.i = q__1.i;
+	    cscal_(n, &tmp, &vr[i__ * vr_dim1 + 1], &c__1);
+	    i__2 = k + i__ * vr_dim1;
+	    i__3 = k + i__ * vr_dim1;
+	    r__1 = vr[i__3].r;
+	    q__1.r = r__1, q__1.i = 0.f;
+	    vr[i__2].r = q__1.r, vr[i__2].i = q__1.i;
+/* L40: */
+	}
+    }
+
+/*     Undo scaling if necessary */
+
+L50:
+    if (scalea) {
+	i__1 = *n - *info;
+/* Computing MAX */
+	i__3 = *n - *info;
+	i__2 = max(i__3,1);
+	clascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &w[*info + 1]
+		, &i__2, &ierr);
+	if (*info > 0) {
+	    i__1 = ilo - 1;
+	    clascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &w[1], n,
+		     &ierr);
+	}
+    }
+
+    work[1].r = (real) maxwrk, work[1].i = 0.f;
+    return 0;
+
+/*     End of CGEEV */
+
+} /* cgeev_ */
+
+/* Subroutine */ int cgehd2_(integer *n, integer *ilo, integer *ihi, complex *
+	a, integer *lda, complex *tau, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__;
+    static complex alpha;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    clarfg_(integer *, complex *, complex *, integer *, complex *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEHD2 reduces a complex general matrix A to upper Hessenberg form H
+    by a unitary similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to CGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= max(1,N).
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the n by n general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the unitary matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) COMPLEX array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) COMPLEX array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEHD2", &i__1);
+	return 0;
+    }
+
+    i__1 = *ihi - 1;
+    for (i__ = *ilo; i__ <= i__1; ++i__) {
+
+/*        Compute elementary reflector H(i) to annihilate A(i+2:ihi,i) */
+
+	i__2 = i__ + 1 + i__ * a_dim1;
+	alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	i__2 = *ihi - i__;
+/* Computing MIN */
+	i__3 = i__ + 2;
+	clarfg_(&i__2, &alpha, &a[min(i__3,*n) + i__ * a_dim1], &c__1, &tau[
+		i__]);
+	i__2 = i__ + 1 + i__ * a_dim1;
+	a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*        Apply H(i) to A(1:ihi,i+1:ihi) from the right */
+
+	i__2 = *ihi - i__;
+	clarf_("Right", ihi, &i__2, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+		i__], &a[(i__ + 1) * a_dim1 + 1], lda, &work[1]);
+
+/*        Apply H(i)' to A(i+1:ihi,i+1:n) from the left */
+
+	i__2 = *ihi - i__;
+	i__3 = *n - i__;
+	r_cnjg(&q__1, &tau[i__]);
+	clarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &c__1, &q__1,
+		 &a[i__ + 1 + (i__ + 1) * a_dim1], lda, &work[1]);
+
+	i__2 = i__ + 1 + i__ * a_dim1;
+	a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+/* L10: */
+    }
+
+    return 0;
+
+/*     End of CGEHD2 */
+
+} /* cgehd2_ */
+
+/* Subroutine */ int cgehrd_(integer *n, integer *ilo, integer *ihi, complex *
+	a, integer *lda, complex *tau, complex *work, integer *lwork, integer
+	*info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j;
+    static complex t[4160]	/* was [65][64] */;
+    static integer ib;
+    static complex ei;
+    static integer nb, nh, nx, iws;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *), caxpy_(integer *,
+	    complex *, complex *, integer *, complex *, integer *), cgehd2_(
+	    integer *, integer *, integer *, complex *, integer *, complex *,
+	    complex *, integer *), clahr2_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, complex *,
+	    integer *), clarfb_(char *, char *, char *, char *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    complex *, integer *, complex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2.1)                                  --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    CGEHRD reduces a complex general matrix A to upper Hessenberg form H by
+    an unitary similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to CGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the N-by-N general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the unitary matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) COMPLEX array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
+            zero.
+
+    WORK    (workspace/output) COMPLEX array, dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This file is a slight modification of LAPACK-3.0's DGEHRD
+    subroutine incorporating improvements proposed by Quintana-Orti and
+    Van de Geijn (2006). (See DLAHR2.)
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MIN */
+    i__1 = 64, i__2 = ilaenv_(&c__1, "CGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    lwkopt = *n * nb;
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEHRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
+
+    i__1 = *ilo - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+/* L10: */
+    }
+    i__1 = *n - 1;
+    for (i__ = max(1,*ihi); i__ <= i__1; ++i__) {
+	i__2 = i__;
+	tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+/* L20: */
+    }
+
+/*     Quick return if possible */
+
+    nh = *ihi - *ilo + 1;
+    if (nh <= 1) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+/*
+       Determine the block size
+
+   Computing MIN
+*/
+    i__1 = 64, i__2 = ilaenv_(&c__1, "CGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    nbmin = 2;
+    iws = 1;
+    if (nb > 1 && nb < nh) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code)
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "CGEHRD", " ", n, ilo, ihi, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < nh) {
+
+/*           Determine if workspace is large enough for blocked code */
+
+	    iws = *n * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code
+
+   Computing MAX
+*/
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CGEHRD", " ", n, ilo, ihi, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+		if (*lwork >= *n * nbmin) {
+		    nb = *lwork / *n;
+		} else {
+		    nb = 1;
+		}
+	    }
+	}
+    }
+    ldwork = *n;
+
+    if (nb < nbmin || nb >= nh) {
+
+/*        Use unblocked code below */
+
+	i__ = *ilo;
+
+    } else {
+
+/*        Use blocked code */
+
+	i__1 = *ihi - 1 - nx;
+	i__2 = nb;
+	for (i__ = *ilo; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = nb, i__4 = *ihi - i__;
+	    ib = min(i__3,i__4);
+
+/*
+             Reduce columns i:i+ib-1 to Hessenberg form, returning the
+             matrices V and T of the block reflector H = I - V*T*V'
+             which performs the reduction, and also the matrix Y = A*V*T
+*/
+
+	    clahr2_(ihi, &i__, &ib, &a[i__ * a_dim1 + 1], lda, &tau[i__], t, &
+		    c__65, &work[1], &ldwork);
+
+/*
+             Apply the block reflector H to A(1:ihi,i+ib:ihi) from the
+             right, computing  A := A - Y * V'. V(i+ib,ib-1) must be set
+             to 1
+*/
+
+	    i__3 = i__ + ib + (i__ + ib - 1) * a_dim1;
+	    ei.r = a[i__3].r, ei.i = a[i__3].i;
+	    i__3 = i__ + ib + (i__ + ib - 1) * a_dim1;
+	    a[i__3].r = 1.f, a[i__3].i = 0.f;
+	    i__3 = *ihi - i__ - ib + 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemm_("No transpose", "Conjugate transpose", ihi, &i__3, &ib, &
+		    q__1, &work[1], &ldwork, &a[i__ + ib + i__ * a_dim1], lda,
+		     &c_b57, &a[(i__ + ib) * a_dim1 + 1], lda);
+	    i__3 = i__ + ib + (i__ + ib - 1) * a_dim1;
+	    a[i__3].r = ei.r, a[i__3].i = ei.i;
+
+/*
+             Apply the block reflector H to A(1:i,i+1:i+ib-1) from the
+             right
+*/
+
+	    i__3 = ib - 1;
+	    ctrmm_("Right", "Lower", "Conjugate transpose", "Unit", &i__, &
+		    i__3, &c_b57, &a[i__ + 1 + i__ * a_dim1], lda, &work[1], &
+		    ldwork);
+	    i__3 = ib - 2;
+	    for (j = 0; j <= i__3; ++j) {
+		q__1.r = -1.f, q__1.i = -0.f;
+		caxpy_(&i__, &q__1, &work[ldwork * j + 1], &c__1, &a[(i__ + j
+			+ 1) * a_dim1 + 1], &c__1);
+/* L30: */
+	    }
+
+/*
+             Apply the block reflector H to A(i+1:ihi,i+ib:n) from the
+             left
+*/
+
+	    i__3 = *ihi - i__;
+	    i__4 = *n - i__ - ib + 1;
+	    clarfb_("Left", "Conjugate transpose", "Forward", "Columnwise", &
+		    i__3, &i__4, &ib, &a[i__ + 1 + i__ * a_dim1], lda, t, &
+		    c__65, &a[i__ + 1 + (i__ + ib) * a_dim1], lda, &work[1], &
+		    ldwork);
+/* L40: */
+	}
+    }
+
+/*     Use unblocked code to reduce the rest of the matrix */
+
+    cgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo);
+    work[1].r = (real) iws, work[1].i = 0.f;
+
+    return 0;
+
+/*     End of CGEHRD */
+
+} /* cgehrd_ */
+
+/* Subroutine */ int cgelq2_(integer *m, integer *n, complex *a, integer *lda,
+	 complex *tau, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static complex alpha;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    clarfg_(integer *, complex *, complex *, integer *, complex *),
+	    clacgv_(integer *, complex *, integer *), xerbla_(char *, integer
+	    *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CGELQ2 computes an LQ factorization of a complex m by n matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m by min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) COMPLEX array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k)' . . . H(2)' H(1)', where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
+    A(i,i+1:n), and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELQ2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i,i+1:n) */
+
+	i__2 = *n - i__ + 1;
+	clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	i__2 = i__ + i__ * a_dim1;
+	alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	i__2 = *n - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	clarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &tau[i__]
+		);
+	if (i__ < *m) {
+
+/*           Apply H(i) to A(i+1:m,i:n) from the right */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    i__2 = *m - i__;
+	    i__3 = *n - i__ + 1;
+	    clarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[
+		    i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	}
+	i__2 = i__ + i__ * a_dim1;
+	a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+	i__2 = *n - i__ + 1;
+	clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+/* L10: */
+    }
+    return 0;
+
+/*     End of CGELQ2 */
+
+} /* cgelq2_ */
+
+/* Subroutine */ int cgelqf_(integer *m, integer *n, complex *a, integer *lda,
+	 complex *tau, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int cgelq2_(integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *), clarfb_(char *, char
+	    *, char *, char *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    integer *), clarft_(char *, char *
+	    , integer *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGELQF computes an LQ factorization of a complex M-by-N matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k)' . . . H(2)' H(1)', where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
+    A(i,i+1:n), and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "CGELQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *m * nb;
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELQF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CGELQF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CGELQF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the LQ factorization of the current block
+             A(i:i+ib-1,i:n)
+*/
+
+	    i__3 = *n - i__ + 1;
+	    cgelq2_(&ib, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *n - i__ + 1;
+		clarft_("Forward", "Rowwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i+ib:m,i:n) from the right */
+
+		i__3 = *m - i__ - ib + 1;
+		i__4 = *n - i__ + 1;
+		clarfb_("Right", "No transpose", "Forward", "Rowwise", &i__3,
+			&i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib +
+			1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	cgelq2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1].r = (real) iws, work[1].i = 0.f;
+    return 0;
+
+/*     End of CGELQF */
+
+} /* cgelqf_ */
+
+/* Subroutine */ int cgelsd_(integer *m, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *b, integer *ldb, real *s, real *rcond,
+	integer *rank, complex *work, integer *lwork, real *rwork, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer ie, il, mm;
+    static real eps, anrm, bnrm;
+    static integer itau, nlvl, iascl, ibscl;
+    static real sfmin;
+    static integer minmn, maxmn, itaup, itauq, mnthr, nwork;
+    extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *,
+	    integer *, real *, real *, complex *, complex *, complex *,
+	    integer *, integer *), slabad_(real *, real *);
+    extern doublereal clange_(char *, integer *, integer *, complex *,
+	    integer *, real *);
+    extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *, integer *), clalsd_(
+	    char *, integer *, integer *, integer *, real *, real *, complex *
+	    , integer *, real *, integer *, complex *, real *, integer *,
+	    integer *), clascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex
+	    *, integer *, complex *, integer *), claset_(char *,
+	    integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), cunmbr_(char *, char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *), slaset_(
+	    char *, integer *, integer *, real *, real *, real *, integer *), cunmlq_(char *, char *, integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, complex *,
+	    integer *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *);
+    static integer liwork, minwrk, maxwrk;
+    static real smlnum;
+    static integer lrwork;
+    static logical lquery;
+    static integer nrwork, smlsiz;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGELSD computes the minimum-norm solution to a real linear least
+    squares problem:
+        minimize 2-norm(| b - A*x |)
+    using the singular value decomposition (SVD) of A. A is an M-by-N
+    matrix which may be rank-deficient.
+
+    Several right hand side vectors b and solution vectors x can be
+    handled in a single call; they are stored as the columns of the
+    M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+    matrix X.
+
+    The problem is solved in three steps:
+    (1) Reduce the coefficient matrix A to bidiagonal form with
+        Householder tranformations, reducing the original problem
+        into a "bidiagonal least squares problem" (BLS)
+    (2) Solve the BLS using a divide and conquer approach.
+    (3) Apply back all the Householder tranformations to solve
+        the original least squares problem.
+
+    The effective rank of A is determined by treating as zero those
+    singular values which are less than RCOND times the largest singular
+    value.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A. N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrices B and X. NRHS >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, A has been destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    B       (input/output) COMPLEX array, dimension (LDB,NRHS)
+            On entry, the M-by-NRHS right hand side matrix B.
+            On exit, B is overwritten by the N-by-NRHS solution matrix X.
+            If m >= n and RANK = n, the residual sum-of-squares for
+            the solution in the i-th column is given by the sum of
+            squares of the modulus of elements n+1:m in that column.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M,N).
+
+    S       (output) REAL array, dimension (min(M,N))
+            The singular values of A in decreasing order.
+            The condition number of A in the 2-norm = S(1)/S(min(m,n)).
+
+    RCOND   (input) REAL
+            RCOND is used to determine the effective rank of A.
+            Singular values S(i) <= RCOND*S(1) are treated as zero.
+            If RCOND < 0, machine precision is used instead.
+
+    RANK    (output) INTEGER
+            The effective rank of A, i.e., the number of singular values
+            which are greater than RCOND*S(1).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK must be at least 1.
+            The exact minimum amount of workspace needed depends on M,
+            N and NRHS. As long as LWORK is at least
+                2 * N + N * NRHS
+            if M is greater than or equal to N or
+                2 * M + M * NRHS
+            if M is less than N, the code will execute correctly.
+            For good performance, LWORK should generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the array WORK and the
+            minimum sizes of the arrays RWORK and IWORK, and returns
+            these values as the first entries of the WORK, RWORK and
+            IWORK arrays, and no error message related to LWORK is issued
+            by XERBLA.
+
+    RWORK   (workspace) REAL array, dimension (MAX(1,LRWORK))
+            LRWORK >=
+               10*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS +
+               MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS )
+            if M is greater than or equal to N or
+               10*M + 2*M*SMLSIZ + 8*M*NLVL + 3*SMLSIZ*NRHS +
+               MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS )
+            if M is less than N, the code will execute correctly.
+            SMLSIZ is returned by ILAENV and is equal to the maximum
+            size of the subproblems at the bottom of the computation
+            tree (usually about 25), and
+               NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+            On exit, if INFO = 0, RWORK(1) returns the minimum LRWORK.
+
+    IWORK   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
+            LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN),
+            where MINMN = MIN( M,N ).
+            On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK.
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+            > 0:  the algorithm for computing the SVD failed to converge;
+                  if INFO = i, i off-diagonal elements of an intermediate
+                  bidiagonal form did not converge to zero.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input arguments.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --s;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    maxmn = max(*m,*n);
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldb < max(1,maxmn)) {
+	*info = -7;
+    }
+
+/*
+       Compute workspace.
+       (Note: Comments in the code beginning "Workspace:" describe the
+       minimal amount of workspace needed at that point in the code,
+       as well as the preferred amount for good performance.
+       NB refers to the optimal block size for the immediately
+       following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	liwork = 1;
+	lrwork = 1;
+	if (minmn > 0) {
+	    smlsiz = ilaenv_(&c__9, "CGELSD", " ", &c__0, &c__0, &c__0, &c__0,
+		     (ftnlen)6, (ftnlen)1);
+	    mnthr = ilaenv_(&c__6, "CGELSD", " ", m, n, nrhs, &c_n1, (ftnlen)
+		    6, (ftnlen)1);
+/* Computing MAX */
+	    i__1 = (integer) (log((real) minmn / (real) (smlsiz + 1)) / log(
+		    2.f)) + 1;
+	    nlvl = max(i__1,0);
+	    liwork = minmn * 3 * nlvl + minmn * 11;
+	    mm = *m;
+	    if (*m >= *n && *m >= mnthr) {
+
+/*
+                Path 1a - overdetermined, with many more rows than
+                          columns.
+*/
+
+		mm = *n;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * ilaenv_(&c__1, "CGEQRF", " ", m, n,
+			 &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *nrhs * ilaenv_(&c__1, "CUNMQR", "LC",
+			m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2);
+		maxwrk = max(i__1,i__2);
+	    }
+	    if (*m >= *n) {
+
+/*
+                Path 1 - overdetermined or exactly determined.
+
+   Computing MAX
+   Computing 2nd power
+*/
+		i__3 = smlsiz + 1;
+		i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1);
+		lrwork = *n * 10 + (*n << 1) * smlsiz + (*n << 3) * nlvl +
+			smlsiz * 3 * *nrhs + max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (mm + *n) * ilaenv_(&c__1,
+			"CGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, (
+			ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + *nrhs * ilaenv_(&c__1,
+			"CUNMBR", "QLC", &mm, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"CUNMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + *n * *nrhs;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = (*n << 1) + mm, i__2 = (*n << 1) + *n * *nrhs;
+		minwrk = max(i__1,i__2);
+	    }
+	    if (*n > *m) {
+/*
+   Computing MAX
+   Computing 2nd power
+*/
+		i__3 = smlsiz + 1;
+		i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1);
+		lrwork = *m * 10 + (*m << 1) * smlsiz + (*m << 3) * nlvl +
+			smlsiz * 3 * *nrhs + max(i__1,i__2);
+		if (*n >= mnthr) {
+
+/*
+                   Path 2a - underdetermined, with many more columns
+                             than rows.
+*/
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) *
+			    ilaenv_(&c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1,
+			    (ftnlen)6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs *
+			    ilaenv_(&c__1, "CUNMBR", "QLC", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) *
+			    ilaenv_(&c__1, "CUNMLQ", "LC", n, nrhs, m, &c_n1,
+			    (ftnlen)6, (ftnlen)2);
+		    maxwrk = max(i__1,i__2);
+		    if (*nrhs > 1) {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs;
+			maxwrk = max(i__1,i__2);
+		    } else {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + (*m << 1);
+			maxwrk = max(i__1,i__2);
+		    }
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+/*
+       XXX: Ensure the Path 2a case below is triggered.  The workspace
+       calculation should use queries for all routines eventually.
+   Computing MAX
+   Computing MAX
+*/
+		    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4),
+			    i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3;
+		    i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4)
+			    ;
+		    maxwrk = max(i__1,i__2);
+		} else {
+
+/*                 Path 2 - underdetermined. */
+
+		    maxwrk = (*m << 1) + (*n + *m) * ilaenv_(&c__1, "CGEBRD",
+			    " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *nrhs * ilaenv_(&c__1,
+			    "CUNMBR", "QLC", m, nrhs, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+		}
+/* Computing MAX */
+		i__1 = (*m << 1) + *n, i__2 = (*m << 1) + *m * *nrhs;
+		minwrk = max(i__1,i__2);
+	    }
+	}
+	minwrk = min(minwrk,maxwrk);
+	work[1].r = (real) maxwrk, work[1].i = 0.f;
+	iwork[1] = liwork;
+	rwork[1] = (real) lrwork;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELSD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	*rank = 0;
+	return 0;
+    }
+
+/*     Get machine parameters. */
+
+    eps = slamch_("P");
+    sfmin = slamch_("S");
+    smlnum = sfmin / eps;
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+
+/*     Scale A if max entry outside range [SMLNUM,BIGNUM]. */
+
+    anrm = clange_("M", m, n, &a[a_offset], lda, &rwork[1]);
+    iascl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 2;
+    } else if (anrm == 0.f) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	slaset_("F", &minmn, &c__1, &c_b328, &c_b328, &s[1], &c__1)
+		;
+	*rank = 0;
+	goto L10;
+    }
+
+/*     Scale B if max entry outside range [SMLNUM,BIGNUM]. */
+
+    bnrm = clange_("M", m, nrhs, &b[b_offset], ldb, &rwork[1]);
+    ibscl = 0;
+    if (bnrm > 0.f && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	clascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	clascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 2;
+    }
+
+/*     If M < N make sure B(M+1:N,:) = 0 */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     Overdetermined case. */
+
+    if (*m >= *n) {
+
+/*        Path 1 - overdetermined or exactly determined. */
+
+	mm = *m;
+	if (*m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns */
+
+	    mm = *n;
+	    itau = 1;
+	    nwork = itau + *n;
+
+/*
+             Compute A=Q*R.
+             (RWorkspace: need N)
+             (CWorkspace: need N, prefer N*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+
+/*
+             Multiply B by transpose(Q).
+             (RWorkspace: need N)
+             (CWorkspace: need NRHS, prefer NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmqr_("L", "C", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+/*           Zero out below R. */
+
+	    if (*n > 1) {
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		claset_("L", &i__1, &i__2, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+	    }
+	}
+
+	itauq = 1;
+	itaup = itauq + *n;
+	nwork = itaup + *n;
+	ie = 1;
+	nrwork = ie + *n;
+
+/*
+          Bidiagonalize R in A.
+          (RWorkspace: need N)
+          (CWorkspace: need 2*N+MM, prefer 2*N+(MM+N)*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	cgebrd_(&mm, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq], &
+		work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of R.
+          (CWorkspace: need 2*N+NRHS, prefer 2*N+NRHS*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	cunmbr_("Q", "L", "C", &mm, nrhs, n, &a[a_offset], lda, &work[itauq],
+		&b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	clalsd_("U", &smlsiz, n, nrhs, &s[1], &rwork[ie], &b[b_offset], ldb,
+		rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1], info);
+	if (*info != 0) {
+	    goto L10;
+	}
+
+/*        Multiply B by right bidiagonalizing vectors of R. */
+
+	i__1 = *lwork - nwork + 1;
+	cunmbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], &
+		b[b_offset], ldb, &work[nwork], &i__1, info);
+
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max(
+		i__1,*nrhs), i__2 = *n - *m * 3;
+	if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,i__2)) {
+
+/*
+          Path 2a - underdetermined, with many more columns than rows
+          and sufficient workspace for an efficient algorithm.
+*/
+
+	    ldwork = *m;
+/*
+   Computing MAX
+   Computing MAX
+*/
+	    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+		    max(i__3,*nrhs), i__4 = *n - *m * 3;
+	    i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda +
+		    *m + *m * *nrhs;
+	    if (*lwork >= max(i__1,i__2)) {
+		ldwork = *lda;
+	    }
+	    itau = 1;
+	    nwork = *m + 1;
+
+/*
+          Compute A=L*Q.
+          (CWorkspace: need 2*M, prefer M+M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+	    il = nwork;
+
+/*        Copy L to WORK(IL), zeroing out above its diagonal. */
+
+	    clacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork);
+	    i__1 = *m - 1;
+	    i__2 = *m - 1;
+	    claset_("U", &i__1, &i__2, &c_b56, &c_b56, &work[il + ldwork], &
+		    ldwork);
+	    itauq = il + ldwork * *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+	    ie = 1;
+	    nrwork = ie + *m;
+
+/*
+          Bidiagonalize L in WORK(IL).
+          (RWorkspace: need M)
+          (CWorkspace: need M*M+4*M, prefer M*M+4*M+2*M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgebrd_(m, m, &work[il], &ldwork, &s[1], &rwork[ie], &work[itauq],
+		     &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of L.
+          (CWorkspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("Q", "L", "C", m, nrhs, m, &work[il], &ldwork, &work[
+		    itauq], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    clalsd_("U", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1],
+		     info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of L. */
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[
+		    itaup], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Zero out below first M rows of B. */
+
+	    i__1 = *n - *m;
+	    claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1],
+		    ldb);
+	    nwork = itau + *m;
+
+/*
+          Multiply transpose(Q) by B.
+          (CWorkspace: need NRHS, prefer NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmlq_("L", "C", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+	} else {
+
+/*        Path 2 - remaining underdetermined cases. */
+
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+	    ie = 1;
+	    nrwork = ie + *m;
+
+/*
+          Bidiagonalize A.
+          (RWorkspace: need M)
+          (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors.
+          (CWorkspace: need 2*M+NRHS, prefer 2*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("Q", "L", "C", m, nrhs, n, &a[a_offset], lda, &work[itauq]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    clalsd_("L", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1],
+		     info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of A. */
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+	}
+    }
+
+/*     Undo scaling. */
+
+    if (iascl == 1) {
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    } else if (iascl == 2) {
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    }
+    if (ibscl == 1) {
+	clascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    } else if (ibscl == 2) {
+	clascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    }
+
+L10:
+    work[1].r = (real) maxwrk, work[1].i = 0.f;
+    iwork[1] = liwork;
+    rwork[1] = (real) lrwork;
+    return 0;
+
+/*     End of CGELSD */
+
+} /* cgelsd_ */
+
+/* Subroutine */ int cgeqr2_(integer *m, integer *n, complex *a, integer *lda,
+	 complex *tau, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static complex alpha;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    clarfg_(integer *, complex *, complex *, integer *, complex *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CGEQR2 computes a QR factorization of a complex m by n matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(m,n) by n upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) COMPLEX array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQR2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	clarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
+		, &c__1, &tau[i__]);
+	if (i__ < *n) {
+
+/*           Apply H(i)' to A(i:m,i+1:n) from the left */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    r_cnjg(&q__1, &tau[i__]);
+	    clarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &q__1,
+		     &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of CGEQR2 */
+
+} /* cgeqr2_ */
+
+/* Subroutine */ int cgeqrf_(integer *m, integer *n, complex *a, integer *lda,
+	 complex *tau, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *), clarfb_(char *, char
+	    *, char *, char *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    integer *), clarft_(char *, char *
+	    , integer *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEQRF computes a QR factorization of a complex M-by-N matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of min(m,n) elementary reflectors (see Further
+            Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "CGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *n * nb;
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQRF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQRF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the QR factorization of the current block
+             A(i:m,i:i+ib-1)
+*/
+
+	    i__3 = *m - i__ + 1;
+	    cgeqr2_(&i__3, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *m - i__ + 1;
+		clarft_("Forward", "Columnwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i:m,i+ib:n) from the left */
+
+		i__3 = *m - i__ + 1;
+		i__4 = *n - i__ - ib + 1;
+		clarfb_("Left", "Conjugate transpose", "Forward", "Columnwise"
+			, &i__3, &i__4, &ib, &a[i__ + i__ * a_dim1], lda, &
+			work[1], &ldwork, &a[i__ + (i__ + ib) * a_dim1], lda,
+			&work[ib + 1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	cgeqr2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1].r = (real) iws, work[1].i = 0.f;
+    return 0;
+
+/*     End of CGEQRF */
+
+} /* cgeqrf_ */
+
+/* Subroutine */ int cgesdd_(char *jobz, integer *m, integer *n, complex *a,
+	integer *lda, real *s, complex *u, integer *ldu, complex *vt, integer
+	*ldvt, complex *work, integer *lwork, real *rwork, integer *iwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ie, il, ir, iu, blk;
+    static real dum[1], eps;
+    static integer iru, ivt, iscl;
+    static real anrm;
+    static integer idum[1], ierr, itau, irvt;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    static integer chunk, minmn, wrkbl, itaup, itauq;
+    static logical wntqa;
+    static integer nwork;
+    extern /* Subroutine */ int clacp2_(char *, integer *, integer *, real *,
+	    integer *, complex *, integer *);
+    static logical wntqn, wntqo, wntqs;
+    static integer mnthr1, mnthr2;
+    extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *,
+	    integer *, real *, real *, complex *, complex *, complex *,
+	    integer *, integer *);
+    extern doublereal clange_(char *, integer *, integer *, complex *,
+	    integer *, real *);
+    extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *, integer *), clacrm_(
+	    integer *, integer *, complex *, integer *, real *, integer *,
+	    complex *, integer *, real *), clarcm_(integer *, integer *, real
+	    *, integer *, complex *, integer *, complex *, integer *, real *),
+	     clascl_(char *, integer *, integer *, real *, real *, integer *,
+	    integer *, complex *, integer *, integer *), sbdsdc_(char
+	    *, char *, integer *, real *, real *, real *, integer *, real *,
+	    integer *, real *, integer *, real *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer
+	    *, complex *, complex *, integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex
+	    *, integer *, complex *, integer *), claset_(char *,
+	    integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cungbr_(char *, integer *, integer *, integer
+	    *, complex *, integer *, complex *, complex *, integer *, integer
+	    *);
+    static real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), cunmbr_(char *, char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *), cunglq_(
+	    integer *, integer *, integer *, complex *, integer *, complex *,
+	    complex *, integer *, integer *);
+    static integer ldwrkl;
+    extern /* Subroutine */ int cungqr_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, integer *);
+    static integer ldwrkr, minwrk, ldwrku, maxwrk, ldwkvt;
+    static real smlnum;
+    static logical wntqas;
+    static integer nrwork;
+
+
+/*
+    -- LAPACK driver routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+       8-15-00:  Improve consistency of WS calculations (eca)
+
+
+    Purpose
+    =======
+
+    CGESDD computes the singular value decomposition (SVD) of a complex
+    M-by-N matrix A, optionally computing the left and/or right singular
+    vectors, by using divide-and-conquer method. The SVD is written
+
+         A = U * SIGMA * conjugate-transpose(V)
+
+    where SIGMA is an M-by-N matrix which is zero except for its
+    min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+    V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+    are the singular values of A; they are real and non-negative, and
+    are returned in descending order.  The first min(m,n) columns of
+    U and V are the left and right singular vectors of A.
+
+    Note that the routine returns VT = V**H, not V.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            Specifies options for computing all or part of the matrix U:
+            = 'A':  all M columns of U and all N rows of V**H are
+                    returned in the arrays U and VT;
+            = 'S':  the first min(M,N) columns of U and the first
+                    min(M,N) rows of V**H are returned in the arrays U
+                    and VT;
+            = 'O':  If M >= N, the first N columns of U are overwritten
+                    in the array A and all rows of V**H are returned in
+                    the array VT;
+                    otherwise, all columns of U are returned in the
+                    array U and the first M rows of V**H are overwritten
+                    in the array A;
+            = 'N':  no columns of U or rows of V**H are computed.
+
+    M       (input) INTEGER
+            The number of rows of the input matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the input matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit,
+            if JOBZ = 'O',  A is overwritten with the first N columns
+                            of U (the left singular vectors, stored
+                            columnwise) if M >= N;
+                            A is overwritten with the first M rows
+                            of V**H (the right singular vectors, stored
+                            rowwise) otherwise.
+            if JOBZ .ne. 'O', the contents of A are destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    S       (output) REAL array, dimension (min(M,N))
+            The singular values of A, sorted so that S(i) >= S(i+1).
+
+    U       (output) COMPLEX array, dimension (LDU,UCOL)
+            UCOL = M if JOBZ = 'A' or JOBZ = 'O' and M < N;
+            UCOL = min(M,N) if JOBZ = 'S'.
+            If JOBZ = 'A' or JOBZ = 'O' and M < N, U contains the M-by-M
+            unitary matrix U;
+            if JOBZ = 'S', U contains the first min(M,N) columns of U
+            (the left singular vectors, stored columnwise);
+            if JOBZ = 'O' and M >= N, or JOBZ = 'N', U is not referenced.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= 1; if
+            JOBZ = 'S' or 'A' or JOBZ = 'O' and M < N, LDU >= M.
+
+    VT      (output) COMPLEX array, dimension (LDVT,N)
+            If JOBZ = 'A' or JOBZ = 'O' and M >= N, VT contains the
+            N-by-N unitary matrix V**H;
+            if JOBZ = 'S', VT contains the first min(M,N) rows of
+            V**H (the right singular vectors, stored rowwise);
+            if JOBZ = 'O' and M < N, or JOBZ = 'N', VT is not referenced.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.  LDVT >= 1; if
+            JOBZ = 'A' or JOBZ = 'O' and M >= N, LDVT >= N;
+            if JOBZ = 'S', LDVT >= min(M,N).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= 1.
+            if JOBZ = 'N', LWORK >= 2*min(M,N)+max(M,N).
+            if JOBZ = 'O',
+                  LWORK >= 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+            if JOBZ = 'S' or 'A',
+                  LWORK >= min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+            For good performance, LWORK should generally be larger.
+
+            If LWORK = -1, a workspace query is assumed.  The optimal
+            size for the WORK array is calculated and stored in WORK(1),
+            and no other work except argument checking is performed.
+
+    RWORK   (workspace) REAL array, dimension (MAX(1,LRWORK))
+            If JOBZ = 'N', LRWORK >= 5*min(M,N).
+            Otherwise,
+            LRWORK >= min(M,N)*max(5*min(M,N)+7,2*max(M,N)+2*min(M,N)+1)
+
+    IWORK   (workspace) INTEGER array, dimension (8*min(M,N))
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The updating process of SBDSDC did not converge.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --s;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    mnthr1 = (integer) (minmn * 17.f / 9.f);
+    mnthr2 = (integer) (minmn * 5.f / 3.f);
+    wntqa = lsame_(jobz, "A");
+    wntqs = lsame_(jobz, "S");
+    wntqas = wntqa || wntqs;
+    wntqo = lsame_(jobz, "O");
+    wntqn = lsame_(jobz, "N");
+    minwrk = 1;
+    maxwrk = 1;
+
+    if (! (wntqa || wntqs || wntqo || wntqn)) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldu < 1 || wntqas && *ldu < *m || wntqo && *m < *n && *ldu < *
+	    m) {
+	*info = -8;
+    } else if (*ldvt < 1 || wntqa && *ldvt < *n || wntqs && *ldvt < minmn ||
+	    wntqo && *m >= *n && *ldvt < *n) {
+	*info = -10;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         CWorkspace refers to complex workspace, and RWorkspace to
+         real workspace. NB refers to the optimal block size for the
+         immediately following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0 && *m > 0 && *n > 0) {
+	if (*m >= *n) {
+
+/*
+             There is no complex work space needed for bidiagonal SVD
+             The real work space needed for bidiagonal SVD is BDSPAC
+             for computing singular values and singular vectors; BDSPAN
+             for computing singular values only.
+             BDSPAC = 5*N*N + 7*N
+             BDSPAN = MAX(7*N+4, 3*N+2+SMLSIZ*(SMLSIZ+8))
+*/
+
+	    if (*m >= mnthr1) {
+		if (wntqn) {
+
+/*                 Path 1 (M much larger than N, JOBZ='N') */
+
+		    maxwrk = *n + *n * ilaenv_(&c__1, "CGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3;
+		} else if (wntqo) {
+
+/*                 Path 2 (M much larger than N, JOBZ='O') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "CGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "CUNGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *n + *n * *n + wrkbl;
+		    minwrk = (*n << 1) * *n + *n * 3;
+		} else if (wntqs) {
+
+/*                 Path 3 (M much larger than N, JOBZ='S') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "CGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "CUNGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *n * *n + wrkbl;
+		    minwrk = *n * *n + *n * 3;
+		} else if (wntqa) {
+
+/*                 Path 4 (M much larger than N, JOBZ='A') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "CGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *m * ilaenv_(&c__1, "CUNGQR",
+			    " ", m, m, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *n * *n + wrkbl;
+		    minwrk = *n * *n + (*n << 1) + *m;
+		}
+	    } else if (*m >= mnthr2) {
+
+/*              Path 5 (M much larger than N, but not as much as MNTHR1) */
+
+		maxwrk = (*n << 1) + (*m + *n) * ilaenv_(&c__1, "CGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*n << 1) + *m;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "P", n, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "Q", m, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *n * *n;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "P", n, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "Q", m, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "P", n, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		}
+	    } else {
+
+/*              Path 6 (M at least N, but not much larger) */
+
+		maxwrk = (*n << 1) + (*m + *n) * ilaenv_(&c__1, "CGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*n << 1) + *m;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", m, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *n * *n;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", m, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		}
+	    }
+	} else {
+
+/*
+             There is no complex work space needed for bidiagonal SVD
+             The real work space needed for bidiagonal SVD is BDSPAC
+             for computing singular values and singular vectors; BDSPAN
+             for computing singular values only.
+             BDSPAC = 5*M*M + 7*M
+             BDSPAN = MAX(7*M+4, 3*M+2+SMLSIZ*(SMLSIZ+8))
+*/
+
+	    if (*n >= mnthr1) {
+		if (wntqn) {
+
+/*                 Path 1t (N much larger than M, JOBZ='N') */
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3;
+		} else if (wntqo) {
+
+/*                 Path 2t (N much larger than M, JOBZ='O') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "CUNGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *n + *m * *m + wrkbl;
+		    minwrk = (*m << 1) * *m + *m * 3;
+		} else if (wntqs) {
+
+/*                 Path 3t (N much larger than M, JOBZ='S') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "CUNGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *m + wrkbl;
+		    minwrk = *m * *m + *m * 3;
+		} else if (wntqa) {
+
+/*                 Path 4t (N much larger than M, JOBZ='A') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *n * ilaenv_(&c__1, "CUNGLQ",
+			    " ", n, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *m + wrkbl;
+		    minwrk = *m * *m + (*m << 1) + *n;
+		}
+	    } else if (*n >= mnthr2) {
+
+/*              Path 5t (N much larger than M, but not as much as MNTHR1) */
+
+		maxwrk = (*m << 1) + (*m + *n) * ilaenv_(&c__1, "CGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*m << 1) + *n;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "P", m, n, m, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *m * *m;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "P", m, n, m, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "P", n, n, m, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		}
+	    } else {
+
+/*              Path 6t (N greater than M, but not much larger) */
+
+		maxwrk = (*m << 1) + (*m + *n) * ilaenv_(&c__1, "CGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*m << 1) + *n;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "PRC", m, n, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *m * *m;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "PRC", m, n, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *n * ilaenv_(&c__1,
+			    "CUNGBR", "PRC", n, n, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNGBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		}
+	    }
+	}
+	maxwrk = max(maxwrk,minwrk);
+    }
+    if (*info == 0) {
+	work[1].r = (real) maxwrk, work[1].i = 0.f;
+	if (*lwork < minwrk && *lwork != -1) {
+	    *info = -13;
+	}
+    }
+
+/*     Quick returns */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGESDD", &i__1);
+	return 0;
+    }
+    if (*lwork == -1) {
+	return 0;
+    }
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = slamch_("P");
+    smlnum = sqrt(slamch_("S")) / eps;
+    bignum = 1.f / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = clange_("M", m, n, &a[a_offset], lda, dum);
+    iscl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
+	iscl = 1;
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, &
+		ierr);
+    } else if (anrm > bignum) {
+	iscl = 1;
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+    if (*m >= *n) {
+
+/*
+          A has at least as many rows as columns. If A has sufficiently
+          more rows than columns, first reduce using the QR
+          decomposition (if sufficient workspace available)
+*/
+
+	if (*m >= mnthr1) {
+
+	    if (wntqn) {
+
+/*
+                Path 1 (M much larger than N, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: need 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Zero out below R */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		claset_("L", &i__1, &i__2, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+		ie = 1;
+		itauq = 1;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (CWorkspace: need 3*N, prefer 2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgebrd_(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+		nrwork = ie + *n;
+
+/*
+                Perform bidiagonal SVD, compute singular values only
+                (CWorkspace: 0)
+                (RWorkspace: need BDSPAN)
+*/
+
+		sbdsdc_("U", "N", n, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2 (M much larger than N, JOBZ='O')
+                N left singular vectors to be overwritten on A and
+                N right singular vectors to be computed in VT
+*/
+
+		iu = 1;
+
+/*              WORK(IU) is N by N */
+
+		ldwrku = *n;
+		ir = iu + ldwrku * *n;
+		if (*lwork >= *m * *n + *n * *n + *n * 3) {
+
+/*                 WORK(IR) is M by N */
+
+		    ldwrkr = *m;
+		} else {
+		    ldwrkr = (*lwork - *n * *n - *n * 3) / *n;
+		}
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (CWorkspace: need N*N+2*N, prefer M*N+N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy R to WORK( IR ), zeroing out below it */
+
+		clacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		claset_("L", &i__1, &i__2, &c_b56, &c_b56, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cungqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in WORK(IR)
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of R in WORK(IRU) and computing right singular vectors
+                of R in WORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *n;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by the left singular vectors of R
+                (CWorkspace: need 2*N*N+3*N, prefer M*N+N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[iru], n, &work[iu], &ldwrku);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &work[iu], &ldwrku, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by the right singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IU), storing result in WORK(IR) and copying to A
+                (CWorkspace: need 2*N*N, prefer N*N+M*N)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *m;
+		i__2 = ldwrkr;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *m - i__ + 1;
+		    chunk = min(i__3,ldwrkr);
+		    cgemm_("N", "N", &chunk, n, n, &c_b57, &a[i__ + a_dim1],
+			    lda, &work[iu], &ldwrku, &c_b56, &work[ir], &
+			    ldwrkr);
+		    clacpy_("F", &chunk, n, &work[ir], &ldwrkr, &a[i__ +
+			    a_dim1], lda);
+/* L10: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Path 3 (M much larger than N, JOBZ='S')
+                N left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		ir = 1;
+
+/*              WORK(IR) is N by N */
+
+		ldwrkr = *n;
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy R to WORK(IR), zeroing out below it */
+
+		clacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		claset_("L", &i__2, &i__1, &c_b56, &c_b56, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cungqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in WORK(IR)
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *n;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[iru], n, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IR), storing result in U
+                (CWorkspace: need N*N)
+                (RWorkspace: 0)
+*/
+
+		clacpy_("F", n, n, &u[u_offset], ldu, &work[ir], &ldwrkr);
+		cgemm_("N", "N", m, n, n, &c_b57, &a[a_offset], lda, &work[ir]
+			, &ldwrkr, &c_b56, &u[u_offset], ldu);
+
+	    } else if (wntqa) {
+
+/*
+                Path 4 (M much larger than N, JOBZ='A')
+                M left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		iu = 1;
+
+/*              WORK(IU) is N by N */
+
+		ldwrku = *n;
+		itau = iu + ldwrku * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R, copying result to U
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+		clacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Generate Q in U
+                (CWorkspace: need N+M, prefer N+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cungqr_(m, m, n, &u[u_offset], ldu, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+
+/*              Produce R in A, zeroing out below it */
+
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		claset_("L", &i__2, &i__1, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (CWorkspace: need 3*N, prefer 2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgebrd_(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+		iru = ie + *n;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by left singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[iru], n, &work[iu], &ldwrku);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", n, n, n, &a[a_offset], lda, &work[
+			itauq], &work[iu], &ldwrku, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of R
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in U by left singular vectors of R in
+                WORK(IU), storing result in A
+                (CWorkspace: need N*N)
+                (RWorkspace: 0)
+*/
+
+		cgemm_("N", "N", m, n, n, &c_b57, &u[u_offset], ldu, &work[iu]
+			, &ldwrku, &c_b56, &a[a_offset], lda);
+
+/*              Copy left singular vectors of A from A to U */
+
+		clacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+	    }
+
+	} else if (*m >= mnthr2) {
+
+/*
+             MNTHR2 <= M < MNTHR1
+
+             Path 5 (M much larger than N, but not as much as MNTHR1)
+             Reduce to bidiagonal form without QR decomposition, use
+             CUNGBR and matrix multiplication to compute singular vectors
+*/
+
+	    ie = 1;
+	    nrwork = ie + *n;
+	    itauq = 1;
+	    itaup = itauq + *n;
+	    nwork = itaup + *n;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*N+M, prefer 2*N+(M+N)*NB)
+             (RWorkspace: need N)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		sbdsdc_("U", "N", n, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		iu = nwork;
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cungbr_("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__2, &ierr);
+
+/*
+                Generate Q in A
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cungbr_("Q", m, n, n, &a[a_offset], lda, &work[itauq], &work[
+			nwork], &i__2, &ierr);
+
+		if (*lwork >= *m * *n + *n * 3) {
+
+/*                 WORK( IU ) is M by N */
+
+		    ldwrku = *m;
+		} else {
+
+/*                 WORK(IU) is LDWRKU by N */
+
+		    ldwrku = (*lwork - *n * 3) / *n;
+		}
+		nwork = iu + ldwrku * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in WORK(IU), copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		clarcm_(n, n, &rwork[irvt], n, &vt[vt_offset], ldvt, &work[iu]
+			, &ldwrku, &rwork[nrwork]);
+		clacpy_("F", n, n, &work[iu], &ldwrku, &vt[vt_offset], ldvt);
+
+/*
+                Multiply Q in A by real matrix RWORK(IRU), storing the
+                result in WORK(IU), copying to A
+                (CWorkspace: need N*N, prefer M*N)
+                (Rworkspace: need 3*N*N, prefer N*N+2*M*N)
+*/
+
+		nrwork = irvt;
+		i__2 = *m;
+		i__1 = ldwrku;
+		for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			i__1) {
+/* Computing MIN */
+		    i__3 = *m - i__ + 1;
+		    chunk = min(i__3,ldwrku);
+		    clacrm_(&chunk, n, &a[i__ + a_dim1], lda, &rwork[iru], n,
+			    &work[iu], &ldwrku, &rwork[nrwork]);
+		    clacpy_("F", &chunk, n, &work[iu], &ldwrku, &a[i__ +
+			    a_dim1], lda);
+/* L20: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cungbr_("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__1, &ierr);
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cungbr_("Q", m, n, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		clarcm_(n, n, &rwork[irvt], n, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		clacpy_("F", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: need 0)
+                (Rworkspace: need N*N+2*M*N)
+*/
+
+		nrwork = irvt;
+		clacrm_(m, n, &u[u_offset], ldu, &rwork[iru], n, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		clacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+	    } else {
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cungbr_("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__1, &ierr);
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		clarcm_(n, n, &rwork[irvt], n, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		clacpy_("F", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		nrwork = irvt;
+		clacrm_(m, n, &u[u_offset], ldu, &rwork[iru], n, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		clacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+	    }
+
+	} else {
+
+/*
+             M .LT. MNTHR2
+
+             Path 6 (M at least N, but not much larger)
+             Reduce to bidiagonal form without QR decomposition
+             Use CUNMBR to compute singular vectors
+*/
+
+	    ie = 1;
+	    nrwork = ie + *n;
+	    itauq = 1;
+	    itaup = itauq + *n;
+	    nwork = itaup + *n;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*N+M, prefer 2*N+(M+N)*NB)
+             (RWorkspace: need N)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, &ierr);
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		sbdsdc_("U", "N", n, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		iu = nwork;
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		if (*lwork >= *m * *n + *n * 3) {
+
+/*                 WORK( IU ) is M by N */
+
+		    ldwrku = *m;
+		} else {
+
+/*                 WORK( IU ) is LDWRKU by N */
+
+		    ldwrku = (*lwork - *n * 3) / *n;
+		}
+		nwork = iu + ldwrku * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: need 0)
+*/
+
+		clacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+		if (*lwork >= *m * *n + *n * 3) {
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by left singular vectors of A, copying
+                to A
+                (Cworkspace: need M*N+2*N, prefer M*N+N+N*NB)
+                (Rworkspace: need 0)
+*/
+
+		    claset_("F", m, n, &c_b56, &c_b56, &work[iu], &ldwrku);
+		    clacp2_("F", n, n, &rwork[iru], n, &work[iu], &ldwrku);
+		    i__1 = *lwork - nwork + 1;
+		    cunmbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			    itauq], &work[iu], &ldwrku, &work[nwork], &i__1, &
+			    ierr);
+		    clacpy_("F", m, n, &work[iu], &ldwrku, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate Q in A
+                   (Cworkspace: need 2*N, prefer N+N*NB)
+                   (Rworkspace: need 0)
+*/
+
+		    i__1 = *lwork - nwork + 1;
+		    cungbr_("Q", m, n, n, &a[a_offset], lda, &work[itauq], &
+			    work[nwork], &i__1, &ierr);
+
+/*
+                   Multiply Q in A by real matrix RWORK(IRU), storing the
+                   result in WORK(IU), copying to A
+                   (CWorkspace: need N*N, prefer M*N)
+                   (Rworkspace: need 3*N*N, prefer N*N+2*M*N)
+*/
+
+		    nrwork = irvt;
+		    i__1 = *m;
+		    i__2 = ldwrku;
+		    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			     i__2) {
+/* Computing MIN */
+			i__3 = *m - i__ + 1;
+			chunk = min(i__3,ldwrku);
+			clacrm_(&chunk, n, &a[i__ + a_dim1], lda, &rwork[iru],
+				 n, &work[iu], &ldwrku, &rwork[nrwork]);
+			clacpy_("F", &chunk, n, &work[iu], &ldwrku, &a[i__ +
+				a_dim1], lda);
+/* L30: */
+		    }
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		claset_("F", m, n, &c_b56, &c_b56, &u[u_offset], ldu);
+		clacp2_("F", n, n, &rwork[iru], n, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+	    } else {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		sbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*              Set the right corner of U to identity matrix */
+
+		claset_("F", m, m, &c_b56, &c_b56, &u[u_offset], ldu);
+		if (*m > *n) {
+		    i__2 = *m - *n;
+		    i__1 = *m - *n;
+		    claset_("F", &i__2, &i__1, &c_b56, &c_b57, &u[*n + 1 + (*
+			    n + 1) * u_dim1], ldu);
+		}
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 2*N+M, prefer 2*N+M*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[iru], n, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+	    }
+
+	}
+
+    } else {
+
+/*
+          A has more columns than rows. If A has sufficiently more
+          columns than rows, first reduce using the LQ decomposition (if
+          sufficient workspace available)
+*/
+
+	if (*n >= mnthr1) {
+
+	    if (wntqn) {
+
+/*
+                Path 1t (N much larger than M, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Zero out above L */
+
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		claset_("U", &i__2, &i__1, &c_b56, &c_b56, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = 1;
+		itauq = 1;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (CWorkspace: need 3*M, prefer 2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgebrd_(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+		nrwork = ie + *m;
+
+/*
+                Perform bidiagonal SVD, compute singular values only
+                (CWorkspace: 0)
+                (RWorkspace: need BDSPAN)
+*/
+
+		sbdsdc_("U", "N", m, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2t (N much larger than M, JOBZ='O')
+                M right singular vectors to be overwritten on A and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+		ldwkvt = *m;
+
+/*              WORK(IVT) is M by M */
+
+		il = ivt + ldwkvt * *m;
+		if (*lwork >= *m * *n + *m * *m + *m * 3) {
+
+/*                 WORK(IL) M by N */
+
+		    ldwrkl = *m;
+		    chunk = *n;
+		} else {
+
+/*                 WORK(IL) is M by CHUNK */
+
+		    ldwrkl = *m;
+		    chunk = (*lwork - *m * *m - *m * 3) / *m;
+		}
+		itau = il + ldwrkl * chunk;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy L to WORK(IL), zeroing about above it */
+
+		clacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		claset_("U", &i__2, &i__1, &c_b56, &c_b56, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cunglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IL)
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		cgebrd_(m, m, &work[il], &ldwrkl, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *m;
+		irvt = iru + *m * *m;
+		nrwork = irvt + *m * *m;
+		sbdsdc_("U", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by the left singular vectors of L
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix WORK(IVT)
+                Overwrite WORK(IVT) by the right singular vectors of L
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", m, m, &rwork[irvt], m, &work[ivt], &ldwkvt);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IL) by Q
+                in A, storing result in WORK(IL) and copying to A
+                (CWorkspace: need 2*M*M, prefer M*M+M*N))
+                (RWorkspace: 0)
+*/
+
+		i__2 = *n;
+		i__1 = chunk;
+		for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			i__1) {
+/* Computing MIN */
+		    i__3 = *n - i__ + 1;
+		    blk = min(i__3,chunk);
+		    cgemm_("N", "N", m, &blk, m, &c_b57, &work[ivt], m, &a[
+			    i__ * a_dim1 + 1], lda, &c_b56, &work[il], &
+			    ldwrkl);
+		    clacpy_("F", m, &blk, &work[il], &ldwrkl, &a[i__ * a_dim1
+			    + 1], lda);
+/* L40: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+               Path 3t (N much larger than M, JOBZ='S')
+               M right singular vectors to be computed in VT and
+               M left singular vectors to be computed in U
+*/
+
+		il = 1;
+
+/*              WORK(IL) is M by M */
+
+		ldwrkl = *m;
+		itau = il + ldwrkl * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy L to WORK(IL), zeroing out above it */
+
+		clacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		claset_("U", &i__1, &i__2, &c_b56, &c_b56, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cunglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IL)
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgebrd_(m, m, &work[il], &ldwrkl, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *m;
+		irvt = iru + *m * *m;
+		nrwork = irvt + *m * *m;
+		sbdsdc_("U", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of L
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by left singular vectors of L
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", m, m, &rwork[irvt], m, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Copy VT to WORK(IL), multiply right singular vectors of L
+                in WORK(IL) by Q in A, storing result in VT
+                (CWorkspace: need M*M)
+                (RWorkspace: 0)
+*/
+
+		clacpy_("F", m, m, &vt[vt_offset], ldvt, &work[il], &ldwrkl);
+		cgemm_("N", "N", m, n, m, &c_b57, &work[il], &ldwrkl, &a[
+			a_offset], lda, &c_b56, &vt[vt_offset], ldvt);
+
+	    } else if (wntqa) {
+
+/*
+                Path 9t (N much larger than M, JOBZ='A')
+                N right singular vectors to be computed in VT and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+
+/*              WORK(IVT) is M by M */
+
+		ldwkvt = *m;
+		itau = ivt + ldwkvt * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q, copying result to VT
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+		clacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Generate Q in VT
+                (CWorkspace: need M+N, prefer M+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cunglq_(n, n, m, &vt[vt_offset], ldvt, &work[itau], &work[
+			nwork], &i__1, &ierr);
+
+/*              Produce L in A, zeroing out above it */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		claset_("U", &i__1, &i__2, &c_b56, &c_b56, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cgebrd_(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *m;
+		irvt = iru + *m * *m;
+		nrwork = irvt + *m * *m;
+		sbdsdc_("U", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of L
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, m, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix WORK(IVT)
+                Overwrite WORK(IVT) by right singular vectors of L
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		clacp2_("F", m, m, &rwork[irvt], m, &work[ivt], &ldwkvt);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", m, m, m, &a[a_offset], lda, &work[
+			itaup], &work[ivt], &ldwkvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IVT) by
+                Q in VT, storing result in A
+                (CWorkspace: need M*M)
+                (RWorkspace: 0)
+*/
+
+		cgemm_("N", "N", m, n, m, &c_b57, &work[ivt], &ldwkvt, &vt[
+			vt_offset], ldvt, &c_b56, &a[a_offset], lda);
+
+/*              Copy right singular vectors of A from A to VT */
+
+		clacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+	    }
+
+	} else if (*n >= mnthr2) {
+
+/*
+             MNTHR2 <= N < MNTHR1
+
+             Path 5t (N much larger than M, but not as much as MNTHR1)
+             Reduce to bidiagonal form without QR decomposition, use
+             CUNGBR and matrix multiplication to compute singular vectors
+*/
+
+
+	    ie = 1;
+	    nrwork = ie + *m;
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+             (RWorkspace: M)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, &ierr);
+
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		sbdsdc_("L", "N", m, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		ivt = nwork;
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__1, &ierr);
+
+/*
+                Generate P**H in A
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		cungbr_("P", m, n, m, &a[a_offset], lda, &work[itaup], &work[
+			nwork], &i__1, &ierr);
+
+		ldwkvt = *m;
+		if (*lwork >= *m * *n + *m * 3) {
+
+/*                 WORK( IVT ) is M by N */
+
+		    nwork = ivt + ldwkvt * *n;
+		    chunk = *n;
+		} else {
+
+/*                 WORK( IVT ) is M by CHUNK */
+
+		    chunk = (*lwork - *m * 3) / *m;
+		    nwork = ivt + ldwkvt * chunk;
+		}
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		sbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRVT)
+                storing the result in WORK(IVT), copying to U
+                (Cworkspace: need 0)
+                (Rworkspace: need 2*M*M)
+*/
+
+		clacrm_(m, m, &u[u_offset], ldu, &rwork[iru], m, &work[ivt], &
+			ldwkvt, &rwork[nrwork]);
+		clacpy_("F", m, m, &work[ivt], &ldwkvt, &u[u_offset], ldu);
+
+/*
+                Multiply RWORK(IRVT) by P**H in A, storing the
+                result in WORK(IVT), copying to A
+                (CWorkspace: need M*M, prefer M*N)
+                (Rworkspace: need 2*M*M, prefer 2*M*N)
+*/
+
+		nrwork = iru;
+		i__1 = *n;
+		i__2 = chunk;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *n - i__ + 1;
+		    blk = min(i__3,chunk);
+		    clarcm_(m, &blk, &rwork[irvt], m, &a[i__ * a_dim1 + 1],
+			    lda, &work[ivt], &ldwkvt, &rwork[nrwork]);
+		    clacpy_("F", m, &blk, &work[ivt], &ldwkvt, &a[i__ *
+			    a_dim1 + 1], lda);
+/* L50: */
+		}
+	    } else if (wntqs) {
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__2, &ierr);
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cungbr_("P", m, n, m, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		sbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: need 0)
+                (Rworkspace: need 3*M*M)
+*/
+
+		clacrm_(m, m, &u[u_offset], ldu, &rwork[iru], m, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		clacpy_("F", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need M*M+2*M*N)
+*/
+
+		nrwork = iru;
+		clarcm_(m, n, &rwork[irvt], m, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		clacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+	    } else {
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__2, &ierr);
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		clacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		cungbr_("P", n, n, m, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		sbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: need 0)
+                (Rworkspace: need 3*M*M)
+*/
+
+		clacrm_(m, m, &u[u_offset], ldu, &rwork[iru], m, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		clacpy_("F", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need M*M+2*M*N)
+*/
+
+		clarcm_(m, n, &rwork[irvt], m, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		clacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+	    }
+
+	} else {
+
+/*
+             N .LT. MNTHR2
+
+             Path 6t (N greater than M, but not much larger)
+             Reduce to bidiagonal form without LQ decomposition
+             Use CUNMBR to compute singular vectors
+*/
+
+	    ie = 1;
+	    nrwork = ie + *m;
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+             (RWorkspace: M)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		sbdsdc_("L", "N", m, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		ldwkvt = *m;
+		ivt = nwork;
+		if (*lwork >= *m * *n + *m * 3) {
+
+/*                 WORK( IVT ) is M by N */
+
+		    claset_("F", m, n, &c_b56, &c_b56, &work[ivt], &ldwkvt);
+		    nwork = ivt + ldwkvt * *n;
+		} else {
+
+/*                 WORK( IVT ) is M by CHUNK */
+
+		    chunk = (*lwork - *m * 3) / *m;
+		    nwork = ivt + ldwkvt * chunk;
+		}
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		sbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: need 0)
+*/
+
+		clacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+		if (*lwork >= *m * *n + *m * 3) {
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix WORK(IVT)
+                Overwrite WORK(IVT) by right singular vectors of A,
+                copying to A
+                (Cworkspace: need M*N+2*M, prefer M*N+M+M*NB)
+                (Rworkspace: need 0)
+*/
+
+		    clacp2_("F", m, m, &rwork[irvt], m, &work[ivt], &ldwkvt);
+		    i__2 = *lwork - nwork + 1;
+		    cunmbr_("P", "R", "C", m, n, m, &a[a_offset], lda, &work[
+			    itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2,
+			    &ierr);
+		    clacpy_("F", m, n, &work[ivt], &ldwkvt, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate P**H in A
+                   (Cworkspace: need 2*M, prefer M+M*NB)
+                   (Rworkspace: need 0)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    cungbr_("P", m, n, m, &a[a_offset], lda, &work[itaup], &
+			    work[nwork], &i__2, &ierr);
+
+/*
+                   Multiply Q in A by real matrix RWORK(IRU), storing the
+                   result in WORK(IU), copying to A
+                   (CWorkspace: need M*M, prefer M*N)
+                   (Rworkspace: need 3*M*M, prefer M*M+2*M*N)
+*/
+
+		    nrwork = iru;
+		    i__2 = *n;
+		    i__1 = chunk;
+		    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			     i__1) {
+/* Computing MIN */
+			i__3 = *n - i__ + 1;
+			blk = min(i__3,chunk);
+			clarcm_(m, &blk, &rwork[irvt], m, &a[i__ * a_dim1 + 1]
+				, lda, &work[ivt], &ldwkvt, &rwork[nrwork]);
+			clacpy_("F", m, &blk, &work[ivt], &ldwkvt, &a[i__ *
+				a_dim1 + 1], lda);
+/* L60: */
+		    }
+		}
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		sbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: M*M)
+*/
+
+		clacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: M*M)
+*/
+
+		claset_("F", m, n, &c_b56, &c_b56, &vt[vt_offset], ldvt);
+		clacp2_("F", m, m, &rwork[irvt], m, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", m, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    } else {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+
+		sbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: M*M)
+*/
+
+		clacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*              Set all of VT to identity matrix */
+
+		claset_("F", n, n, &c_b56, &c_b57, &vt[vt_offset], ldvt);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 2*M+N, prefer 2*M+N*NB)
+                (RWorkspace: M*M)
+*/
+
+		clacp2_("F", m, m, &rwork[irvt], m, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		cunmbr_("P", "R", "C", n, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    }
+
+	}
+
+    }
+
+/*     Undo scaling if necessary */
+
+    if (iscl == 1) {
+	if (anrm > bignum) {
+	    slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+	if (*info != 0 && anrm > bignum) {
+	    i__1 = minmn - 1;
+	    slascl_("G", &c__0, &c__0, &bignum, &anrm, &i__1, &c__1, &rwork[
+		    ie], &minmn, &ierr);
+	}
+	if (anrm < smlnum) {
+	    slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+	if (*info != 0 && anrm < smlnum) {
+	    i__1 = minmn - 1;
+	    slascl_("G", &c__0, &c__0, &smlnum, &anrm, &i__1, &c__1, &rwork[
+		    ie], &minmn, &ierr);
+	}
+    }
+
+/*     Return optimal workspace in WORK(1) */
+
+    work[1].r = (real) maxwrk, work[1].i = 0.f;
+
+    return 0;
+
+/*     End of CGESDD */
+
+} /* cgesdd_ */
+
+/* Subroutine */ int cgesv_(integer *n, integer *nrhs, complex *a, integer *
+	lda, integer *ipiv, complex *b, integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int cgetrf_(integer *, integer *, complex *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *), cgetrs_(char *, integer *, integer *, complex *, integer
+	    *, integer *, complex *, integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGESV computes the solution to a complex system of linear equations
+       A * X = B,
+    where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+
+    The LU decomposition with partial pivoting and row interchanges is
+    used to factor A as
+       A = P * L * U,
+    where P is a permutation matrix, L is unit lower triangular, and U is
+    upper triangular.  The factored form of A is then used to solve the
+    system of equations A * X = B.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of linear equations, i.e., the order of the
+            matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the N-by-N coefficient matrix A.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (output) INTEGER array, dimension (N)
+            The pivot indices that define the permutation matrix P;
+            row i of the matrix was interchanged with row IPIV(i).
+
+    B       (input/output) COMPLEX array, dimension (LDB,NRHS)
+            On entry, the N-by-NRHS matrix of right hand side matrix B.
+            On exit, if INFO = 0, the N-by-NRHS solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization
+                  has been completed, but the factor U is exactly
+                  singular, so the solution could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*nrhs < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGESV ", &i__1);
+	return 0;
+    }
+
+/*     Compute the LU factorization of A. */
+
+    cgetrf_(n, n, &a[a_offset], lda, &ipiv[1], info);
+    if (*info == 0) {
+
+/*        Solve the system A*X = B, overwriting B with X. */
+
+	cgetrs_("No transpose", n, nrhs, &a[a_offset], lda, &ipiv[1], &b[
+		b_offset], ldb, info);
+    }
+    return 0;
+
+/*     End of CGESV */
+
+} /* cgesv_ */
+
+/* Subroutine */ int cgetf2_(integer *m, integer *n, complex *a, integer *lda,
+	 integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, jp;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), cgeru_(integer *, integer *, complex *, complex *,
+	    integer *, complex *, integer *, complex *, integer *);
+    static real sfmin;
+    extern /* Subroutine */ int cswap_(integer *, complex *, integer *,
+	    complex *, integer *);
+    extern integer icamax_(integer *, complex *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGETF2 computes an LU factorization of a general m-by-n matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the m by n matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, U(k,k) is exactly zero. The factorization
+                 has been completed, but the factor U is exactly
+                 singular, and division by zero will occur if it is used
+                 to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGETF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Compute machine safe minimum */
+
+    sfmin = slamch_("S");
+
+    i__1 = min(*m,*n);
+    for (j = 1; j <= i__1; ++j) {
+
+/*        Find pivot and test for singularity. */
+
+	i__2 = *m - j + 1;
+	jp = j - 1 + icamax_(&i__2, &a[j + j * a_dim1], &c__1);
+	ipiv[j] = jp;
+	i__2 = jp + j * a_dim1;
+	if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+
+/*           Apply the interchange to columns 1:N. */
+
+	    if (jp != j) {
+		cswap_(n, &a[j + a_dim1], lda, &a[jp + a_dim1], lda);
+	    }
+
+/*           Compute elements J+1:M of J-th column. */
+
+	    if (j < *m) {
+		if (c_abs(&a[j + j * a_dim1]) >= sfmin) {
+		    i__2 = *m - j;
+		    c_div(&q__1, &c_b57, &a[j + j * a_dim1]);
+		    cscal_(&i__2, &q__1, &a[j + 1 + j * a_dim1], &c__1);
+		} else {
+		    i__2 = *m - j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = j + i__ + j * a_dim1;
+			c_div(&q__1, &a[j + i__ + j * a_dim1], &a[j + j *
+				a_dim1]);
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L20: */
+		    }
+		}
+	    }
+
+	} else if (*info == 0) {
+
+	    *info = j;
+	}
+
+	if (j < min(*m,*n)) {
+
+/*           Update trailing submatrix. */
+
+	    i__2 = *m - j;
+	    i__3 = *n - j;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgeru_(&i__2, &i__3, &q__1, &a[j + 1 + j * a_dim1], &c__1, &a[j +
+		    (j + 1) * a_dim1], lda, &a[j + 1 + (j + 1) * a_dim1], lda)
+		    ;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of CGETF2 */
+
+} /* cgetf2_ */
+
+/* Subroutine */ int cgetrf_(integer *m, integer *n, complex *a, integer *lda,
+	 integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, jb, nb;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *);
+    static integer iinfo;
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *), cgetf2_(integer *,
+	    integer *, complex *, integer *, integer *, integer *), xerbla_(
+	    char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int claswp_(integer *, complex *, integer *,
+	    integer *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGETRF computes an LU factorization of a general M-by-N matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
+                  has been completed, but the factor U is exactly
+                  singular, and division by zero will occur if it is used
+                  to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGETRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "CGETRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    if (nb <= 1 || nb >= min(*m,*n)) {
+
+/*        Use unblocked code. */
+
+	cgetf2_(m, n, &a[a_offset], lda, &ipiv[1], info);
+    } else {
+
+/*        Use blocked code. */
+
+	i__1 = min(*m,*n);
+	i__2 = nb;
+	for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+/* Computing MIN */
+	    i__3 = min(*m,*n) - j + 1;
+	    jb = min(i__3,nb);
+
+/*
+             Factor diagonal and subdiagonal blocks and test for exact
+             singularity.
+*/
+
+	    i__3 = *m - j + 1;
+	    cgetf2_(&i__3, &jb, &a[j + j * a_dim1], lda, &ipiv[j], &iinfo);
+
+/*           Adjust INFO and the pivot indices. */
+
+	    if (*info == 0 && iinfo > 0) {
+		*info = iinfo + j - 1;
+	    }
+/* Computing MIN */
+	    i__4 = *m, i__5 = j + jb - 1;
+	    i__3 = min(i__4,i__5);
+	    for (i__ = j; i__ <= i__3; ++i__) {
+		ipiv[i__] = j - 1 + ipiv[i__];
+/* L10: */
+	    }
+
+/*           Apply interchanges to columns 1:J-1. */
+
+	    i__3 = j - 1;
+	    i__4 = j + jb - 1;
+	    claswp_(&i__3, &a[a_offset], lda, &j, &i__4, &ipiv[1], &c__1);
+
+	    if (j + jb <= *n) {
+
+/*              Apply interchanges to columns J+JB:N. */
+
+		i__3 = *n - j - jb + 1;
+		i__4 = j + jb - 1;
+		claswp_(&i__3, &a[(j + jb) * a_dim1 + 1], lda, &j, &i__4, &
+			ipiv[1], &c__1);
+
+/*              Compute block row of U. */
+
+		i__3 = *n - j - jb + 1;
+		ctrsm_("Left", "Lower", "No transpose", "Unit", &jb, &i__3, &
+			c_b57, &a[j + j * a_dim1], lda, &a[j + (j + jb) *
+			a_dim1], lda);
+		if (j + jb <= *m) {
+
+/*                 Update trailing submatrix. */
+
+		    i__3 = *m - j - jb + 1;
+		    i__4 = *n - j - jb + 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "No transpose", &i__3, &i__4, &jb,
+			    &q__1, &a[j + jb + j * a_dim1], lda, &a[j + (j +
+			    jb) * a_dim1], lda, &c_b57, &a[j + jb + (j + jb) *
+			     a_dim1], lda);
+		}
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of CGETRF */
+
+} /* cgetrf_ */
+
+/* Subroutine */ int cgetrs_(char *trans, integer *n, integer *nrhs, complex *
+	a, integer *lda, integer *ipiv, complex *b, integer *ldb, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *), xerbla_(char *,
+	    integer *), claswp_(integer *, complex *, integer *,
+	    integer *, integer *, integer *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGETRS solves a system of linear equations
+       A * X = B,  A**T * X = B,  or  A**H * X = B
+    with a general N-by-N matrix A using the LU factorization computed
+    by CGETRF.
+
+    Arguments
+    =========
+
+    TRANS   (input) CHARACTER*1
+            Specifies the form of the system of equations:
+            = 'N':  A * X = B     (No transpose)
+            = 'T':  A**T * X = B  (Transpose)
+            = 'C':  A**H * X = B  (Conjugate transpose)
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The factors L and U from the factorization A = P*L*U
+            as computed by CGETRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (input) INTEGER array, dimension (N)
+            The pivot indices from CGETRF; for 1<=i<=N, row i of the
+            matrix was interchanged with row IPIV(i).
+
+    B       (input/output) COMPLEX array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    notran = lsame_(trans, "N");
+    if (! notran && ! lsame_(trans, "T") && ! lsame_(
+	    trans, "C")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGETRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (notran) {
+
+/*
+          Solve A * X = B.
+
+          Apply row interchanges to the right hand sides.
+*/
+
+	claswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c__1);
+
+/*        Solve L*X = B, overwriting B with X. */
+
+	ctrsm_("Left", "Lower", "No transpose", "Unit", n, nrhs, &c_b57, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	ctrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b57, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A**T * X = B  or A**H * X = B.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	ctrsm_("Left", "Upper", trans, "Non-unit", n, nrhs, &c_b57, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	ctrsm_("Left", "Lower", trans, "Unit", n, nrhs, &c_b57, &a[a_offset],
+		lda, &b[b_offset], ldb);
+
+/*        Apply row interchanges to the solution vectors. */
+
+	claswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1);
+    }
+
+    return 0;
+
+/*     End of CGETRS */
+
+} /* cgetrs_ */
+
+/* Subroutine */ int cheevd_(char *jobz, char *uplo, integer *n, complex *a,
+	integer *lda, real *w, complex *work, integer *lwork, real *rwork,
+	integer *lrwork, integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static real eps;
+    static integer inde;
+    static real anrm;
+    static integer imax;
+    static real rmin, rmax;
+    static integer lopt;
+    static real sigma;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static integer lwmin, liopt;
+    static logical lower;
+    static integer llrwk, lropt;
+    static logical wantz;
+    static integer indwk2, llwrk2;
+    extern doublereal clanhe_(char *, char *, integer *, complex *, integer *,
+	     real *);
+    static integer iscale;
+    extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, complex *, integer *, integer *), cstedc_(char *, integer *, real *, real *, complex *,
+	    integer *, complex *, integer *, real *, integer *, integer *,
+	    integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int chetrd_(char *, integer *, complex *, integer
+	    *, real *, real *, complex *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer
+	    *, complex *, integer *);
+    static real safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real bignum;
+    static integer indtau, indrwk, indwrk, liwmin;
+    extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *);
+    static integer lrwmin;
+    extern /* Subroutine */ int cunmtr_(char *, char *, char *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *);
+    static integer llwork;
+    static real smlnum;
+    static logical lquery;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CHEEVD computes all eigenvalues and, optionally, eigenvectors of a
+    complex Hermitian matrix A.  If eigenvectors are desired, it uses a
+    divide and conquer algorithm.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only;
+            = 'V':  Compute eigenvalues and eigenvectors.
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA, N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of A contains the
+            upper triangular part of the matrix A.  If UPLO = 'L',
+            the leading N-by-N lower triangular part of A contains
+            the lower triangular part of the matrix A.
+            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+            orthonormal eigenvectors of the matrix A.
+            If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+            or the upper triangle (if UPLO='U') of A, including the
+            diagonal, is destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    W       (output) REAL array, dimension (N)
+            If INFO = 0, the eigenvalues in ascending order.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.
+            If N <= 1,                LWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LWORK must be at least N + 1.
+            If JOBZ  = 'V' and N > 1, LWORK must be at least 2*N + N**2.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal sizes of the WORK, RWORK and
+            IWORK arrays, returns these values as the first entries of
+            the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    RWORK   (workspace/output) REAL array,
+                                           dimension (LRWORK)
+            On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+
+    LRWORK  (input) INTEGER
+            The dimension of the array RWORK.
+            If N <= 1,                LRWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LRWORK must be at least N.
+            If JOBZ  = 'V' and N > 1, LRWORK must be at least
+                           1 + 5*N + 2*N**2.
+
+            If LRWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If N <= 1,                LIWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+            If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+                  to converge; i off-diagonal elements of an intermediate
+                  tridiagonal form did not converge to zero;
+                  if INFO = i and JOBZ = 'V', then the algorithm failed
+                  to compute an eigenvalue while working on the submatrix
+                  lying in rows and columns INFO/(N+1) through
+                  mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    Modified description of INFO. Sven, 16 Feb 05.
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --w;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    wantz = lsame_(jobz, "V");
+    lower = lsame_(uplo, "L");
+    lquery = *lwork == -1 || *lrwork == -1 || *liwork == -1;
+
+    *info = 0;
+    if (! (wantz || lsame_(jobz, "N"))) {
+	*info = -1;
+    } else if (! (lower || lsame_(uplo, "U"))) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+
+    if (*info == 0) {
+	if (*n <= 1) {
+	    lwmin = 1;
+	    lrwmin = 1;
+	    liwmin = 1;
+	    lopt = lwmin;
+	    lropt = lrwmin;
+	    liopt = liwmin;
+	} else {
+	    if (wantz) {
+		lwmin = (*n << 1) + *n * *n;
+/* Computing 2nd power */
+		i__1 = *n;
+		lrwmin = *n * 5 + 1 + (i__1 * i__1 << 1);
+		liwmin = *n * 5 + 3;
+	    } else {
+		lwmin = *n + 1;
+		lrwmin = *n;
+		liwmin = 1;
+	    }
+/* Computing MAX */
+	    i__1 = lwmin, i__2 = *n + ilaenv_(&c__1, "CHETRD", uplo, n, &c_n1,
+		     &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+	    lopt = max(i__1,i__2);
+	    lropt = lrwmin;
+	    liopt = liwmin;
+	}
+	work[1].r = (real) lopt, work[1].i = 0.f;
+	rwork[1] = (real) lropt;
+	iwork[1] = liopt;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*lrwork < lrwmin && ! lquery) {
+	    *info = -10;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CHEEVD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	i__1 = a_dim1 + 1;
+	w[1] = a[i__1].r;
+	if (wantz) {
+	    i__1 = a_dim1 + 1;
+	    a[i__1].r = 1.f, a[i__1].i = 0.f;
+	}
+	return 0;
+    }
+
+/*     Get machine constants. */
+
+    safmin = slamch_("Safe minimum");
+    eps = slamch_("Precision");
+    smlnum = safmin / eps;
+    bignum = 1.f / smlnum;
+    rmin = sqrt(smlnum);
+    rmax = sqrt(bignum);
+
+/*     Scale matrix to allowable range, if necessary. */
+
+    anrm = clanhe_("M", uplo, n, &a[a_offset], lda, &rwork[1]);
+    iscale = 0;
+    if (anrm > 0.f && anrm < rmin) {
+	iscale = 1;
+	sigma = rmin / anrm;
+    } else if (anrm > rmax) {
+	iscale = 1;
+	sigma = rmax / anrm;
+    }
+    if (iscale == 1) {
+	clascl_(uplo, &c__0, &c__0, &c_b1034, &sigma, n, n, &a[a_offset], lda,
+		 info);
+    }
+
+/*     Call CHETRD to reduce Hermitian matrix to tridiagonal form. */
+
+    inde = 1;
+    indtau = 1;
+    indwrk = indtau + *n;
+    indrwk = inde + *n;
+    indwk2 = indwrk + *n * *n;
+    llwork = *lwork - indwrk + 1;
+    llwrk2 = *lwork - indwk2 + 1;
+    llrwk = *lrwork - indrwk + 1;
+    chetrd_(uplo, n, &a[a_offset], lda, &w[1], &rwork[inde], &work[indtau], &
+	    work[indwrk], &llwork, &iinfo);
+
+/*
+       For eigenvalues only, call SSTERF.  For eigenvectors, first call
+       CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+       tridiagonal matrix, then call CUNMTR to multiply it to the
+       Householder transformations represented as Householder vectors in
+       A.
+*/
+
+    if (! wantz) {
+	ssterf_(n, &w[1], &rwork[inde], info);
+    } else {
+	cstedc_("I", n, &w[1], &rwork[inde], &work[indwrk], n, &work[indwk2],
+		&llwrk2, &rwork[indrwk], &llrwk, &iwork[1], liwork, info);
+	cunmtr_("L", uplo, "N", n, n, &a[a_offset], lda, &work[indtau], &work[
+		indwrk], n, &work[indwk2], &llwrk2, &iinfo);
+	clacpy_("A", n, n, &work[indwrk], n, &a[a_offset], lda);
+    }
+
+/*     If matrix was scaled, then rescale eigenvalues appropriately. */
+
+    if (iscale == 1) {
+	if (*info == 0) {
+	    imax = *n;
+	} else {
+	    imax = *info - 1;
+	}
+	r__1 = 1.f / sigma;
+	sscal_(&imax, &r__1, &w[1], &c__1);
+    }
+
+    work[1].r = (real) lopt, work[1].i = 0.f;
+    rwork[1] = (real) lropt;
+    iwork[1] = liopt;
+
+    return 0;
+
+/*     End of CHEEVD */
+
+} /* cheevd_ */
+
+/* Subroutine */ int chetd2_(char *uplo, integer *n, complex *a, integer *lda,
+	 real *d__, real *e, complex *tau, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__;
+    static complex taui;
+    extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex *
+	    , integer *, complex *, integer *, complex *, integer *);
+    static complex alpha;
+    extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer
+	    *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex *
+	    , integer *, complex *, integer *, complex *, complex *, integer *
+	    ), caxpy_(integer *, complex *, complex *, integer *,
+	    complex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *,
+	    integer *, complex *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CHETD2 reduces a complex Hermitian matrix A to real symmetric
+    tridiagonal form T by a unitary similarity transformation:
+    Q' * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            Hermitian matrix A is stored:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the unitary
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the unitary matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) REAL array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) COMPLEX array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CHETD2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Reduce the upper triangle of A */
+
+	i__1 = *n + *n * a_dim1;
+	i__2 = *n + *n * a_dim1;
+	r__1 = a[i__2].r;
+	a[i__1].r = r__1, a[i__1].i = 0.f;
+	for (i__ = *n - 1; i__ >= 1; --i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(1:i-1,i+1)
+*/
+
+	    i__1 = i__ + (i__ + 1) * a_dim1;
+	    alpha.r = a[i__1].r, alpha.i = a[i__1].i;
+	    clarfg_(&i__, &alpha, &a[(i__ + 1) * a_dim1 + 1], &c__1, &taui);
+	    i__1 = i__;
+	    e[i__1] = alpha.r;
+
+	    if (taui.r != 0.f || taui.i != 0.f) {
+
+/*              Apply H(i) from both sides to A(1:i,1:i) */
+
+		i__1 = i__ + (i__ + 1) * a_dim1;
+		a[i__1].r = 1.f, a[i__1].i = 0.f;
+
+/*              Compute  x := tau * A * v  storing x in TAU(1:i) */
+
+		chemv_(uplo, &i__, &taui, &a[a_offset], lda, &a[(i__ + 1) *
+			a_dim1 + 1], &c__1, &c_b56, &tau[1], &c__1)
+			;
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		q__3.r = -.5f, q__3.i = -0.f;
+		q__2.r = q__3.r * taui.r - q__3.i * taui.i, q__2.i = q__3.r *
+			taui.i + q__3.i * taui.r;
+		cdotc_(&q__4, &i__, &tau[1], &c__1, &a[(i__ + 1) * a_dim1 + 1]
+			, &c__1);
+		q__1.r = q__2.r * q__4.r - q__2.i * q__4.i, q__1.i = q__2.r *
+			q__4.i + q__2.i * q__4.r;
+		alpha.r = q__1.r, alpha.i = q__1.i;
+		caxpy_(&i__, &alpha, &a[(i__ + 1) * a_dim1 + 1], &c__1, &tau[
+			1], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		q__1.r = -1.f, q__1.i = -0.f;
+		cher2_(uplo, &i__, &q__1, &a[(i__ + 1) * a_dim1 + 1], &c__1, &
+			tau[1], &c__1, &a[a_offset], lda);
+
+	    } else {
+		i__1 = i__ + i__ * a_dim1;
+		i__2 = i__ + i__ * a_dim1;
+		r__1 = a[i__2].r;
+		a[i__1].r = r__1, a[i__1].i = 0.f;
+	    }
+	    i__1 = i__ + (i__ + 1) * a_dim1;
+	    i__2 = i__;
+	    a[i__1].r = e[i__2], a[i__1].i = 0.f;
+	    i__1 = i__ + 1;
+	    i__2 = i__ + 1 + (i__ + 1) * a_dim1;
+	    d__[i__1] = a[i__2].r;
+	    i__1 = i__;
+	    tau[i__1].r = taui.r, tau[i__1].i = taui.i;
+/* L10: */
+	}
+	i__1 = a_dim1 + 1;
+	d__[1] = a[i__1].r;
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__1 = a_dim1 + 1;
+	i__2 = a_dim1 + 1;
+	r__1 = a[i__2].r;
+	a[i__1].r = r__1, a[i__1].i = 0.f;
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(i+2:n,i)
+*/
+
+	    i__2 = i__ + 1 + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *n - i__;
+/* Computing MIN */
+	    i__3 = i__ + 2;
+	    clarfg_(&i__2, &alpha, &a[min(i__3,*n) + i__ * a_dim1], &c__1, &
+		    taui);
+	    i__2 = i__;
+	    e[i__2] = alpha.r;
+
+	    if (taui.r != 0.f || taui.i != 0.f) {
+
+/*              Apply H(i) from both sides to A(i+1:n,i+1:n) */
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute  x := tau * A * v  storing y in TAU(i:n-1) */
+
+		i__2 = *n - i__;
+		chemv_(uplo, &i__2, &taui, &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b56, &tau[
+			i__], &c__1);
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		q__3.r = -.5f, q__3.i = -0.f;
+		q__2.r = q__3.r * taui.r - q__3.i * taui.i, q__2.i = q__3.r *
+			taui.i + q__3.i * taui.r;
+		i__2 = *n - i__;
+		cdotc_(&q__4, &i__2, &tau[i__], &c__1, &a[i__ + 1 + i__ *
+			a_dim1], &c__1);
+		q__1.r = q__2.r * q__4.r - q__2.i * q__4.i, q__1.i = q__2.r *
+			q__4.i + q__2.i * q__4.r;
+		alpha.r = q__1.r, alpha.i = q__1.i;
+		i__2 = *n - i__;
+		caxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+			i__], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		i__2 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cher2_(uplo, &i__2, &q__1, &a[i__ + 1 + i__ * a_dim1], &c__1,
+			&tau[i__], &c__1, &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda);
+
+	    } else {
+		i__2 = i__ + 1 + (i__ + 1) * a_dim1;
+		i__3 = i__ + 1 + (i__ + 1) * a_dim1;
+		r__1 = a[i__3].r;
+		a[i__2].r = r__1, a[i__2].i = 0.f;
+	    }
+	    i__2 = i__ + 1 + i__ * a_dim1;
+	    i__3 = i__;
+	    a[i__2].r = e[i__3], a[i__2].i = 0.f;
+	    i__2 = i__;
+	    i__3 = i__ + i__ * a_dim1;
+	    d__[i__2] = a[i__3].r;
+	    i__2 = i__;
+	    tau[i__2].r = taui.r, tau[i__2].i = taui.i;
+/* L20: */
+	}
+	i__1 = *n;
+	i__2 = *n + *n * a_dim1;
+	d__[i__1] = a[i__2].r;
+    }
+
+    return 0;
+
+/*     End of CHETD2 */
+
+} /* chetd2_ */
+
+/* Subroutine */ int chetrd_(char *uplo, integer *n, complex *a, integer *lda,
+	 real *d__, real *e, complex *tau, complex *work, integer *lwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, nb, kk, nx, iws;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    static logical upper;
+    extern /* Subroutine */ int chetd2_(char *, integer *, complex *, integer
+	    *, real *, real *, complex *, integer *), cher2k_(char *,
+	    char *, integer *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, real *, complex *, integer *), clatrd_(char *, integer *, integer *, complex *, integer
+	    *, real *, complex *, complex *, integer *), xerbla_(char
+	    *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CHETRD reduces a complex Hermitian matrix A to real symmetric
+    tridiagonal form T by a unitary similarity transformation:
+    Q**H * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the unitary
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the unitary matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) REAL array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) COMPLEX array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= 1.
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*lwork < 1 && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+
+/*        Determine the block size. */
+
+	nb = ilaenv_(&c__1, "CHETRD", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6,
+		 (ftnlen)1);
+	lwkopt = *n * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CHETRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nx = *n;
+    iws = 1;
+    if (nb > 1 && nb < *n) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code).
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "CHETRD", uplo, n, &c_n1, &c_n1, &
+		c_n1, (ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *n) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code by setting NX = N.
+
+   Computing MAX
+*/
+		i__1 = *lwork / ldwork;
+		nb = max(i__1,1);
+		nbmin = ilaenv_(&c__2, "CHETRD", uplo, n, &c_n1, &c_n1, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		if (nb < nbmin) {
+		    nx = *n;
+		}
+	    }
+	} else {
+	    nx = *n;
+	}
+    } else {
+	nb = 1;
+    }
+
+    if (upper) {
+
+/*
+          Reduce the upper triangle of A.
+          Columns 1:kk are handled by the unblocked method.
+*/
+
+	kk = *n - (*n - nx + nb - 1) / nb * nb;
+	i__1 = kk + 1;
+	i__2 = -nb;
+	for (i__ = *n - nb + 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+		i__2) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = i__ + nb - 1;
+	    clatrd_(uplo, &i__3, &nb, &a[a_offset], lda, &e[1], &tau[1], &
+		    work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(1:i-1,1:i-1), using an
+             update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cher2k_(uplo, "No transpose", &i__3, &nb, &q__1, &a[i__ * a_dim1
+		    + 1], lda, &work[1], &ldwork, &c_b1034, &a[a_offset], lda);
+
+/*
+             Copy superdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j - 1 + j * a_dim1;
+		i__5 = j - 1;
+		a[i__4].r = e[i__5], a[i__4].i = 0.f;
+		i__4 = j;
+		i__5 = j + j * a_dim1;
+		d__[i__4] = a[i__5].r;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	chetd2_(uplo, &kk, &a[a_offset], lda, &d__[1], &e[1], &tau[1], &iinfo);
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__2 = *n - nx;
+	i__1 = nb;
+	for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = *n - i__ + 1;
+	    clatrd_(uplo, &i__3, &nb, &a[i__ + i__ * a_dim1], lda, &e[i__], &
+		    tau[i__], &work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(i+nb:n,i+nb:n), using
+             an update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = *n - i__ - nb + 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cher2k_(uplo, "No transpose", &i__3, &nb, &q__1, &a[i__ + nb +
+		    i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b1034, &a[
+		    i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*
+             Copy subdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j + 1 + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = e[i__5], a[i__4].i = 0.f;
+		i__4 = j;
+		i__5 = j + j * a_dim1;
+		d__[i__4] = a[i__5].r;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	i__1 = *n - i__ + 1;
+	chetd2_(uplo, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__],
+		&tau[i__], &iinfo);
+    }
+
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CHETRD */
+
+} /* chetrd_ */
+
+/* Subroutine */ int chseqr_(char *job, char *compz, integer *n, integer *ilo,
+	 integer *ihi, complex *h__, integer *ldh, complex *w, complex *z__,
+	integer *ldz, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3[2];
+    real r__1, r__2, r__3;
+    complex q__1;
+    char ch__1[2];
+
+    /* Local variables */
+    static complex hl[2401]	/* was [49][49] */;
+    static integer kbot, nmin;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static logical initz;
+    static complex workl[49];
+    static logical wantt, wantz;
+    extern /* Subroutine */ int claqr0_(logical *, logical *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *),
+	     clahqr_(logical *, logical *, integer *, integer *, integer *,
+	    complex *, integer *, complex *, integer *, integer *, complex *,
+	    integer *, integer *), clacpy_(char *, integer *, integer *,
+	    complex *, integer *, complex *, integer *), claset_(char
+	    *, integer *, integer *, complex *, complex *, complex *, integer
+	    *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical lquery;
+
+
+/*
+    -- LAPACK computational routine (version 3.2.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       June 2010
+
+       Purpose
+       =======
+
+       CHSEQR computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**H, where T is an upper triangular matrix (the
+       Schur form), and Z is the unitary matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input unitary
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+       Arguments
+       =========
+
+       JOB   (input) CHARACTER*1
+             = 'E':  compute eigenvalues only;
+             = 'S':  compute eigenvalues and the Schur form T.
+
+       COMPZ (input) CHARACTER*1
+             = 'N':  no Schur vectors are computed;
+             = 'I':  Z is initialized to the unit matrix and the matrix Z
+                     of Schur vectors of H is returned;
+             = 'V':  Z must contain an unitary matrix Q on entry, and
+                     the product Q*Z is returned.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+             set by a previous call to CGEBAL, and then passed to CGEHRD
+             when the matrix output by CGEBAL is reduced to Hessenberg
+             form. Otherwise ILO and IHI should be set to 1 and N
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) COMPLEX array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and JOB = 'S', H contains the upper
+             triangular matrix T from the Schur decomposition (the
+             Schur form). If INFO = 0 and JOB = 'E', the contents of
+             H are unspecified on exit.  (The output value of H when
+             INFO.GT.0 is given under the description of INFO below.)
+
+             Unlike earlier versions of CHSEQR, this subroutine may
+             explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+             or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       W        (output) COMPLEX array, dimension (N)
+             The computed eigenvalues. If JOB = 'S', the eigenvalues are
+             stored in the same order as on the diagonal of the Schur
+             form returned in H, with W(i) = H(i,i).
+
+       Z     (input/output) COMPLEX array, dimension (LDZ,N)
+             If COMPZ = 'N', Z is not referenced.
+             If COMPZ = 'I', on entry Z need not be set and on exit,
+             if INFO = 0, Z contains the unitary matrix Z of the Schur
+             vectors of H.  If COMPZ = 'V', on entry Z must contain an
+             N-by-N matrix Q, which is assumed to be equal to the unit
+             matrix except for the submatrix Z(ILO:IHI,ILO:IHI). On exit,
+             if INFO = 0, Z contains Q*Z.
+             Normally Q is the unitary matrix generated by CUNGHR
+             after the call to CGEHRD which formed the Hessenberg matrix
+             H. (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if COMPZ = 'I' or
+             COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) COMPLEX array, dimension (LWORK)
+             On exit, if INFO = 0, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient and delivers very good and sometimes
+             optimal performance.  However, LWORK as large as 11*N
+             may be required for optimal performance.  A workspace
+             query is recommended to determine the optimal workspace
+             size.
+
+             If LWORK = -1, then CHSEQR does a workspace query.
+             In this case, CHSEQR checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .LT. 0:  if INFO = -i, the i-th argument had an illegal
+                      value
+             .GT. 0:  if INFO = i, CHSEQR failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and JOB = 'E', then on exit, the
+                  remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and JOB   = 'S', then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is a unitary matrix.  The final
+                  value of  H is upper Hessenberg and triangular in
+                  rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and COMPZ = 'V', then on exit
+
+                    (final value of Z)  =  (initial value of Z)*U
+
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'I', then on exit
+                        (final value of Z)  = U
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'N', then Z is not
+                  accessed.
+
+       ================================================================
+               Default values supplied by
+               ILAENV(ISPEC,'CHSEQR',JOB(:1)//COMPZ(:1),N,ILO,IHI,LWORK).
+               It is suggested that these defaults be adjusted in order
+               to attain best performance in each particular
+               computational environment.
+
+              ISPEC=12: The CLAHQR vs CLAQR0 crossover point.
+                        Default: 75. (Must be at least 11.)
+
+              ISPEC=13: Recommended deflation window size.
+                        This depends on ILO, IHI and NS.  NS is the
+                        number of simultaneous shifts returned
+                        by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
+                        The default for (IHI-ILO+1).LE.500 is NS.
+                        The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+
+              ISPEC=14: Nibble crossover point. (See IPARMQ for
+                        details.)  Default: 14% of deflation window
+                        size.
+
+              ISPEC=15: Number of simultaneous shifts in a multishift
+                        QR iteration.
+
+                        If IHI-ILO+1 is ...
+
+                        greater than      ...but less    ... the
+                        or equal to ...      than        default is
+
+                             1               30          NS =   2(+)
+                            30               60          NS =   4(+)
+                            60              150          NS =  10(+)
+                           150              590          NS =  **
+                           590             3000          NS =  64
+                          3000             6000          NS = 128
+                          6000             infinity      NS = 256
+
+                    (+)  By default some or all matrices of this order
+                         are passed to the implicit double shift routine
+                         CLAHQR and this parameter is ignored.  See
+                         ISPEC=12 above and comments in IPARMQ for
+                         details.
+
+                   (**)  The asterisks (**) indicate an ad-hoc
+                         function of N increasing from 10 to 64.
+
+              ISPEC=16: Select structured matrix multiply.
+                        If the number of simultaneous shifts (specified
+                        by ISPEC=15) is less than 14, then the default
+                        for ISPEC=16 is 0.  Otherwise the default for
+                        ISPEC=16 is 2.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    CLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== NL allocates some local workspace to help small matrices
+       .    through a rare CLAHQR failure.  NL .GT. NTINY = 11 is
+       .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+       .    mended.  (The default value of NMIN is 75.)  Using NL = 49
+       .    allows up to six simultaneous shifts and a 16-by-16
+       .    deflation window.  ====
+
+       ==== Decode and check the input parameters. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    wantt = lsame_(job, "S");
+    initz = lsame_(compz, "I");
+    wantz = initz || lsame_(compz, "V");
+    r__1 = (real) max(1,*n);
+    q__1.r = r__1, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+    lquery = *lwork == -1;
+
+    *info = 0;
+    if (! lsame_(job, "E") && ! wantt) {
+	*info = -1;
+    } else if (! lsame_(compz, "N") && ! wantz) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*ldh < max(1,*n)) {
+	*info = -7;
+    } else if (*ldz < 1 || wantz && *ldz < max(1,*n)) {
+	*info = -10;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info != 0) {
+
+/*        ==== Quick return in case of invalid argument. ==== */
+
+	i__1 = -(*info);
+	xerbla_("CHSEQR", &i__1);
+	return 0;
+
+    } else if (*n == 0) {
+
+/*        ==== Quick return in case N = 0; nothing to do. ==== */
+
+	return 0;
+
+    } else if (lquery) {
+
+/*        ==== Quick return in case of a workspace query ==== */
+
+	claqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1], ilo,
+		ihi, &z__[z_offset], ldz, &work[1], lwork, info);
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+   Computing MAX
+*/
+	r__2 = work[1].r, r__3 = (real) max(1,*n);
+	r__1 = dmax(r__2,r__3);
+	q__1.r = r__1, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+
+    } else {
+
+/*        ==== copy eigenvalues isolated by CGEBAL ==== */
+
+	if (*ilo > 1) {
+	    i__1 = *ilo - 1;
+	    i__2 = *ldh + 1;
+	    ccopy_(&i__1, &h__[h_offset], &i__2, &w[1], &c__1);
+	}
+	if (*ihi < *n) {
+	    i__1 = *n - *ihi;
+	    i__2 = *ldh + 1;
+	    ccopy_(&i__1, &h__[*ihi + 1 + (*ihi + 1) * h_dim1], &i__2, &w[*
+		    ihi + 1], &c__1);
+	}
+
+/*        ==== Initialize Z, if requested ==== */
+
+	if (initz) {
+	    claset_("A", n, n, &c_b56, &c_b57, &z__[z_offset], ldz)
+		    ;
+	}
+
+/*        ==== Quick return if possible ==== */
+
+	if (*ilo == *ihi) {
+	    i__1 = *ilo;
+	    i__2 = *ilo + *ilo * h_dim1;
+	    w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i;
+	    return 0;
+	}
+
+/*
+          ==== CLAHQR/CLAQR0 crossover point ====
+
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = job;
+	i__3[1] = 1, a__1[1] = compz;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	nmin = ilaenv_(&c__12, "CHSEQR", ch__1, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== CLAQR0 for big matrices; CLAHQR for small ones ==== */
+
+	if (*n > nmin) {
+	    claqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    ilo, ihi, &z__[z_offset], ldz, &work[1], lwork, info);
+	} else {
+
+/*           ==== Small matrix ==== */
+
+	    clahqr_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    ilo, ihi, &z__[z_offset], ldz, info);
+
+	    if (*info > 0) {
+
+/*
+                ==== A rare CLAHQR failure!  CLAQR0 sometimes succeeds
+                .    when CLAHQR fails. ====
+*/
+
+		kbot = *info;
+
+		if (*n >= 49) {
+
+/*
+                   ==== Larger matrices have enough subdiagonal scratch
+                   .    space to call CLAQR0 directly. ====
+*/
+
+		    claqr0_(&wantt, &wantz, n, ilo, &kbot, &h__[h_offset],
+			    ldh, &w[1], ilo, ihi, &z__[z_offset], ldz, &work[
+			    1], lwork, info);
+
+		} else {
+
+/*
+                   ==== Tiny matrices don't have enough subdiagonal
+                   .    scratch space to benefit from CLAQR0.  Hence,
+                   .    tiny matrices must be copied into a larger
+                   .    array before calling CLAQR0. ====
+*/
+
+		    clacpy_("A", n, n, &h__[h_offset], ldh, hl, &c__49);
+		    i__1 = *n + 1 + *n * 49 - 50;
+		    hl[i__1].r = 0.f, hl[i__1].i = 0.f;
+		    i__1 = 49 - *n;
+		    claset_("A", &c__49, &i__1, &c_b56, &c_b56, &hl[(*n + 1) *
+			     49 - 49], &c__49);
+		    claqr0_(&wantt, &wantz, &c__49, ilo, &kbot, hl, &c__49, &
+			    w[1], ilo, ihi, &z__[z_offset], ldz, workl, &
+			    c__49, info);
+		    if (wantt || *info != 0) {
+			clacpy_("A", n, n, hl, &c__49, &h__[h_offset], ldh);
+		    }
+		}
+	    }
+	}
+
+/*        ==== Clear out the trash, if necessary. ==== */
+
+	if ((wantt || *info != 0) && *n > 2) {
+	    i__1 = *n - 2;
+	    i__2 = *n - 2;
+	    claset_("L", &i__1, &i__2, &c_b56, &c_b56, &h__[h_dim1 + 3], ldh);
+	}
+
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+
+   Computing MAX
+*/
+	r__2 = (real) max(1,*n), r__3 = work[1].r;
+	r__1 = dmax(r__2,r__3);
+	q__1.r = r__1, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+    }
+
+/*     ==== End of CHSEQR ==== */
+
+    return 0;
+} /* chseqr_ */
+
+/* Subroutine */ int clabrd_(integer *m, integer *n, integer *nb, complex *a,
+	integer *lda, real *d__, real *e, complex *tauq, complex *taup,
+	complex *x, integer *ldx, complex *y, integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__;
+    static complex alpha;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), cgemv_(char *, integer *, integer *, complex *,
+	    complex *, integer *, complex *, integer *, complex *, complex *,
+	    integer *), clarfg_(integer *, complex *, complex *,
+	    integer *, complex *), clacgv_(integer *, complex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLABRD reduces the first NB rows and columns of a complex general
+    m by n matrix A to upper or lower real bidiagonal form by a unitary
+    transformation Q' * A * P, and returns the matrices X and Y which
+    are needed to apply the transformation to the unreduced part of A.
+
+    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+    bidiagonal form.
+
+    This is an auxiliary routine called by CGEBRD
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.
+
+    NB      (input) INTEGER
+            The number of leading rows and columns of A to be reduced.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit, the first NB rows and columns of the matrix are
+            overwritten; the rest of the array is unchanged.
+            If m >= n, elements on and below the diagonal in the first NB
+              columns, with the array TAUQ, represent the unitary
+              matrix Q as a product of elementary reflectors; and
+              elements above the diagonal in the first NB rows, with the
+              array TAUP, represent the unitary matrix P as a product
+              of elementary reflectors.
+            If m < n, elements below the diagonal in the first NB
+              columns, with the array TAUQ, represent the unitary
+              matrix Q as a product of elementary reflectors, and
+              elements on and above the diagonal in the first NB rows,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) REAL array, dimension (NB)
+            The diagonal elements of the first NB rows and columns of
+            the reduced matrix.  D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (NB)
+            The off-diagonal elements of the first NB rows and columns of
+            the reduced matrix.
+
+    TAUQ    (output) COMPLEX array dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX array, dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix P. See Further Details.
+
+    X       (output) COMPLEX array, dimension (LDX,NB)
+            The m-by-nb matrix X required to update the unreduced part
+            of A.
+
+    LDX     (input) INTEGER
+            The leading dimension of the array X. LDX >= max(1,M).
+
+    Y       (output) COMPLEX array, dimension (LDY,NB)
+            The n-by-nb matrix Y required to update the unreduced part
+            of A.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= max(1,N).
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors.
+
+    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
+    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The elements of the vectors v and u together form the m-by-nb matrix
+    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
+    the transformation to the unreduced part of the matrix, using a block
+    update of the form:  A := A - V*Y' - X*U'.
+
+    The contents of A on exit are illustrated by the following examples
+    with nb = 2:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
+      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
+      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )
+
+    where a denotes an element of the original matrix which is unchanged,
+    vi denotes an element of the vector defining H(i), and ui an element
+    of the vector defining G(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:m,i) */
+
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &y[i__ + y_dim1], ldy);
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__2, &i__3, &q__1, &a[i__ + a_dim1], lda,
+		     &y[i__ + y_dim1], ldy, &c_b57, &a[i__ + i__ * a_dim1], &
+		    c__1);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &y[i__ + y_dim1], ldy);
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__2, &i__3, &q__1, &x[i__ + x_dim1], ldx,
+		     &a[i__ * a_dim1 + 1], &c__1, &c_b57, &a[i__ + i__ *
+		    a_dim1], &c__1);
+
+/*           Generate reflection Q(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    clarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1, &
+		    tauq[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    if (i__ < *n) {
+		i__2 = i__ + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ + (
+			i__ + 1) * a_dim1], lda, &a[i__ + i__ * a_dim1], &
+			c__1, &c_b56, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			a_dim1], lda, &a[i__ + i__ * a_dim1], &c__1, &c_b56, &
+			y[i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b57, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &x[i__ +
+			x_dim1], ldx, &a[i__ + i__ * a_dim1], &c__1, &c_b56, &
+			y[i__ * y_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &q__1, &a[(i__ +
+			1) * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &
+			c_b57, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		cscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+
+/*              Update A(i,i+1:n) */
+
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		clacgv_(&i__, &a[i__ + a_dim1], lda);
+		i__2 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__, &q__1, &y[i__ + 1 +
+			y_dim1], ldy, &a[i__ + a_dim1], lda, &c_b57, &a[i__ +
+			(i__ + 1) * a_dim1], lda);
+		clacgv_(&i__, &a[i__ + a_dim1], lda);
+		i__2 = i__ - 1;
+		clacgv_(&i__2, &x[i__ + x_dim1], ldx);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &q__1, &a[(i__ +
+			1) * a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, &c_b57,
+			&a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ - 1;
+		clacgv_(&i__2, &x[i__ + x_dim1], ldx);
+
+/*              Generate reflection P(i) to annihilate A(i,i+2:n) */
+
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		clarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+			taup[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		cgemv_("No transpose", &i__2, &i__3, &c_b57, &a[i__ + 1 + (
+			i__ + 1) * a_dim1], lda, &a[i__ + (i__ + 1) * a_dim1],
+			 lda, &c_b56, &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__;
+		cgemv_("Conjugate transpose", &i__2, &i__, &c_b57, &y[i__ + 1
+			+ y_dim1], ldy, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b56, &x[i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__, &q__1, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		cgemv_("No transpose", &i__2, &i__3, &c_b57, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b56, &x[i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		cscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i,i:n) */
+
+	    i__2 = *n - i__ + 1;
+	    clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__2, &i__3, &q__1, &y[i__ + y_dim1], ldy,
+		     &a[i__ + a_dim1], lda, &c_b57, &a[i__ + i__ * a_dim1],
+		    lda);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &x[i__ + x_dim1], ldx);
+	    i__2 = i__ - 1;
+	    i__3 = *n - i__ + 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("Conjugate transpose", &i__2, &i__3, &q__1, &a[i__ *
+		    a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, &c_b57, &a[i__ +
+		    i__ * a_dim1], lda);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &x[i__ + x_dim1], ldx);
+
+/*           Generate reflection P(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    clarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+		    taup[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    if (i__ < *m) {
+		i__2 = i__ + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		cgemv_("No transpose", &i__2, &i__3, &c_b57, &a[i__ + 1 + i__
+			* a_dim1], lda, &a[i__ + i__ * a_dim1], lda, &c_b56, &
+			x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__ + 1;
+		i__3 = i__ - 1;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &y[i__ +
+			y_dim1], ldy, &a[i__ + i__ * a_dim1], lda, &c_b56, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__ + 1;
+		cgemv_("No transpose", &i__2, &i__3, &c_b57, &a[i__ * a_dim1
+			+ 1], lda, &a[i__ + i__ * a_dim1], lda, &c_b56, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		cscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__ + 1;
+		clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+
+/*              Update A(i+1:m,i) */
+
+		i__2 = i__ - 1;
+		clacgv_(&i__2, &y[i__ + y_dim1], ldy);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &a[i__ + 1 +
+			a_dim1], lda, &y[i__ + y_dim1], ldy, &c_b57, &a[i__ +
+			1 + i__ * a_dim1], &c__1);
+		i__2 = i__ - 1;
+		clacgv_(&i__2, &y[i__ + y_dim1], ldy);
+		i__2 = *m - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__, &q__1, &x[i__ + 1 +
+			x_dim1], ldx, &a[i__ * a_dim1 + 1], &c__1, &c_b57, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+
+/*              Generate reflection Q(i) to annihilate A(i+2:m,i) */
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		clarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1,
+			 &tauq[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + (i__ + 1) * a_dim1], lda, &a[i__ + 1 + i__ *
+			a_dim1], &c__1, &c_b56, &y[i__ + 1 + i__ * y_dim1], &
+			c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &y[i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b57, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__;
+		cgemv_("Conjugate transpose", &i__2, &i__, &c_b57, &x[i__ + 1
+			+ x_dim1], ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &y[i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("Conjugate transpose", &i__, &i__2, &q__1, &a[(i__ + 1)
+			 * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &
+			c_b57, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		cscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+	    } else {
+		i__2 = *n - i__ + 1;
+		clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of CLABRD */
+
+} /* clabrd_ */
+
+/* Subroutine */ int clacgv_(integer *n, complex *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, ioff;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLACGV conjugates a complex vector of length N.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The length of the vector X.  N >= 0.
+
+    X       (input/output) COMPLEX array, dimension
+                           (1+(N-1)*abs(INCX))
+            On entry, the vector of length N to be conjugated.
+            On exit, X is overwritten with conjg(X).
+
+    INCX    (input) INTEGER
+            The spacing between successive elements of X.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*incx == 1) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__;
+	    r_cnjg(&q__1, &x[i__]);
+	    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+/* L10: */
+	}
+    } else {
+	ioff = 1;
+	if (*incx < 0) {
+	    ioff = 1 - (*n - 1) * *incx;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = ioff;
+	    r_cnjg(&q__1, &x[ioff]);
+	    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+	    ioff += *incx;
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of CLACGV */
+
+} /* clacgv_ */
+
+/* Subroutine */ int clacp2_(char *uplo, integer *m, integer *n, real *a,
+	integer *lda, complex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLACP2 copies all or part of a real two-dimensional matrix A to a
+    complex matrix B.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be copied to B.
+            = 'U':      Upper triangular part
+            = 'L':      Lower triangular part
+            Otherwise:  All of the matrix A
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The m by n matrix A.  If UPLO = 'U', only the upper trapezium
+            is accessed; if UPLO = 'L', only the lower trapezium is
+            accessed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (output) COMPLEX array, dimension (LDB,N)
+            On exit, B = A in the locations specified by UPLO.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4], b[i__3].i = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4], b[i__3].i = 0.f;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4], b[i__3].i = 0.f;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLACP2 */
+
+} /* clacp2_ */
+
+/* Subroutine */ int clacpy_(char *uplo, integer *m, integer *n, complex *a,
+	integer *lda, complex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLACPY copies all or part of a two-dimensional matrix A to another
+    matrix B.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be copied to B.
+            = 'U':      Upper triangular part
+            = 'L':      Lower triangular part
+            Otherwise:  All of the matrix A
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The m by n matrix A.  If UPLO = 'U', only the upper trapezium
+            is accessed; if UPLO = 'L', only the lower trapezium is
+            accessed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (output) COMPLEX array, dimension (LDB,N)
+            On exit, B = A in the locations specified by UPLO.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4].r, b[i__3].i = a[i__4].i;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4].r, b[i__3].i = a[i__4].i;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4].r, b[i__3].i = a[i__4].i;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLACPY */
+
+} /* clacpy_ */
+
+/* Subroutine */ int clacrm_(integer *m, integer *n, complex *a, integer *lda,
+	 real *b, integer *ldb, complex *c__, integer *ldc, real *rwork)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, a_dim1, a_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLACRM performs a very simple matrix-matrix multiplication:
+             C := A * B,
+    where A is M by N and complex; B is N by N and real;
+    C is M by N and complex.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A and of the matrix C.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns and rows of the matrix B and
+            the number of columns of the matrix C.
+            N >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA, N)
+            A contains the M by N matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >=max(1,M).
+
+    B       (input) REAL array, dimension (LDB, N)
+            B contains the N by N matrix B.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >=max(1,N).
+
+    C       (input) COMPLEX array, dimension (LDC, N)
+            C contains the M by N matrix C.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >=max(1,N).
+
+    RWORK   (workspace) REAL array, dimension (2*M*N)
+
+    =====================================================================
+
+
+       Quick return if possible.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --rwork;
+
+    /* Function Body */
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    rwork[(j - 1) * *m + i__] = a[i__3].r;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    l = *m * *n + 1;
+    sgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = l + (j - 1) * *m + i__ - 1;
+	    c__[i__3].r = rwork[i__4], c__[i__3].i = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    rwork[(j - 1) * *m + i__] = r_imag(&a[i__ + j * a_dim1]);
+/* L50: */
+	}
+/* L60: */
+    }
+    sgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = i__ + j * c_dim1;
+	    r__1 = c__[i__4].r;
+	    i__5 = l + (j - 1) * *m + i__ - 1;
+	    q__1.r = r__1, q__1.i = rwork[i__5];
+	    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L70: */
+	}
+/* L80: */
+    }
+
+    return 0;
+
+/*     End of CLACRM */
+
+} /* clacrm_ */
+
+/* Complex */ VOID cladiv_(complex * ret_val, complex *x, complex *y)
+{
+    /* System generated locals */
+    real r__1, r__2, r__3, r__4;
+    complex q__1;
+
+    /* Local variables */
+    static real zi, zr;
+    extern /* Subroutine */ int sladiv_(real *, real *, real *, real *, real *
+	    , real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLADIV := X / Y, where X and Y are complex.  The computation of X / Y
+    will not overflow on an intermediary step unless the results
+    overflows.
+
+    Arguments
+    =========
+
+    X       (input) COMPLEX
+    Y       (input) COMPLEX
+            The complex scalars X and Y.
+
+    =====================================================================
+*/
+
+
+    r__1 = x->r;
+    r__2 = r_imag(x);
+    r__3 = y->r;
+    r__4 = r_imag(y);
+    sladiv_(&r__1, &r__2, &r__3, &r__4, &zr, &zi);
+    q__1.r = zr, q__1.i = zi;
+     ret_val->r = q__1.r,  ret_val->i = q__1.i;
+
+    return ;
+
+/*     End of CLADIV */
+
+} /* cladiv_ */
+
+/* Subroutine */ int claed0_(integer *qsiz, integer *n, real *d__, real *e,
+	complex *q, integer *ldq, complex *qstore, integer *ldqs, real *rwork,
+	 integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, qstore_dim1, qstore_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, k, ll, iq, lgn, msd2, smm1, spm1, spm2;
+    static real temp;
+    static integer curr, iperm;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static integer indxq, iwrem;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static integer iqptr;
+    extern /* Subroutine */ int claed7_(integer *, integer *, integer *,
+	    integer *, integer *, integer *, real *, complex *, integer *,
+	    real *, integer *, real *, integer *, integer *, integer *,
+	    integer *, integer *, real *, complex *, real *, integer *,
+	    integer *);
+    static integer tlvls;
+    extern /* Subroutine */ int clacrm_(integer *, integer *, complex *,
+	    integer *, real *, integer *, complex *, integer *, real *);
+    static integer igivcl;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer igivnm, submat, curprb, subpbs, igivpt, curlvl, matsiz,
+	    iprmpt, smlsiz;
+    extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *,
+	    real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    Using the divide and conquer method, CLAED0 computes all eigenvalues
+    of a symmetric tridiagonal matrix which is one diagonal block of
+    those from reducing a dense or band Hermitian matrix and
+    corresponding eigenvectors of the dense or band matrix.
+
+    Arguments
+    =========
+
+    QSIZ   (input) INTEGER
+           The dimension of the unitary matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, the diagonal elements of the tridiagonal matrix.
+           On exit, the eigenvalues in ascending order.
+
+    E      (input/output) REAL array, dimension (N-1)
+           On entry, the off-diagonal elements of the tridiagonal matrix.
+           On exit, E has been destroyed.
+
+    Q      (input/output) COMPLEX array, dimension (LDQ,N)
+           On entry, Q must contain an QSIZ x N matrix whose columns
+           unitarily orthonormal. It is a part of the unitary matrix
+           that reduces the full dense Hermitian matrix to a
+           (reducible) symmetric tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    IWORK  (workspace) INTEGER array,
+           the dimension of IWORK must be at least
+                        6 + 6*N + 5*N*lg N
+                        ( lg( N ) = smallest integer k
+                                    such that 2^k >= N )
+
+    RWORK  (workspace) REAL array,
+                                 dimension (1 + 3*N + 2*N*lg N + 3*N**2)
+                          ( lg( N ) = smallest integer k
+                                      such that 2^k >= N )
+
+    QSTORE (workspace) COMPLEX array, dimension (LDQS, N)
+           Used to store parts of
+           the eigenvector matrix when the updating matrix multiplies
+           take place.
+
+    LDQS   (input) INTEGER
+           The leading dimension of the array QSTORE.
+           LDQS >= max(1,N).
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    =====================================================================
+
+    Warning:      N could be as big as QSIZ!
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    qstore_dim1 = *ldqs;
+    qstore_offset = 1 + qstore_dim1;
+    qstore -= qstore_offset;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*
+       IF( ICOMPQ .LT. 0 .OR. ICOMPQ .GT. 2 ) THEN
+          INFO = -1
+       ELSE IF( ( ICOMPQ .EQ. 1 ) .AND. ( QSIZ .LT. MAX( 0, N ) ) )
+      $        THEN
+*/
+    if (*qsiz < max(0,*n)) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldq < max(1,*n)) {
+	*info = -6;
+    } else if (*ldqs < max(1,*n)) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLAED0", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    smlsiz = ilaenv_(&c__9, "CLAED0", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+
+/*
+       Determine the size and placement of the submatrices, and save in
+       the leading elements of IWORK.
+*/
+
+    iwork[1] = *n;
+    subpbs = 1;
+    tlvls = 0;
+L10:
+    if (iwork[subpbs] > smlsiz) {
+	for (j = subpbs; j >= 1; --j) {
+	    iwork[j * 2] = (iwork[j] + 1) / 2;
+	    iwork[(j << 1) - 1] = iwork[j] / 2;
+/* L20: */
+	}
+	++tlvls;
+	subpbs <<= 1;
+	goto L10;
+    }
+    i__1 = subpbs;
+    for (j = 2; j <= i__1; ++j) {
+	iwork[j] += iwork[j - 1];
+/* L30: */
+    }
+
+/*
+       Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1
+       using rank-1 modifications (cuts).
+*/
+
+    spm1 = subpbs - 1;
+    i__1 = spm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	submat = iwork[i__] + 1;
+	smm1 = submat - 1;
+	d__[smm1] -= (r__1 = e[smm1], dabs(r__1));
+	d__[submat] -= (r__1 = e[smm1], dabs(r__1));
+/* L40: */
+    }
+
+    indxq = (*n << 2) + 3;
+
+/*
+       Set up workspaces for eigenvalues only/accumulate new vectors
+       routine
+*/
+
+    temp = log((real) (*n)) / log(2.f);
+    lgn = (integer) temp;
+    if (pow_ii(&c__2, &lgn) < *n) {
+	++lgn;
+    }
+    if (pow_ii(&c__2, &lgn) < *n) {
+	++lgn;
+    }
+    iprmpt = indxq + *n + 1;
+    iperm = iprmpt + *n * lgn;
+    iqptr = iperm + *n * lgn;
+    igivpt = iqptr + *n + 2;
+    igivcl = igivpt + *n * lgn;
+
+    igivnm = 1;
+    iq = igivnm + (*n << 1) * lgn;
+/* Computing 2nd power */
+    i__1 = *n;
+    iwrem = iq + i__1 * i__1 + 1;
+/*     Initialize pointers */
+    i__1 = subpbs;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	iwork[iprmpt + i__] = 1;
+	iwork[igivpt + i__] = 1;
+/* L50: */
+    }
+    iwork[iqptr] = 1;
+
+/*
+       Solve each submatrix eigenproblem at the bottom of the divide and
+       conquer tree.
+*/
+
+    curr = 0;
+    i__1 = spm1;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	if (i__ == 0) {
+	    submat = 1;
+	    matsiz = iwork[1];
+	} else {
+	    submat = iwork[i__] + 1;
+	    matsiz = iwork[i__ + 1] - iwork[i__];
+	}
+	ll = iq - 1 + iwork[iqptr + curr];
+	ssteqr_("I", &matsiz, &d__[submat], &e[submat], &rwork[ll], &matsiz, &
+		rwork[1], info);
+	clacrm_(qsiz, &matsiz, &q[submat * q_dim1 + 1], ldq, &rwork[ll], &
+		matsiz, &qstore[submat * qstore_dim1 + 1], ldqs, &rwork[iwrem]
+		);
+/* Computing 2nd power */
+	i__2 = matsiz;
+	iwork[iqptr + curr + 1] = iwork[iqptr + curr] + i__2 * i__2;
+	++curr;
+	if (*info > 0) {
+	    *info = submat * (*n + 1) + submat + matsiz - 1;
+	    return 0;
+	}
+	k = 1;
+	i__2 = iwork[i__ + 1];
+	for (j = submat; j <= i__2; ++j) {
+	    iwork[indxq + j] = k;
+	    ++k;
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       Successively merge eigensystems of adjacent submatrices
+       into eigensystem for the corresponding larger matrix.
+
+       while ( SUBPBS > 1 )
+*/
+
+    curlvl = 1;
+L80:
+    if (subpbs > 1) {
+	spm2 = subpbs - 2;
+	i__1 = spm2;
+	for (i__ = 0; i__ <= i__1; i__ += 2) {
+	    if (i__ == 0) {
+		submat = 1;
+		matsiz = iwork[2];
+		msd2 = iwork[1];
+		curprb = 0;
+	    } else {
+		submat = iwork[i__] + 1;
+		matsiz = iwork[i__ + 2] - iwork[i__];
+		msd2 = matsiz / 2;
+		++curprb;
+	    }
+
+/*
+       Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2)
+       into an eigensystem of size MATSIZ.  CLAED7 handles the case
+       when the eigenvectors of a full or band Hermitian matrix (which
+       was reduced to tridiagonal form) are desired.
+
+       I am free to use Q as a valuable working space until Loop 150.
+*/
+
+	    claed7_(&matsiz, &msd2, qsiz, &tlvls, &curlvl, &curprb, &d__[
+		    submat], &qstore[submat * qstore_dim1 + 1], ldqs, &e[
+		    submat + msd2 - 1], &iwork[indxq + submat], &rwork[iq], &
+		    iwork[iqptr], &iwork[iprmpt], &iwork[iperm], &iwork[
+		    igivpt], &iwork[igivcl], &rwork[igivnm], &q[submat *
+		    q_dim1 + 1], &rwork[iwrem], &iwork[subpbs + 1], info);
+	    if (*info > 0) {
+		*info = submat * (*n + 1) + submat + matsiz - 1;
+		return 0;
+	    }
+	    iwork[i__ / 2 + 1] = iwork[i__ + 2];
+/* L90: */
+	}
+	subpbs /= 2;
+	++curlvl;
+	goto L80;
+    }
+
+/*
+       end while
+
+       Re-merge the eigenvalues/vectors which were deflated at the final
+       merge step.
+*/
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	j = iwork[indxq + i__];
+	rwork[i__] = d__[j];
+	ccopy_(qsiz, &qstore[j * qstore_dim1 + 1], &c__1, &q[i__ * q_dim1 + 1]
+		, &c__1);
+/* L100: */
+    }
+    scopy_(n, &rwork[1], &c__1, &d__[1], &c__1);
+
+    return 0;
+
+/*     End of CLAED0 */
+
+} /* claed0_ */
+
+/* Subroutine */ int claed7_(integer *n, integer *cutpnt, integer *qsiz,
+	integer *tlvls, integer *curlvl, integer *curpbm, real *d__, complex *
+	q, integer *ldq, real *rho, integer *indxq, real *qstore, integer *
+	qptr, integer *prmptr, integer *perm, integer *givptr, integer *
+	givcol, real *givnum, complex *work, real *rwork, integer *iwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, k, n1, n2, iq, iw, iz, ptr, indx, curr, indxc, indxp;
+    extern /* Subroutine */ int claed8_(integer *, integer *, integer *,
+	    complex *, integer *, real *, real *, integer *, real *, real *,
+	    complex *, integer *, real *, integer *, integer *, integer *,
+	    integer *, integer *, integer *, real *, integer *), slaed9_(
+	    integer *, integer *, integer *, integer *, real *, real *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    slaeda_(integer *, integer *, integer *, integer *, integer *,
+	    integer *, integer *, integer *, real *, real *, integer *, real *
+	    , real *, integer *);
+    static integer idlmda;
+    extern /* Subroutine */ int clacrm_(integer *, integer *, complex *,
+	    integer *, real *, integer *, complex *, integer *, real *),
+	    xerbla_(char *, integer *), slamrg_(integer *, integer *,
+	    real *, integer *, integer *, integer *);
+    static integer coltyp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLAED7 computes the updated eigensystem of a diagonal
+    matrix after modification by a rank-one symmetric matrix. This
+    routine is used only for the eigenproblem which requires all
+    eigenvalues and optionally eigenvectors of a dense or banded
+    Hermitian matrix that has been reduced to tridiagonal form.
+
+      T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out)
+
+      where Z = Q'u, u is a vector of length N with ones in the
+      CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+
+       The eigenvectors of the original matrix are stored in Q, and the
+       eigenvalues are in D.  The algorithm consists of three stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple eigenvalues or if there is a zero in
+          the Z vector.  For each such occurence the dimension of the
+          secular equation problem is reduced by one.  This stage is
+          performed by the routine SLAED2.
+
+          The second stage consists of calculating the updated
+          eigenvalues. This is done by finding the roots of the secular
+          equation via the routine SLAED4 (as called by SLAED3).
+          This routine also calculates the eigenvectors of the current
+          problem.
+
+          The final stage consists of computing the updated eigenvectors
+          directly using the updated eigenvalues.  The eigenvectors for
+          the current problem are multiplied with the eigenvectors from
+          the overall problem.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    CUTPNT (input) INTEGER
+           Contains the location of the last eigenvalue in the leading
+           sub-matrix.  min(1,N) <= CUTPNT <= N.
+
+    QSIZ   (input) INTEGER
+           The dimension of the unitary matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N.
+
+    TLVLS  (input) INTEGER
+           The total number of merging levels in the overall divide and
+           conquer tree.
+
+    CURLVL (input) INTEGER
+           The current level in the overall merge routine,
+           0 <= curlvl <= tlvls.
+
+    CURPBM (input) INTEGER
+           The current problem in the current level in the overall
+           merge routine (counting from upper left to lower right).
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, the eigenvalues of the rank-1-perturbed matrix.
+           On exit, the eigenvalues of the repaired matrix.
+
+    Q      (input/output) COMPLEX array, dimension (LDQ,N)
+           On entry, the eigenvectors of the rank-1-perturbed matrix.
+           On exit, the eigenvectors of the repaired tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    RHO    (input) REAL
+           Contains the subdiagonal element used to create the rank-1
+           modification.
+
+    INDXQ  (output) INTEGER array, dimension (N)
+           This contains the permutation which will reintegrate the
+           subproblem just solved back into sorted order,
+           ie. D( INDXQ( I = 1, N ) ) will be in ascending order.
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    RWORK  (workspace) REAL array,
+                                   dimension (3*N+2*QSIZ*N)
+
+    WORK   (workspace) COMPLEX array, dimension (QSIZ*N)
+
+    QSTORE (input/output) REAL array, dimension (N**2+1)
+           Stores eigenvectors of submatrices encountered during
+           divide and conquer, packed together. QPTR points to
+           beginning of the submatrices.
+
+    QPTR   (input/output) INTEGER array, dimension (N+2)
+           List of indices pointing to beginning of submatrices stored
+           in QSTORE. The submatrices are numbered starting at the
+           bottom left of the divide and conquer tree, from left to
+           right and bottom to top.
+
+    PRMPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in PERM a
+           level's permutation is stored.  PRMPTR(i+1) - PRMPTR(i)
+           indicates the size of the permutation and also the size of
+           the full, non-deflated problem.
+
+    PERM   (input) INTEGER array, dimension (N lg N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in GIVCOL a
+           level's Givens rotations are stored.  GIVPTR(i+1) - GIVPTR(i)
+           indicates the number of Givens rotations.
+
+    GIVCOL (input) INTEGER array, dimension (2, N lg N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (input) REAL array, dimension (2, N lg N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --qstore;
+    --qptr;
+    --prmptr;
+    --perm;
+    --givptr;
+    givcol -= 3;
+    givnum -= 3;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*
+       IF( ICOMPQ.LT.0 .OR. ICOMPQ.GT.1 ) THEN
+          INFO = -1
+       ELSE IF( N.LT.0 ) THEN
+*/
+    if (*n < 0) {
+	*info = -1;
+    } else if (min(1,*n) > *cutpnt || *n < *cutpnt) {
+	*info = -2;
+    } else if (*qsiz < *n) {
+	*info = -3;
+    } else if (*ldq < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLAED7", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in SLAED2 and SLAED3.
+*/
+
+    iz = 1;
+    idlmda = iz + *n;
+    iw = idlmda + *n;
+    iq = iw + *n;
+
+    indx = 1;
+    indxc = indx + *n;
+    coltyp = indxc + *n;
+    indxp = coltyp + *n;
+
+/*
+       Form the z-vector which consists of the last row of Q_1 and the
+       first row of Q_2.
+*/
+
+    ptr = pow_ii(&c__2, tlvls) + 1;
+    i__1 = *curlvl - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = *tlvls - i__;
+	ptr += pow_ii(&c__2, &i__2);
+/* L10: */
+    }
+    curr = ptr + *curpbm;
+    slaeda_(n, tlvls, curlvl, curpbm, &prmptr[1], &perm[1], &givptr[1], &
+	    givcol[3], &givnum[3], &qstore[1], &qptr[1], &rwork[iz], &rwork[
+	    iz + *n], info);
+
+/*
+       When solving the final problem, we no longer need the stored data,
+       so we will overwrite the data from this level onto the previously
+       used storage space.
+*/
+
+    if (*curlvl == *tlvls) {
+	qptr[curr] = 1;
+	prmptr[curr] = 1;
+	givptr[curr] = 1;
+    }
+
+/*     Sort and Deflate eigenvalues. */
+
+    claed8_(&k, n, qsiz, &q[q_offset], ldq, &d__[1], rho, cutpnt, &rwork[iz],
+	    &rwork[idlmda], &work[1], qsiz, &rwork[iw], &iwork[indxp], &iwork[
+	    indx], &indxq[1], &perm[prmptr[curr]], &givptr[curr + 1], &givcol[
+	    (givptr[curr] << 1) + 1], &givnum[(givptr[curr] << 1) + 1], info);
+    prmptr[curr + 1] = prmptr[curr] + *n;
+    givptr[curr + 1] += givptr[curr];
+
+/*     Solve Secular Equation. */
+
+    if (k != 0) {
+	slaed9_(&k, &c__1, &k, n, &d__[1], &rwork[iq], &k, rho, &rwork[idlmda]
+		, &rwork[iw], &qstore[qptr[curr]], &k, info);
+	clacrm_(qsiz, &k, &work[1], qsiz, &qstore[qptr[curr]], &k, &q[
+		q_offset], ldq, &rwork[iq]);
+/* Computing 2nd power */
+	i__1 = k;
+	qptr[curr + 1] = qptr[curr] + i__1 * i__1;
+	if (*info != 0) {
+	    return 0;
+	}
+
+/*     Prepare the INDXQ sorting premutation. */
+
+	n1 = k;
+	n2 = *n - k;
+	slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &indxq[1]);
+    } else {
+	qptr[curr + 1] = qptr[curr];
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    indxq[i__] = i__;
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLAED7 */
+
+} /* claed7_ */
+
+/* Subroutine */ int claed8_(integer *k, integer *n, integer *qsiz, complex *
+	q, integer *ldq, real *d__, real *rho, integer *cutpnt, real *z__,
+	real *dlamda, complex *q2, integer *ldq2, real *w, integer *indxp,
+	integer *indx, integer *indxq, integer *perm, integer *givptr,
+	integer *givcol, real *givnum, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, q2_dim1, q2_offset, i__1;
+    real r__1;
+
+    /* Local variables */
+    static real c__;
+    static integer i__, j;
+    static real s, t;
+    static integer k2, n1, n2, jp, n1p1;
+    static real eps, tau, tol;
+    static integer jlam, imax, jmax;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    ccopy_(integer *, complex *, integer *, complex *, integer *),
+	    csrot_(integer *, complex *, integer *, complex *, integer *,
+	    real *, real *), scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex
+	    *, integer *, complex *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int slamrg_(integer *, integer *, real *, integer
+	    *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CLAED8 merges the two sets of eigenvalues together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    eigenvalues are close together or if there is a tiny element in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    Arguments
+    =========
+
+    K      (output) INTEGER
+           Contains the number of non-deflated eigenvalues.
+           This is the order of the related secular equation.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    QSIZ   (input) INTEGER
+           The dimension of the unitary matrix used to reduce
+           the dense or band matrix to tridiagonal form.
+           QSIZ >= N if ICOMPQ = 1.
+
+    Q      (input/output) COMPLEX array, dimension (LDQ,N)
+           On entry, Q contains the eigenvectors of the partially solved
+           system which has been previously updated in matrix
+           multiplies with other partially solved eigensystems.
+           On exit, Q contains the trailing (N-K) updated eigenvectors
+           (those which were deflated) in its last N-K columns.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max( 1, N ).
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, D contains the eigenvalues of the two submatrices to
+           be combined.  On exit, D contains the trailing (N-K) updated
+           eigenvalues (those which were deflated) sorted into increasing
+           order.
+
+    RHO    (input/output) REAL
+           Contains the off diagonal element associated with the rank-1
+           cut which originally split the two submatrices which are now
+           being recombined. RHO is modified during the computation to
+           the value required by SLAED3.
+
+    CUTPNT (input) INTEGER
+           Contains the location of the last eigenvalue in the leading
+           sub-matrix.  MIN(1,N) <= CUTPNT <= N.
+
+    Z      (input) REAL array, dimension (N)
+           On input this vector contains the updating vector (the last
+           row of the first sub-eigenvector matrix and the first row of
+           the second sub-eigenvector matrix).  The contents of Z are
+           destroyed during the updating process.
+
+    DLAMDA (output) REAL array, dimension (N)
+           Contains a copy of the first K eigenvalues which will be used
+           by SLAED3 to form the secular equation.
+
+    Q2     (output) COMPLEX array, dimension (LDQ2,N)
+           If ICOMPQ = 0, Q2 is not referenced.  Otherwise,
+           Contains a copy of the first K eigenvectors which will be used
+           by SLAED7 in a matrix multiply (SGEMM) to update the new
+           eigenvectors.
+
+    LDQ2   (input) INTEGER
+           The leading dimension of the array Q2.  LDQ2 >= max( 1, N ).
+
+    W      (output) REAL array, dimension (N)
+           This will hold the first k values of the final
+           deflation-altered z-vector and will be passed to SLAED3.
+
+    INDXP  (workspace) INTEGER array, dimension (N)
+           This will contain the permutation used to place deflated
+           values of D at the end of the array. On output INDXP(1:K)
+           points to the nondeflated D-values and INDXP(K+1:N)
+           points to the deflated eigenvalues.
+
+    INDX   (workspace) INTEGER array, dimension (N)
+           This will contain the permutation used to sort the contents of
+           D into ascending order.
+
+    INDXQ  (input) INTEGER array, dimension (N)
+           This contains the permutation which separately sorts the two
+           sub-problems in D into ascending order.  Note that elements in
+           the second half of this permutation must first have CUTPNT
+           added to their values in order to be accurate.
+
+    PERM   (output) INTEGER array, dimension (N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (output) INTEGER
+           Contains the number of Givens rotations which took place in
+           this subproblem.
+
+    GIVCOL (output) INTEGER array, dimension (2, N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (output) REAL array, dimension (2, N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --d__;
+    --z__;
+    --dlamda;
+    q2_dim1 = *ldq2;
+    q2_offset = 1 + q2_dim1;
+    q2 -= q2_offset;
+    --w;
+    --indxp;
+    --indx;
+    --indxq;
+    --perm;
+    givcol -= 3;
+    givnum -= 3;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -2;
+    } else if (*qsiz < *n) {
+	*info = -3;
+    } else if (*ldq < max(1,*n)) {
+	*info = -5;
+    } else if (*cutpnt < min(1,*n) || *cutpnt > *n) {
+	*info = -8;
+    } else if (*ldq2 < max(1,*n)) {
+	*info = -12;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLAED8", &i__1);
+	return 0;
+    }
+
+/*
+       Need to initialize GIVPTR to O here in case of quick exit
+       to prevent an unspecified code behavior (usually sigfault)
+       when IWORK array on entry to *stedc is not zeroed
+       (or at least some IWORK entries which used in *laed7 for GIVPTR).
+*/
+
+    *givptr = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    n1 = *cutpnt;
+    n2 = *n - n1;
+    n1p1 = n1 + 1;
+
+    if (*rho < 0.f) {
+	sscal_(&n2, &c_b1276, &z__[n1p1], &c__1);
+    }
+
+/*     Normalize z so that norm(z) = 1 */
+
+    t = 1.f / sqrt(2.f);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	indx[j] = j;
+/* L10: */
+    }
+    sscal_(n, &t, &z__[1], &c__1);
+    *rho = (r__1 = *rho * 2.f, dabs(r__1));
+
+/*     Sort the eigenvalues into increasing order */
+
+    i__1 = *n;
+    for (i__ = *cutpnt + 1; i__ <= i__1; ++i__) {
+	indxq[i__] += *cutpnt;
+/* L20: */
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = d__[indxq[i__]];
+	w[i__] = z__[indxq[i__]];
+/* L30: */
+    }
+    i__ = 1;
+    j = *cutpnt + 1;
+    slamrg_(&n1, &n2, &dlamda[1], &c__1, &c__1, &indx[1]);
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = dlamda[indx[i__]];
+	z__[i__] = w[indx[i__]];
+/* L40: */
+    }
+
+/*     Calculate the allowable deflation tolerance */
+
+    imax = isamax_(n, &z__[1], &c__1);
+    jmax = isamax_(n, &d__[1], &c__1);
+    eps = slamch_("Epsilon");
+    tol = eps * 8.f * (r__1 = d__[jmax], dabs(r__1));
+
+/*
+       If the rank-1 modifier is small enough, no more needs to be done
+       -- except to reorganize Q so that its columns correspond with the
+       elements in D.
+*/
+
+    if (*rho * (r__1 = z__[imax], dabs(r__1)) <= tol) {
+	*k = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    perm[j] = indxq[indx[j]];
+	    ccopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1 + 1]
+		    , &c__1);
+/* L50: */
+	}
+	clacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq);
+	return 0;
+    }
+
+/*
+       If there are multiple eigenvalues then the problem deflates.  Here
+       the number of equal eigenvalues are found.  As each equal
+       eigenvalue is found, an elementary reflector is computed to rotate
+       the corresponding eigensubspace so that the corresponding
+       components of Z are zero in this new basis.
+*/
+
+    *k = 0;
+    k2 = *n + 1;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (*rho * (r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    indxp[k2] = j;
+	    if (j == *n) {
+		goto L100;
+	    }
+	} else {
+	    jlam = j;
+	    goto L70;
+	}
+/* L60: */
+    }
+L70:
+    ++j;
+    if (j > *n) {
+	goto L90;
+    }
+    if (*rho * (r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	indxp[k2] = j;
+    } else {
+
+/*        Check if eigenvalues are close enough to allow deflation. */
+
+	s = z__[jlam];
+	c__ = z__[j];
+
+/*
+          Find sqrt(a**2+b**2) without overflow or
+          destructive underflow.
+*/
+
+	tau = slapy2_(&c__, &s);
+	t = d__[j] - d__[jlam];
+	c__ /= tau;
+	s = -s / tau;
+	if ((r__1 = t * c__ * s, dabs(r__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    z__[j] = tau;
+	    z__[jlam] = 0.f;
+
+/*           Record the appropriate Givens rotation */
+
+	    ++(*givptr);
+	    givcol[(*givptr << 1) + 1] = indxq[indx[jlam]];
+	    givcol[(*givptr << 1) + 2] = indxq[indx[j]];
+	    givnum[(*givptr << 1) + 1] = c__;
+	    givnum[(*givptr << 1) + 2] = s;
+	    csrot_(qsiz, &q[indxq[indx[jlam]] * q_dim1 + 1], &c__1, &q[indxq[
+		    indx[j]] * q_dim1 + 1], &c__1, &c__, &s);
+	    t = d__[jlam] * c__ * c__ + d__[j] * s * s;
+	    d__[j] = d__[jlam] * s * s + d__[j] * c__ * c__;
+	    d__[jlam] = t;
+	    --k2;
+	    i__ = 1;
+L80:
+	    if (k2 + i__ <= *n) {
+		if (d__[jlam] < d__[indxp[k2 + i__]]) {
+		    indxp[k2 + i__ - 1] = indxp[k2 + i__];
+		    indxp[k2 + i__] = jlam;
+		    ++i__;
+		    goto L80;
+		} else {
+		    indxp[k2 + i__ - 1] = jlam;
+		}
+	    } else {
+		indxp[k2 + i__ - 1] = jlam;
+	    }
+	    jlam = j;
+	} else {
+	    ++(*k);
+	    w[*k] = z__[jlam];
+	    dlamda[*k] = d__[jlam];
+	    indxp[*k] = jlam;
+	    jlam = j;
+	}
+    }
+    goto L70;
+L90:
+
+/*     Record the last eigenvalue. */
+
+    ++(*k);
+    w[*k] = z__[jlam];
+    dlamda[*k] = d__[jlam];
+    indxp[*k] = jlam;
+
+L100:
+
+/*
+       Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+       and Q2 respectively.  The eigenvalues/vectors which were not
+       deflated go into the first K slots of DLAMDA and Q2 respectively,
+       while those which were deflated go into the last N - K slots.
+*/
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jp = indxp[j];
+	dlamda[j] = d__[jp];
+	perm[j] = indxq[indx[jp]];
+	ccopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1 + 1], &
+		c__1);
+/* L110: */
+    }
+
+/*
+       The deflated eigenvalues and their corresponding vectors go back
+       into the last N - K slots of D and Q respectively.
+*/
+
+    if (*k < *n) {
+	i__1 = *n - *k;
+	scopy_(&i__1, &dlamda[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	i__1 = *n - *k;
+	clacpy_("A", qsiz, &i__1, &q2[(*k + 1) * q2_dim1 + 1], ldq2, &q[(*k +
+		1) * q_dim1 + 1], ldq);
+    }
+
+    return 0;
+
+/*     End of CLAED8 */
+
+} /* claed8_ */
+
+/* Subroutine */ int clahqr_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w,
+	integer *iloz, integer *ihiz, complex *z__, integer *ldz, integer *
+	info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    complex q__1, q__2, q__3, q__4, q__5, q__6, q__7;
+
+    /* Local variables */
+    static integer i__, j, k, l, m;
+    static real s;
+    static complex t, u, v[2], x, y;
+    static integer i1, i2;
+    static complex t1;
+    static real t2;
+    static complex v2;
+    static real aa, ab, ba, bb, h10;
+    static complex h11;
+    static real h21;
+    static complex h22, sc;
+    static integer nh, nz;
+    static real sx;
+    static integer jhi;
+    static complex h11s;
+    static integer jlo, its;
+    static real ulp;
+    static complex sum;
+    static real tst;
+    static complex temp;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), ccopy_(integer *, complex *, integer *, complex *,
+	    integer *);
+    static real rtemp;
+    extern /* Subroutine */ int slabad_(real *, real *), clarfg_(integer *,
+	    complex *, complex *, integer *, complex *);
+    extern /* Complex */ VOID cladiv_(complex *, complex *, complex *);
+    extern doublereal slamch_(char *);
+    static real safmin, safmax, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       CLAHQR is an auxiliary routine called by CHSEQR to update the
+       eigenvalues and Schur decomposition already computed by CHSEQR, by
+       dealing with the Hessenberg submatrix in rows and columns ILO to
+       IHI.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N       (input) INTEGER
+            The order of the matrix H.  N >= 0.
+
+       ILO     (input) INTEGER
+       IHI     (input) INTEGER
+            It is assumed that H is already upper triangular in rows and
+            columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless ILO = 1).
+            CLAHQR works primarily with the Hessenberg submatrix in rows
+            and columns ILO to IHI, but applies transformations to all of
+            H if WANTT is .TRUE..
+            1 <= ILO <= max(1,IHI); IHI <= N.
+
+       H       (input/output) COMPLEX array, dimension (LDH,N)
+            On entry, the upper Hessenberg matrix H.
+            On exit, if INFO is zero and if WANTT is .TRUE., then H
+            is upper triangular in rows and columns ILO:IHI.  If INFO
+            is zero and if WANTT is .FALSE., then the contents of H
+            are unspecified on exit.  The output state of H in case
+            INF is positive is below under the description of INFO.
+
+       LDH     (input) INTEGER
+            The leading dimension of the array H. LDH >= max(1,N).
+
+       W       (output) COMPLEX array, dimension (N)
+            The computed eigenvalues ILO to IHI are stored in the
+            corresponding elements of W. If WANTT is .TRUE., the
+            eigenvalues are stored in the same order as on the diagonal
+            of the Schur form returned in H, with W(i) = H(i,i).
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE..
+            1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
+
+       Z       (input/output) COMPLEX array, dimension (LDZ,N)
+            If WANTZ is .TRUE., on entry Z must contain the current
+            matrix Z of transformations accumulated by CHSEQR, and on
+            exit Z has been updated; transformations are applied only to
+            the submatrix Z(ILOZ:IHIZ,ILO:IHI).
+            If WANTZ is .FALSE., Z is not referenced.
+
+       LDZ     (input) INTEGER
+            The leading dimension of the array Z. LDZ >= max(1,N).
+
+       INFO    (output) INTEGER
+             =   0: successful exit
+            .GT. 0: if INFO = i, CLAHQR failed to compute all the
+                    eigenvalues ILO to IHI in a total of 30 iterations
+                    per eigenvalue; elements i+1:ihi of W contain
+                    those eigenvalues which have been successfully
+                    computed.
+
+                    If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+                    the remaining unconverged eigenvalues are the
+                    eigenvalues of the upper Hessenberg matrix
+                    rows and columns ILO thorugh INFO of the final,
+                    output value of H.
+
+                    If INFO .GT. 0 and WANTT is .TRUE., then on exit
+            (*)       (initial value of H)*U  = U*(final value of H)
+                    where U is an orthognal matrix.    The final
+                    value of H is upper Hessenberg and triangular in
+                    rows and columns INFO+1 through IHI.
+
+                    If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+                        (final value of Z)  = (initial value of Z)*U
+                    where U is the orthogonal matrix in (*)
+                    (regardless of the value of WANTT.)
+
+       Further Details
+       ===============
+
+       02-96 Based on modifications by
+       David Day, Sandia National Laboratory, USA
+
+       12-04 Further modifications by
+       Ralph Byers, University of Kansas, USA
+       This is a modified version of CLAHQR from LAPACK version 3.0.
+       It is (1) more robust against overflow and underflow and
+       (2) adopts the more conservative Ahues & Tisseur stopping
+       criterion (LAWN 122, 1997).
+
+       =========================================================
+*/
+
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*ilo == *ihi) {
+	i__1 = *ilo;
+	i__2 = *ilo + *ilo * h_dim1;
+	w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i;
+	return 0;
+    }
+
+/*     ==== clear out the trash ==== */
+    i__1 = *ihi - 3;
+    for (j = *ilo; j <= i__1; ++j) {
+	i__2 = j + 2 + j * h_dim1;
+	h__[i__2].r = 0.f, h__[i__2].i = 0.f;
+	i__2 = j + 3 + j * h_dim1;
+	h__[i__2].r = 0.f, h__[i__2].i = 0.f;
+/* L10: */
+    }
+    if (*ilo <= *ihi - 2) {
+	i__1 = *ihi + (*ihi - 2) * h_dim1;
+	h__[i__1].r = 0.f, h__[i__1].i = 0.f;
+    }
+/*     ==== ensure that subdiagonal entries are real ==== */
+    if (*wantt) {
+	jlo = 1;
+	jhi = *n;
+    } else {
+	jlo = *ilo;
+	jhi = *ihi;
+    }
+    i__1 = *ihi;
+    for (i__ = *ilo + 1; i__ <= i__1; ++i__) {
+	if (r_imag(&h__[i__ + (i__ - 1) * h_dim1]) != 0.f) {
+/*
+             ==== The following redundant normalization
+             .    avoids problems with both gradual and
+             .    sudden underflow in ABS(H(I,I-1)) ====
+*/
+	    i__2 = i__ + (i__ - 1) * h_dim1;
+	    i__3 = i__ + (i__ - 1) * h_dim1;
+	    r__3 = (r__1 = h__[i__3].r, dabs(r__1)) + (r__2 = r_imag(&h__[i__
+		    + (i__ - 1) * h_dim1]), dabs(r__2));
+	    q__1.r = h__[i__2].r / r__3, q__1.i = h__[i__2].i / r__3;
+	    sc.r = q__1.r, sc.i = q__1.i;
+	    r_cnjg(&q__2, &sc);
+	    r__1 = c_abs(&sc);
+	    q__1.r = q__2.r / r__1, q__1.i = q__2.i / r__1;
+	    sc.r = q__1.r, sc.i = q__1.i;
+	    i__2 = i__ + (i__ - 1) * h_dim1;
+	    r__1 = c_abs(&h__[i__ + (i__ - 1) * h_dim1]);
+	    h__[i__2].r = r__1, h__[i__2].i = 0.f;
+	    i__2 = jhi - i__ + 1;
+	    cscal_(&i__2, &sc, &h__[i__ + i__ * h_dim1], ldh);
+/* Computing MIN */
+	    i__3 = jhi, i__4 = i__ + 1;
+	    i__2 = min(i__3,i__4) - jlo + 1;
+	    r_cnjg(&q__1, &sc);
+	    cscal_(&i__2, &q__1, &h__[jlo + i__ * h_dim1], &c__1);
+	    if (*wantz) {
+		i__2 = *ihiz - *iloz + 1;
+		r_cnjg(&q__1, &sc);
+		cscal_(&i__2, &q__1, &z__[*iloz + i__ * z_dim1], &c__1);
+	    }
+	}
+/* L20: */
+    }
+
+    nh = *ihi - *ilo + 1;
+    nz = *ihiz - *iloz + 1;
+
+/*     Set machine-dependent constants for the stopping criterion. */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) nh / ulp);
+
+/*
+       I1 and I2 are the indices of the first row and last column of H
+       to which transformations must be applied. If eigenvalues only are
+       being computed, I1 and I2 are set inside the main loop.
+*/
+
+    if (*wantt) {
+	i1 = 1;
+	i2 = *n;
+    }
+
+/*
+       The main loop begins here. I is the loop index and decreases from
+       IHI to ILO in steps of 1. Each iteration of the loop works
+       with the active submatrix in rows and columns L to I.
+       Eigenvalues I+1 to IHI have already converged. Either L = ILO, or
+       H(L,L-1) is negligible so that the matrix splits.
+*/
+
+    i__ = *ihi;
+L30:
+    if (i__ < *ilo) {
+	goto L150;
+    }
+
+/*
+       Perform QR iterations on rows and columns ILO to I until a
+       submatrix of order 1 splits off at the bottom because a
+       subdiagonal element has become negligible.
+*/
+
+    l = *ilo;
+    for (its = 0; its <= 30; ++its) {
+
+/*        Look for a single small subdiagonal element. */
+
+	i__1 = l + 1;
+	for (k = i__; k >= i__1; --k) {
+	    i__2 = k + (k - 1) * h_dim1;
+	    if ((r__1 = h__[i__2].r, dabs(r__1)) + (r__2 = r_imag(&h__[k + (k
+		    - 1) * h_dim1]), dabs(r__2)) <= smlnum) {
+		goto L50;
+	    }
+	    i__2 = k - 1 + (k - 1) * h_dim1;
+	    i__3 = k + k * h_dim1;
+	    tst = (r__1 = h__[i__2].r, dabs(r__1)) + (r__2 = r_imag(&h__[k -
+		    1 + (k - 1) * h_dim1]), dabs(r__2)) + ((r__3 = h__[i__3]
+		    .r, dabs(r__3)) + (r__4 = r_imag(&h__[k + k * h_dim1]),
+		    dabs(r__4)));
+	    if (tst == 0.f) {
+		if (k - 2 >= *ilo) {
+		    i__2 = k - 1 + (k - 2) * h_dim1;
+		    tst += (r__1 = h__[i__2].r, dabs(r__1));
+		}
+		if (k + 1 <= *ihi) {
+		    i__2 = k + 1 + k * h_dim1;
+		    tst += (r__1 = h__[i__2].r, dabs(r__1));
+		}
+	    }
+/*
+             ==== The following is a conservative small subdiagonal
+             .    deflation criterion due to Ahues & Tisseur (LAWN 122,
+             .    1997). It has better mathematical foundation and
+             .    improves accuracy in some examples.  ====
+*/
+	    i__2 = k + (k - 1) * h_dim1;
+	    if ((r__1 = h__[i__2].r, dabs(r__1)) <= ulp * tst) {
+/* Computing MAX */
+		i__2 = k + (k - 1) * h_dim1;
+		i__3 = k - 1 + k * h_dim1;
+		r__5 = (r__1 = h__[i__2].r, dabs(r__1)) + (r__2 = r_imag(&h__[
+			k + (k - 1) * h_dim1]), dabs(r__2)), r__6 = (r__3 =
+			h__[i__3].r, dabs(r__3)) + (r__4 = r_imag(&h__[k - 1
+			+ k * h_dim1]), dabs(r__4));
+		ab = dmax(r__5,r__6);
+/* Computing MIN */
+		i__2 = k + (k - 1) * h_dim1;
+		i__3 = k - 1 + k * h_dim1;
+		r__5 = (r__1 = h__[i__2].r, dabs(r__1)) + (r__2 = r_imag(&h__[
+			k + (k - 1) * h_dim1]), dabs(r__2)), r__6 = (r__3 =
+			h__[i__3].r, dabs(r__3)) + (r__4 = r_imag(&h__[k - 1
+			+ k * h_dim1]), dabs(r__4));
+		ba = dmin(r__5,r__6);
+		i__2 = k - 1 + (k - 1) * h_dim1;
+		i__3 = k + k * h_dim1;
+		q__2.r = h__[i__2].r - h__[i__3].r, q__2.i = h__[i__2].i -
+			h__[i__3].i;
+		q__1.r = q__2.r, q__1.i = q__2.i;
+/* Computing MAX */
+		i__4 = k + k * h_dim1;
+		r__5 = (r__1 = h__[i__4].r, dabs(r__1)) + (r__2 = r_imag(&h__[
+			k + k * h_dim1]), dabs(r__2)), r__6 = (r__3 = q__1.r,
+			dabs(r__3)) + (r__4 = r_imag(&q__1), dabs(r__4));
+		aa = dmax(r__5,r__6);
+		i__2 = k - 1 + (k - 1) * h_dim1;
+		i__3 = k + k * h_dim1;
+		q__2.r = h__[i__2].r - h__[i__3].r, q__2.i = h__[i__2].i -
+			h__[i__3].i;
+		q__1.r = q__2.r, q__1.i = q__2.i;
+/* Computing MIN */
+		i__4 = k + k * h_dim1;
+		r__5 = (r__1 = h__[i__4].r, dabs(r__1)) + (r__2 = r_imag(&h__[
+			k + k * h_dim1]), dabs(r__2)), r__6 = (r__3 = q__1.r,
+			dabs(r__3)) + (r__4 = r_imag(&q__1), dabs(r__4));
+		bb = dmin(r__5,r__6);
+		s = aa + ab;
+/* Computing MAX */
+		r__1 = smlnum, r__2 = ulp * (bb * (aa / s));
+		if (ba * (ab / s) <= dmax(r__1,r__2)) {
+		    goto L50;
+		}
+	    }
+/* L40: */
+	}
+L50:
+	l = k;
+	if (l > *ilo) {
+
+/*           H(L,L-1) is negligible */
+
+	    i__1 = l + (l - 1) * h_dim1;
+	    h__[i__1].r = 0.f, h__[i__1].i = 0.f;
+	}
+
+/*        Exit from loop if a submatrix of order 1 has split off. */
+
+	if (l >= i__) {
+	    goto L140;
+	}
+
+/*
+          Now the active submatrix is in rows and columns L to I. If
+          eigenvalues only are being computed, only the active submatrix
+          need be transformed.
+*/
+
+	if (! (*wantt)) {
+	    i1 = l;
+	    i2 = i__;
+	}
+
+	if (its == 10) {
+
+/*           Exceptional shift. */
+
+	    i__1 = l + 1 + l * h_dim1;
+	    s = (r__1 = h__[i__1].r, dabs(r__1)) * .75f;
+	    i__1 = l + l * h_dim1;
+	    q__1.r = s + h__[i__1].r, q__1.i = h__[i__1].i;
+	    t.r = q__1.r, t.i = q__1.i;
+	} else if (its == 20) {
+
+/*           Exceptional shift. */
+
+	    i__1 = i__ + (i__ - 1) * h_dim1;
+	    s = (r__1 = h__[i__1].r, dabs(r__1)) * .75f;
+	    i__1 = i__ + i__ * h_dim1;
+	    q__1.r = s + h__[i__1].r, q__1.i = h__[i__1].i;
+	    t.r = q__1.r, t.i = q__1.i;
+	} else {
+
+/*           Wilkinson's shift. */
+
+	    i__1 = i__ + i__ * h_dim1;
+	    t.r = h__[i__1].r, t.i = h__[i__1].i;
+	    c_sqrt(&q__2, &h__[i__ - 1 + i__ * h_dim1]);
+	    c_sqrt(&q__3, &h__[i__ + (i__ - 1) * h_dim1]);
+	    q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = q__2.r *
+		    q__3.i + q__2.i * q__3.r;
+	    u.r = q__1.r, u.i = q__1.i;
+	    s = (r__1 = u.r, dabs(r__1)) + (r__2 = r_imag(&u), dabs(r__2));
+	    if (s != 0.f) {
+		i__1 = i__ - 1 + (i__ - 1) * h_dim1;
+		q__2.r = h__[i__1].r - t.r, q__2.i = h__[i__1].i - t.i;
+		q__1.r = q__2.r * .5f, q__1.i = q__2.i * .5f;
+		x.r = q__1.r, x.i = q__1.i;
+		sx = (r__1 = x.r, dabs(r__1)) + (r__2 = r_imag(&x), dabs(r__2)
+			);
+/* Computing MAX */
+		r__3 = s, r__4 = (r__1 = x.r, dabs(r__1)) + (r__2 = r_imag(&x)
+			, dabs(r__2));
+		s = dmax(r__3,r__4);
+		q__5.r = x.r / s, q__5.i = x.i / s;
+		pow_ci(&q__4, &q__5, &c__2);
+		q__7.r = u.r / s, q__7.i = u.i / s;
+		pow_ci(&q__6, &q__7, &c__2);
+		q__3.r = q__4.r + q__6.r, q__3.i = q__4.i + q__6.i;
+		c_sqrt(&q__2, &q__3);
+		q__1.r = s * q__2.r, q__1.i = s * q__2.i;
+		y.r = q__1.r, y.i = q__1.i;
+		if (sx > 0.f) {
+		    q__1.r = x.r / sx, q__1.i = x.i / sx;
+		    q__2.r = x.r / sx, q__2.i = x.i / sx;
+		    if (q__1.r * y.r + r_imag(&q__2) * r_imag(&y) < 0.f) {
+			q__3.r = -y.r, q__3.i = -y.i;
+			y.r = q__3.r, y.i = q__3.i;
+		    }
+		}
+		q__4.r = x.r + y.r, q__4.i = x.i + y.i;
+		cladiv_(&q__3, &u, &q__4);
+		q__2.r = u.r * q__3.r - u.i * q__3.i, q__2.i = u.r * q__3.i +
+			u.i * q__3.r;
+		q__1.r = t.r - q__2.r, q__1.i = t.i - q__2.i;
+		t.r = q__1.r, t.i = q__1.i;
+	    }
+	}
+
+/*        Look for two consecutive small subdiagonal elements. */
+
+	i__1 = l + 1;
+	for (m = i__ - 1; m >= i__1; --m) {
+
+/*
+             Determine the effect of starting the single-shift QR
+             iteration at row M, and see if this would make H(M,M-1)
+             negligible.
+*/
+
+	    i__2 = m + m * h_dim1;
+	    h11.r = h__[i__2].r, h11.i = h__[i__2].i;
+	    i__2 = m + 1 + (m + 1) * h_dim1;
+	    h22.r = h__[i__2].r, h22.i = h__[i__2].i;
+	    q__1.r = h11.r - t.r, q__1.i = h11.i - t.i;
+	    h11s.r = q__1.r, h11s.i = q__1.i;
+	    i__2 = m + 1 + m * h_dim1;
+	    h21 = h__[i__2].r;
+	    s = (r__1 = h11s.r, dabs(r__1)) + (r__2 = r_imag(&h11s), dabs(
+		    r__2)) + dabs(h21);
+	    q__1.r = h11s.r / s, q__1.i = h11s.i / s;
+	    h11s.r = q__1.r, h11s.i = q__1.i;
+	    h21 /= s;
+	    v[0].r = h11s.r, v[0].i = h11s.i;
+	    v[1].r = h21, v[1].i = 0.f;
+	    i__2 = m + (m - 1) * h_dim1;
+	    h10 = h__[i__2].r;
+	    if (dabs(h10) * dabs(h21) <= ulp * (((r__1 = h11s.r, dabs(r__1))
+		    + (r__2 = r_imag(&h11s), dabs(r__2))) * ((r__3 = h11.r,
+		    dabs(r__3)) + (r__4 = r_imag(&h11), dabs(r__4)) + ((r__5 =
+		     h22.r, dabs(r__5)) + (r__6 = r_imag(&h22), dabs(r__6)))))
+		    ) {
+		goto L70;
+	    }
+/* L60: */
+	}
+	i__1 = l + l * h_dim1;
+	h11.r = h__[i__1].r, h11.i = h__[i__1].i;
+	i__1 = l + 1 + (l + 1) * h_dim1;
+	h22.r = h__[i__1].r, h22.i = h__[i__1].i;
+	q__1.r = h11.r - t.r, q__1.i = h11.i - t.i;
+	h11s.r = q__1.r, h11s.i = q__1.i;
+	i__1 = l + 1 + l * h_dim1;
+	h21 = h__[i__1].r;
+	s = (r__1 = h11s.r, dabs(r__1)) + (r__2 = r_imag(&h11s), dabs(r__2))
+		+ dabs(h21);
+	q__1.r = h11s.r / s, q__1.i = h11s.i / s;
+	h11s.r = q__1.r, h11s.i = q__1.i;
+	h21 /= s;
+	v[0].r = h11s.r, v[0].i = h11s.i;
+	v[1].r = h21, v[1].i = 0.f;
+L70:
+
+/*        Single-shift QR step */
+
+	i__1 = i__ - 1;
+	for (k = m; k <= i__1; ++k) {
+
+/*
+             The first iteration of this loop determines a reflection G
+             from the vector V and applies it from left and right to H,
+             thus creating a nonzero bulge below the subdiagonal.
+
+             Each subsequent iteration determines a reflection G to
+             restore the Hessenberg form in the (K-1)th column, and thus
+             chases the bulge one step toward the bottom of the active
+             submatrix.
+
+             V(2) is always real before the call to CLARFG, and hence
+             after the call T2 ( = T1*V(2) ) is also real.
+*/
+
+	    if (k > m) {
+		ccopy_(&c__2, &h__[k + (k - 1) * h_dim1], &c__1, v, &c__1);
+	    }
+	    clarfg_(&c__2, v, &v[1], &c__1, &t1);
+	    if (k > m) {
+		i__2 = k + (k - 1) * h_dim1;
+		h__[i__2].r = v[0].r, h__[i__2].i = v[0].i;
+		i__2 = k + 1 + (k - 1) * h_dim1;
+		h__[i__2].r = 0.f, h__[i__2].i = 0.f;
+	    }
+	    v2.r = v[1].r, v2.i = v[1].i;
+	    q__1.r = t1.r * v2.r - t1.i * v2.i, q__1.i = t1.r * v2.i + t1.i *
+		    v2.r;
+	    t2 = q__1.r;
+
+/*
+             Apply G from the left to transform the rows of the matrix
+             in columns K to I2.
+*/
+
+	    i__2 = i2;
+	    for (j = k; j <= i__2; ++j) {
+		r_cnjg(&q__3, &t1);
+		i__3 = k + j * h_dim1;
+		q__2.r = q__3.r * h__[i__3].r - q__3.i * h__[i__3].i, q__2.i =
+			 q__3.r * h__[i__3].i + q__3.i * h__[i__3].r;
+		i__4 = k + 1 + j * h_dim1;
+		q__4.r = t2 * h__[i__4].r, q__4.i = t2 * h__[i__4].i;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		sum.r = q__1.r, sum.i = q__1.i;
+		i__3 = k + j * h_dim1;
+		i__4 = k + j * h_dim1;
+		q__1.r = h__[i__4].r - sum.r, q__1.i = h__[i__4].i - sum.i;
+		h__[i__3].r = q__1.r, h__[i__3].i = q__1.i;
+		i__3 = k + 1 + j * h_dim1;
+		i__4 = k + 1 + j * h_dim1;
+		q__2.r = sum.r * v2.r - sum.i * v2.i, q__2.i = sum.r * v2.i +
+			sum.i * v2.r;
+		q__1.r = h__[i__4].r - q__2.r, q__1.i = h__[i__4].i - q__2.i;
+		h__[i__3].r = q__1.r, h__[i__3].i = q__1.i;
+/* L80: */
+	    }
+
+/*
+             Apply G from the right to transform the columns of the
+             matrix in rows I1 to min(K+2,I).
+
+   Computing MIN
+*/
+	    i__3 = k + 2;
+	    i__2 = min(i__3,i__);
+	    for (j = i1; j <= i__2; ++j) {
+		i__3 = j + k * h_dim1;
+		q__2.r = t1.r * h__[i__3].r - t1.i * h__[i__3].i, q__2.i =
+			t1.r * h__[i__3].i + t1.i * h__[i__3].r;
+		i__4 = j + (k + 1) * h_dim1;
+		q__3.r = t2 * h__[i__4].r, q__3.i = t2 * h__[i__4].i;
+		q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		sum.r = q__1.r, sum.i = q__1.i;
+		i__3 = j + k * h_dim1;
+		i__4 = j + k * h_dim1;
+		q__1.r = h__[i__4].r - sum.r, q__1.i = h__[i__4].i - sum.i;
+		h__[i__3].r = q__1.r, h__[i__3].i = q__1.i;
+		i__3 = j + (k + 1) * h_dim1;
+		i__4 = j + (k + 1) * h_dim1;
+		r_cnjg(&q__3, &v2);
+		q__2.r = sum.r * q__3.r - sum.i * q__3.i, q__2.i = sum.r *
+			q__3.i + sum.i * q__3.r;
+		q__1.r = h__[i__4].r - q__2.r, q__1.i = h__[i__4].i - q__2.i;
+		h__[i__3].r = q__1.r, h__[i__3].i = q__1.i;
+/* L90: */
+	    }
+
+	    if (*wantz) {
+
+/*              Accumulate transformations in the matrix Z */
+
+		i__2 = *ihiz;
+		for (j = *iloz; j <= i__2; ++j) {
+		    i__3 = j + k * z_dim1;
+		    q__2.r = t1.r * z__[i__3].r - t1.i * z__[i__3].i, q__2.i =
+			     t1.r * z__[i__3].i + t1.i * z__[i__3].r;
+		    i__4 = j + (k + 1) * z_dim1;
+		    q__3.r = t2 * z__[i__4].r, q__3.i = t2 * z__[i__4].i;
+		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+		    sum.r = q__1.r, sum.i = q__1.i;
+		    i__3 = j + k * z_dim1;
+		    i__4 = j + k * z_dim1;
+		    q__1.r = z__[i__4].r - sum.r, q__1.i = z__[i__4].i -
+			    sum.i;
+		    z__[i__3].r = q__1.r, z__[i__3].i = q__1.i;
+		    i__3 = j + (k + 1) * z_dim1;
+		    i__4 = j + (k + 1) * z_dim1;
+		    r_cnjg(&q__3, &v2);
+		    q__2.r = sum.r * q__3.r - sum.i * q__3.i, q__2.i = sum.r *
+			     q__3.i + sum.i * q__3.r;
+		    q__1.r = z__[i__4].r - q__2.r, q__1.i = z__[i__4].i -
+			    q__2.i;
+		    z__[i__3].r = q__1.r, z__[i__3].i = q__1.i;
+/* L100: */
+		}
+	    }
+
+	    if (k == m && m > l) {
+
+/*
+                If the QR step was started at row M > L because two
+                consecutive small subdiagonals were found, then extra
+                scaling must be performed to ensure that H(M,M-1) remains
+                real.
+*/
+
+		q__1.r = 1.f - t1.r, q__1.i = 0.f - t1.i;
+		temp.r = q__1.r, temp.i = q__1.i;
+		r__1 = c_abs(&temp);
+		q__1.r = temp.r / r__1, q__1.i = temp.i / r__1;
+		temp.r = q__1.r, temp.i = q__1.i;
+		i__2 = m + 1 + m * h_dim1;
+		i__3 = m + 1 + m * h_dim1;
+		r_cnjg(&q__2, &temp);
+		q__1.r = h__[i__3].r * q__2.r - h__[i__3].i * q__2.i, q__1.i =
+			 h__[i__3].r * q__2.i + h__[i__3].i * q__2.r;
+		h__[i__2].r = q__1.r, h__[i__2].i = q__1.i;
+		if (m + 2 <= i__) {
+		    i__2 = m + 2 + (m + 1) * h_dim1;
+		    i__3 = m + 2 + (m + 1) * h_dim1;
+		    q__1.r = h__[i__3].r * temp.r - h__[i__3].i * temp.i,
+			    q__1.i = h__[i__3].r * temp.i + h__[i__3].i *
+			    temp.r;
+		    h__[i__2].r = q__1.r, h__[i__2].i = q__1.i;
+		}
+		i__2 = i__;
+		for (j = m; j <= i__2; ++j) {
+		    if (j != m + 1) {
+			if (i2 > j) {
+			    i__3 = i2 - j;
+			    cscal_(&i__3, &temp, &h__[j + (j + 1) * h_dim1],
+				    ldh);
+			}
+			i__3 = j - i1;
+			r_cnjg(&q__1, &temp);
+			cscal_(&i__3, &q__1, &h__[i1 + j * h_dim1], &c__1);
+			if (*wantz) {
+			    r_cnjg(&q__1, &temp);
+			    cscal_(&nz, &q__1, &z__[*iloz + j * z_dim1], &
+				    c__1);
+			}
+		    }
+/* L110: */
+		}
+	    }
+/* L120: */
+	}
+
+/*        Ensure that H(I,I-1) is real. */
+
+	i__1 = i__ + (i__ - 1) * h_dim1;
+	temp.r = h__[i__1].r, temp.i = h__[i__1].i;
+	if (r_imag(&temp) != 0.f) {
+	    rtemp = c_abs(&temp);
+	    i__1 = i__ + (i__ - 1) * h_dim1;
+	    h__[i__1].r = rtemp, h__[i__1].i = 0.f;
+	    q__1.r = temp.r / rtemp, q__1.i = temp.i / rtemp;
+	    temp.r = q__1.r, temp.i = q__1.i;
+	    if (i2 > i__) {
+		i__1 = i2 - i__;
+		r_cnjg(&q__1, &temp);
+		cscal_(&i__1, &q__1, &h__[i__ + (i__ + 1) * h_dim1], ldh);
+	    }
+	    i__1 = i__ - i1;
+	    cscal_(&i__1, &temp, &h__[i1 + i__ * h_dim1], &c__1);
+	    if (*wantz) {
+		cscal_(&nz, &temp, &z__[*iloz + i__ * z_dim1], &c__1);
+	    }
+	}
+
+/* L130: */
+    }
+
+/*     Failure to converge in remaining number of iterations */
+
+    *info = i__;
+    return 0;
+
+L140:
+
+/*     H(I,I-1) is negligible: one eigenvalue has converged. */
+
+    i__1 = i__;
+    i__2 = i__ + i__ * h_dim1;
+    w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i;
+
+/*     return to start of the main loop with new value of I. */
+
+    i__ = l - 1;
+    goto L30;
+
+L150:
+    return 0;
+
+/*     End of CLAHQR */
+
+} /* clahqr_ */
+
+/* Subroutine */ int clahr2_(integer *n, integer *k, integer *nb, complex *a,
+	integer *lda, complex *tau, complex *t, integer *ldt, complex *y,
+	integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__;
+    static complex ei;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), cgemm_(char *, char *, integer *, integer *, integer *
+	    , complex *, complex *, integer *, complex *, integer *, complex *
+	    , complex *, integer *), cgemv_(char *, integer *,
+	     integer *, complex *, complex *, integer *, complex *, integer *,
+	     complex *, complex *, integer *), ccopy_(integer *,
+	    complex *, integer *, complex *, integer *), ctrmm_(char *, char *
+	    , char *, char *, integer *, integer *, complex *, complex *,
+	    integer *, complex *, integer *),
+	    caxpy_(integer *, complex *, complex *, integer *, complex *,
+	    integer *), ctrmv_(char *, char *, char *, integer *, complex *,
+	    integer *, complex *, integer *), clarfg_(
+	    integer *, complex *, complex *, integer *, complex *), clacgv_(
+	    integer *, complex *, integer *), clacpy_(char *, integer *,
+	    integer *, complex *, integer *, complex *, integer *);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2.1)                        -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    --*  -- April 2009
+                                 -- */
+/*
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    CLAHR2 reduces the first NB columns of A complex general n-BY-(n-k+1)
+    matrix A so that elements below the k-th subdiagonal are zero. The
+    reduction is performed by an unitary similarity transformation
+    Q' * A * Q. The routine returns the matrices V and T which determine
+    Q as a block reflector I - V*T*V', and also the matrix Y = A * V * T.
+
+    This is an auxiliary routine called by CGEHRD.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    K       (input) INTEGER
+            The offset for the reduction. Elements below the k-th
+            subdiagonal in the first NB columns are reduced to zero.
+            K < N.
+
+    NB      (input) INTEGER
+            The number of columns to be reduced.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N-K+1)
+            On entry, the n-by-(n-k+1) general matrix A.
+            On exit, the elements on and above the k-th subdiagonal in
+            the first NB columns are overwritten with the corresponding
+            elements of the reduced matrix; the elements below the k-th
+            subdiagonal, with the array TAU, represent the matrix Q as a
+            product of elementary reflectors. The other columns of A are
+            unchanged. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) COMPLEX array, dimension (NB)
+            The scalar factors of the elementary reflectors. See Further
+            Details.
+
+    T       (output) COMPLEX array, dimension (LDT,NB)
+            The upper triangular matrix T.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T.  LDT >= NB.
+
+    Y       (output) COMPLEX array, dimension (LDY,NB)
+            The n-by-nb matrix Y.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of nb elementary reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
+    A(i+k+1:n,i), and tau in TAU(i).
+
+    The elements of the vectors v together form the (n-k+1)-by-nb matrix
+    V which is needed, with T and Y, to apply the transformation to the
+    unreduced part of the matrix, using an update of the form:
+    A := (I - V*T*V') * (A - Y*V').
+
+    The contents of A on exit are illustrated by the following example
+    with n = 7, k = 3 and nb = 2:
+
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( h   h   a   a   a )
+       ( v1  h   a   a   a )
+       ( v1  v2  a   a   a )
+       ( v1  v2  a   a   a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This subroutine is a slight modification of LAPACK-3.0's DLAHRD
+    incorporating improvements proposed by Quintana-Orti and Van de
+    Gejin. Note that the entries of A(1:K,2:NB) differ from those
+    returned by the original LAPACK-3.0's DLAHRD routine. (This
+    subroutine is not backward compatible with LAPACK-3.0's DLAHRD.)
+
+    References
+    ==========
+
+    Gregorio Quintana-Orti and Robert van de Geijn, "Improving the
+    performance of reduction to Hessenberg form," ACM Transactions on
+    Mathematical Software, 32(2):180-194, June 2006.
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    --tau;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*n <= 1) {
+	return 0;
+    }
+
+    i__1 = *nb;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (i__ > 1) {
+
+/*
+             Update A(K+1:N,I)
+
+             Update I-th column of A - Y * V'
+*/
+
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &a[*k + i__ - 1 + a_dim1], lda);
+	    i__2 = *n - *k;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("NO TRANSPOSE", &i__2, &i__3, &q__1, &y[*k + 1 + y_dim1],
+		    ldy, &a[*k + i__ - 1 + a_dim1], lda, &c_b57, &a[*k + 1 +
+		    i__ * a_dim1], &c__1);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &a[*k + i__ - 1 + a_dim1], lda);
+
+/*
+             Apply I - V * T' * V' to this column (call it b) from the
+             left, using the last column of T as workspace
+
+             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)
+                      ( V2 )             ( b2 )
+
+             where V1 is unit lower triangular
+
+             w := V1' * b1
+*/
+
+	    i__2 = i__ - 1;
+	    ccopy_(&i__2, &a[*k + 1 + i__ * a_dim1], &c__1, &t[*nb * t_dim1 +
+		    1], &c__1);
+	    i__2 = i__ - 1;
+	    ctrmv_("Lower", "Conjugate transpose", "UNIT", &i__2, &a[*k + 1 +
+		    a_dim1], lda, &t[*nb * t_dim1 + 1], &c__1);
+
+/*           w := w + V2'*b2 */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[*k + i__ +
+		    a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b57,
+		    &t[*nb * t_dim1 + 1], &c__1);
+
+/*           w := T'*w */
+
+	    i__2 = i__ - 1;
+	    ctrmv_("Upper", "Conjugate transpose", "NON-UNIT", &i__2, &t[
+		    t_offset], ldt, &t[*nb * t_dim1 + 1], &c__1);
+
+/*           b2 := b2 - V2*w */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("NO TRANSPOSE", &i__2, &i__3, &q__1, &a[*k + i__ + a_dim1],
+		     lda, &t[*nb * t_dim1 + 1], &c__1, &c_b57, &a[*k + i__ +
+		    i__ * a_dim1], &c__1);
+
+/*           b1 := b1 - V1*w */
+
+	    i__2 = i__ - 1;
+	    ctrmv_("Lower", "NO TRANSPOSE", "UNIT", &i__2, &a[*k + 1 + a_dim1]
+		    , lda, &t[*nb * t_dim1 + 1], &c__1);
+	    i__2 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    caxpy_(&i__2, &q__1, &t[*nb * t_dim1 + 1], &c__1, &a[*k + 1 + i__
+		    * a_dim1], &c__1);
+
+	    i__2 = *k + i__ - 1 + (i__ - 1) * a_dim1;
+	    a[i__2].r = ei.r, a[i__2].i = ei.i;
+	}
+
+/*
+          Generate the elementary reflector H(I) to annihilate
+          A(K+I+1:N,I)
+*/
+
+	i__2 = *n - *k - i__ + 1;
+/* Computing MIN */
+	i__3 = *k + i__ + 1;
+	clarfg_(&i__2, &a[*k + i__ + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		a_dim1], &c__1, &tau[i__]);
+	i__2 = *k + i__ + i__ * a_dim1;
+	ei.r = a[i__2].r, ei.i = a[i__2].i;
+	i__2 = *k + i__ + i__ * a_dim1;
+	a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*        Compute  Y(K+1:N,I) */
+
+	i__2 = *n - *k;
+	i__3 = *n - *k - i__ + 1;
+	cgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b57, &a[*k + 1 + (i__ + 1) *
+		a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b56, &y[*
+		k + 1 + i__ * y_dim1], &c__1);
+	i__2 = *n - *k - i__ + 1;
+	i__3 = i__ - 1;
+	cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[*k + i__ +
+		a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b56, &t[
+		i__ * t_dim1 + 1], &c__1);
+	i__2 = *n - *k;
+	i__3 = i__ - 1;
+	q__1.r = -1.f, q__1.i = -0.f;
+	cgemv_("NO TRANSPOSE", &i__2, &i__3, &q__1, &y[*k + 1 + y_dim1], ldy,
+		&t[i__ * t_dim1 + 1], &c__1, &c_b57, &y[*k + 1 + i__ * y_dim1]
+		, &c__1);
+	i__2 = *n - *k;
+	cscal_(&i__2, &tau[i__], &y[*k + 1 + i__ * y_dim1], &c__1);
+
+/*        Compute T(1:I,I) */
+
+	i__2 = i__ - 1;
+	i__3 = i__;
+	q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i;
+	cscal_(&i__2, &q__1, &t[i__ * t_dim1 + 1], &c__1);
+	i__2 = i__ - 1;
+	ctrmv_("Upper", "No Transpose", "NON-UNIT", &i__2, &t[t_offset], ldt,
+		&t[i__ * t_dim1 + 1], &c__1)
+		;
+	i__2 = i__ + i__ * t_dim1;
+	i__3 = i__;
+	t[i__2].r = tau[i__3].r, t[i__2].i = tau[i__3].i;
+
+/* L10: */
+    }
+    i__1 = *k + *nb + *nb * a_dim1;
+    a[i__1].r = ei.r, a[i__1].i = ei.i;
+
+/*     Compute Y(1:K,1:NB) */
+
+    clacpy_("ALL", k, nb, &a[(a_dim1 << 1) + 1], lda, &y[y_offset], ldy);
+    ctrmm_("RIGHT", "Lower", "NO TRANSPOSE", "UNIT", k, nb, &c_b57, &a[*k + 1
+	    + a_dim1], lda, &y[y_offset], ldy);
+    if (*n > *k + *nb) {
+	i__1 = *n - *k - *nb;
+	cgemm_("NO TRANSPOSE", "NO TRANSPOSE", k, nb, &i__1, &c_b57, &a[(*nb
+		+ 2) * a_dim1 + 1], lda, &a[*k + 1 + *nb + a_dim1], lda, &
+		c_b57, &y[y_offset], ldy);
+    }
+    ctrmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b57, &t[
+	    t_offset], ldt, &y[y_offset], ldy);
+
+    return 0;
+
+/*     End of CLAHR2 */
+
+} /* clahr2_ */
+
+/* Subroutine */ int clals0_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *nrhs, complex *b, integer *ldb, complex *bx,
+	integer *ldbx, integer *perm, integer *givptr, integer *givcol,
+	integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real *
+	difl, real *difr, real *z__, integer *k, real *c__, real *s, real *
+	rwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, difr_dim1, difr_offset, givnum_dim1,
+	    givnum_offset, poles_dim1, poles_offset, b_dim1, b_offset,
+	    bx_dim1, bx_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, m, n;
+    static real dj;
+    static integer nlp1, jcol;
+    static real temp;
+    static integer jrow;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    static real diflj, difrj, dsigj;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *), sgemv_(char *, integer *, integer *, real *
+	    , real *, integer *, real *, integer *, real *, real *, integer *), csrot_(integer *, complex *, integer *, complex *,
+	    integer *, real *, real *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, complex *, integer *, integer *), csscal_(integer *, real *, complex *, integer *),
+	    clacpy_(char *, integer *, integer *, complex *, integer *,
+	    complex *, integer *), xerbla_(char *, integer *);
+    static real dsigjp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLALS0 applies back the multiplying factors of either the left or the
+    right singular vector matrix of a diagonal matrix appended by a row
+    to the right hand side matrix B in solving the least squares problem
+    using the divide-and-conquer SVD approach.
+
+    For the left singular vector matrix, three types of orthogonal
+    matrices are involved:
+
+    (1L) Givens rotations: the number of such rotations is GIVPTR; the
+         pairs of columns/rows they were applied to are stored in GIVCOL;
+         and the C- and S-values of these rotations are stored in GIVNUM.
+
+    (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+         row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+         J-th row.
+
+    (3L) The left singular vector matrix of the remaining matrix.
+
+    For the right singular vector matrix, four types of orthogonal
+    matrices are involved:
+
+    (1R) The right singular vector matrix of the remaining matrix.
+
+    (2R) If SQRE = 1, one extra Givens rotation to generate the right
+         null space.
+
+    (3R) The inverse transformation of (2L).
+
+    (4R) The inverse transformation of (1L).
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Left singular vector matrix.
+           = 1: Right singular vector matrix.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) COMPLEX array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M. On output, B contains
+           the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B. LDB must be at least
+           max(1,MAX( M, N ) ).
+
+    BX     (workspace) COMPLEX array, dimension ( LDBX, NRHS )
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    PERM   (input) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) applied
+           to the two blocks.
+
+    GIVPTR (input) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of rows/columns
+           involved in a Givens rotation.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (input) REAL array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value used in the
+           corresponding Givens rotation.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of arrays DIFR, POLES and
+           GIVNUM, must be at least K.
+
+    POLES  (input) REAL array, dimension ( LDGNUM, 2 )
+           On entry, POLES(1:K, 1) contains the new singular
+           values obtained from solving the secular equation, and
+           POLES(1:K, 2) is an array containing the poles in the secular
+           equation.
+
+    DIFL   (input) REAL array, dimension ( K ).
+           On entry, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (input) REAL array, dimension ( LDGNUM, 2 ).
+           On entry, DIFR(I, 1) contains the distances between I-th
+           updated (undeflated) singular value and the I+1-th
+           (undeflated) old singular value. And DIFR(I, 2) is the
+           normalizing factor for the I-th right singular vector.
+
+    Z      (input) REAL array, dimension ( K )
+           Contain the components of the deflation-adjusted updating row
+           vector.
+
+    K      (input) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (input) REAL
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (input) REAL
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    RWORK  (workspace) REAL array, dimension
+           ( K*(1+NRHS) + 2*NRHS )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    difr_dim1 = *ldgnum;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --z__;
+    --rwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    }
+
+    n = *nl + *nr + 1;
+
+    if (*nrhs < 1) {
+	*info = -5;
+    } else if (*ldb < n) {
+	*info = -7;
+    } else if (*ldbx < n) {
+	*info = -9;
+    } else if (*givptr < 0) {
+	*info = -11;
+    } else if (*ldgcol < n) {
+	*info = -13;
+    } else if (*ldgnum < n) {
+	*info = -15;
+    } else if (*k < 1) {
+	*info = -20;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLALS0", &i__1);
+	return 0;
+    }
+
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+
+    if (*icompq == 0) {
+
+/*
+          Apply back orthogonal transformations from the left.
+
+          Step (1L): apply back the Givens rotations performed.
+*/
+
+	i__1 = *givptr;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    csrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]);
+/* L10: */
+	}
+
+/*        Step (2L): permute rows of B. */
+
+	ccopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx);
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    ccopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1],
+		    ldbx);
+/* L20: */
+	}
+
+/*
+          Step (3L): apply the inverse of the left singular vector
+          matrix to BX.
+*/
+
+	if (*k == 1) {
+	    ccopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb);
+	    if (z__[1] < 0.f) {
+		csscal_(nrhs, &c_b1276, &b[b_offset], ldb);
+	    }
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		diflj = difl[j];
+		dj = poles[j + poles_dim1];
+		dsigj = -poles[j + (poles_dim1 << 1)];
+		if (j < *k) {
+		    difrj = -difr[j + difr_dim1];
+		    dsigjp = -poles[j + 1 + (poles_dim1 << 1)];
+		}
+		if (z__[j] == 0.f || poles[j + (poles_dim1 << 1)] == 0.f) {
+		    rwork[j] = 0.f;
+		} else {
+		    rwork[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj
+			    / (poles[j + (poles_dim1 << 1)] + dj);
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				 / (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigj) - diflj) / (poles[i__ + (poles_dim1 <<
+				1)] + dj);
+		    }
+/* L30: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				 / (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigjp) + difrj) / (poles[i__ + (poles_dim1 <<
+				 1)] + dj);
+		    }
+/* L40: */
+		}
+		rwork[1] = -1.f;
+		temp = snrm2_(k, &rwork[1], &c__1);
+
+/*
+                Since B and BX are complex, the following call to SGEMV
+                is performed in two steps (real and imaginary parts).
+
+                CALL SGEMV( 'T', K, NRHS, ONE, BX, LDBX, WORK, 1, ZERO,
+      $                     B( J, 1 ), LDB )
+*/
+
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			i__4 = jrow + jcol * bx_dim1;
+			rwork[i__] = bx[i__4].r;
+/* L50: */
+		    }
+/* L60: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1);
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			rwork[i__] = r_imag(&bx[jrow + jcol * bx_dim1]);
+/* L70: */
+		    }
+/* L80: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs],
+			&c__1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = j + jcol * b_dim1;
+		    i__4 = jcol + *k;
+		    i__5 = jcol + *k + *nrhs;
+		    q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L90: */
+		}
+		clascl_("G", &c__0, &c__0, &temp, &c_b1034, &c__1, nrhs, &b[j
+			+ b_dim1], ldb, info);
+/* L100: */
+	    }
+	}
+
+/*        Move the deflated rows of BX to B also. */
+
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    clacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1
+		    + b_dim1], ldb);
+	}
+    } else {
+
+/*
+          Apply back the right orthogonal transformations.
+
+          Step (1R): apply back the new right singular vector matrix
+          to B.
+*/
+
+	if (*k == 1) {
+	    ccopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx);
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		dsigj = poles[j + (poles_dim1 << 1)];
+		if (z__[j] == 0.f) {
+		    rwork[j] = 0.f;
+		} else {
+		    rwork[j] = -z__[j] / difl[j] / (dsigj + poles[j +
+			    poles_dim1]) / difr[j + (difr_dim1 << 1)];
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + 1 + (poles_dim1 << 1)];
+			rwork[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difr[
+				i__ + difr_dim1]) / (dsigj + poles[i__ +
+				poles_dim1]) / difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L110: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + (poles_dim1 << 1)];
+			rwork[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difl[
+				i__]) / (dsigj + poles[i__ + poles_dim1]) /
+				difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L120: */
+		}
+
+/*
+                Since B and BX are complex, the following call to SGEMV
+                is performed in two steps (real and imaginary parts).
+
+                CALL SGEMV( 'T', K, NRHS, ONE, B, LDB, WORK, 1, ZERO,
+      $                     BX( J, 1 ), LDBX )
+*/
+
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			i__4 = jrow + jcol * b_dim1;
+			rwork[i__] = b[i__4].r;
+/* L130: */
+		    }
+/* L140: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1);
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			rwork[i__] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L150: */
+		    }
+/* L160: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs],
+			&c__1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = j + jcol * bx_dim1;
+		    i__4 = jcol + *k;
+		    i__5 = jcol + *k + *nrhs;
+		    q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		    bx[i__3].r = q__1.r, bx[i__3].i = q__1.i;
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+
+/*
+          Step (2R): if SQRE = 1, apply back the rotation that is
+          related to the right null space of the subproblem.
+*/
+
+	if (*sqre == 1) {
+	    ccopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx);
+	    csrot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__,
+		    s);
+	}
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    clacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 +
+		    bx_dim1], ldbx);
+	}
+
+/*        Step (3R): permute rows of B. */
+
+	ccopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb);
+	if (*sqre == 1) {
+	    ccopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb);
+	}
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    ccopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1],
+		    ldb);
+/* L190: */
+	}
+
+/*        Step (4R): apply back the Givens rotations performed. */
+
+	for (i__ = *givptr; i__ >= 1; --i__) {
+	    r__1 = -givnum[i__ + givnum_dim1];
+	    csrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &r__1);
+/* L200: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLALS0 */
+
+} /* clals0_ */
+
+/* Subroutine */ int clalsa_(integer *icompq, integer *smlsiz, integer *n,
+	integer *nrhs, complex *b, integer *ldb, complex *bx, integer *ldbx,
+	real *u, integer *ldu, real *vt, integer *k, real *difl, real *difr,
+	real *z__, real *poles, integer *givptr, integer *givcol, integer *
+	ldgcol, integer *perm, real *givnum, real *c__, real *s, real *rwork,
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, difl_dim1,
+	    difl_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset,
+	    z_dim1, z_offset, b_dim1, b_offset, bx_dim1, bx_offset, i__1,
+	    i__2, i__3, i__4, i__5, i__6;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl,
+	    ndb1, nlp1, lvl2, nrp1, jcol, nlvl, sqre, jrow, jimag, jreal,
+	    inode, ndiml;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer ndimr;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *), clals0_(integer *, integer *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, integer *, integer *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, real *, real *, real *,
+	     integer *), xerbla_(char *, integer *), slasdt_(integer *
+	    , integer *, integer *, integer *, integer *, integer *, integer *
+	    );
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLALSA is an itermediate step in solving the least squares problem
+    by computing the SVD of the coefficient matrix in compact form (The
+    singular vectors are computed as products of simple orthorgonal
+    matrices.).
+
+    If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector
+    matrix of an upper bidiagonal matrix to the right hand side; and if
+    ICOMPQ = 1, CLALSA applies the right singular vector matrix to the
+    right hand side. The singular vector matrices were generated in
+    compact form by CLALSA.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether the left or the right singular vector
+           matrix is involved.
+           = 0: Left singular vector matrix
+           = 1: Right singular vector matrix
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row and column dimensions of the upper bidiagonal matrix.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) COMPLEX array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M.
+           On output, B contains the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,MAX( M, N ) ).
+
+    BX     (output) COMPLEX array, dimension ( LDBX, NRHS )
+           On exit, the result of applying the left or right singular
+           vector matrix to B.
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    U      (input) REAL array, dimension ( LDU, SMLSIZ ).
+           On entry, U contains the left singular vector matrices of all
+           subproblems at the bottom level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR,
+           POLES, GIVNUM, and Z.
+
+    VT     (input) REAL array, dimension ( LDU, SMLSIZ+1 ).
+           On entry, VT' contains the right singular vector matrices of
+           all subproblems at the bottom level.
+
+    K      (input) INTEGER array, dimension ( N ).
+
+    DIFL   (input) REAL array, dimension ( LDU, NLVL ).
+           where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1.
+
+    DIFR   (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record
+           distances between singular values on the I-th level and
+           singular values on the (I -1)-th level, and DIFR(*, 2 * I)
+           record the normalizing factors of the right singular vectors
+           matrices of subproblems on I-th level.
+
+    Z      (input) REAL array, dimension ( LDU, NLVL ).
+           On entry, Z(1, I) contains the components of the deflation-
+           adjusted updating row vector for subproblems on the I-th
+           level.
+
+    POLES  (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old
+           singular values involved in the secular equations on the I-th
+           level.
+
+    GIVPTR (input) INTEGER array, dimension ( N ).
+           On entry, GIVPTR( I ) records the number of Givens
+           rotations performed on the I-th problem on the computation
+           tree.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ).
+           On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the
+           locations of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (input) INTEGER array, dimension ( LDGCOL, NLVL ).
+           On entry, PERM(*, I) records permutations done on the I-th
+           level of the computation tree.
+
+    GIVNUM (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S-
+           values of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    C      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           S( I ) contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    RWORK  (workspace) REAL array, dimension at least
+           MAX( (SMLSZ+1)*NRHS*3, N*(1+NRHS) + 2*NRHS ).
+
+    IWORK  (workspace) INTEGER array.
+           The dimension must be at least 3 * N
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < *smlsiz) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < *n) {
+	*info = -6;
+    } else if (*ldbx < *n) {
+	*info = -8;
+    } else if (*ldu < *n) {
+	*info = -10;
+    } else if (*ldgcol < *n) {
+	*info = -19;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLALSA", &i__1);
+	return 0;
+    }
+
+/*     Book-keeping and  setting up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+
+    slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       The following code applies back the left singular vector factors.
+       For applying back the right singular vector factors, go to 170.
+*/
+
+    if (*icompq == 1) {
+	goto L170;
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding left and right singular vector
+       matrices are in explicit form. First apply back the left
+       singular vector matrices.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+
+/*
+          Since B and BX are complex, the following call to SGEMM
+          is performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NL, NRHS, NL, ONE, U( NLF, 1 ), LDU,
+       $               B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX )
+*/
+
+	j = nl * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L10: */
+	    }
+/* L20: */
+	}
+	sgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, &
+		rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[1], &nl);
+	j = nl * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L30: */
+	    }
+/* L40: */
+	}
+	sgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, &
+		rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[nl * *nrhs
+		+ 1], &nl);
+	jreal = 0;
+	jimag = nl * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L50: */
+	    }
+/* L60: */
+	}
+
+/*
+          Since B and BX are complex, the following call to SGEMM
+          is performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NR, NRHS, NR, ONE, U( NRF, 1 ), LDU,
+      $               B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX )
+*/
+
+	j = nr * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L70: */
+	    }
+/* L80: */
+	}
+	sgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, &
+		rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[1], &nr);
+	j = nr * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L90: */
+	    }
+/* L100: */
+	}
+	sgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, &
+		rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[nr * *nrhs
+		+ 1], &nr);
+	jreal = 0;
+	jimag = nr * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L110: */
+	    }
+/* L120: */
+	}
+
+/* L130: */
+    }
+
+/*
+       Next copy the rows of B that correspond to unchanged rows
+       in the bidiagonal matrix to BX.
+*/
+
+    i__1 = nd;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	ic = iwork[inode + i__ - 1];
+	ccopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx);
+/* L140: */
+    }
+
+/*
+       Finally go through the left singular vector matrices of all
+       the other subproblems bottom-up on the tree.
+*/
+
+    j = pow_ii(&c__2, &nlvl);
+    sqre = 0;
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          find the first node LF and last node LL on
+          the current level LVL
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    --j;
+	    clals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, &
+		    b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &rwork[1], info);
+/* L150: */
+	}
+/* L160: */
+    }
+    goto L330;
+
+/*     ICOMPQ = 1: applying back the right singular vector factors. */
+
+L170:
+
+/*
+       First now go through the right singular vector matrices of all
+       the tree nodes top-down.
+*/
+
+    j = 0;
+    i__1 = nlvl;
+    for (lvl = 1; lvl <= i__1; ++lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__2 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__2);
+	    ll = (lf << 1) - 1;
+	}
+	i__2 = lf;
+	for (i__ = ll; i__ >= i__2; --i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqre = 0;
+	    } else {
+		sqre = 1;
+	    }
+	    ++j;
+	    clals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[
+		    nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &rwork[1], info);
+/* L180: */
+	}
+/* L190: */
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding right singular vector
+       matrices are in explicit form. Apply them back.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlp1 = nl + 1;
+	if (i__ == nd) {
+	    nrp1 = nr;
+	} else {
+	    nrp1 = nr + 1;
+	}
+	nlf = ic - nl;
+	nrf = ic + 1;
+
+/*
+          Since B and BX are complex, the following call to SGEMM is
+          performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NLP1, NRHS, NLP1, ONE, VT( NLF, 1 ), LDU,
+      $               B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX )
+*/
+
+	j = nlp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L200: */
+	    }
+/* L210: */
+	}
+	sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1],
+		ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[
+		1], &nlp1);
+	j = nlp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L220: */
+	    }
+/* L230: */
+	}
+	sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1],
+		ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[
+		nlp1 * *nrhs + 1], &nlp1);
+	jreal = 0;
+	jimag = nlp1 * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L240: */
+	    }
+/* L250: */
+	}
+
+/*
+          Since B and BX are complex, the following call to SGEMM is
+          performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NRP1, NRHS, NRP1, ONE, VT( NRF, 1 ), LDU,
+      $               B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX )
+*/
+
+	j = nrp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L260: */
+	    }
+/* L270: */
+	}
+	sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1],
+		ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[
+		1], &nrp1);
+	j = nrp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L280: */
+	    }
+/* L290: */
+	}
+	sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1],
+		ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[
+		nrp1 * *nrhs + 1], &nrp1);
+	jreal = 0;
+	jimag = nrp1 * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L300: */
+	    }
+/* L310: */
+	}
+
+/* L320: */
+    }
+
+L330:
+
+    return 0;
+
+/*     End of CLALSA */
+
+} /* clalsa_ */
+
+/* Subroutine */ int clalsd_(char *uplo, integer *smlsiz, integer *n, integer
+	*nrhs, real *d__, real *e, complex *b, integer *ldb, real *rcond,
+	integer *rank, complex *work, real *rwork, integer *iwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer c__, i__, j, k;
+    static real r__;
+    static integer s, u, z__;
+    static real cs;
+    static integer bx;
+    static real sn;
+    static integer st, vt, nm1, st1;
+    static real eps;
+    static integer iwk;
+    static real tol;
+    static integer difl, difr;
+    static real rcnd;
+    static integer jcol, irwb, perm, nsub, nlvl, sqre, bxst, jrow, irwu,
+	    jimag, jreal;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer irwib;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static integer poles, sizei, irwrb, nsize;
+    extern /* Subroutine */ int csrot_(integer *, complex *, integer *,
+	    complex *, integer *, real *, real *);
+    static integer irwvt, icmpq1, icmpq2;
+    extern /* Subroutine */ int clalsa_(integer *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, real *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    integer *, integer *, integer *, integer *, real *, real *, real *
+	    , real *, integer *, integer *), clascl_(char *, integer *,
+	    integer *, real *, real *, integer *, integer *, complex *,
+	    integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int slasda_(integer *, integer *, integer *,
+	    integer *, real *, real *, real *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    clacpy_(char *, integer *, integer *, complex *, integer *,
+	    complex *, integer *), claset_(char *, integer *, integer
+	    *, complex *, complex *, complex *, integer *), xerbla_(
+	    char *, integer *), slascl_(char *, integer *, integer *,
+	    real *, real *, integer *, integer *, real *, integer *, integer *
+	    );
+    extern integer isamax_(integer *, real *, integer *);
+    static integer givcol;
+    extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *),
+	    slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *), slartg_(real *, real *, real *, real *, real *
+	    );
+    static real orgnrm;
+    static integer givnum;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+    static integer givptr, nrwork, irwwrk, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CLALSD uses the singular value decomposition of A to solve the least
+    squares problem of finding X to minimize the Euclidean norm of each
+    column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+    are N-by-NRHS. The solution X overwrites B.
+
+    The singular values of A smaller than RCOND times the largest
+    singular value are treated as zero in solving the least squares
+    problem; in this case a minimum norm solution is returned.
+    The actual singular values are returned in D in ascending order.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    UPLO   (input) CHARACTER*1
+           = 'U': D and E define an upper bidiagonal matrix.
+           = 'L': D and E define a  lower bidiagonal matrix.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The dimension of the  bidiagonal matrix.  N >= 0.
+
+    NRHS   (input) INTEGER
+           The number of columns of B. NRHS must be at least 1.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit, if INFO = 0, D contains its singular values.
+
+    E      (input/output) REAL array, dimension (N-1)
+           Contains the super-diagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    B      (input/output) COMPLEX array, dimension (LDB,NRHS)
+           On input, B contains the right hand sides of the least
+           squares problem. On output, B contains the solution X.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,N).
+
+    RCOND  (input) REAL
+           The singular values of A less than or equal to RCOND times
+           the largest singular value are treated as zero in solving
+           the least squares problem. If RCOND is negative,
+           machine precision is used instead.
+           For example, if diag(S)*X=B were the least squares problem,
+           where diag(S) is a diagonal matrix of singular values, the
+           solution would be X(i) = B(i) / S(i) if S(i) is greater than
+           RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to
+           RCOND*max(S).
+
+    RANK   (output) INTEGER
+           The number of singular values of A greater than RCOND times
+           the largest singular value.
+
+    WORK   (workspace) COMPLEX array, dimension (N * NRHS).
+
+    RWORK  (workspace) REAL array, dimension at least
+           (9*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS +
+           MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS ),
+           where
+           NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+
+    IWORK  (workspace) INTEGER array, dimension (3*N*NLVL + 11*N).
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  The algorithm failed to compute a singular value while
+                 working on the submatrix lying in rows and columns
+                 INFO/(N+1) through MOD(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < 1 || *ldb < *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLALSD", &i__1);
+	return 0;
+    }
+
+    eps = slamch_("Epsilon");
+
+/*     Set up the tolerance. */
+
+    if (*rcond <= 0.f || *rcond >= 1.f) {
+	rcnd = eps;
+    } else {
+	rcnd = *rcond;
+    }
+
+    *rank = 0;
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	if (d__[1] == 0.f) {
+	    claset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	} else {
+	    *rank = 1;
+	    clascl_("G", &c__0, &c__0, &d__[1], &c_b1034, &c__1, nrhs, &b[
+		    b_offset], ldb, info);
+	    d__[1] = dabs(d__[1]);
+	}
+	return 0;
+    }
+
+/*     Rotate the matrix if it is lower bidiagonal. */
+
+    if (*(unsigned char *)uplo == 'L') {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (*nrhs == 1) {
+		csrot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], &
+			c__1, &cs, &sn);
+	    } else {
+		rwork[(i__ << 1) - 1] = cs;
+		rwork[i__ * 2] = sn;
+	    }
+/* L10: */
+	}
+	if (*nrhs > 1) {
+	    i__1 = *nrhs;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = *n - 1;
+		for (j = 1; j <= i__2; ++j) {
+		    cs = rwork[(j << 1) - 1];
+		    sn = rwork[j * 2];
+		    csrot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__
+			    * b_dim1], &c__1, &cs, &sn);
+/* L20: */
+		}
+/* L30: */
+	    }
+	}
+    }
+
+/*     Scale. */
+
+    nm1 = *n - 1;
+    orgnrm = slanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.f) {
+	claset_("A", n, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	return 0;
+    }
+
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, &c__1, &d__[1], n, info);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &nm1, &c__1, &e[1], &nm1,
+	    info);
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= *smlsiz) {
+	irwu = 1;
+	irwvt = irwu + *n * *n;
+	irwwrk = irwvt + *n * *n;
+	irwrb = irwwrk;
+	irwib = irwrb + *n * *nrhs;
+	irwb = irwib + *n * *nrhs;
+	slaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwu], n);
+	slaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwvt], n);
+	slasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &rwork[irwvt], n,
+		&rwork[irwu], n, &rwork[irwwrk], &c__1, &rwork[irwwrk], info);
+	if (*info != 0) {
+	    return 0;
+	}
+
+/*
+          In the real version, B is passed to SLASDQ and multiplied
+          internally by Q'. Here B is complex and that product is
+          computed below in two steps (real and imaginary parts).
+*/
+
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		i__3 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__3].r;
+/* L40: */
+	    }
+/* L50: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb],
+		n, &c_b328, &rwork[irwrb], n);
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L60: */
+	    }
+/* L70: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb],
+		n, &c_b328, &rwork[irwib], n);
+	jreal = irwrb - 1;
+	jimag = irwib - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++jreal;
+		++jimag;
+		i__3 = jrow + jcol * b_dim1;
+		i__4 = jreal;
+		i__5 = jimag;
+		q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+	tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (d__[i__] <= tol) {
+		claset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[i__ + b_dim1],
+			ldb);
+	    } else {
+		clascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, &
+			b[i__ + b_dim1], ldb, info);
+		++(*rank);
+	    }
+/* L100: */
+	}
+
+/*
+          Since B is complex, the following call to SGEMM is performed
+          in two steps (real and imaginary parts). That is for V * B
+          (in the real version of the code V' is stored in WORK).
+
+          CALL SGEMM( 'T', 'N', N, NRHS, N, ONE, WORK, N, B, LDB, ZERO,
+      $               WORK( NWORK ), N )
+*/
+
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		i__3 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__3].r;
+/* L110: */
+	    }
+/* L120: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb],
+		 n, &c_b328, &rwork[irwrb], n);
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L130: */
+	    }
+/* L140: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb],
+		 n, &c_b328, &rwork[irwib], n);
+	jreal = irwrb - 1;
+	jimag = irwib - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++jreal;
+		++jimag;
+		i__3 = jrow + jcol * b_dim1;
+		i__4 = jreal;
+		i__5 = jimag;
+		q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L150: */
+	    }
+/* L160: */
+	}
+
+/*        Unscale. */
+
+	slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n,
+		info);
+	slasrt_("D", n, &d__[1], info);
+	clascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset],
+		ldb, info);
+
+	return 0;
+    }
+
+/*     Book-keeping and setting up some constants. */
+
+    nlvl = (integer) (log((real) (*n) / (real) (*smlsiz + 1)) / log(2.f)) + 1;
+
+    smlszp = *smlsiz + 1;
+
+    u = 1;
+    vt = *smlsiz * *n + 1;
+    difl = vt + smlszp * *n;
+    difr = difl + nlvl * *n;
+    z__ = difr + (nlvl * *n << 1);
+    c__ = z__ + nlvl * *n;
+    s = c__ + *n;
+    poles = s + *n;
+    givnum = poles + (nlvl << 1) * *n;
+    nrwork = givnum + (nlvl << 1) * *n;
+    bx = 1;
+
+    irwrb = nrwork;
+    irwib = irwrb + *smlsiz * *nrhs;
+    irwb = irwib + *smlsiz * *nrhs;
+
+    sizei = *n + 1;
+    k = sizei + *n;
+    givptr = k + *n;
+    perm = givptr + *n;
+    givcol = perm + nlvl * *n;
+    iwk = givcol + (nlvl * *n << 1);
+
+    st = 1;
+    sqre = 0;
+    icmpq1 = 1;
+    icmpq2 = 0;
+    nsub = 0;
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) < eps) {
+	    d__[i__] = r_sign(&eps, &d__[i__]);
+	}
+/* L170: */
+    }
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) {
+	    ++nsub;
+	    iwork[nsub] = st;
+
+/*
+             Subproblem found. First determine its size and then
+             apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*              A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else if ((r__1 = e[i__], dabs(r__1)) >= eps) {
+
+/*              A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else {
+
+/*
+                A subproblem with E(NM1) small. This implies an
+                1-by-1 subproblem at D(N), which is not solved
+                explicitly.
+*/
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+		++nsub;
+		iwork[nsub] = *n;
+		iwork[sizei + nsub - 1] = 1;
+		ccopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n);
+	    }
+	    st1 = st - 1;
+	    if (nsize == 1) {
+
+/*
+                This is a 1-by-1 subproblem and is not solved
+                explicitly.
+*/
+
+		ccopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n);
+	    } else if (nsize <= *smlsiz) {
+
+/*              This is a small subproblem and is solved by SLASDQ. */
+
+		slaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[vt +
+			st1], n);
+		slaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[u +
+			st1], n);
+		slasdq_("U", &c__0, &nsize, &nsize, &nsize, &c__0, &d__[st], &
+			e[st], &rwork[vt + st1], n, &rwork[u + st1], n, &
+			rwork[nrwork], &c__1, &rwork[nrwork], info)
+			;
+		if (*info != 0) {
+		    return 0;
+		}
+
+/*
+                In the real version, B is passed to SLASDQ and multiplied
+                internally by Q'. Here B is complex and that product is
+                computed below in two steps (real and imaginary parts).
+*/
+
+		j = irwb - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++j;
+			i__4 = jrow + jcol * b_dim1;
+			rwork[j] = b[i__4].r;
+/* L180: */
+		    }
+/* L190: */
+		}
+		sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u +
+			st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb],
+			 &nsize);
+		j = irwb - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++j;
+			rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L200: */
+		    }
+/* L210: */
+		}
+		sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u +
+			st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib],
+			 &nsize);
+		jreal = irwrb - 1;
+		jimag = irwib - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++jreal;
+			++jimag;
+			i__4 = jrow + jcol * b_dim1;
+			i__5 = jreal;
+			i__6 = jimag;
+			q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+			b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L220: */
+		    }
+/* L230: */
+		}
+
+		clacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx +
+			st1], n);
+	    } else {
+
+/*              A large problem. Solve it using divide and conquer. */
+
+		slasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], &
+			rwork[u + st1], n, &rwork[vt + st1], &iwork[k + st1],
+			&rwork[difl + st1], &rwork[difr + st1], &rwork[z__ +
+			st1], &rwork[poles + st1], &iwork[givptr + st1], &
+			iwork[givcol + st1], n, &iwork[perm + st1], &rwork[
+			givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &
+			rwork[nrwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		bxst = bx + st1;
+		clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, &
+			work[bxst], n, &rwork[u + st1], n, &rwork[vt + st1], &
+			iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1]
+			, &rwork[z__ + st1], &rwork[poles + st1], &iwork[
+			givptr + st1], &iwork[givcol + st1], n, &iwork[perm +
+			st1], &rwork[givnum + st1], &rwork[c__ + st1], &rwork[
+			s + st1], &rwork[nrwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+	    }
+	    st = i__ + 1;
+	}
+/* L240: */
+    }
+
+/*     Apply the singular values and treat the tiny ones as zero. */
+
+    tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+          Some of the elements in D can be negative because 1-by-1
+          subproblems were not solved explicitly.
+*/
+
+	if ((r__1 = d__[i__], dabs(r__1)) <= tol) {
+	    claset_("A", &c__1, nrhs, &c_b56, &c_b56, &work[bx + i__ - 1], n);
+	} else {
+	    ++(*rank);
+	    clascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, &
+		    work[bx + i__ - 1], n, info);
+	}
+	d__[i__] = (r__1 = d__[i__], dabs(r__1));
+/* L250: */
+    }
+
+/*     Now apply back the right singular vectors. */
+
+    icmpq2 = 1;
+    i__1 = nsub;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	st = iwork[i__];
+	st1 = st - 1;
+	nsize = iwork[sizei + i__ - 1];
+	bxst = bx + st1;
+	if (nsize == 1) {
+	    ccopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb);
+	} else if (nsize <= *smlsiz) {
+
+/*
+             Since B and BX are complex, the following call to SGEMM
+             is performed in two steps (real and imaginary parts).
+
+             CALL SGEMM( 'T', 'N', NSIZE, NRHS, NSIZE, ONE,
+      $                  RWORK( VT+ST1 ), N, RWORK( BXST ), N, ZERO,
+      $                  B( ST, 1 ), LDB )
+*/
+
+	    j = bxst - *n - 1;
+	    jreal = irwb - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		j += *n;
+		i__3 = nsize;
+		for (jrow = 1; jrow <= i__3; ++jrow) {
+		    ++jreal;
+		    i__4 = j + jrow;
+		    rwork[jreal] = work[i__4].r;
+/* L260: */
+		}
+/* L270: */
+	    }
+	    sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1],
+		     n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb], &nsize);
+	    j = bxst - *n - 1;
+	    jimag = irwb - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		j += *n;
+		i__3 = nsize;
+		for (jrow = 1; jrow <= i__3; ++jrow) {
+		    ++jimag;
+		    rwork[jimag] = r_imag(&work[j + jrow]);
+/* L280: */
+		}
+/* L290: */
+	    }
+	    sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1],
+		     n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib], &nsize);
+	    jreal = irwrb - 1;
+	    jimag = irwib - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		i__3 = st + nsize - 1;
+		for (jrow = st; jrow <= i__3; ++jrow) {
+		    ++jreal;
+		    ++jimag;
+		    i__4 = jrow + jcol * b_dim1;
+		    i__5 = jreal;
+		    i__6 = jimag;
+		    q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		    b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L300: */
+		}
+/* L310: */
+	    }
+	} else {
+	    clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st +
+		    b_dim1], ldb, &rwork[u + st1], n, &rwork[vt + st1], &
+		    iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1], &
+		    rwork[z__ + st1], &rwork[poles + st1], &iwork[givptr +
+		    st1], &iwork[givcol + st1], n, &iwork[perm + st1], &rwork[
+		    givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &rwork[
+		    nrwork], &iwork[iwk], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+	}
+/* L320: */
+    }
+
+/*     Unscale and sort the singular values. */
+
+    slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n, info);
+    slasrt_("D", n, &d__[1], info);
+    clascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset], ldb,
+	    info);
+
+    return 0;
+
+/*     End of CLALSD */
+
+} /* clalsd_ */
+
+doublereal clange_(char *norm, integer *m, integer *n, complex *a, integer *
+	lda, real *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real ret_val, r__1, r__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static real sum, scale;
+    extern logical lsame_(char *, char *);
+    static real value;
+    extern /* Subroutine */ int classq_(integer *, complex *, integer *, real
+	    *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLANGE  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    complex matrix A.
+
+    Description
+    ===========
+
+    CLANGE returns the value
+
+       CLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in CLANGE as described
+            above.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.  When M = 0,
+            CLANGE is set to zero.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.  When N = 0,
+            CLANGE is set to zero.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(M,1).
+
+    WORK    (workspace) REAL array, dimension (MAX(1,LWORK)),
+            where LWORK >= M when NORM = 'I'; otherwise, WORK is not
+            referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (min(*m,*n) == 0) {
+	value = 0.f;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		r__1 = value, r__2 = c_abs(&a[i__ + j * a_dim1]);
+		value = dmax(r__1,r__2);
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1') {
+
+/*        Find norm1(A). */
+
+	value = 0.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = 0.f;
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		sum += c_abs(&a[i__ + j * a_dim1]);
+/* L30: */
+	    }
+	    value = dmax(value,sum);
+/* L40: */
+	}
+    } else if (lsame_(norm, "I")) {
+
+/*        Find normI(A). */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[i__] = 0.f;
+/* L50: */
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		work[i__] += c_abs(&a[i__ + j * a_dim1]);
+/* L60: */
+	    }
+/* L70: */
+	}
+	value = 0.f;
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    r__1 = value, r__2 = work[i__];
+	    value = dmax(r__1,r__2);
+/* L80: */
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.f;
+	sum = 1.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    classq_(m, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L90: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of CLANGE */
+
+} /* clange_ */
+
+doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
+	lda, real *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real ret_val, r__1, r__2, r__3;
+
+    /* Local variables */
+    static integer i__, j;
+    static real sum, absa, scale;
+    extern logical lsame_(char *, char *);
+    static real value;
+    extern /* Subroutine */ int classq_(integer *, complex *, integer *, real
+	    *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLANHE  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    complex hermitian matrix A.
+
+    Description
+    ===========
+
+    CLANHE returns the value
+
+       CLANHE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in CLANHE as described
+            above.
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            hermitian matrix A is to be referenced.
+            = 'U':  Upper triangular part of A is referenced
+            = 'L':  Lower triangular part of A is referenced
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.  When N = 0, CLANHE is
+            set to zero.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The hermitian matrix A.  If UPLO = 'U', the leading n by n
+            upper triangular part of A contains the upper triangular part
+            of the matrix A, and the strictly lower triangular part of A
+            is not referenced.  If UPLO = 'L', the leading n by n lower
+            triangular part of A contains the lower triangular part of
+            the matrix A, and the strictly upper triangular part of A is
+            not referenced. Note that the imaginary parts of the diagonal
+            elements need not be set and are assumed to be zero.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(N,1).
+
+    WORK    (workspace) REAL array, dimension (MAX(1,LWORK)),
+            where LWORK >= N when NORM = 'I' or '1' or 'O'; otherwise,
+            WORK is not referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (*n == 0) {
+	value = 0.f;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.f;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    r__1 = value, r__2 = c_abs(&a[i__ + j * a_dim1]);
+		    value = dmax(r__1,r__2);
+/* L10: */
+		}
+/* Computing MAX */
+		i__2 = j + j * a_dim1;
+		r__2 = value, r__3 = (r__1 = a[i__2].r, dabs(r__1));
+		value = dmax(r__2,r__3);
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+		i__2 = j + j * a_dim1;
+		r__2 = value, r__3 = (r__1 = a[i__2].r, dabs(r__1));
+		value = dmax(r__2,r__3);
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    r__1 = value, r__2 = c_abs(&a[i__ + j * a_dim1]);
+		    value = dmax(r__1,r__2);
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+    } else if (lsame_(norm, "I") || lsame_(norm, "O") || *(unsigned char *)norm == '1') {
+
+/*        Find normI(A) ( = norm1(A), since A is hermitian). */
+
+	value = 0.f;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		sum = 0.f;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    absa = c_abs(&a[i__ + j * a_dim1]);
+		    sum += absa;
+		    work[i__] += absa;
+/* L50: */
+		}
+		i__2 = j + j * a_dim1;
+		work[j] = sum + (r__1 = a[i__2].r, dabs(r__1));
+/* L60: */
+	    }
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+		r__1 = value, r__2 = work[i__];
+		value = dmax(r__1,r__2);
+/* L70: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		work[i__] = 0.f;
+/* L80: */
+	    }
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j + j * a_dim1;
+		sum = work[j] + (r__1 = a[i__2].r, dabs(r__1));
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    absa = c_abs(&a[i__ + j * a_dim1]);
+		    sum += absa;
+		    work[i__] += absa;
+/* L90: */
+		}
+		value = dmax(value,sum);
+/* L100: */
+	    }
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.f;
+	sum = 1.f;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		i__2 = j - 1;
+		classq_(&i__2, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n - j;
+		classq_(&i__2, &a[j + 1 + j * a_dim1], &c__1, &scale, &sum);
+/* L120: */
+	    }
+	}
+	sum *= 2;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    if (a[i__2].r != 0.f) {
+		i__2 = i__ + i__ * a_dim1;
+		absa = (r__1 = a[i__2].r, dabs(r__1));
+		if (scale < absa) {
+/* Computing 2nd power */
+		    r__1 = scale / absa;
+		    sum = sum * (r__1 * r__1) + 1.f;
+		    scale = absa;
+		} else {
+/* Computing 2nd power */
+		    r__1 = absa / scale;
+		    sum += r__1 * r__1;
+		}
+	    }
+/* L130: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of CLANHE */
+
+} /* clanhe_ */
+
+/* Subroutine */ int claqr0_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w,
+	integer *iloz, integer *ihiz, complex *z__, integer *ldz, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8;
+    complex q__1, q__2, q__3, q__4, q__5;
+
+    /* Local variables */
+    static integer i__, k;
+    static real s;
+    static complex aa, bb, cc, dd;
+    static integer ld, nh, it, ks, kt, ku, kv, ls, ns, nw;
+    static complex tr2, det;
+    static integer inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl, kbot,
+	    nmin;
+    static complex swap;
+    static integer ktop;
+    static complex zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int claqr3_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, complex *, integer *, integer *,
+	    integer *, complex *, integer *, integer *, integer *, complex *,
+	    complex *, integer *, integer *, complex *, integer *, integer *,
+	    complex *, integer *, complex *, integer *), claqr4_(logical *,
+	    logical *, integer *, integer *, integer *, complex *, integer *,
+	    complex *, integer *, integer *, complex *, integer *, complex *,
+	    integer *, integer *), claqr5_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, integer *, complex *, complex *,
+	    integer *, integer *, integer *, complex *, integer *, complex *,
+	    integer *, complex *, integer *, integer *, complex *, integer *,
+	    integer *, complex *, integer *);
+    static integer nibble;
+    extern /* Subroutine */ int clahqr_(logical *, logical *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, complex *, integer *, integer *), clacpy_(char *,
+	    integer *, integer *, complex *, integer *, complex *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    static complex rtdisc;
+    static integer nwupbd;
+    static logical sorted;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       CLAQR0 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**H, where T is an upper triangular matrix (the
+       Schur form), and Z is the unitary matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input unitary
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to CGEBAL, and then passed to CGEHRD when the
+             matrix output by CGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) COMPLEX array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H
+             contains the upper triangular matrix T from the Schur
+             decomposition (the Schur form). If INFO = 0 and WANT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       W        (output) COMPLEX array, dimension (N)
+             The computed eigenvalues of H(ILO:IHI,ILO:IHI) are stored
+             in W(ILO:IHI). If WANTT is .TRUE., then the eigenvalues are
+             stored in the same order as on the diagonal of the Schur
+             form returned in H, with W(i) = H(i,i).
+
+       Z     (input/output) COMPLEX array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) COMPLEX array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then CLAQR0 does a workspace query.
+             In this case, CLAQR0 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, CLAQR0 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is a unitary matrix.  The final
+                  value of  H is upper Hessenberg and triangular in
+                  rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    CLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constant WILK1 is used to form the exceptional
+       .    shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use CLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    clahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "CLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "CLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to CLAQR3 ====
+*/
+
+	i__1 = nwr + 1;
+	claqr3_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[h_offset],
+		ldh, n, &h__[h_offset], ldh, n, &h__[h_offset], ldh, &work[1],
+		 &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(CLAQR5, CLAQR3) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1].r;
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    r__1 = (real) lwkopt;
+	    q__1.r = r__1, q__1.i = 0.f;
+	    work[1].r = q__1.r, work[1].i = q__1.i;
+	    return 0;
+	}
+
+/*        ==== CLAHQR/CLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "CLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "CLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "CLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L80;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		i__3 = k + (k - 1) * h_dim1;
+		if (h__[i__3].r == 0.f && h__[i__3].i == 0.f) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    i__2 = kwtop + (kwtop - 1) * h_dim1;
+		    i__3 = kwtop - 1 + (kwtop - 2) * h_dim1;
+		    if ((r__1 = h__[i__2].r, dabs(r__1)) + (r__2 = r_imag(&
+			    h__[kwtop + (kwtop - 1) * h_dim1]), dabs(r__2)) >
+			    (r__3 = h__[i__3].r, dabs(r__3)) + (r__4 = r_imag(
+			    &h__[kwtop - 1 + (kwtop - 2) * h_dim1]), dabs(
+			    r__4))) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    claqr3_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[kv
+		    + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1], ldh, &nve, &
+		    h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if CLAQR3
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    CLAQR3 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+		    i__2 = ks + 1;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			i__3 = i__;
+			i__4 = i__ + i__ * h_dim1;
+			i__5 = i__ + (i__ - 1) * h_dim1;
+			r__3 = ((r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				r_imag(&h__[i__ + (i__ - 1) * h_dim1]), dabs(
+				r__2))) * .75f;
+			q__1.r = h__[i__4].r + r__3, q__1.i = h__[i__4].i;
+			w[i__3].r = q__1.r, w[i__3].i = q__1.i;
+			i__3 = i__ - 1;
+			i__4 = i__;
+			w[i__3].r = w[i__4].r, w[i__3].i = w[i__4].i;
+/* L30: */
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use CLAQR4 or
+                   .    CLAHQR on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			clacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			if (ns > nmin) {
+			    claqr4_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &w[ks], &c__1, &c__1,
+				    zdum, &c__1, &work[1], lwork, &inf);
+			} else {
+			    clahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &w[ks], &c__1, &c__1,
+				    zdum, &c__1, &inf);
+			}
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  Scale to avoid
+                      .    overflows, underflows and subnormals.
+                      .    (The scale factor S can not be zero,
+                      .    because H(KBOT,KBOT-1) is nonzero.) ====
+*/
+
+			if (ks >= kbot) {
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    i__3 = kbot + (kbot - 1) * h_dim1;
+			    i__4 = kbot - 1 + kbot * h_dim1;
+			    i__5 = kbot + kbot * h_dim1;
+			    s = (r__1 = h__[i__2].r, dabs(r__1)) + (r__2 =
+				    r_imag(&h__[kbot - 1 + (kbot - 1) *
+				    h_dim1]), dabs(r__2)) + ((r__3 = h__[i__3]
+				    .r, dabs(r__3)) + (r__4 = r_imag(&h__[
+				    kbot + (kbot - 1) * h_dim1]), dabs(r__4)))
+				     + ((r__5 = h__[i__4].r, dabs(r__5)) + (
+				    r__6 = r_imag(&h__[kbot - 1 + kbot *
+				    h_dim1]), dabs(r__6))) + ((r__7 = h__[
+				    i__5].r, dabs(r__7)) + (r__8 = r_imag(&
+				    h__[kbot + kbot * h_dim1]), dabs(r__8)));
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    aa.r = q__1.r, aa.i = q__1.i;
+			    i__2 = kbot + (kbot - 1) * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    cc.r = q__1.r, cc.i = q__1.i;
+			    i__2 = kbot - 1 + kbot * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    bb.r = q__1.r, bb.i = q__1.i;
+			    i__2 = kbot + kbot * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    dd.r = q__1.r, dd.i = q__1.i;
+			    q__2.r = aa.r + dd.r, q__2.i = aa.i + dd.i;
+			    q__1.r = q__2.r / 2.f, q__1.i = q__2.i / 2.f;
+			    tr2.r = q__1.r, tr2.i = q__1.i;
+			    q__3.r = aa.r - tr2.r, q__3.i = aa.i - tr2.i;
+			    q__4.r = dd.r - tr2.r, q__4.i = dd.i - tr2.i;
+			    q__2.r = q__3.r * q__4.r - q__3.i * q__4.i,
+				    q__2.i = q__3.r * q__4.i + q__3.i *
+				    q__4.r;
+			    q__5.r = bb.r * cc.r - bb.i * cc.i, q__5.i = bb.r
+				    * cc.i + bb.i * cc.r;
+			    q__1.r = q__2.r - q__5.r, q__1.i = q__2.i -
+				    q__5.i;
+			    det.r = q__1.r, det.i = q__1.i;
+			    q__2.r = -det.r, q__2.i = -det.i;
+			    c_sqrt(&q__1, &q__2);
+			    rtdisc.r = q__1.r, rtdisc.i = q__1.i;
+			    i__2 = kbot - 1;
+			    q__2.r = tr2.r + rtdisc.r, q__2.i = tr2.i +
+				    rtdisc.i;
+			    q__1.r = s * q__2.r, q__1.i = s * q__2.i;
+			    w[i__2].r = q__1.r, w[i__2].i = q__1.i;
+			    i__2 = kbot;
+			    q__2.r = tr2.r - rtdisc.r, q__2.i = tr2.i -
+				    rtdisc.i;
+			    q__1.r = s * q__2.r, q__1.i = s * q__2.i;
+			    w[i__2].r = q__1.r, w[i__2].i = q__1.i;
+
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*                    ==== Sort the shifts (Helps a little) ==== */
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				i__4 = i__;
+				i__5 = i__ + 1;
+				if ((r__1 = w[i__4].r, dabs(r__1)) + (r__2 =
+					r_imag(&w[i__]), dabs(r__2)) < (r__3 =
+					 w[i__5].r, dabs(r__3)) + (r__4 =
+					r_imag(&w[i__ + 1]), dabs(r__4))) {
+				    sorted = FALSE_;
+				    i__4 = i__;
+				    swap.r = w[i__4].r, swap.i = w[i__4].i;
+				    i__4 = i__;
+				    i__5 = i__ + 1;
+				    w[i__4].r = w[i__5].r, w[i__4].i = w[i__5]
+					    .i;
+				    i__4 = i__ + 1;
+				    w[i__4].r = swap.r, w[i__4].i = swap.i;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+		}
+
+/*
+                ==== If there are only two shifts, then use
+                .    only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    i__2 = kbot;
+		    i__3 = kbot + kbot * h_dim1;
+		    q__2.r = w[i__2].r - h__[i__3].r, q__2.i = w[i__2].i -
+			    h__[i__3].i;
+		    q__1.r = q__2.r, q__1.i = q__2.i;
+		    i__4 = kbot - 1;
+		    i__5 = kbot + kbot * h_dim1;
+		    q__4.r = w[i__4].r - h__[i__5].r, q__4.i = w[i__4].i -
+			    h__[i__5].i;
+		    q__3.r = q__4.r, q__3.i = q__4.i;
+		    if ((r__1 = q__1.r, dabs(r__1)) + (r__2 = r_imag(&q__1),
+			    dabs(r__2)) < (r__3 = q__3.r, dabs(r__3)) + (r__4
+			    = r_imag(&q__3), dabs(r__4))) {
+			i__2 = kbot - 1;
+			i__3 = kbot;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    } else {
+			i__2 = kbot;
+			i__3 = kbot - 1;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		claqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &w[ks], &
+			h__[h_offset], ldh, iloz, ihiz, &z__[z_offset], ldz, &
+			work[1], &c__3, &h__[ku + h_dim1], ldh, &nve, &h__[
+			kwv + h_dim1], ldh, &nho, &h__[ku + kwh * h_dim1],
+			ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L70:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L80:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    r__1 = (real) lwkopt;
+    q__1.r = r__1, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+/*     ==== End of CLAQR0 ==== */
+
+    return 0;
+} /* claqr0_ */
+
+/* Subroutine */ int claqr1_(integer *n, complex *h__, integer *ldh, complex *
+	s1, complex *s2, complex *v)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8;
+
+    /* Local variables */
+    static real s;
+    static complex h21s, h31s;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+         Given a 2-by-2 or 3-by-3 matrix H, CLAQR1 sets v to a
+         scalar multiple of the first column of the product
+
+         (*)  K = (H - s1*I)*(H - s2*I)
+
+         scaling to avoid overflows and most underflows.
+
+         This is useful for starting double implicit shift bulges
+         in the QR algorithm.
+
+
+         N      (input) integer
+                Order of the matrix H. N must be either 2 or 3.
+
+         H      (input) COMPLEX array of dimension (LDH,N)
+                The 2-by-2 or 3-by-3 matrix H in (*).
+
+         LDH    (input) integer
+                The leading dimension of H as declared in
+                the calling procedure.  LDH.GE.N
+
+         S1     (input) COMPLEX
+         S2     S1 and S2 are the shifts defining K in (*) above.
+
+         V      (output) COMPLEX array of dimension N
+                A scalar multiple of the first column of the
+                matrix K in (*).
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --v;
+
+    /* Function Body */
+    if (*n == 2) {
+	i__1 = h_dim1 + 1;
+	q__2.r = h__[i__1].r - s2->r, q__2.i = h__[i__1].i - s2->i;
+	q__1.r = q__2.r, q__1.i = q__2.i;
+	i__2 = h_dim1 + 2;
+	s = (r__1 = q__1.r, dabs(r__1)) + (r__2 = r_imag(&q__1), dabs(r__2))
+		+ ((r__3 = h__[i__2].r, dabs(r__3)) + (r__4 = r_imag(&h__[
+		h_dim1 + 2]), dabs(r__4)));
+	if (s == 0.f) {
+	    v[1].r = 0.f, v[1].i = 0.f;
+	    v[2].r = 0.f, v[2].i = 0.f;
+	} else {
+	    i__1 = h_dim1 + 2;
+	    q__1.r = h__[i__1].r / s, q__1.i = h__[i__1].i / s;
+	    h21s.r = q__1.r, h21s.i = q__1.i;
+	    i__1 = (h_dim1 << 1) + 1;
+	    q__2.r = h21s.r * h__[i__1].r - h21s.i * h__[i__1].i, q__2.i =
+		    h21s.r * h__[i__1].i + h21s.i * h__[i__1].r;
+	    i__2 = h_dim1 + 1;
+	    q__4.r = h__[i__2].r - s1->r, q__4.i = h__[i__2].i - s1->i;
+	    i__3 = h_dim1 + 1;
+	    q__6.r = h__[i__3].r - s2->r, q__6.i = h__[i__3].i - s2->i;
+	    q__5.r = q__6.r / s, q__5.i = q__6.i / s;
+	    q__3.r = q__4.r * q__5.r - q__4.i * q__5.i, q__3.i = q__4.r *
+		    q__5.i + q__4.i * q__5.r;
+	    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	    v[1].r = q__1.r, v[1].i = q__1.i;
+	    i__1 = h_dim1 + 1;
+	    i__2 = (h_dim1 << 1) + 2;
+	    q__4.r = h__[i__1].r + h__[i__2].r, q__4.i = h__[i__1].i + h__[
+		    i__2].i;
+	    q__3.r = q__4.r - s1->r, q__3.i = q__4.i - s1->i;
+	    q__2.r = q__3.r - s2->r, q__2.i = q__3.i - s2->i;
+	    q__1.r = h21s.r * q__2.r - h21s.i * q__2.i, q__1.i = h21s.r *
+		    q__2.i + h21s.i * q__2.r;
+	    v[2].r = q__1.r, v[2].i = q__1.i;
+	}
+    } else {
+	i__1 = h_dim1 + 1;
+	q__2.r = h__[i__1].r - s2->r, q__2.i = h__[i__1].i - s2->i;
+	q__1.r = q__2.r, q__1.i = q__2.i;
+	i__2 = h_dim1 + 2;
+	i__3 = h_dim1 + 3;
+	s = (r__1 = q__1.r, dabs(r__1)) + (r__2 = r_imag(&q__1), dabs(r__2))
+		+ ((r__3 = h__[i__2].r, dabs(r__3)) + (r__4 = r_imag(&h__[
+		h_dim1 + 2]), dabs(r__4))) + ((r__5 = h__[i__3].r, dabs(r__5))
+		 + (r__6 = r_imag(&h__[h_dim1 + 3]), dabs(r__6)));
+	if (s == 0.f) {
+	    v[1].r = 0.f, v[1].i = 0.f;
+	    v[2].r = 0.f, v[2].i = 0.f;
+	    v[3].r = 0.f, v[3].i = 0.f;
+	} else {
+	    i__1 = h_dim1 + 2;
+	    q__1.r = h__[i__1].r / s, q__1.i = h__[i__1].i / s;
+	    h21s.r = q__1.r, h21s.i = q__1.i;
+	    i__1 = h_dim1 + 3;
+	    q__1.r = h__[i__1].r / s, q__1.i = h__[i__1].i / s;
+	    h31s.r = q__1.r, h31s.i = q__1.i;
+	    i__1 = h_dim1 + 1;
+	    q__4.r = h__[i__1].r - s1->r, q__4.i = h__[i__1].i - s1->i;
+	    i__2 = h_dim1 + 1;
+	    q__6.r = h__[i__2].r - s2->r, q__6.i = h__[i__2].i - s2->i;
+	    q__5.r = q__6.r / s, q__5.i = q__6.i / s;
+	    q__3.r = q__4.r * q__5.r - q__4.i * q__5.i, q__3.i = q__4.r *
+		    q__5.i + q__4.i * q__5.r;
+	    i__3 = (h_dim1 << 1) + 1;
+	    q__7.r = h__[i__3].r * h21s.r - h__[i__3].i * h21s.i, q__7.i =
+		    h__[i__3].r * h21s.i + h__[i__3].i * h21s.r;
+	    q__2.r = q__3.r + q__7.r, q__2.i = q__3.i + q__7.i;
+	    i__4 = h_dim1 * 3 + 1;
+	    q__8.r = h__[i__4].r * h31s.r - h__[i__4].i * h31s.i, q__8.i =
+		    h__[i__4].r * h31s.i + h__[i__4].i * h31s.r;
+	    q__1.r = q__2.r + q__8.r, q__1.i = q__2.i + q__8.i;
+	    v[1].r = q__1.r, v[1].i = q__1.i;
+	    i__1 = h_dim1 + 1;
+	    i__2 = (h_dim1 << 1) + 2;
+	    q__5.r = h__[i__1].r + h__[i__2].r, q__5.i = h__[i__1].i + h__[
+		    i__2].i;
+	    q__4.r = q__5.r - s1->r, q__4.i = q__5.i - s1->i;
+	    q__3.r = q__4.r - s2->r, q__3.i = q__4.i - s2->i;
+	    q__2.r = h21s.r * q__3.r - h21s.i * q__3.i, q__2.i = h21s.r *
+		    q__3.i + h21s.i * q__3.r;
+	    i__3 = h_dim1 * 3 + 2;
+	    q__6.r = h__[i__3].r * h31s.r - h__[i__3].i * h31s.i, q__6.i =
+		    h__[i__3].r * h31s.i + h__[i__3].i * h31s.r;
+	    q__1.r = q__2.r + q__6.r, q__1.i = q__2.i + q__6.i;
+	    v[2].r = q__1.r, v[2].i = q__1.i;
+	    i__1 = h_dim1 + 1;
+	    i__2 = h_dim1 * 3 + 3;
+	    q__5.r = h__[i__1].r + h__[i__2].r, q__5.i = h__[i__1].i + h__[
+		    i__2].i;
+	    q__4.r = q__5.r - s1->r, q__4.i = q__5.i - s1->i;
+	    q__3.r = q__4.r - s2->r, q__3.i = q__4.i - s2->i;
+	    q__2.r = h31s.r * q__3.r - h31s.i * q__3.i, q__2.i = h31s.r *
+		    q__3.i + h31s.i * q__3.r;
+	    i__3 = (h_dim1 << 1) + 3;
+	    q__6.r = h21s.r * h__[i__3].r - h21s.i * h__[i__3].i, q__6.i =
+		    h21s.r * h__[i__3].i + h21s.i * h__[i__3].r;
+	    q__1.r = q__2.r + q__6.r, q__1.i = q__2.i + q__6.i;
+	    v[3].r = q__1.r, v[3].i = q__1.i;
+	}
+    }
+    return 0;
+} /* claqr1_ */
+
+/* Subroutine */ int claqr2_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, complex *h__, integer *ldh,
+	 integer *iloz, integer *ihiz, complex *z__, integer *ldz, integer *
+	ns, integer *nd, complex *sh, complex *v, integer *ldv, integer *nh,
+	complex *t, integer *ldt, integer *nv, complex *wv, integer *ldwv,
+	complex *work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static complex s;
+    static integer jw;
+    static real foo;
+    static integer kln;
+    static complex tau;
+    static integer knt;
+    static real ulp;
+    static integer lwk1, lwk2;
+    static complex beta;
+    static integer kcol, info, ifst, ilst, ltop, krow;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    cgemm_(char *, char *, integer *, integer *, integer *, complex *,
+	     complex *, integer *, complex *, integer *, complex *, complex *,
+	     integer *), ccopy_(integer *, complex *, integer
+	    *, complex *, integer *);
+    static integer infqr, kwtop;
+    extern /* Subroutine */ int slabad_(real *, real *), cgehrd_(integer *,
+	    integer *, integer *, complex *, integer *, complex *, complex *,
+	    integer *, integer *), clarfg_(integer *, complex *, complex *,
+	    integer *, complex *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clahqr_(logical *, logical *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, complex *, integer *, integer *), clacpy_(char *,
+	    integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex
+	    *, complex *, integer *);
+    static real safmin, safmax;
+    extern /* Subroutine */ int ctrexc_(char *, integer *, complex *, integer
+	    *, complex *, integer *, integer *, integer *, integer *),
+	     cunmhr_(char *, char *, integer *, integer *, integer *, integer
+	    *, complex *, integer *, complex *, complex *, integer *, complex
+	    *, integer *, integer *);
+    static real smlnum;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- April 2009                                                      --
+
+
+       This subroutine is identical to CLAQR3 except that it avoids
+       recursion by calling CLAHQR instead of CLAQR4.
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an unitary similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an unitary similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the unitary matrix Z is updated so
+            so that the unitary Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the unitary matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) COMPLEX array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by a unitary
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) COMPLEX array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the unitary
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SH      (output) COMPLEX array, dimension KBOT
+            On output, approximate eigenvalues that may
+            be used for shifts are stored in SH(KBOT-ND-NS+1)
+            through SR(KBOT-ND).  Converged eigenvalues are
+            stored in SH(KBOT-ND+1) through SH(KBOT).
+
+       V       (workspace) COMPLEX array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) COMPLEX array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) COMPLEX array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) COMPLEX array, dimension LWORK.
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; CLAQR2
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sh;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to CGEHRD ==== */
+
+	i__1 = jw - 1;
+	cgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1].r;
+
+/*        ==== Workspace query call to CUNMHR ==== */
+
+	i__1 = jw - 1;
+	cunmhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1].r;
+
+/*        ==== Optimal workspace ==== */
+
+	lwkopt = jw + max(lwk1,lwk2);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	r__1 = (real) lwkopt;
+	q__1.r = r__1, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1].r = 1.f, work[1].i = 0.f;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s.r = 0.f, s.i = 0.f;
+    } else {
+	i__1 = kwtop + (kwtop - 1) * h_dim1;
+	s.r = h__[i__1].r, s.i = h__[i__1].i;
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	i__1 = kwtop;
+	i__2 = kwtop + kwtop * h_dim1;
+	sh[i__1].r = h__[i__2].r, sh[i__1].i = h__[i__2].i;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	i__1 = kwtop + kwtop * h_dim1;
+	r__5 = smlnum, r__6 = ulp * ((r__1 = h__[i__1].r, dabs(r__1)) + (r__2
+		= r_imag(&h__[kwtop + kwtop * h_dim1]), dabs(r__2)));
+	if ((r__3 = s.r, dabs(r__3)) + (r__4 = r_imag(&s), dabs(r__4)) <=
+		dmax(r__5,r__6)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		i__1 = kwtop + (kwtop - 1) * h_dim1;
+		h__[i__1].r = 0.f, h__[i__1].i = 0.f;
+	    }
+	}
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    clacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    ccopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    claset_("A", &jw, &jw, &c_b56, &c_b57, &v[v_offset], ldv);
+    clahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[kwtop],
+	    &c__1, &jw, &v[v_offset], ldv, &infqr);
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+    i__1 = jw;
+    for (knt = infqr + 1; knt <= i__1; ++knt) {
+
+/*        ==== Small spike tip deflation test ==== */
+
+	i__2 = *ns + *ns * t_dim1;
+	foo = (r__1 = t[i__2].r, dabs(r__1)) + (r__2 = r_imag(&t[*ns + *ns *
+		t_dim1]), dabs(r__2));
+	if (foo == 0.f) {
+	    foo = (r__1 = s.r, dabs(r__1)) + (r__2 = r_imag(&s), dabs(r__2));
+	}
+	i__2 = *ns * v_dim1 + 1;
+/* Computing MAX */
+	r__5 = smlnum, r__6 = ulp * foo;
+	if (((r__1 = s.r, dabs(r__1)) + (r__2 = r_imag(&s), dabs(r__2))) * ((
+		r__3 = v[i__2].r, dabs(r__3)) + (r__4 = r_imag(&v[*ns *
+		v_dim1 + 1]), dabs(r__4))) <= dmax(r__5,r__6)) {
+
+/*           ==== One more converged eigenvalue ==== */
+
+	    --(*ns);
+	} else {
+
+/*
+             ==== One undeflatable eigenvalue.  Move it up out of the
+             .    way.   (CTREXC can not fail in this case.) ====
+*/
+
+	    ifst = *ns;
+	    ctrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst, &
+		    ilst, &info);
+	    ++ilst;
+	}
+/* L10: */
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s.r = 0.f, s.i = 0.f;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting the diagonal of T improves accuracy for
+          .    graded matrices.  ====
+*/
+
+	i__1 = *ns;
+	for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	    ifst = i__;
+	    i__2 = *ns;
+	    for (j = i__ + 1; j <= i__2; ++j) {
+		i__3 = j + j * t_dim1;
+		i__4 = ifst + ifst * t_dim1;
+		if ((r__1 = t[i__3].r, dabs(r__1)) + (r__2 = r_imag(&t[j + j *
+			 t_dim1]), dabs(r__2)) > (r__3 = t[i__4].r, dabs(r__3)
+			) + (r__4 = r_imag(&t[ifst + ifst * t_dim1]), dabs(
+			r__4))) {
+		    ifst = j;
+		}
+/* L20: */
+	    }
+	    ilst = i__;
+	    if (ifst != ilst) {
+		ctrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &info);
+	    }
+/* L30: */
+	}
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__1 = jw;
+    for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	i__2 = kwtop + i__ - 1;
+	i__3 = i__ + i__ * t_dim1;
+	sh[i__2].r = t[i__3].r, sh[i__2].i = t[i__3].i;
+/* L40: */
+    }
+
+
+    if (*ns < jw || s.r == 0.f && s.i == 0.f) {
+	if (*ns > 1 && (s.r != 0.f || s.i != 0.f)) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    ccopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    i__1 = *ns;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		r_cnjg(&q__1, &work[i__]);
+		work[i__2].r = q__1.r, work[i__2].i = q__1.i;
+/* L50: */
+	    }
+	    beta.r = work[1].r, beta.i = work[1].i;
+	    clarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1].r = 1.f, work[1].i = 0.f;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    claset_("L", &i__1, &i__2, &c_b56, &c_b56, &t[t_dim1 + 3], ldt);
+
+	    r_cnjg(&q__1, &tau);
+	    clarf_("L", ns, &jw, &work[1], &c__1, &q__1, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    clarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    clarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    cgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    i__1 = kwtop + (kwtop - 1) * h_dim1;
+	    r_cnjg(&q__2, &v[v_dim1 + 1]);
+	    q__1.r = s.r * q__2.r - s.i * q__2.i, q__1.i = s.r * q__2.i + s.i
+		    * q__2.r;
+	    h__[i__1].r = q__1.r, h__[i__1].i = q__1.i;
+	}
+	clacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	ccopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && (s.r != 0.f || s.i != 0.f)) {
+	    i__1 = *lwork - jw;
+	    cunmhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    cgemm_("N", "N", &kln, &jw, &jw, &c_b57, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b56, &wv[wv_offset],
+		    ldwv);
+	    clacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L60: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		cgemm_("C", "N", &jw, &kln, &jw, &c_b57, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b56, &t[t_offset],
+			 ldt);
+		clacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L70: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		cgemm_("N", "N", &kln, &jw, &jw, &c_b57, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b56, &wv[
+			wv_offset], ldwv);
+		clacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L80: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    r__1 = (real) lwkopt;
+    q__1.r = r__1, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+/*     ==== End of CLAQR2 ==== */
+
+    return 0;
+} /* claqr2_ */
+
+/* Subroutine */ int claqr3_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, complex *h__, integer *ldh,
+	 integer *iloz, integer *ihiz, complex *z__, integer *ldz, integer *
+	ns, integer *nd, complex *sh, complex *v, integer *ldv, integer *nh,
+	complex *t, integer *ldt, integer *nv, complex *wv, integer *ldwv,
+	complex *work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static complex s;
+    static integer jw;
+    static real foo;
+    static integer kln;
+    static complex tau;
+    static integer knt;
+    static real ulp;
+    static integer lwk1, lwk2, lwk3;
+    static complex beta;
+    static integer kcol, info, nmin, ifst, ilst, ltop, krow;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    cgemm_(char *, char *, integer *, integer *, integer *, complex *,
+	     complex *, integer *, complex *, integer *, complex *, complex *,
+	     integer *), ccopy_(integer *, complex *, integer
+	    *, complex *, integer *);
+    static integer infqr, kwtop;
+    extern /* Subroutine */ int claqr4_(logical *, logical *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, integer *),
+	     slabad_(real *, real *), cgehrd_(integer *, integer *, integer *,
+	     complex *, integer *, complex *, complex *, integer *, integer *)
+	    , clarfg_(integer *, complex *, complex *, integer *, complex *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clahqr_(logical *, logical *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, complex *, integer *, integer *), clacpy_(char *,
+	    integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex
+	    *, complex *, integer *);
+    static real safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real safmax;
+    extern /* Subroutine */ int ctrexc_(char *, integer *, complex *, integer
+	    *, complex *, integer *, integer *, integer *, integer *),
+	     cunmhr_(char *, char *, integer *, integer *, integer *, integer
+	    *, complex *, integer *, complex *, complex *, integer *, complex
+	    *, integer *, integer *);
+    static real smlnum;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- April 2009                                                      --
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an unitary similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an unitary similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the unitary matrix Z is updated so
+            so that the unitary Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the unitary matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) COMPLEX array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by a unitary
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) COMPLEX array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the unitary
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SH      (output) COMPLEX array, dimension KBOT
+            On output, approximate eigenvalues that may
+            be used for shifts are stored in SH(KBOT-ND-NS+1)
+            through SR(KBOT-ND).  Converged eigenvalues are
+            stored in SH(KBOT-ND+1) through SH(KBOT).
+
+       V       (workspace) COMPLEX array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) COMPLEX array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) COMPLEX array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) COMPLEX array, dimension LWORK.
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; CLAQR3
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sh;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to CGEHRD ==== */
+
+	i__1 = jw - 1;
+	cgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1].r;
+
+/*        ==== Workspace query call to CUNMHR ==== */
+
+	i__1 = jw - 1;
+	cunmhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1].r;
+
+/*        ==== Workspace query call to CLAQR4 ==== */
+
+	claqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[1],
+		&c__1, &jw, &v[v_offset], ldv, &work[1], &c_n1, &infqr);
+	lwk3 = (integer) work[1].r;
+
+/*
+          ==== Optimal workspace ====
+
+   Computing MAX
+*/
+	i__1 = jw + max(lwk1,lwk2);
+	lwkopt = max(i__1,lwk3);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	r__1 = (real) lwkopt;
+	q__1.r = r__1, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1].r = 1.f, work[1].i = 0.f;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s.r = 0.f, s.i = 0.f;
+    } else {
+	i__1 = kwtop + (kwtop - 1) * h_dim1;
+	s.r = h__[i__1].r, s.i = h__[i__1].i;
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	i__1 = kwtop;
+	i__2 = kwtop + kwtop * h_dim1;
+	sh[i__1].r = h__[i__2].r, sh[i__1].i = h__[i__2].i;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	i__1 = kwtop + kwtop * h_dim1;
+	r__5 = smlnum, r__6 = ulp * ((r__1 = h__[i__1].r, dabs(r__1)) + (r__2
+		= r_imag(&h__[kwtop + kwtop * h_dim1]), dabs(r__2)));
+	if ((r__3 = s.r, dabs(r__3)) + (r__4 = r_imag(&s), dabs(r__4)) <=
+		dmax(r__5,r__6)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		i__1 = kwtop + (kwtop - 1) * h_dim1;
+		h__[i__1].r = 0.f, h__[i__1].i = 0.f;
+	    }
+	}
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    clacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    ccopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    claset_("A", &jw, &jw, &c_b56, &c_b57, &v[v_offset], ldv);
+    nmin = ilaenv_(&c__12, "CLAQR3", "SV", &jw, &c__1, &jw, lwork, (ftnlen)6,
+	    (ftnlen)2);
+    if (jw > nmin) {
+	claqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[
+		kwtop], &c__1, &jw, &v[v_offset], ldv, &work[1], lwork, &
+		infqr);
+    } else {
+	clahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[
+		kwtop], &c__1, &jw, &v[v_offset], ldv, &infqr);
+    }
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+    i__1 = jw;
+    for (knt = infqr + 1; knt <= i__1; ++knt) {
+
+/*        ==== Small spike tip deflation test ==== */
+
+	i__2 = *ns + *ns * t_dim1;
+	foo = (r__1 = t[i__2].r, dabs(r__1)) + (r__2 = r_imag(&t[*ns + *ns *
+		t_dim1]), dabs(r__2));
+	if (foo == 0.f) {
+	    foo = (r__1 = s.r, dabs(r__1)) + (r__2 = r_imag(&s), dabs(r__2));
+	}
+	i__2 = *ns * v_dim1 + 1;
+/* Computing MAX */
+	r__5 = smlnum, r__6 = ulp * foo;
+	if (((r__1 = s.r, dabs(r__1)) + (r__2 = r_imag(&s), dabs(r__2))) * ((
+		r__3 = v[i__2].r, dabs(r__3)) + (r__4 = r_imag(&v[*ns *
+		v_dim1 + 1]), dabs(r__4))) <= dmax(r__5,r__6)) {
+
+/*           ==== One more converged eigenvalue ==== */
+
+	    --(*ns);
+	} else {
+
+/*
+             ==== One undeflatable eigenvalue.  Move it up out of the
+             .    way.   (CTREXC can not fail in this case.) ====
+*/
+
+	    ifst = *ns;
+	    ctrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst, &
+		    ilst, &info);
+	    ++ilst;
+	}
+/* L10: */
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s.r = 0.f, s.i = 0.f;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting the diagonal of T improves accuracy for
+          .    graded matrices.  ====
+*/
+
+	i__1 = *ns;
+	for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	    ifst = i__;
+	    i__2 = *ns;
+	    for (j = i__ + 1; j <= i__2; ++j) {
+		i__3 = j + j * t_dim1;
+		i__4 = ifst + ifst * t_dim1;
+		if ((r__1 = t[i__3].r, dabs(r__1)) + (r__2 = r_imag(&t[j + j *
+			 t_dim1]), dabs(r__2)) > (r__3 = t[i__4].r, dabs(r__3)
+			) + (r__4 = r_imag(&t[ifst + ifst * t_dim1]), dabs(
+			r__4))) {
+		    ifst = j;
+		}
+/* L20: */
+	    }
+	    ilst = i__;
+	    if (ifst != ilst) {
+		ctrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &info);
+	    }
+/* L30: */
+	}
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__1 = jw;
+    for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	i__2 = kwtop + i__ - 1;
+	i__3 = i__ + i__ * t_dim1;
+	sh[i__2].r = t[i__3].r, sh[i__2].i = t[i__3].i;
+/* L40: */
+    }
+
+
+    if (*ns < jw || s.r == 0.f && s.i == 0.f) {
+	if (*ns > 1 && (s.r != 0.f || s.i != 0.f)) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    ccopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    i__1 = *ns;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		r_cnjg(&q__1, &work[i__]);
+		work[i__2].r = q__1.r, work[i__2].i = q__1.i;
+/* L50: */
+	    }
+	    beta.r = work[1].r, beta.i = work[1].i;
+	    clarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1].r = 1.f, work[1].i = 0.f;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    claset_("L", &i__1, &i__2, &c_b56, &c_b56, &t[t_dim1 + 3], ldt);
+
+	    r_cnjg(&q__1, &tau);
+	    clarf_("L", ns, &jw, &work[1], &c__1, &q__1, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    clarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    clarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    cgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    i__1 = kwtop + (kwtop - 1) * h_dim1;
+	    r_cnjg(&q__2, &v[v_dim1 + 1]);
+	    q__1.r = s.r * q__2.r - s.i * q__2.i, q__1.i = s.r * q__2.i + s.i
+		    * q__2.r;
+	    h__[i__1].r = q__1.r, h__[i__1].i = q__1.i;
+	}
+	clacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	ccopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && (s.r != 0.f || s.i != 0.f)) {
+	    i__1 = *lwork - jw;
+	    cunmhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    cgemm_("N", "N", &kln, &jw, &jw, &c_b57, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b56, &wv[wv_offset],
+		    ldwv);
+	    clacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L60: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		cgemm_("C", "N", &jw, &kln, &jw, &c_b57, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b56, &t[t_offset],
+			 ldt);
+		clacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L70: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		cgemm_("N", "N", &kln, &jw, &jw, &c_b57, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b56, &wv[
+			wv_offset], ldwv);
+		clacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L80: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    r__1 = (real) lwkopt;
+    q__1.r = r__1, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+/*     ==== End of CLAQR3 ==== */
+
+    return 0;
+} /* claqr3_ */
+
+/* Subroutine */ int claqr4_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w,
+	integer *iloz, integer *ihiz, complex *z__, integer *ldz, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8;
+    complex q__1, q__2, q__3, q__4, q__5;
+
+    /* Local variables */
+    static integer i__, k;
+    static real s;
+    static complex aa, bb, cc, dd;
+    static integer ld, nh, it, ks, kt, ku, kv, ls, ns, nw;
+    static complex tr2, det;
+    static integer inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl, kbot,
+	    nmin;
+    static complex swap;
+    static integer ktop;
+    static complex zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int claqr2_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, complex *, integer *, integer *,
+	    integer *, complex *, integer *, integer *, integer *, complex *,
+	    complex *, integer *, integer *, complex *, integer *, integer *,
+	    complex *, integer *, complex *, integer *), claqr5_(logical *,
+	    logical *, integer *, integer *, integer *, integer *, integer *,
+	    complex *, complex *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, integer *,
+	    complex *, integer *, integer *, complex *, integer *);
+    static integer nibble;
+    extern /* Subroutine */ int clahqr_(logical *, logical *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, complex *, integer *, integer *), clacpy_(char *,
+	    integer *, integer *, complex *, integer *, complex *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    static complex rtdisc;
+    static integer nwupbd;
+    static logical sorted;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       This subroutine implements one level of recursion for CLAQR0.
+       It is a complete implementation of the small bulge multi-shift
+       QR algorithm.  It may be called by CLAQR0 and, for large enough
+       deflation window size, it may be called by CLAQR3.  This
+       subroutine is identical to CLAQR0 except that it calls CLAQR2
+       instead of CLAQR3.
+
+       Purpose
+       =======
+
+       CLAQR4 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**H, where T is an upper triangular matrix (the
+       Schur form), and Z is the unitary matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input unitary
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to CGEBAL, and then passed to CGEHRD when the
+             matrix output by CGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) COMPLEX array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H
+             contains the upper triangular matrix T from the Schur
+             decomposition (the Schur form). If INFO = 0 and WANT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       W        (output) COMPLEX array, dimension (N)
+             The computed eigenvalues of H(ILO:IHI,ILO:IHI) are stored
+             in W(ILO:IHI). If WANTT is .TRUE., then the eigenvalues are
+             stored in the same order as on the diagonal of the Schur
+             form returned in H, with W(i) = H(i,i).
+
+       Z     (input/output) COMPLEX array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) COMPLEX array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then CLAQR4 does a workspace query.
+             In this case, CLAQR4 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, CLAQR4 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is a unitary matrix.  The final
+                  value of  H is upper Hessenberg and triangular in
+                  rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    CLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constant WILK1 is used to form the exceptional
+       .    shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use CLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    clahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "CLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "CLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to CLAQR2 ====
+*/
+
+	i__1 = nwr + 1;
+	claqr2_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[h_offset],
+		ldh, n, &h__[h_offset], ldh, n, &h__[h_offset], ldh, &work[1],
+		 &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(CLAQR5, CLAQR2) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1].r;
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    r__1 = (real) lwkopt;
+	    q__1.r = r__1, q__1.i = 0.f;
+	    work[1].r = q__1.r, work[1].i = q__1.i;
+	    return 0;
+	}
+
+/*        ==== CLAHQR/CLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "CLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "CLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "CLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L80;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		i__3 = k + (k - 1) * h_dim1;
+		if (h__[i__3].r == 0.f && h__[i__3].i == 0.f) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    i__2 = kwtop + (kwtop - 1) * h_dim1;
+		    i__3 = kwtop - 1 + (kwtop - 2) * h_dim1;
+		    if ((r__1 = h__[i__2].r, dabs(r__1)) + (r__2 = r_imag(&
+			    h__[kwtop + (kwtop - 1) * h_dim1]), dabs(r__2)) >
+			    (r__3 = h__[i__3].r, dabs(r__3)) + (r__4 = r_imag(
+			    &h__[kwtop - 1 + (kwtop - 2) * h_dim1]), dabs(
+			    r__4))) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    claqr2_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[kv
+		    + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1], ldh, &nve, &
+		    h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if CLAQR2
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    CLAQR2 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+		    i__2 = ks + 1;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			i__3 = i__;
+			i__4 = i__ + i__ * h_dim1;
+			i__5 = i__ + (i__ - 1) * h_dim1;
+			r__3 = ((r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				r_imag(&h__[i__ + (i__ - 1) * h_dim1]), dabs(
+				r__2))) * .75f;
+			q__1.r = h__[i__4].r + r__3, q__1.i = h__[i__4].i;
+			w[i__3].r = q__1.r, w[i__3].i = q__1.i;
+			i__3 = i__ - 1;
+			i__4 = i__;
+			w[i__3].r = w[i__4].r, w[i__3].i = w[i__4].i;
+/* L30: */
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use CLAHQR
+                   .    on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			clacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			clahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[kt
+				+ h_dim1], ldh, &w[ks], &c__1, &c__1, zdum, &
+				c__1, &inf);
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  Scale to avoid
+                      .    overflows, underflows and subnormals.
+                      .    (The scale factor S can not be zero,
+                      .    because H(KBOT,KBOT-1) is nonzero.) ====
+*/
+
+			if (ks >= kbot) {
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    i__3 = kbot + (kbot - 1) * h_dim1;
+			    i__4 = kbot - 1 + kbot * h_dim1;
+			    i__5 = kbot + kbot * h_dim1;
+			    s = (r__1 = h__[i__2].r, dabs(r__1)) + (r__2 =
+				    r_imag(&h__[kbot - 1 + (kbot - 1) *
+				    h_dim1]), dabs(r__2)) + ((r__3 = h__[i__3]
+				    .r, dabs(r__3)) + (r__4 = r_imag(&h__[
+				    kbot + (kbot - 1) * h_dim1]), dabs(r__4)))
+				     + ((r__5 = h__[i__4].r, dabs(r__5)) + (
+				    r__6 = r_imag(&h__[kbot - 1 + kbot *
+				    h_dim1]), dabs(r__6))) + ((r__7 = h__[
+				    i__5].r, dabs(r__7)) + (r__8 = r_imag(&
+				    h__[kbot + kbot * h_dim1]), dabs(r__8)));
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    aa.r = q__1.r, aa.i = q__1.i;
+			    i__2 = kbot + (kbot - 1) * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    cc.r = q__1.r, cc.i = q__1.i;
+			    i__2 = kbot - 1 + kbot * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    bb.r = q__1.r, bb.i = q__1.i;
+			    i__2 = kbot + kbot * h_dim1;
+			    q__1.r = h__[i__2].r / s, q__1.i = h__[i__2].i /
+				    s;
+			    dd.r = q__1.r, dd.i = q__1.i;
+			    q__2.r = aa.r + dd.r, q__2.i = aa.i + dd.i;
+			    q__1.r = q__2.r / 2.f, q__1.i = q__2.i / 2.f;
+			    tr2.r = q__1.r, tr2.i = q__1.i;
+			    q__3.r = aa.r - tr2.r, q__3.i = aa.i - tr2.i;
+			    q__4.r = dd.r - tr2.r, q__4.i = dd.i - tr2.i;
+			    q__2.r = q__3.r * q__4.r - q__3.i * q__4.i,
+				    q__2.i = q__3.r * q__4.i + q__3.i *
+				    q__4.r;
+			    q__5.r = bb.r * cc.r - bb.i * cc.i, q__5.i = bb.r
+				    * cc.i + bb.i * cc.r;
+			    q__1.r = q__2.r - q__5.r, q__1.i = q__2.i -
+				    q__5.i;
+			    det.r = q__1.r, det.i = q__1.i;
+			    q__2.r = -det.r, q__2.i = -det.i;
+			    c_sqrt(&q__1, &q__2);
+			    rtdisc.r = q__1.r, rtdisc.i = q__1.i;
+			    i__2 = kbot - 1;
+			    q__2.r = tr2.r + rtdisc.r, q__2.i = tr2.i +
+				    rtdisc.i;
+			    q__1.r = s * q__2.r, q__1.i = s * q__2.i;
+			    w[i__2].r = q__1.r, w[i__2].i = q__1.i;
+			    i__2 = kbot;
+			    q__2.r = tr2.r - rtdisc.r, q__2.i = tr2.i -
+				    rtdisc.i;
+			    q__1.r = s * q__2.r, q__1.i = s * q__2.i;
+			    w[i__2].r = q__1.r, w[i__2].i = q__1.i;
+
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*                    ==== Sort the shifts (Helps a little) ==== */
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				i__4 = i__;
+				i__5 = i__ + 1;
+				if ((r__1 = w[i__4].r, dabs(r__1)) + (r__2 =
+					r_imag(&w[i__]), dabs(r__2)) < (r__3 =
+					 w[i__5].r, dabs(r__3)) + (r__4 =
+					r_imag(&w[i__ + 1]), dabs(r__4))) {
+				    sorted = FALSE_;
+				    i__4 = i__;
+				    swap.r = w[i__4].r, swap.i = w[i__4].i;
+				    i__4 = i__;
+				    i__5 = i__ + 1;
+				    w[i__4].r = w[i__5].r, w[i__4].i = w[i__5]
+					    .i;
+				    i__4 = i__ + 1;
+				    w[i__4].r = swap.r, w[i__4].i = swap.i;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+		}
+
+/*
+                ==== If there are only two shifts, then use
+                .    only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    i__2 = kbot;
+		    i__3 = kbot + kbot * h_dim1;
+		    q__2.r = w[i__2].r - h__[i__3].r, q__2.i = w[i__2].i -
+			    h__[i__3].i;
+		    q__1.r = q__2.r, q__1.i = q__2.i;
+		    i__4 = kbot - 1;
+		    i__5 = kbot + kbot * h_dim1;
+		    q__4.r = w[i__4].r - h__[i__5].r, q__4.i = w[i__4].i -
+			    h__[i__5].i;
+		    q__3.r = q__4.r, q__3.i = q__4.i;
+		    if ((r__1 = q__1.r, dabs(r__1)) + (r__2 = r_imag(&q__1),
+			    dabs(r__2)) < (r__3 = q__3.r, dabs(r__3)) + (r__4
+			    = r_imag(&q__3), dabs(r__4))) {
+			i__2 = kbot - 1;
+			i__3 = kbot;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    } else {
+			i__2 = kbot;
+			i__3 = kbot - 1;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		claqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &w[ks], &
+			h__[h_offset], ldh, iloz, ihiz, &z__[z_offset], ldz, &
+			work[1], &c__3, &h__[ku + h_dim1], ldh, &nve, &h__[
+			kwv + h_dim1], ldh, &nho, &h__[ku + kwh * h_dim1],
+			ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L70:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L80:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    r__1 = (real) lwkopt;
+    q__1.r = r__1, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+/*     ==== End of CLAQR4 ==== */
+
+    return 0;
+} /* claqr4_ */
+
+/* Subroutine */ int claqr5_(logical *wantt, logical *wantz, integer *kacc22,
+	integer *n, integer *ktop, integer *kbot, integer *nshfts, complex *s,
+	 complex *h__, integer *ldh, integer *iloz, integer *ihiz, complex *
+	z__, integer *ldz, complex *v, integer *ldv, complex *u, integer *ldu,
+	 integer *nv, complex *wv, integer *ldwv, integer *nh, complex *wh,
+	integer *ldwh)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, u_dim1, u_offset, v_dim1, v_offset, wh_dim1,
+	    wh_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3,
+	     i__4, i__5, i__6, i__7, i__8, i__9, i__10, i__11;
+    real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10;
+    complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8;
+
+    /* Local variables */
+    static integer j, k, m, i2, j2, i4, j4, k1;
+    static real h11, h12, h21, h22;
+    static integer m22, ns, nu;
+    static complex vt[3];
+    static real scl;
+    static integer kdu, kms;
+    static real ulp;
+    static integer knz, kzs;
+    static real tst1, tst2;
+    static complex beta;
+    static logical blk22, bmp22;
+    static integer mend, jcol, jlen, jbot, mbot, jtop, jrow, mtop;
+    static complex alpha;
+    static logical accum;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *);
+    static integer ndcol, incol, krcol, nbmps;
+    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *), claqr1_(integer *,
+	    complex *, integer *, complex *, complex *, complex *), slabad_(
+	    real *, real *), clarfg_(integer *, complex *, complex *, integer
+	    *, complex *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex
+	    *, integer *, complex *, integer *), claset_(char *,
+	    integer *, integer *, complex *, complex *, complex *, integer *);
+    static real safmin, safmax;
+    static complex refsum;
+    static integer mstart;
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       This auxiliary subroutine called by CLAQR0 performs a
+       single small-bulge multi-shift QR sweep.
+
+        WANTT  (input) logical scalar
+               WANTT = .true. if the triangular Schur factor
+               is being computed.  WANTT is set to .false. otherwise.
+
+        WANTZ  (input) logical scalar
+               WANTZ = .true. if the unitary Schur factor is being
+               computed.  WANTZ is set to .false. otherwise.
+
+        KACC22 (input) integer with value 0, 1, or 2.
+               Specifies the computation mode of far-from-diagonal
+               orthogonal updates.
+          = 0: CLAQR5 does not accumulate reflections and does not
+               use matrix-matrix multiply to update far-from-diagonal
+               matrix entries.
+          = 1: CLAQR5 accumulates reflections and uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries.
+          = 2: CLAQR5 accumulates reflections, uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries,
+               and takes advantage of 2-by-2 block structure during
+               matrix multiplies.
+
+        N      (input) integer scalar
+               N is the order of the Hessenberg matrix H upon which this
+               subroutine operates.
+
+        KTOP   (input) integer scalar
+        KBOT   (input) integer scalar
+               These are the first and last rows and columns of an
+               isolated diagonal block upon which the QR sweep is to be
+               applied. It is assumed without a check that
+                         either KTOP = 1  or   H(KTOP,KTOP-1) = 0
+               and
+                         either KBOT = N  or   H(KBOT+1,KBOT) = 0.
+
+        NSHFTS (input) integer scalar
+               NSHFTS gives the number of simultaneous shifts.  NSHFTS
+               must be positive and even.
+
+        S      (input/output) COMPLEX array of size (NSHFTS)
+               S contains the shifts of origin that define the multi-
+               shift QR sweep.  On output S may be reordered.
+
+        H      (input/output) COMPLEX array of size (LDH,N)
+               On input H contains a Hessenberg matrix.  On output a
+               multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied
+               to the isolated diagonal block in rows and columns KTOP
+               through KBOT.
+
+        LDH    (input) integer scalar
+               LDH is the leading dimension of H just as declared in the
+               calling procedure.  LDH.GE.MAX(1,N).
+
+        ILOZ   (input) INTEGER
+        IHIZ   (input) INTEGER
+               Specify the rows of Z to which transformations must be
+               applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+
+        Z      (input/output) COMPLEX array of size (LDZ,IHI)
+               If WANTZ = .TRUE., then the QR Sweep unitary
+               similarity transformation is accumulated into
+               Z(ILOZ:IHIZ,ILO:IHI) from the right.
+               If WANTZ = .FALSE., then Z is unreferenced.
+
+        LDZ    (input) integer scalar
+               LDA is the leading dimension of Z just as declared in
+               the calling procedure. LDZ.GE.N.
+
+        V      (workspace) COMPLEX array of size (LDV,NSHFTS/2)
+
+        LDV    (input) integer scalar
+               LDV is the leading dimension of V as declared in the
+               calling procedure.  LDV.GE.3.
+
+        U      (workspace) COMPLEX array of size
+               (LDU,3*NSHFTS-3)
+
+        LDU    (input) integer scalar
+               LDU is the leading dimension of U just as declared in the
+               in the calling subroutine.  LDU.GE.3*NSHFTS-3.
+
+        NH     (input) integer scalar
+               NH is the number of columns in array WH available for
+               workspace. NH.GE.1.
+
+        WH     (workspace) COMPLEX array of size (LDWH,NH)
+
+        LDWH   (input) integer scalar
+               Leading dimension of WH just as declared in the
+               calling procedure.  LDWH.GE.3*NSHFTS-3.
+
+        NV     (input) integer scalar
+               NV is the number of rows in WV agailable for workspace.
+               NV.GE.1.
+
+        WV     (workspace) COMPLEX array of size
+               (LDWV,3*NSHFTS-3)
+
+        LDWV   (input) integer scalar
+               LDWV is the leading dimension of WV as declared in the
+               in the calling subroutine.  LDWV.GE.NV.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       Reference:
+
+       K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+       Algorithm Part I: Maintaining Well Focused Shifts, and
+       Level 3 Performance, SIAM Journal of Matrix Analysis,
+       volume 23, pages 929--947, 2002.
+
+       ================================================================
+
+
+       ==== If there are no shifts, then there is nothing to do. ====
+*/
+
+    /* Parameter adjustments */
+    --s;
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    wh_dim1 = *ldwh;
+    wh_offset = 1 + wh_dim1;
+    wh -= wh_offset;
+
+    /* Function Body */
+    if (*nshfts < 2) {
+	return 0;
+    }
+
+/*
+       ==== If the active block is empty or 1-by-1, then there
+       .    is nothing to do. ====
+*/
+
+    if (*ktop >= *kbot) {
+	return 0;
+    }
+
+/*
+       ==== NSHFTS is supposed to be even, but if it is odd,
+       .    then simply reduce it by one.  ====
+*/
+
+    ns = *nshfts - *nshfts % 2;
+
+/*     ==== Machine constants for deflation ==== */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) (*n) / ulp);
+
+/*
+       ==== Use accumulated reflections to update far-from-diagonal
+       .    entries ? ====
+*/
+
+    accum = *kacc22 == 1 || *kacc22 == 2;
+
+/*     ==== If so, exploit the 2-by-2 block structure? ==== */
+
+    blk22 = ns > 2 && *kacc22 == 2;
+
+/*     ==== clear trash ==== */
+
+    if (*ktop + 2 <= *kbot) {
+	i__1 = *ktop + 2 + *ktop * h_dim1;
+	h__[i__1].r = 0.f, h__[i__1].i = 0.f;
+    }
+
+/*     ==== NBMPS = number of 2-shift bulges in the chain ==== */
+
+    nbmps = ns / 2;
+
+/*     ==== KDU = width of slab ==== */
+
+    kdu = nbmps * 6 - 3;
+
+/*     ==== Create and chase chains of NBMPS bulges ==== */
+
+    i__1 = *kbot - 2;
+    i__2 = nbmps * 3 - 2;
+    for (incol = (1 - nbmps) * 3 + *ktop - 1; i__2 < 0 ? incol >= i__1 :
+	    incol <= i__1; incol += i__2) {
+	ndcol = incol + kdu;
+	if (accum) {
+	    claset_("ALL", &kdu, &kdu, &c_b56, &c_b57, &u[u_offset], ldu);
+	}
+
+/*
+          ==== Near-the-diagonal bulge chase.  The following loop
+          .    performs the near-the-diagonal part of a small bulge
+          .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+          .    chunk extends from column INCOL to column NDCOL
+          .    (including both column INCOL and column NDCOL). The
+          .    following loop chases a 3*NBMPS column long chain of
+          .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+          .    may be less than KTOP and and NDCOL may be greater than
+          .    KBOT indicating phantom columns from which to chase
+          .    bulges before they are actually introduced or to which
+          .    to chase bulges beyond column KBOT.)  ====
+
+   Computing MIN
+*/
+	i__4 = incol + nbmps * 3 - 3, i__5 = *kbot - 2;
+	i__3 = min(i__4,i__5);
+	for (krcol = incol; krcol <= i__3; ++krcol) {
+
+/*
+             ==== Bulges number MTOP to MBOT are active double implicit
+             .    shift bulges.  There may or may not also be small
+             .    2-by-2 bulge, if there is room.  The inactive bulges
+             .    (if any) must wait until the active bulges have moved
+             .    down the diagonal to make room.  The phantom matrix
+             .    paradigm described above helps keep track.  ====
+
+   Computing MAX
+*/
+	    i__4 = 1, i__5 = (*ktop - 1 - krcol + 2) / 3 + 1;
+	    mtop = max(i__4,i__5);
+/* Computing MIN */
+	    i__4 = nbmps, i__5 = (*kbot - krcol) / 3;
+	    mbot = min(i__4,i__5);
+	    m22 = mbot + 1;
+	    bmp22 = mbot < nbmps && krcol + (m22 - 1) * 3 == *kbot - 2;
+
+/*
+             ==== Generate reflections to chase the chain right
+             .    one column.  (The minimum value of K is KTOP-1.) ====
+*/
+
+	    i__4 = mbot;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		if (k == *ktop - 1) {
+		    claqr1_(&c__3, &h__[*ktop + *ktop * h_dim1], ldh, &s[(m <<
+			     1) - 1], &s[m * 2], &v[m * v_dim1 + 1]);
+		    i__5 = m * v_dim1 + 1;
+		    alpha.r = v[i__5].r, alpha.i = v[i__5].i;
+		    clarfg_(&c__3, &alpha, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+		} else {
+		    i__5 = k + 1 + k * h_dim1;
+		    beta.r = h__[i__5].r, beta.i = h__[i__5].i;
+		    i__5 = m * v_dim1 + 2;
+		    i__6 = k + 2 + k * h_dim1;
+		    v[i__5].r = h__[i__6].r, v[i__5].i = h__[i__6].i;
+		    i__5 = m * v_dim1 + 3;
+		    i__6 = k + 3 + k * h_dim1;
+		    v[i__5].r = h__[i__6].r, v[i__5].i = h__[i__6].i;
+		    clarfg_(&c__3, &beta, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+
+/*
+                   ==== A Bulge may collapse because of vigilant
+                   .    deflation or destructive underflow.  In the
+                   .    underflow case, try the two-small-subdiagonals
+                   .    trick to try to reinflate the bulge.  ====
+*/
+
+		    i__5 = k + 3 + k * h_dim1;
+		    i__6 = k + 3 + (k + 1) * h_dim1;
+		    i__7 = k + 3 + (k + 2) * h_dim1;
+		    if (h__[i__5].r != 0.f || h__[i__5].i != 0.f || (h__[i__6]
+			    .r != 0.f || h__[i__6].i != 0.f) || h__[i__7].r ==
+			     0.f && h__[i__7].i == 0.f) {
+
+/*                    ==== Typical case: not collapsed (yet). ==== */
+
+			i__5 = k + 1 + k * h_dim1;
+			h__[i__5].r = beta.r, h__[i__5].i = beta.i;
+			i__5 = k + 2 + k * h_dim1;
+			h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+			i__5 = k + 3 + k * h_dim1;
+			h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+		    } else {
+
+/*
+                      ==== Atypical case: collapsed.  Attempt to
+                      .    reintroduce ignoring H(K+1,K) and H(K+2,K).
+                      .    If the fill resulting from the new
+                      .    reflector is too large, then abandon it.
+                      .    Otherwise, use the new one. ====
+*/
+
+			claqr1_(&c__3, &h__[k + 1 + (k + 1) * h_dim1], ldh, &
+				s[(m << 1) - 1], &s[m * 2], vt);
+			alpha.r = vt[0].r, alpha.i = vt[0].i;
+			clarfg_(&c__3, &alpha, &vt[1], &c__1, vt);
+			r_cnjg(&q__2, vt);
+			i__5 = k + 1 + k * h_dim1;
+			r_cnjg(&q__5, &vt[1]);
+			i__6 = k + 2 + k * h_dim1;
+			q__4.r = q__5.r * h__[i__6].r - q__5.i * h__[i__6].i,
+				q__4.i = q__5.r * h__[i__6].i + q__5.i * h__[
+				i__6].r;
+			q__3.r = h__[i__5].r + q__4.r, q__3.i = h__[i__5].i +
+				q__4.i;
+			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+				q__2.r * q__3.i + q__2.i * q__3.r;
+			refsum.r = q__1.r, refsum.i = q__1.i;
+
+			i__5 = k + 2 + k * h_dim1;
+			q__3.r = refsum.r * vt[1].r - refsum.i * vt[1].i,
+				q__3.i = refsum.r * vt[1].i + refsum.i * vt[1]
+				.r;
+			q__2.r = h__[i__5].r - q__3.r, q__2.i = h__[i__5].i -
+				q__3.i;
+			q__1.r = q__2.r, q__1.i = q__2.i;
+			q__5.r = refsum.r * vt[2].r - refsum.i * vt[2].i,
+				q__5.i = refsum.r * vt[2].i + refsum.i * vt[2]
+				.r;
+			q__4.r = q__5.r, q__4.i = q__5.i;
+			i__6 = k + k * h_dim1;
+			i__7 = k + 1 + (k + 1) * h_dim1;
+			i__8 = k + 2 + (k + 2) * h_dim1;
+			if ((r__1 = q__1.r, dabs(r__1)) + (r__2 = r_imag(&
+				q__1), dabs(r__2)) + ((r__3 = q__4.r, dabs(
+				r__3)) + (r__4 = r_imag(&q__4), dabs(r__4)))
+				> ulp * ((r__5 = h__[i__6].r, dabs(r__5)) + (
+				r__6 = r_imag(&h__[k + k * h_dim1]), dabs(
+				r__6)) + ((r__7 = h__[i__7].r, dabs(r__7)) + (
+				r__8 = r_imag(&h__[k + 1 + (k + 1) * h_dim1]),
+				 dabs(r__8))) + ((r__9 = h__[i__8].r, dabs(
+				r__9)) + (r__10 = r_imag(&h__[k + 2 + (k + 2)
+				* h_dim1]), dabs(r__10))))) {
+
+/*
+                         ==== Starting a new bulge here would
+                         .    create non-negligible fill.  Use
+                         .    the old one with trepidation. ====
+*/
+
+			    i__5 = k + 1 + k * h_dim1;
+			    h__[i__5].r = beta.r, h__[i__5].i = beta.i;
+			    i__5 = k + 2 + k * h_dim1;
+			    h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+			    i__5 = k + 3 + k * h_dim1;
+			    h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+			} else {
+
+/*
+                         ==== Stating a new bulge here would
+                         .    create only negligible fill.
+                         .    Replace the old reflector with
+                         .    the new one. ====
+*/
+
+			    i__5 = k + 1 + k * h_dim1;
+			    i__6 = k + 1 + k * h_dim1;
+			    q__1.r = h__[i__6].r - refsum.r, q__1.i = h__[
+				    i__6].i - refsum.i;
+			    h__[i__5].r = q__1.r, h__[i__5].i = q__1.i;
+			    i__5 = k + 2 + k * h_dim1;
+			    h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+			    i__5 = k + 3 + k * h_dim1;
+			    h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+			    i__5 = m * v_dim1 + 1;
+			    v[i__5].r = vt[0].r, v[i__5].i = vt[0].i;
+			    i__5 = m * v_dim1 + 2;
+			    v[i__5].r = vt[1].r, v[i__5].i = vt[1].i;
+			    i__5 = m * v_dim1 + 3;
+			    v[i__5].r = vt[2].r, v[i__5].i = vt[2].i;
+			}
+		    }
+		}
+/* L10: */
+	    }
+
+/*           ==== Generate a 2-by-2 reflection, if needed. ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    if (bmp22) {
+		if (k == *ktop - 1) {
+		    claqr1_(&c__2, &h__[k + 1 + (k + 1) * h_dim1], ldh, &s[(
+			    m22 << 1) - 1], &s[m22 * 2], &v[m22 * v_dim1 + 1])
+			    ;
+		    i__4 = m22 * v_dim1 + 1;
+		    beta.r = v[i__4].r, beta.i = v[i__4].i;
+		    clarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		} else {
+		    i__4 = k + 1 + k * h_dim1;
+		    beta.r = h__[i__4].r, beta.i = h__[i__4].i;
+		    i__4 = m22 * v_dim1 + 2;
+		    i__5 = k + 2 + k * h_dim1;
+		    v[i__4].r = h__[i__5].r, v[i__4].i = h__[i__5].i;
+		    clarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		    i__4 = k + 1 + k * h_dim1;
+		    h__[i__4].r = beta.r, h__[i__4].i = beta.i;
+		    i__4 = k + 2 + k * h_dim1;
+		    h__[i__4].r = 0.f, h__[i__4].i = 0.f;
+		}
+	    }
+
+/*           ==== Multiply H by reflections from the left ==== */
+
+	    if (accum) {
+		jbot = min(ndcol,*kbot);
+	    } else if (*wantt) {
+		jbot = *n;
+	    } else {
+		jbot = *kbot;
+	    }
+	    i__4 = jbot;
+	    for (j = max(*ktop,krcol); j <= i__4; ++j) {
+/* Computing MIN */
+		i__5 = mbot, i__6 = (j - krcol + 2) / 3;
+		mend = min(i__5,i__6);
+		i__5 = mend;
+		for (m = mtop; m <= i__5; ++m) {
+		    k = krcol + (m - 1) * 3;
+		    r_cnjg(&q__2, &v[m * v_dim1 + 1]);
+		    i__6 = k + 1 + j * h_dim1;
+		    r_cnjg(&q__6, &v[m * v_dim1 + 2]);
+		    i__7 = k + 2 + j * h_dim1;
+		    q__5.r = q__6.r * h__[i__7].r - q__6.i * h__[i__7].i,
+			    q__5.i = q__6.r * h__[i__7].i + q__6.i * h__[i__7]
+			    .r;
+		    q__4.r = h__[i__6].r + q__5.r, q__4.i = h__[i__6].i +
+			    q__5.i;
+		    r_cnjg(&q__8, &v[m * v_dim1 + 3]);
+		    i__8 = k + 3 + j * h_dim1;
+		    q__7.r = q__8.r * h__[i__8].r - q__8.i * h__[i__8].i,
+			    q__7.i = q__8.r * h__[i__8].i + q__8.i * h__[i__8]
+			    .r;
+		    q__3.r = q__4.r + q__7.r, q__3.i = q__4.i + q__7.i;
+		    q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+			    q__2.r * q__3.i + q__2.i * q__3.r;
+		    refsum.r = q__1.r, refsum.i = q__1.i;
+		    i__6 = k + 1 + j * h_dim1;
+		    i__7 = k + 1 + j * h_dim1;
+		    q__1.r = h__[i__7].r - refsum.r, q__1.i = h__[i__7].i -
+			    refsum.i;
+		    h__[i__6].r = q__1.r, h__[i__6].i = q__1.i;
+		    i__6 = k + 2 + j * h_dim1;
+		    i__7 = k + 2 + j * h_dim1;
+		    i__8 = m * v_dim1 + 2;
+		    q__2.r = refsum.r * v[i__8].r - refsum.i * v[i__8].i,
+			    q__2.i = refsum.r * v[i__8].i + refsum.i * v[i__8]
+			    .r;
+		    q__1.r = h__[i__7].r - q__2.r, q__1.i = h__[i__7].i -
+			    q__2.i;
+		    h__[i__6].r = q__1.r, h__[i__6].i = q__1.i;
+		    i__6 = k + 3 + j * h_dim1;
+		    i__7 = k + 3 + j * h_dim1;
+		    i__8 = m * v_dim1 + 3;
+		    q__2.r = refsum.r * v[i__8].r - refsum.i * v[i__8].i,
+			    q__2.i = refsum.r * v[i__8].i + refsum.i * v[i__8]
+			    .r;
+		    q__1.r = h__[i__7].r - q__2.r, q__1.i = h__[i__7].i -
+			    q__2.i;
+		    h__[i__6].r = q__1.r, h__[i__6].i = q__1.i;
+/* L20: */
+		}
+/* L30: */
+	    }
+	    if (bmp22) {
+		k = krcol + (m22 - 1) * 3;
+/* Computing MAX */
+		i__4 = k + 1;
+		i__5 = jbot;
+		for (j = max(i__4,*ktop); j <= i__5; ++j) {
+		    r_cnjg(&q__2, &v[m22 * v_dim1 + 1]);
+		    i__4 = k + 1 + j * h_dim1;
+		    r_cnjg(&q__5, &v[m22 * v_dim1 + 2]);
+		    i__6 = k + 2 + j * h_dim1;
+		    q__4.r = q__5.r * h__[i__6].r - q__5.i * h__[i__6].i,
+			    q__4.i = q__5.r * h__[i__6].i + q__5.i * h__[i__6]
+			    .r;
+		    q__3.r = h__[i__4].r + q__4.r, q__3.i = h__[i__4].i +
+			    q__4.i;
+		    q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+			    q__2.r * q__3.i + q__2.i * q__3.r;
+		    refsum.r = q__1.r, refsum.i = q__1.i;
+		    i__4 = k + 1 + j * h_dim1;
+		    i__6 = k + 1 + j * h_dim1;
+		    q__1.r = h__[i__6].r - refsum.r, q__1.i = h__[i__6].i -
+			    refsum.i;
+		    h__[i__4].r = q__1.r, h__[i__4].i = q__1.i;
+		    i__4 = k + 2 + j * h_dim1;
+		    i__6 = k + 2 + j * h_dim1;
+		    i__7 = m22 * v_dim1 + 2;
+		    q__2.r = refsum.r * v[i__7].r - refsum.i * v[i__7].i,
+			    q__2.i = refsum.r * v[i__7].i + refsum.i * v[i__7]
+			    .r;
+		    q__1.r = h__[i__6].r - q__2.r, q__1.i = h__[i__6].i -
+			    q__2.i;
+		    h__[i__4].r = q__1.r, h__[i__4].i = q__1.i;
+/* L40: */
+		}
+	    }
+
+/*
+             ==== Multiply H by reflections from the right.
+             .    Delay filling in the last row until the
+             .    vigilant deflation check is complete. ====
+*/
+
+	    if (accum) {
+		jtop = max(*ktop,incol);
+	    } else if (*wantt) {
+		jtop = 1;
+	    } else {
+		jtop = *ktop;
+	    }
+	    i__5 = mbot;
+	    for (m = mtop; m <= i__5; ++m) {
+		i__4 = m * v_dim1 + 1;
+		if (v[i__4].r != 0.f || v[i__4].i != 0.f) {
+		    k = krcol + (m - 1) * 3;
+/* Computing MIN */
+		    i__6 = *kbot, i__7 = k + 3;
+		    i__4 = min(i__6,i__7);
+		    for (j = jtop; j <= i__4; ++j) {
+			i__6 = m * v_dim1 + 1;
+			i__7 = j + (k + 1) * h_dim1;
+			i__8 = m * v_dim1 + 2;
+			i__9 = j + (k + 2) * h_dim1;
+			q__4.r = v[i__8].r * h__[i__9].r - v[i__8].i * h__[
+				i__9].i, q__4.i = v[i__8].r * h__[i__9].i + v[
+				i__8].i * h__[i__9].r;
+			q__3.r = h__[i__7].r + q__4.r, q__3.i = h__[i__7].i +
+				q__4.i;
+			i__10 = m * v_dim1 + 3;
+			i__11 = j + (k + 3) * h_dim1;
+			q__5.r = v[i__10].r * h__[i__11].r - v[i__10].i * h__[
+				i__11].i, q__5.i = v[i__10].r * h__[i__11].i
+				+ v[i__10].i * h__[i__11].r;
+			q__2.r = q__3.r + q__5.r, q__2.i = q__3.i + q__5.i;
+			q__1.r = v[i__6].r * q__2.r - v[i__6].i * q__2.i,
+				q__1.i = v[i__6].r * q__2.i + v[i__6].i *
+				q__2.r;
+			refsum.r = q__1.r, refsum.i = q__1.i;
+			i__6 = j + (k + 1) * h_dim1;
+			i__7 = j + (k + 1) * h_dim1;
+			q__1.r = h__[i__7].r - refsum.r, q__1.i = h__[i__7].i
+				- refsum.i;
+			h__[i__6].r = q__1.r, h__[i__6].i = q__1.i;
+			i__6 = j + (k + 2) * h_dim1;
+			i__7 = j + (k + 2) * h_dim1;
+			r_cnjg(&q__3, &v[m * v_dim1 + 2]);
+			q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				q__2.i = refsum.r * q__3.i + refsum.i *
+				q__3.r;
+			q__1.r = h__[i__7].r - q__2.r, q__1.i = h__[i__7].i -
+				q__2.i;
+			h__[i__6].r = q__1.r, h__[i__6].i = q__1.i;
+			i__6 = j + (k + 3) * h_dim1;
+			i__7 = j + (k + 3) * h_dim1;
+			r_cnjg(&q__3, &v[m * v_dim1 + 3]);
+			q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				q__2.i = refsum.r * q__3.i + refsum.i *
+				q__3.r;
+			q__1.r = h__[i__7].r - q__2.r, q__1.i = h__[i__7].i -
+				q__2.i;
+			h__[i__6].r = q__1.r, h__[i__6].i = q__1.i;
+/* L50: */
+		    }
+
+		    if (accum) {
+
+/*
+                      ==== Accumulate U. (If necessary, update Z later
+                      .    with with an efficient matrix-matrix
+                      .    multiply.) ====
+*/
+
+			kms = k - incol;
+/* Computing MAX */
+			i__4 = 1, i__6 = *ktop - incol;
+			i__7 = kdu;
+			for (j = max(i__4,i__6); j <= i__7; ++j) {
+			    i__4 = m * v_dim1 + 1;
+			    i__6 = j + (kms + 1) * u_dim1;
+			    i__8 = m * v_dim1 + 2;
+			    i__9 = j + (kms + 2) * u_dim1;
+			    q__4.r = v[i__8].r * u[i__9].r - v[i__8].i * u[
+				    i__9].i, q__4.i = v[i__8].r * u[i__9].i +
+				    v[i__8].i * u[i__9].r;
+			    q__3.r = u[i__6].r + q__4.r, q__3.i = u[i__6].i +
+				    q__4.i;
+			    i__10 = m * v_dim1 + 3;
+			    i__11 = j + (kms + 3) * u_dim1;
+			    q__5.r = v[i__10].r * u[i__11].r - v[i__10].i * u[
+				    i__11].i, q__5.i = v[i__10].r * u[i__11]
+				    .i + v[i__10].i * u[i__11].r;
+			    q__2.r = q__3.r + q__5.r, q__2.i = q__3.i +
+				    q__5.i;
+			    q__1.r = v[i__4].r * q__2.r - v[i__4].i * q__2.i,
+				    q__1.i = v[i__4].r * q__2.i + v[i__4].i *
+				    q__2.r;
+			    refsum.r = q__1.r, refsum.i = q__1.i;
+			    i__4 = j + (kms + 1) * u_dim1;
+			    i__6 = j + (kms + 1) * u_dim1;
+			    q__1.r = u[i__6].r - refsum.r, q__1.i = u[i__6].i
+				    - refsum.i;
+			    u[i__4].r = q__1.r, u[i__4].i = q__1.i;
+			    i__4 = j + (kms + 2) * u_dim1;
+			    i__6 = j + (kms + 2) * u_dim1;
+			    r_cnjg(&q__3, &v[m * v_dim1 + 2]);
+			    q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				    q__2.i = refsum.r * q__3.i + refsum.i *
+				    q__3.r;
+			    q__1.r = u[i__6].r - q__2.r, q__1.i = u[i__6].i -
+				    q__2.i;
+			    u[i__4].r = q__1.r, u[i__4].i = q__1.i;
+			    i__4 = j + (kms + 3) * u_dim1;
+			    i__6 = j + (kms + 3) * u_dim1;
+			    r_cnjg(&q__3, &v[m * v_dim1 + 3]);
+			    q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				    q__2.i = refsum.r * q__3.i + refsum.i *
+				    q__3.r;
+			    q__1.r = u[i__6].r - q__2.r, q__1.i = u[i__6].i -
+				    q__2.i;
+			    u[i__4].r = q__1.r, u[i__4].i = q__1.i;
+/* L60: */
+			}
+		    } else if (*wantz) {
+
+/*
+                      ==== U is not accumulated, so update Z
+                      .    now by multiplying by reflections
+                      .    from the right. ====
+*/
+
+			i__7 = *ihiz;
+			for (j = *iloz; j <= i__7; ++j) {
+			    i__4 = m * v_dim1 + 1;
+			    i__6 = j + (k + 1) * z_dim1;
+			    i__8 = m * v_dim1 + 2;
+			    i__9 = j + (k + 2) * z_dim1;
+			    q__4.r = v[i__8].r * z__[i__9].r - v[i__8].i *
+				    z__[i__9].i, q__4.i = v[i__8].r * z__[
+				    i__9].i + v[i__8].i * z__[i__9].r;
+			    q__3.r = z__[i__6].r + q__4.r, q__3.i = z__[i__6]
+				    .i + q__4.i;
+			    i__10 = m * v_dim1 + 3;
+			    i__11 = j + (k + 3) * z_dim1;
+			    q__5.r = v[i__10].r * z__[i__11].r - v[i__10].i *
+				    z__[i__11].i, q__5.i = v[i__10].r * z__[
+				    i__11].i + v[i__10].i * z__[i__11].r;
+			    q__2.r = q__3.r + q__5.r, q__2.i = q__3.i +
+				    q__5.i;
+			    q__1.r = v[i__4].r * q__2.r - v[i__4].i * q__2.i,
+				    q__1.i = v[i__4].r * q__2.i + v[i__4].i *
+				    q__2.r;
+			    refsum.r = q__1.r, refsum.i = q__1.i;
+			    i__4 = j + (k + 1) * z_dim1;
+			    i__6 = j + (k + 1) * z_dim1;
+			    q__1.r = z__[i__6].r - refsum.r, q__1.i = z__[
+				    i__6].i - refsum.i;
+			    z__[i__4].r = q__1.r, z__[i__4].i = q__1.i;
+			    i__4 = j + (k + 2) * z_dim1;
+			    i__6 = j + (k + 2) * z_dim1;
+			    r_cnjg(&q__3, &v[m * v_dim1 + 2]);
+			    q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				    q__2.i = refsum.r * q__3.i + refsum.i *
+				    q__3.r;
+			    q__1.r = z__[i__6].r - q__2.r, q__1.i = z__[i__6]
+				    .i - q__2.i;
+			    z__[i__4].r = q__1.r, z__[i__4].i = q__1.i;
+			    i__4 = j + (k + 3) * z_dim1;
+			    i__6 = j + (k + 3) * z_dim1;
+			    r_cnjg(&q__3, &v[m * v_dim1 + 3]);
+			    q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				    q__2.i = refsum.r * q__3.i + refsum.i *
+				    q__3.r;
+			    q__1.r = z__[i__6].r - q__2.r, q__1.i = z__[i__6]
+				    .i - q__2.i;
+			    z__[i__4].r = q__1.r, z__[i__4].i = q__1.i;
+/* L70: */
+			}
+		    }
+		}
+/* L80: */
+	    }
+
+/*           ==== Special case: 2-by-2 reflection (if needed) ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    i__5 = m22 * v_dim1 + 1;
+	    if (bmp22 && (v[i__5].r != 0.f || v[i__5].i != 0.f)) {
+/* Computing MIN */
+		i__7 = *kbot, i__4 = k + 3;
+		i__5 = min(i__7,i__4);
+		for (j = jtop; j <= i__5; ++j) {
+		    i__7 = m22 * v_dim1 + 1;
+		    i__4 = j + (k + 1) * h_dim1;
+		    i__6 = m22 * v_dim1 + 2;
+		    i__8 = j + (k + 2) * h_dim1;
+		    q__3.r = v[i__6].r * h__[i__8].r - v[i__6].i * h__[i__8]
+			    .i, q__3.i = v[i__6].r * h__[i__8].i + v[i__6].i *
+			     h__[i__8].r;
+		    q__2.r = h__[i__4].r + q__3.r, q__2.i = h__[i__4].i +
+			    q__3.i;
+		    q__1.r = v[i__7].r * q__2.r - v[i__7].i * q__2.i, q__1.i =
+			     v[i__7].r * q__2.i + v[i__7].i * q__2.r;
+		    refsum.r = q__1.r, refsum.i = q__1.i;
+		    i__7 = j + (k + 1) * h_dim1;
+		    i__4 = j + (k + 1) * h_dim1;
+		    q__1.r = h__[i__4].r - refsum.r, q__1.i = h__[i__4].i -
+			    refsum.i;
+		    h__[i__7].r = q__1.r, h__[i__7].i = q__1.i;
+		    i__7 = j + (k + 2) * h_dim1;
+		    i__4 = j + (k + 2) * h_dim1;
+		    r_cnjg(&q__3, &v[m22 * v_dim1 + 2]);
+		    q__2.r = refsum.r * q__3.r - refsum.i * q__3.i, q__2.i =
+			    refsum.r * q__3.i + refsum.i * q__3.r;
+		    q__1.r = h__[i__4].r - q__2.r, q__1.i = h__[i__4].i -
+			    q__2.i;
+		    h__[i__7].r = q__1.r, h__[i__7].i = q__1.i;
+/* L90: */
+		}
+
+		if (accum) {
+		    kms = k - incol;
+/* Computing MAX */
+		    i__5 = 1, i__7 = *ktop - incol;
+		    i__4 = kdu;
+		    for (j = max(i__5,i__7); j <= i__4; ++j) {
+			i__5 = m22 * v_dim1 + 1;
+			i__7 = j + (kms + 1) * u_dim1;
+			i__6 = m22 * v_dim1 + 2;
+			i__8 = j + (kms + 2) * u_dim1;
+			q__3.r = v[i__6].r * u[i__8].r - v[i__6].i * u[i__8]
+				.i, q__3.i = v[i__6].r * u[i__8].i + v[i__6]
+				.i * u[i__8].r;
+			q__2.r = u[i__7].r + q__3.r, q__2.i = u[i__7].i +
+				q__3.i;
+			q__1.r = v[i__5].r * q__2.r - v[i__5].i * q__2.i,
+				q__1.i = v[i__5].r * q__2.i + v[i__5].i *
+				q__2.r;
+			refsum.r = q__1.r, refsum.i = q__1.i;
+			i__5 = j + (kms + 1) * u_dim1;
+			i__7 = j + (kms + 1) * u_dim1;
+			q__1.r = u[i__7].r - refsum.r, q__1.i = u[i__7].i -
+				refsum.i;
+			u[i__5].r = q__1.r, u[i__5].i = q__1.i;
+			i__5 = j + (kms + 2) * u_dim1;
+			i__7 = j + (kms + 2) * u_dim1;
+			r_cnjg(&q__3, &v[m22 * v_dim1 + 2]);
+			q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				q__2.i = refsum.r * q__3.i + refsum.i *
+				q__3.r;
+			q__1.r = u[i__7].r - q__2.r, q__1.i = u[i__7].i -
+				q__2.i;
+			u[i__5].r = q__1.r, u[i__5].i = q__1.i;
+/* L100: */
+		    }
+		} else if (*wantz) {
+		    i__4 = *ihiz;
+		    for (j = *iloz; j <= i__4; ++j) {
+			i__5 = m22 * v_dim1 + 1;
+			i__7 = j + (k + 1) * z_dim1;
+			i__6 = m22 * v_dim1 + 2;
+			i__8 = j + (k + 2) * z_dim1;
+			q__3.r = v[i__6].r * z__[i__8].r - v[i__6].i * z__[
+				i__8].i, q__3.i = v[i__6].r * z__[i__8].i + v[
+				i__6].i * z__[i__8].r;
+			q__2.r = z__[i__7].r + q__3.r, q__2.i = z__[i__7].i +
+				q__3.i;
+			q__1.r = v[i__5].r * q__2.r - v[i__5].i * q__2.i,
+				q__1.i = v[i__5].r * q__2.i + v[i__5].i *
+				q__2.r;
+			refsum.r = q__1.r, refsum.i = q__1.i;
+			i__5 = j + (k + 1) * z_dim1;
+			i__7 = j + (k + 1) * z_dim1;
+			q__1.r = z__[i__7].r - refsum.r, q__1.i = z__[i__7].i
+				- refsum.i;
+			z__[i__5].r = q__1.r, z__[i__5].i = q__1.i;
+			i__5 = j + (k + 2) * z_dim1;
+			i__7 = j + (k + 2) * z_dim1;
+			r_cnjg(&q__3, &v[m22 * v_dim1 + 2]);
+			q__2.r = refsum.r * q__3.r - refsum.i * q__3.i,
+				q__2.i = refsum.r * q__3.i + refsum.i *
+				q__3.r;
+			q__1.r = z__[i__7].r - q__2.r, q__1.i = z__[i__7].i -
+				q__2.i;
+			z__[i__5].r = q__1.r, z__[i__5].i = q__1.i;
+/* L110: */
+		    }
+		}
+	    }
+
+/*           ==== Vigilant deflation check ==== */
+
+	    mstart = mtop;
+	    if (krcol + (mstart - 1) * 3 < *ktop) {
+		++mstart;
+	    }
+	    mend = mbot;
+	    if (bmp22) {
+		++mend;
+	    }
+	    if (krcol == *kbot - 2) {
+		++mend;
+	    }
+	    i__4 = mend;
+	    for (m = mstart; m <= i__4; ++m) {
+/* Computing MIN */
+		i__5 = *kbot - 1, i__7 = krcol + (m - 1) * 3;
+		k = min(i__5,i__7);
+
+/*
+                ==== The following convergence test requires that
+                .    the tradition small-compared-to-nearby-diagonals
+                .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+                .    criteria both be satisfied.  The latter improves
+                .    accuracy in some examples. Falling back on an
+                .    alternate convergence criterion when TST1 or TST2
+                .    is zero (as done here) is traditional but probably
+                .    unnecessary. ====
+*/
+
+		i__5 = k + 1 + k * h_dim1;
+		if (h__[i__5].r != 0.f || h__[i__5].i != 0.f) {
+		    i__5 = k + k * h_dim1;
+		    i__7 = k + 1 + (k + 1) * h_dim1;
+		    tst1 = (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 = r_imag(&
+			    h__[k + k * h_dim1]), dabs(r__2)) + ((r__3 = h__[
+			    i__7].r, dabs(r__3)) + (r__4 = r_imag(&h__[k + 1
+			    + (k + 1) * h_dim1]), dabs(r__4)));
+		    if (tst1 == 0.f) {
+			if (k >= *ktop + 1) {
+			    i__5 = k + (k - 1) * h_dim1;
+			    tst1 += (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				     r_imag(&h__[k + (k - 1) * h_dim1]), dabs(
+				    r__2));
+			}
+			if (k >= *ktop + 2) {
+			    i__5 = k + (k - 2) * h_dim1;
+			    tst1 += (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				     r_imag(&h__[k + (k - 2) * h_dim1]), dabs(
+				    r__2));
+			}
+			if (k >= *ktop + 3) {
+			    i__5 = k + (k - 3) * h_dim1;
+			    tst1 += (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				     r_imag(&h__[k + (k - 3) * h_dim1]), dabs(
+				    r__2));
+			}
+			if (k <= *kbot - 2) {
+			    i__5 = k + 2 + (k + 1) * h_dim1;
+			    tst1 += (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				     r_imag(&h__[k + 2 + (k + 1) * h_dim1]),
+				    dabs(r__2));
+			}
+			if (k <= *kbot - 3) {
+			    i__5 = k + 3 + (k + 1) * h_dim1;
+			    tst1 += (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				     r_imag(&h__[k + 3 + (k + 1) * h_dim1]),
+				    dabs(r__2));
+			}
+			if (k <= *kbot - 4) {
+			    i__5 = k + 4 + (k + 1) * h_dim1;
+			    tst1 += (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				     r_imag(&h__[k + 4 + (k + 1) * h_dim1]),
+				    dabs(r__2));
+			}
+		    }
+		    i__5 = k + 1 + k * h_dim1;
+/* Computing MAX */
+		    r__3 = smlnum, r__4 = ulp * tst1;
+		    if ((r__1 = h__[i__5].r, dabs(r__1)) + (r__2 = r_imag(&
+			    h__[k + 1 + k * h_dim1]), dabs(r__2)) <= dmax(
+			    r__3,r__4)) {
+/* Computing MAX */
+			i__5 = k + 1 + k * h_dim1;
+			i__7 = k + (k + 1) * h_dim1;
+			r__5 = (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				r_imag(&h__[k + 1 + k * h_dim1]), dabs(r__2)),
+				 r__6 = (r__3 = h__[i__7].r, dabs(r__3)) + (
+				r__4 = r_imag(&h__[k + (k + 1) * h_dim1]),
+				dabs(r__4));
+			h12 = dmax(r__5,r__6);
+/* Computing MIN */
+			i__5 = k + 1 + k * h_dim1;
+			i__7 = k + (k + 1) * h_dim1;
+			r__5 = (r__1 = h__[i__5].r, dabs(r__1)) + (r__2 =
+				r_imag(&h__[k + 1 + k * h_dim1]), dabs(r__2)),
+				 r__6 = (r__3 = h__[i__7].r, dabs(r__3)) + (
+				r__4 = r_imag(&h__[k + (k + 1) * h_dim1]),
+				dabs(r__4));
+			h21 = dmin(r__5,r__6);
+			i__5 = k + k * h_dim1;
+			i__7 = k + 1 + (k + 1) * h_dim1;
+			q__2.r = h__[i__5].r - h__[i__7].r, q__2.i = h__[i__5]
+				.i - h__[i__7].i;
+			q__1.r = q__2.r, q__1.i = q__2.i;
+/* Computing MAX */
+			i__6 = k + 1 + (k + 1) * h_dim1;
+			r__5 = (r__1 = h__[i__6].r, dabs(r__1)) + (r__2 =
+				r_imag(&h__[k + 1 + (k + 1) * h_dim1]), dabs(
+				r__2)), r__6 = (r__3 = q__1.r, dabs(r__3)) + (
+				r__4 = r_imag(&q__1), dabs(r__4));
+			h11 = dmax(r__5,r__6);
+			i__5 = k + k * h_dim1;
+			i__7 = k + 1 + (k + 1) * h_dim1;
+			q__2.r = h__[i__5].r - h__[i__7].r, q__2.i = h__[i__5]
+				.i - h__[i__7].i;
+			q__1.r = q__2.r, q__1.i = q__2.i;
+/* Computing MIN */
+			i__6 = k + 1 + (k + 1) * h_dim1;
+			r__5 = (r__1 = h__[i__6].r, dabs(r__1)) + (r__2 =
+				r_imag(&h__[k + 1 + (k + 1) * h_dim1]), dabs(
+				r__2)), r__6 = (r__3 = q__1.r, dabs(r__3)) + (
+				r__4 = r_imag(&q__1), dabs(r__4));
+			h22 = dmin(r__5,r__6);
+			scl = h11 + h12;
+			tst2 = h22 * (h11 / scl);
+
+/* Computing MAX */
+			r__1 = smlnum, r__2 = ulp * tst2;
+			if (tst2 == 0.f || h21 * (h12 / scl) <= dmax(r__1,
+				r__2)) {
+			    i__5 = k + 1 + k * h_dim1;
+			    h__[i__5].r = 0.f, h__[i__5].i = 0.f;
+			}
+		    }
+		}
+/* L120: */
+	    }
+
+/*
+             ==== Fill in the last row of each bulge. ====
+
+   Computing MIN
+*/
+	    i__4 = nbmps, i__5 = (*kbot - krcol - 1) / 3;
+	    mend = min(i__4,i__5);
+	    i__4 = mend;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		i__5 = m * v_dim1 + 1;
+		i__7 = m * v_dim1 + 3;
+		q__2.r = v[i__5].r * v[i__7].r - v[i__5].i * v[i__7].i,
+			q__2.i = v[i__5].r * v[i__7].i + v[i__5].i * v[i__7]
+			.r;
+		i__6 = k + 4 + (k + 3) * h_dim1;
+		q__1.r = q__2.r * h__[i__6].r - q__2.i * h__[i__6].i, q__1.i =
+			 q__2.r * h__[i__6].i + q__2.i * h__[i__6].r;
+		refsum.r = q__1.r, refsum.i = q__1.i;
+		i__5 = k + 4 + (k + 1) * h_dim1;
+		q__1.r = -refsum.r, q__1.i = -refsum.i;
+		h__[i__5].r = q__1.r, h__[i__5].i = q__1.i;
+		i__5 = k + 4 + (k + 2) * h_dim1;
+		q__2.r = -refsum.r, q__2.i = -refsum.i;
+		r_cnjg(&q__3, &v[m * v_dim1 + 2]);
+		q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = q__2.r *
+			q__3.i + q__2.i * q__3.r;
+		h__[i__5].r = q__1.r, h__[i__5].i = q__1.i;
+		i__5 = k + 4 + (k + 3) * h_dim1;
+		i__7 = k + 4 + (k + 3) * h_dim1;
+		r_cnjg(&q__3, &v[m * v_dim1 + 3]);
+		q__2.r = refsum.r * q__3.r - refsum.i * q__3.i, q__2.i =
+			refsum.r * q__3.i + refsum.i * q__3.r;
+		q__1.r = h__[i__7].r - q__2.r, q__1.i = h__[i__7].i - q__2.i;
+		h__[i__5].r = q__1.r, h__[i__5].i = q__1.i;
+/* L130: */
+	    }
+
+/*
+             ==== End of near-the-diagonal bulge chase. ====
+
+   L140:
+*/
+	}
+
+/*
+          ==== Use U (if accumulated) to update far-from-diagonal
+          .    entries in H.  If required, use U to update Z as
+          .    well. ====
+*/
+
+	if (accum) {
+	    if (*wantt) {
+		jtop = 1;
+		jbot = *n;
+	    } else {
+		jtop = *ktop;
+		jbot = *kbot;
+	    }
+	    if (! blk22 || incol < *ktop || ndcol > *kbot || ns <= 2) {
+
+/*
+                ==== Updates not exploiting the 2-by-2 block
+                .    structure of U.  K1 and NU keep track of
+                .    the location and size of U in the special
+                .    cases of introducing bulges and chasing
+                .    bulges off the bottom.  In these special
+                .    cases and in case the number of shifts
+                .    is NS = 2, there is no 2-by-2 block
+                .    structure to exploit.  ====
+
+   Computing MAX
+*/
+		i__3 = 1, i__4 = *ktop - incol;
+		k1 = max(i__3,i__4);
+/* Computing MAX */
+		i__3 = 0, i__4 = ndcol - *kbot;
+		nu = kdu - max(i__3,i__4) - k1 + 1;
+
+/*              ==== Horizontal Multiply ==== */
+
+		i__3 = jbot;
+		i__4 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__4 < 0 ? jcol >= i__3 :
+			jcol <= i__3; jcol += i__4) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+		    cgemm_("C", "N", &nu, &jlen, &nu, &c_b57, &u[k1 + k1 *
+			    u_dim1], ldu, &h__[incol + k1 + jcol * h_dim1],
+			    ldh, &c_b56, &wh[wh_offset], ldwh);
+		    clacpy_("ALL", &nu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + k1 + jcol * h_dim1], ldh);
+/* L150: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__4 = max(*ktop,incol) - 1;
+		i__3 = *nv;
+		for (jrow = jtop; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			jrow += i__3) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(*ktop,incol) - jrow;
+		    jlen = min(i__5,i__7);
+		    cgemm_("N", "N", &jlen, &nu, &nu, &c_b57, &h__[jrow + (
+			    incol + k1) * h_dim1], ldh, &u[k1 + k1 * u_dim1],
+			    ldu, &c_b56, &wv[wv_offset], ldwv);
+		    clacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + k1) * h_dim1], ldh);
+/* L160: */
+		}
+
+/*              ==== Z multiply (also vertical) ==== */
+
+		if (*wantz) {
+		    i__3 = *ihiz;
+		    i__4 = *nv;
+		    for (jrow = *iloz; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			     jrow += i__4) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+			cgemm_("N", "N", &jlen, &nu, &nu, &c_b57, &z__[jrow +
+				(incol + k1) * z_dim1], ldz, &u[k1 + k1 *
+				u_dim1], ldu, &c_b56, &wv[wv_offset], ldwv);
+			clacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &z__[
+				jrow + (incol + k1) * z_dim1], ldz)
+				;
+/* L170: */
+		    }
+		}
+	    } else {
+
+/*
+                ==== Updates exploiting U's 2-by-2 block structure.
+                .    (I2, I4, J2, J4 are the last rows and columns
+                .    of the blocks.) ====
+*/
+
+		i2 = (kdu + 1) / 2;
+		i4 = kdu;
+		j2 = i4 - i2;
+		j4 = kdu;
+
+/*
+                ==== KZS and KNZ deal with the band of zeros
+                .    along the diagonal of one of the triangular
+                .    blocks. ====
+*/
+
+		kzs = j4 - j2 - (ns + 1);
+		knz = ns + 1;
+
+/*              ==== Horizontal multiply ==== */
+
+		i__4 = jbot;
+		i__3 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__3 < 0 ? jcol >= i__4 :
+			jcol <= i__4; jcol += i__3) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy bottom of H to top+KZS of scratch ====
+                    (The first KZS rows get multiplied by zero.) ====
+*/
+
+		    clacpy_("ALL", &knz, &jlen, &h__[incol + 1 + j2 + jcol *
+			    h_dim1], ldh, &wh[kzs + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    claset_("ALL", &kzs, &jlen, &c_b56, &c_b56, &wh[wh_offset]
+			    , ldwh);
+		    ctrmm_("L", "U", "C", "N", &knz, &jlen, &c_b57, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wh[kzs + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Multiply top of H by U11' ==== */
+
+		    cgemm_("C", "N", &i2, &jlen, &j2, &c_b57, &u[u_offset],
+			    ldu, &h__[incol + 1 + jcol * h_dim1], ldh, &c_b57,
+			     &wh[wh_offset], ldwh);
+
+/*                 ==== Copy top of H to bottom of WH ==== */
+
+		    clacpy_("ALL", &j2, &jlen, &h__[incol + 1 + jcol * h_dim1]
+			    , ldh, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    ctrmm_("L", "L", "C", "N", &j2, &jlen, &c_b57, &u[(i2 + 1)
+			     * u_dim1 + 1], ldu, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    cgemm_("C", "N", &i__5, &jlen, &i__7, &c_b57, &u[j2 + 1 +
+			    (i2 + 1) * u_dim1], ldu, &h__[incol + 1 + j2 +
+			    jcol * h_dim1], ldh, &c_b57, &wh[i2 + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Copy it back ==== */
+
+		    clacpy_("ALL", &kdu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + 1 + jcol * h_dim1], ldh);
+/* L180: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__3 = max(incol,*ktop) - 1;
+		i__4 = *nv;
+		for (jrow = jtop; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			jrow += i__4) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(incol,*ktop) - jrow;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy right of H to scratch (the first KZS
+                   .    columns get multiplied by zero) ====
+*/
+
+		    clacpy_("ALL", &jlen, &knz, &h__[jrow + (incol + 1 + j2) *
+			     h_dim1], ldh, &wv[(kzs + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    claset_("ALL", &jlen, &kzs, &c_b56, &c_b56, &wv[wv_offset]
+			    , ldwv);
+		    ctrmm_("R", "U", "N", "N", &jlen, &knz, &c_b57, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U11 ==== */
+
+		    cgemm_("N", "N", &jlen, &i2, &j2, &c_b57, &h__[jrow + (
+			    incol + 1) * h_dim1], ldh, &u[u_offset], ldu, &
+			    c_b57, &wv[wv_offset], ldwv)
+			    ;
+
+/*                 ==== Copy left of H to right of scratch ==== */
+
+		    clacpy_("ALL", &jlen, &j2, &h__[jrow + (incol + 1) *
+			    h_dim1], ldh, &wv[(i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    i__5 = i4 - i2;
+		    ctrmm_("R", "L", "N", "N", &jlen, &i__5, &c_b57, &u[(i2 +
+			    1) * u_dim1 + 1], ldu, &wv[(i2 + 1) * wv_dim1 + 1]
+			    , ldwv);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    cgemm_("N", "N", &jlen, &i__5, &i__7, &c_b57, &h__[jrow +
+			    (incol + 1 + j2) * h_dim1], ldh, &u[j2 + 1 + (i2
+			    + 1) * u_dim1], ldu, &c_b57, &wv[(i2 + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Copy it back ==== */
+
+		    clacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + 1) * h_dim1], ldh);
+/* L190: */
+		}
+
+/*              ==== Multiply Z (also vertical) ==== */
+
+		if (*wantz) {
+		    i__4 = *ihiz;
+		    i__3 = *nv;
+		    for (jrow = *iloz; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			     jrow += i__3) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+
+/*
+                      ==== Copy right of Z to left of scratch (first
+                      .     KZS columns get multiplied by zero) ====
+*/
+
+			clacpy_("ALL", &jlen, &knz, &z__[jrow + (incol + 1 +
+				j2) * z_dim1], ldz, &wv[(kzs + 1) * wv_dim1 +
+				1], ldwv);
+
+/*                    ==== Multiply by U12 ==== */
+
+			claset_("ALL", &jlen, &kzs, &c_b56, &c_b56, &wv[
+				wv_offset], ldwv);
+			ctrmm_("R", "U", "N", "N", &jlen, &knz, &c_b57, &u[j2
+				+ 1 + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1)
+				* wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U11 ==== */
+
+			cgemm_("N", "N", &jlen, &i2, &j2, &c_b57, &z__[jrow +
+				(incol + 1) * z_dim1], ldz, &u[u_offset], ldu,
+				 &c_b57, &wv[wv_offset], ldwv);
+
+/*                    ==== Copy left of Z to right of scratch ==== */
+
+			clacpy_("ALL", &jlen, &j2, &z__[jrow + (incol + 1) *
+				z_dim1], ldz, &wv[(i2 + 1) * wv_dim1 + 1],
+				ldwv);
+
+/*                    ==== Multiply by U21 ==== */
+
+			i__5 = i4 - i2;
+			ctrmm_("R", "L", "N", "N", &jlen, &i__5, &c_b57, &u[(
+				i2 + 1) * u_dim1 + 1], ldu, &wv[(i2 + 1) *
+				wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U22 ==== */
+
+			i__5 = i4 - i2;
+			i__7 = j4 - j2;
+			cgemm_("N", "N", &jlen, &i__5, &i__7, &c_b57, &z__[
+				jrow + (incol + 1 + j2) * z_dim1], ldz, &u[j2
+				+ 1 + (i2 + 1) * u_dim1], ldu, &c_b57, &wv[(
+				i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                    ==== Copy the result back to Z ==== */
+
+			clacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &
+				z__[jrow + (incol + 1) * z_dim1], ldz);
+/* L200: */
+		    }
+		}
+	    }
+	}
+/* L210: */
+    }
+
+/*     ==== End of CLAQR5 ==== */
+
+    return 0;
+} /* claqr5_ */
+
+/* Subroutine */ int clarcm_(integer *m, integer *n, real *a, integer *lda,
+	complex *b, integer *ldb, complex *c__, integer *ldc, real *rwork)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLARCM performs a very simple matrix-matrix multiplication:
+             C := A * B,
+    where A is M by M and real; B is M by N and complex;
+    C is M by N and complex.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A and of the matrix C.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns and rows of the matrix B and
+            the number of columns of the matrix C.
+            N >= 0.
+
+    A       (input) REAL array, dimension (LDA, M)
+            A contains the M by M matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >=max(1,M).
+
+    B       (input) REAL array, dimension (LDB, N)
+            B contains the M by N matrix B.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >=max(1,M).
+
+    C       (input) COMPLEX array, dimension (LDC, N)
+            C contains the M by N matrix C.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >=max(1,M).
+
+    RWORK   (workspace) REAL array, dimension (2*M*N)
+
+    =====================================================================
+
+
+       Quick return if possible.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --rwork;
+
+    /* Function Body */
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * b_dim1;
+	    rwork[(j - 1) * *m + i__] = b[i__3].r;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    l = *m * *n + 1;
+    sgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = l + (j - 1) * *m + i__ - 1;
+	    c__[i__3].r = rwork[i__4], c__[i__3].i = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    rwork[(j - 1) * *m + i__] = r_imag(&b[i__ + j * b_dim1]);
+/* L50: */
+	}
+/* L60: */
+    }
+    sgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = i__ + j * c_dim1;
+	    r__1 = c__[i__4].r;
+	    i__5 = l + (j - 1) * *m + i__ - 1;
+	    q__1.r = r__1, q__1.i = rwork[i__5];
+	    c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L70: */
+	}
+/* L80: */
+    }
+
+    return 0;
+
+/*     End of CLARCM */
+
+} /* clarcm_ */
+
+/* Subroutine */ int clarf_(char *side, integer *m, integer *n, complex *v,
+	integer *incv, complex *tau, complex *c__, integer *ldc, complex *
+	work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__;
+    static logical applyleft;
+    extern /* Subroutine */ int cgerc_(integer *, integer *, complex *,
+	    complex *, integer *, complex *, integer *, complex *, integer *),
+	     cgemv_(char *, integer *, integer *, complex *, complex *,
+	    integer *, complex *, integer *, complex *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    static integer lastc, lastv;
+    extern integer ilaclc_(integer *, integer *, complex *, integer *),
+	    ilaclr_(integer *, integer *, complex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLARF applies a complex elementary reflector H to a complex M-by-N
+    matrix C, from either the left or the right. H is represented in the
+    form
+
+          H = I - tau * v * v'
+
+    where tau is a complex scalar and v is a complex vector.
+
+    If tau = 0, then H is taken to be the unit matrix.
+
+    To apply H' (the conjugate transpose of H), supply conjg(tau) instead
+    tau.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': form  H * C
+            = 'R': form  C * H
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    V       (input) COMPLEX array, dimension
+                       (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+                    or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+            The vector v in the representation of H. V is not used if
+            TAU = 0.
+
+    INCV    (input) INTEGER
+            The increment between elements of v. INCV <> 0.
+
+    TAU     (input) COMPLEX
+            The value tau in the representation of H.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+            or C * H if SIDE = 'R'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX array, dimension
+                           (N) if SIDE = 'L'
+                        or (M) if SIDE = 'R'
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    applyleft = lsame_(side, "L");
+    lastv = 0;
+    lastc = 0;
+    if (tau->r != 0.f || tau->i != 0.f) {
+/*
+       Set up variables for scanning V.  LASTV begins pointing to the end
+       of V.
+*/
+	if (applyleft) {
+	    lastv = *m;
+	} else {
+	    lastv = *n;
+	}
+	if (*incv > 0) {
+	    i__ = (lastv - 1) * *incv + 1;
+	} else {
+	    i__ = 1;
+	}
+/*     Look for the last non-zero row in V. */
+	for(;;) { /* while(complicated condition) */
+	    i__1 = i__;
+	    if (!(lastv > 0 && (v[i__1].r == 0.f && v[i__1].i == 0.f)))
+	    	break;
+	    --lastv;
+	    i__ -= *incv;
+	}
+	if (applyleft) {
+/*     Scan for the last non-zero column in C(1:lastv,:). */
+	    lastc = ilaclc_(&lastv, n, &c__[c_offset], ldc);
+	} else {
+/*     Scan for the last non-zero row in C(:,1:lastv). */
+	    lastc = ilaclr_(m, &lastv, &c__[c_offset], ldc);
+	}
+    }
+/*
+       Note that lastc.eq.0 renders the BLAS operations null; no special
+       case is needed at this level.
+*/
+    if (applyleft) {
+
+/*        Form  H * C */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastv,1:lastc)' * v(1:lastv,1) */
+
+	    cgemv_("Conjugate transpose", &lastv, &lastc, &c_b57, &c__[
+		    c_offset], ldc, &v[1], incv, &c_b56, &work[1], &c__1);
+
+/*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)' */
+
+	    q__1.r = -tau->r, q__1.i = -tau->i;
+	    cgerc_(&lastv, &lastc, &q__1, &v[1], incv, &work[1], &c__1, &c__[
+		    c_offset], ldc);
+	}
+    } else {
+
+/*        Form  C * H */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1) */
+
+	    cgemv_("No transpose", &lastc, &lastv, &c_b57, &c__[c_offset],
+		    ldc, &v[1], incv, &c_b56, &work[1], &c__1);
+
+/*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)' */
+
+	    q__1.r = -tau->r, q__1.i = -tau->i;
+	    cgerc_(&lastc, &lastv, &q__1, &work[1], &c__1, &v[1], incv, &c__[
+		    c_offset], ldc);
+	}
+    }
+    return 0;
+
+/*     End of CLARF */
+
+} /* clarf_ */
+
+/* Subroutine */ int clarfb_(char *side, char *trans, char *direct, char *
+	storev, integer *m, integer *n, integer *k, complex *v, integer *ldv,
+	complex *t, integer *ldt, complex *c__, integer *ldc, complex *work,
+	integer *ldwork)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1,
+	    work_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    static integer lastc;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *), ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *);
+    static integer lastv;
+    extern integer ilaclc_(integer *, integer *, complex *, integer *);
+    extern /* Subroutine */ int clacgv_(integer *, complex *, integer *);
+    extern integer ilaclr_(integer *, integer *, complex *, integer *);
+    static char transt[1];
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLARFB applies a complex block reflector H or its transpose H' to a
+    complex M-by-N matrix C, from either the left or the right.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply H or H' from the Left
+            = 'R': apply H or H' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply H (No transpose)
+            = 'C': apply H' (Conjugate transpose)
+
+    DIRECT  (input) CHARACTER*1
+            Indicates how H is formed from a product of elementary
+            reflectors
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Indicates how the vectors which define the elementary
+            reflectors are stored:
+            = 'C': Columnwise
+            = 'R': Rowwise
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    K       (input) INTEGER
+            The order of the matrix T (= the number of elementary
+            reflectors whose product defines the block reflector).
+
+    V       (input) COMPLEX array, dimension
+                                  (LDV,K) if STOREV = 'C'
+                                  (LDV,M) if STOREV = 'R' and SIDE = 'L'
+                                  (LDV,N) if STOREV = 'R' and SIDE = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+            if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+            if STOREV = 'R', LDV >= K.
+
+    T       (input) COMPLEX array, dimension (LDT,K)
+            The triangular K-by-K matrix T in the representation of the
+            block reflector.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by H*C or H'*C or C*H or C*H'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX array, dimension (LDWORK,K)
+
+    LDWORK  (input) INTEGER
+            The leading dimension of the array WORK.
+            If SIDE = 'L', LDWORK >= max(1,N);
+            if SIDE = 'R', LDWORK >= max(1,M).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    work_dim1 = *ldwork;
+    work_offset = 1 + work_dim1;
+    work -= work_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(trans, "N")) {
+	*(unsigned char *)transt = 'C';
+    } else {
+	*(unsigned char *)transt = 'N';
+    }
+
+    if (lsame_(storev, "C")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1 )    (first K rows)
+                       ( V2 )
+             where  V1  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+		    clacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L10: */
+		}
+
+/*              W := W * V1 */
+
+		ctrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2 */
+
+		    i__1 = lastv - *k;
+		    cgemm_("Conjugate transpose", "No transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[*k + 1 + c_dim1], ldc, &v[*k +
+			    1 + v_dim1], ldv, &c_b57, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ctrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (*m > *k) {
+
+/*                 C2 := C2 - V2 * W' */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    lastc, k, &q__1, &v[*k + 1 + v_dim1], ldv, &work[
+			    work_offset], ldwork, &c_b57, &c__[*k + 1 +
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		ctrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = j + i__ * c_dim1;
+			i__4 = j + i__ * c_dim1;
+			r_cnjg(&q__2, &work[i__ + j * work_dim1]);
+			q__1.r = c__[i__4].r - q__2.r, q__1.i = c__[i__4].i -
+				q__2.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L20: */
+		    }
+/* L30: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L40: */
+		}
+
+/*              W := W * V1 */
+
+		ctrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2 */
+
+		    i__1 = lastv - *k;
+		    cgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b57, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[*k +
+			    1 + v_dim1], ldv, &c_b57, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ctrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2' */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &lastc, &
+			    i__1, k, &q__1, &work[work_offset], ldwork, &v[*k
+			    + 1 + v_dim1], ldv, &c_b57, &c__[(*k + 1) *
+			    c_dim1 + 1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		ctrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			q__1.r = c__[i__4].r - work[i__5].r, q__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1 )
+                       ( V2 )    (last K rows)
+             where  V2  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+		    clacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L70: */
+		}
+
+/*              W := W * V2 */
+
+		ctrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1 */
+
+		    i__1 = lastv - *k;
+		    cgemm_("Conjugate transpose", "No transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[c_offset], ldc, &v[v_offset],
+			    ldv, &c_b57, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ctrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1 * W' */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    lastc, k, &q__1, &v[v_offset], ldv, &work[
+			    work_offset], ldwork, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		ctrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &
+			work[work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = lastv - *k + j + i__ * c_dim1;
+			i__4 = lastv - *k + j + i__ * c_dim1;
+			r_cnjg(&q__2, &work[i__ + j * work_dim1]);
+			q__1.r = c__[i__4].r - q__2.r, q__1.i = c__[i__4].i -
+				q__2.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L80: */
+		    }
+/* L90: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1,
+			     &work[j * work_dim1 + 1], &c__1);
+/* L100: */
+		}
+
+/*              W := W * V2 */
+
+		ctrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1 */
+
+		    i__1 = lastv - *k;
+		    cgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b57, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b57, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ctrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1' */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &lastc, &
+			    i__1, k, &q__1, &work[work_offset], ldwork, &v[
+			    v_offset], ldv, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		ctrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &
+			work[work_offset], ldwork);
+
+/*              C2 := C2 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + (lastv - *k + j) * c_dim1;
+			i__4 = i__ + (lastv - *k + j) * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			q__1.r = c__[i__4].r - work[i__5].r, q__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L110: */
+		    }
+/* L120: */
+		}
+	    }
+	}
+
+    } else if (lsame_(storev, "R")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1  V2 )    (V1: first K columns)
+             where  V1  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+		    clacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L130: */
+		}
+
+/*              W := W * V1' */
+
+		ctrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2' */
+
+		    i__1 = lastv - *k;
+		    cgemm_("Conjugate transpose", "Conjugate transpose", &
+			    lastc, k, &i__1, &c_b57, &c__[*k + 1 + c_dim1],
+			    ldc, &v[(*k + 1) * v_dim1 + 1], ldv, &c_b57, &
+			    work[work_offset], ldwork)
+			    ;
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ctrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2' * W' */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("Conjugate transpose", "Conjugate transpose", &
+			    i__1, &lastc, k, &q__1, &v[(*k + 1) * v_dim1 + 1],
+			     ldv, &work[work_offset], ldwork, &c_b57, &c__[*k
+			    + 1 + c_dim1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		ctrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = j + i__ * c_dim1;
+			i__4 = j + i__ * c_dim1;
+			r_cnjg(&q__2, &work[i__ + j * work_dim1]);
+			q__1.r = c__[i__4].r - q__2.r, q__1.i = c__[i__4].i -
+				q__2.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L140: */
+		    }
+/* L150: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L160: */
+		}
+
+/*              W := W * V1' */
+
+		ctrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2' */
+
+		    i__1 = lastv - *k;
+		    cgemm_("No transpose", "Conjugate transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[(*k + 1) * c_dim1 + 1], ldc, &
+			    v[(*k + 1) * v_dim1 + 1], ldv, &c_b57, &work[
+			    work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ctrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2 */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    q__1, &work[work_offset], ldwork, &v[(*k + 1) *
+			    v_dim1 + 1], ldv, &c_b57, &c__[(*k + 1) * c_dim1
+			    + 1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		ctrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			q__1.r = c__[i__4].r - work[i__5].r, q__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L170: */
+		    }
+/* L180: */
+		}
+
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1  V2 )    (V2: last K columns)
+             where  V2  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+		    clacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L190: */
+		}
+
+/*              W := W * V2' */
+
+		ctrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[(lastv - *k + 1) * v_dim1 + 1],
+			ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1' */
+
+		    i__1 = lastv - *k;
+		    cgemm_("Conjugate transpose", "Conjugate transpose", &
+			    lastc, k, &i__1, &c_b57, &c__[c_offset], ldc, &v[
+			    v_offset], ldv, &c_b57, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ctrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1' * W' */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("Conjugate transpose", "Conjugate transpose", &
+			    i__1, &lastc, k, &q__1, &v[v_offset], ldv, &work[
+			    work_offset], ldwork, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		ctrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = lastv - *k + j + i__ * c_dim1;
+			i__4 = lastv - *k + j + i__ * c_dim1;
+			r_cnjg(&q__2, &work[i__ + j * work_dim1]);
+			q__1.r = c__[i__4].r - q__2.r, q__1.i = c__[i__4].i -
+				q__2.i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L200: */
+		    }
+/* L210: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaclc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaclr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    ccopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1,
+			     &work[j * work_dim1 + 1], &c__1);
+/* L220: */
+		}
+
+/*              W := W * V2' */
+
+		ctrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[(lastv - *k + 1) * v_dim1 + 1],
+			ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1' */
+
+		    i__1 = lastv - *k;
+		    cgemm_("No transpose", "Conjugate transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[c_offset], ldc, &v[v_offset],
+			    ldv, &c_b57, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ctrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1 */
+
+		    i__1 = lastv - *k;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    q__1, &work[work_offset], ldwork, &v[v_offset],
+			    ldv, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		ctrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + (lastv - *k + j) * c_dim1;
+			i__4 = i__ + (lastv - *k + j) * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			q__1.r = c__[i__4].r - work[i__5].r, q__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = q__1.r, c__[i__3].i = q__1.i;
+/* L230: */
+		    }
+/* L240: */
+		}
+
+	    }
+
+	}
+    }
+
+    return 0;
+
+/*     End of CLARFB */
+
+} /* clarfb_ */
+
+/* Subroutine */ int clarfg_(integer *n, complex *alpha, complex *x, integer *
+	incx, complex *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer j, knt;
+    static real beta;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *);
+    static real alphi, alphr, xnorm;
+    extern doublereal scnrm2_(integer *, complex *, integer *), slapy3_(real *
+	    , real *, real *);
+    extern /* Complex */ VOID cladiv_(complex *, complex *, complex *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
+	    *);
+    static real safmin, rsafmn;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLARFG generates a complex elementary reflector H of order n, such
+    that
+
+          H' * ( alpha ) = ( beta ),   H' * H = I.
+               (   x   )   (   0  )
+
+    where alpha and beta are scalars, with beta real, and x is an
+    (n-1)-element complex vector. H is represented in the form
+
+          H = I - tau * ( 1 ) * ( 1 v' ) ,
+                        ( v )
+
+    where tau is a complex scalar and v is a complex (n-1)-element
+    vector. Note that H is not hermitian.
+
+    If the elements of x are all zero and alpha is real, then tau = 0
+    and H is taken to be the unit matrix.
+
+    Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the elementary reflector.
+
+    ALPHA   (input/output) COMPLEX
+            On entry, the value alpha.
+            On exit, it is overwritten with the value beta.
+
+    X       (input/output) COMPLEX array, dimension
+                           (1+(N-2)*abs(INCX))
+            On entry, the vector x.
+            On exit, it is overwritten with the vector v.
+
+    INCX    (input) INTEGER
+            The increment between elements of X. INCX > 0.
+
+    TAU     (output) COMPLEX
+            The value tau.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	tau->r = 0.f, tau->i = 0.f;
+	return 0;
+    }
+
+    i__1 = *n - 1;
+    xnorm = scnrm2_(&i__1, &x[1], incx);
+    alphr = alpha->r;
+    alphi = r_imag(alpha);
+
+    if (xnorm == 0.f && alphi == 0.f) {
+
+/*        H  =  I */
+
+	tau->r = 0.f, tau->i = 0.f;
+    } else {
+
+/*        general case */
+
+	r__1 = slapy3_(&alphr, &alphi, &xnorm);
+	beta = -r_sign(&r__1, &alphr);
+	safmin = slamch_("S") / slamch_("E");
+	rsafmn = 1.f / safmin;
+
+	knt = 0;
+	if (dabs(beta) < safmin) {
+
+/*           XNORM, BETA may be inaccurate; scale X and recompute them */
+
+L10:
+	    ++knt;
+	    i__1 = *n - 1;
+	    csscal_(&i__1, &rsafmn, &x[1], incx);
+	    beta *= rsafmn;
+	    alphi *= rsafmn;
+	    alphr *= rsafmn;
+	    if (dabs(beta) < safmin) {
+		goto L10;
+	    }
+
+/*           New BETA is at most 1, at least SAFMIN */
+
+	    i__1 = *n - 1;
+	    xnorm = scnrm2_(&i__1, &x[1], incx);
+	    q__1.r = alphr, q__1.i = alphi;
+	    alpha->r = q__1.r, alpha->i = q__1.i;
+	    r__1 = slapy3_(&alphr, &alphi, &xnorm);
+	    beta = -r_sign(&r__1, &alphr);
+	}
+	r__1 = (beta - alphr) / beta;
+	r__2 = -alphi / beta;
+	q__1.r = r__1, q__1.i = r__2;
+	tau->r = q__1.r, tau->i = q__1.i;
+	q__2.r = alpha->r - beta, q__2.i = alpha->i;
+	cladiv_(&q__1, &c_b57, &q__2);
+	alpha->r = q__1.r, alpha->i = q__1.i;
+	i__1 = *n - 1;
+	cscal_(&i__1, alpha, &x[1], incx);
+
+/*        If ALPHA is subnormal, it may lose relative accuracy */
+
+	i__1 = knt;
+	for (j = 1; j <= i__1; ++j) {
+	    beta *= safmin;
+/* L20: */
+	}
+	alpha->r = beta, alpha->i = 0.f;
+    }
+
+    return 0;
+
+/*     End of CLARFG */
+
+} /* clarfg_ */
+
+/* Subroutine */ int clarft_(char *direct, char *storev, integer *n, integer *
+	k, complex *v, integer *ldv, complex *tau, complex *t, integer *ldt)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, prevlastv;
+    static complex vii;
+    extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex *
+	    , complex *, integer *, complex *, integer *, complex *, complex *
+	    , integer *);
+    extern logical lsame_(char *, char *);
+    static integer lastv;
+    extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *,
+	    complex *, integer *, complex *, integer *), clacgv_(integer *, complex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLARFT forms the triangular factor T of a complex block reflector H
+    of order n, which is defined as a product of k elementary reflectors.
+
+    If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+
+    If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+
+    If STOREV = 'C', the vector which defines the elementary reflector
+    H(i) is stored in the i-th column of the array V, and
+
+       H  =  I - V * T * V'
+
+    If STOREV = 'R', the vector which defines the elementary reflector
+    H(i) is stored in the i-th row of the array V, and
+
+       H  =  I - V' * T * V
+
+    Arguments
+    =========
+
+    DIRECT  (input) CHARACTER*1
+            Specifies the order in which the elementary reflectors are
+            multiplied to form the block reflector:
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Specifies how the vectors which define the elementary
+            reflectors are stored (see also Further Details):
+            = 'C': columnwise
+            = 'R': rowwise
+
+    N       (input) INTEGER
+            The order of the block reflector H. N >= 0.
+
+    K       (input) INTEGER
+            The order of the triangular factor T (= the number of
+            elementary reflectors). K >= 1.
+
+    V       (input/output) COMPLEX array, dimension
+                                 (LDV,K) if STOREV = 'C'
+                                 (LDV,N) if STOREV = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i).
+
+    T       (output) COMPLEX array, dimension (LDT,K)
+            The k by k triangular factor T of the block reflector.
+            If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+            lower triangular. The rest of the array is not used.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    Further Details
+    ===============
+
+    The shape of the matrix V and the storage of the vectors which define
+    the H(i) is best illustrated by the following example with n = 5 and
+    k = 3. The elements equal to 1 are not stored; the corresponding
+    array elements are modified but restored on exit. The rest of the
+    array is not used.
+
+    DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+
+                 V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+                     ( v1  1    )                     (     1 v2 v2 v2 )
+                     ( v1 v2  1 )                     (        1 v3 v3 )
+                     ( v1 v2 v3 )
+                     ( v1 v2 v3 )
+
+    DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+
+                 V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+                     ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+                     (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+                     (     1 v3 )
+                     (        1 )
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    --tau;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (lsame_(direct, "F")) {
+	prevlastv = *n;
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    prevlastv = max(prevlastv,i__);
+	    i__2 = i__;
+	    if (tau[i__2].r == 0.f && tau[i__2].i == 0.f) {
+
+/*              H(i)  =  I */
+
+		i__2 = i__;
+		for (j = 1; j <= i__2; ++j) {
+		    i__3 = j + i__ * t_dim1;
+		    t[i__3].r = 0.f, t[i__3].i = 0.f;
+/* L10: */
+		}
+	    } else {
+
+/*              general case */
+
+		i__2 = i__ + i__ * v_dim1;
+		vii.r = v[i__2].r, vii.i = v[i__2].i;
+		i__2 = i__ + i__ * v_dim1;
+		v[i__2].r = 1.f, v[i__2].i = 0.f;
+		if (lsame_(storev, "C")) {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			i__3 = lastv + i__ * v_dim1;
+			if (v[i__3].r != 0.f || v[i__3].i != 0.f) {
+			    goto L15;
+			}
+		    }
+L15:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)' * V(i:j,i) */
+
+		    i__2 = j - i__ + 1;
+		    i__3 = i__ - 1;
+		    i__4 = i__;
+		    q__1.r = -tau[i__4].r, q__1.i = -tau[i__4].i;
+		    cgemv_("Conjugate transpose", &i__2, &i__3, &q__1, &v[i__
+			    + v_dim1], ldv, &v[i__ + i__ * v_dim1], &c__1, &
+			    c_b56, &t[i__ * t_dim1 + 1], &c__1);
+		} else {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			i__3 = i__ + lastv * v_dim1;
+			if (v[i__3].r != 0.f || v[i__3].i != 0.f) {
+			    goto L16;
+			}
+		    }
+L16:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)' */
+
+		    if (i__ < j) {
+			i__2 = j - i__;
+			clacgv_(&i__2, &v[i__ + (i__ + 1) * v_dim1], ldv);
+		    }
+		    i__2 = i__ - 1;
+		    i__3 = j - i__ + 1;
+		    i__4 = i__;
+		    q__1.r = -tau[i__4].r, q__1.i = -tau[i__4].i;
+		    cgemv_("No transpose", &i__2, &i__3, &q__1, &v[i__ *
+			    v_dim1 + 1], ldv, &v[i__ + i__ * v_dim1], ldv, &
+			    c_b56, &t[i__ * t_dim1 + 1], &c__1);
+		    if (i__ < j) {
+			i__2 = j - i__;
+			clacgv_(&i__2, &v[i__ + (i__ + 1) * v_dim1], ldv);
+		    }
+		}
+		i__2 = i__ + i__ * v_dim1;
+		v[i__2].r = vii.r, v[i__2].i = vii.i;
+
+/*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		ctrmv_("Upper", "No transpose", "Non-unit", &i__2, &t[
+			t_offset], ldt, &t[i__ * t_dim1 + 1], &c__1);
+		i__2 = i__ + i__ * t_dim1;
+		i__3 = i__;
+		t[i__2].r = tau[i__3].r, t[i__2].i = tau[i__3].i;
+		if (i__ > 1) {
+		    prevlastv = max(prevlastv,lastv);
+		} else {
+		    prevlastv = lastv;
+		}
+	    }
+/* L20: */
+	}
+    } else {
+	prevlastv = 1;
+	for (i__ = *k; i__ >= 1; --i__) {
+	    i__1 = i__;
+	    if (tau[i__1].r == 0.f && tau[i__1].i == 0.f) {
+
+/*              H(i)  =  I */
+
+		i__1 = *k;
+		for (j = i__; j <= i__1; ++j) {
+		    i__2 = j + i__ * t_dim1;
+		    t[i__2].r = 0.f, t[i__2].i = 0.f;
+/* L30: */
+		}
+	    } else {
+
+/*              general case */
+
+		if (i__ < *k) {
+		    if (lsame_(storev, "C")) {
+			i__1 = *n - *k + i__ + i__ * v_dim1;
+			vii.r = v[i__1].r, vii.i = v[i__1].i;
+			i__1 = *n - *k + i__ + i__ * v_dim1;
+			v[i__1].r = 1.f, v[i__1].i = 0.f;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    i__2 = lastv + i__ * v_dim1;
+			    if (v[i__2].r != 0.f || v[i__2].i != 0.f) {
+				goto L35;
+			    }
+			}
+L35:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(j:n-k+i,i+1:k)' * V(j:n-k+i,i)
+*/
+
+			i__1 = *n - *k + i__ - j + 1;
+			i__2 = *k - i__;
+			i__3 = i__;
+			q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i;
+			cgemv_("Conjugate transpose", &i__1, &i__2, &q__1, &v[
+				j + (i__ + 1) * v_dim1], ldv, &v[j + i__ *
+				v_dim1], &c__1, &c_b56, &t[i__ + 1 + i__ *
+				t_dim1], &c__1);
+			i__1 = *n - *k + i__ + i__ * v_dim1;
+			v[i__1].r = vii.r, v[i__1].i = vii.i;
+		    } else {
+			i__1 = i__ + (*n - *k + i__) * v_dim1;
+			vii.r = v[i__1].r, vii.i = v[i__1].i;
+			i__1 = i__ + (*n - *k + i__) * v_dim1;
+			v[i__1].r = 1.f, v[i__1].i = 0.f;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    i__2 = i__ + lastv * v_dim1;
+			    if (v[i__2].r != 0.f || v[i__2].i != 0.f) {
+				goto L36;
+			    }
+			}
+L36:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)'
+*/
+
+			i__1 = *n - *k + i__ - 1 - j + 1;
+			clacgv_(&i__1, &v[i__ + j * v_dim1], ldv);
+			i__1 = *k - i__;
+			i__2 = *n - *k + i__ - j + 1;
+			i__3 = i__;
+			q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i;
+			cgemv_("No transpose", &i__1, &i__2, &q__1, &v[i__ +
+				1 + j * v_dim1], ldv, &v[i__ + j * v_dim1],
+				ldv, &c_b56, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			i__1 = *n - *k + i__ - 1 - j + 1;
+			clacgv_(&i__1, &v[i__ + j * v_dim1], ldv);
+			i__1 = i__ + (*n - *k + i__) * v_dim1;
+			v[i__1].r = vii.r, v[i__1].i = vii.i;
+		    }
+
+/*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) */
+
+		    i__1 = *k - i__;
+		    ctrmv_("Lower", "No transpose", "Non-unit", &i__1, &t[i__
+			    + 1 + (i__ + 1) * t_dim1], ldt, &t[i__ + 1 + i__ *
+			     t_dim1], &c__1)
+			    ;
+		    if (i__ > 1) {
+			prevlastv = min(prevlastv,lastv);
+		    } else {
+			prevlastv = lastv;
+		    }
+		}
+		i__1 = i__ + i__ * t_dim1;
+		i__2 = i__;
+		t[i__1].r = tau[i__2].r, t[i__1].i = tau[i__2].i;
+	    }
+/* L40: */
+	}
+    }
+    return 0;
+
+/*     End of CLARFT */
+
+} /* clarft_ */
+
+/* Subroutine */ int clartg_(complex *f, complex *g, real *cs, complex *sn,
+	complex *r__)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static real d__;
+    static integer i__;
+    static real f2, g2;
+    static complex ff;
+    static real di, dr;
+    static complex fs, gs;
+    static real f2s, g2s, eps, scale;
+    static integer count;
+    static real safmn2, safmx2;
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    static real safmin;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLARTG generates a plane rotation so that
+
+       [  CS  SN  ]     [ F ]     [ R ]
+       [  __      ]  .  [   ]  =  [   ]   where CS**2 + |SN|**2 = 1.
+       [ -SN  CS  ]     [ G ]     [ 0 ]
+
+    This is a faster version of the BLAS1 routine CROTG, except for
+    the following differences:
+       F and G are unchanged on return.
+       If G=0, then CS=1 and SN=0.
+       If F=0, then CS=0 and SN is chosen so that R is real.
+
+    Arguments
+    =========
+
+    F       (input) COMPLEX
+            The first component of vector to be rotated.
+
+    G       (input) COMPLEX
+            The second component of vector to be rotated.
+
+    CS      (output) REAL
+            The cosine of the rotation.
+
+    SN      (output) COMPLEX
+            The sine of the rotation.
+
+    R       (output) COMPLEX
+            The nonzero component of the rotated vector.
+
+    Further Details
+    ======= =======
+
+    3-5-96 - Modified with a new algorithm by W. Kahan and J. Demmel
+
+    This version has a few statements commented out for thread safety
+    (machine parameters are computed on each entry). 10 feb 03, SJH.
+
+    =====================================================================
+
+       LOGICAL            FIRST
+       SAVE               FIRST, SAFMX2, SAFMIN, SAFMN2
+       DATA               FIRST / .TRUE. /
+
+       IF( FIRST ) THEN
+*/
+    safmin = slamch_("S");
+    eps = slamch_("E");
+    r__1 = slamch_("B");
+    i__1 = (integer) (log(safmin / eps) / log(slamch_("B")) / 2.f);
+    safmn2 = pow_ri(&r__1, &i__1);
+    safmx2 = 1.f / safmn2;
+/*
+          FIRST = .FALSE.
+       END IF
+   Computing MAX
+   Computing MAX
+*/
+    r__7 = (r__1 = f->r, dabs(r__1)), r__8 = (r__2 = r_imag(f), dabs(r__2));
+/* Computing MAX */
+    r__9 = (r__3 = g->r, dabs(r__3)), r__10 = (r__4 = r_imag(g), dabs(r__4));
+    r__5 = dmax(r__7,r__8), r__6 = dmax(r__9,r__10);
+    scale = dmax(r__5,r__6);
+    fs.r = f->r, fs.i = f->i;
+    gs.r = g->r, gs.i = g->i;
+    count = 0;
+    if (scale >= safmx2) {
+L10:
+	++count;
+	q__1.r = safmn2 * fs.r, q__1.i = safmn2 * fs.i;
+	fs.r = q__1.r, fs.i = q__1.i;
+	q__1.r = safmn2 * gs.r, q__1.i = safmn2 * gs.i;
+	gs.r = q__1.r, gs.i = q__1.i;
+	scale *= safmn2;
+	if (scale >= safmx2) {
+	    goto L10;
+	}
+    } else if (scale <= safmn2) {
+	if (g->r == 0.f && g->i == 0.f) {
+	    *cs = 1.f;
+	    sn->r = 0.f, sn->i = 0.f;
+	    r__->r = f->r, r__->i = f->i;
+	    return 0;
+	}
+L20:
+	--count;
+	q__1.r = safmx2 * fs.r, q__1.i = safmx2 * fs.i;
+	fs.r = q__1.r, fs.i = q__1.i;
+	q__1.r = safmx2 * gs.r, q__1.i = safmx2 * gs.i;
+	gs.r = q__1.r, gs.i = q__1.i;
+	scale *= safmx2;
+	if (scale <= safmn2) {
+	    goto L20;
+	}
+    }
+/* Computing 2nd power */
+    r__1 = fs.r;
+/* Computing 2nd power */
+    r__2 = r_imag(&fs);
+    f2 = r__1 * r__1 + r__2 * r__2;
+/* Computing 2nd power */
+    r__1 = gs.r;
+/* Computing 2nd power */
+    r__2 = r_imag(&gs);
+    g2 = r__1 * r__1 + r__2 * r__2;
+    if (f2 <= dmax(g2,1.f) * safmin) {
+
+/*        This is a rare case: F is very small. */
+
+	if (f->r == 0.f && f->i == 0.f) {
+	    *cs = 0.f;
+	    r__2 = g->r;
+	    r__3 = r_imag(g);
+	    r__1 = slapy2_(&r__2, &r__3);
+	    r__->r = r__1, r__->i = 0.f;
+/*           Do complex/real division explicitly with two real divisions */
+	    r__1 = gs.r;
+	    r__2 = r_imag(&gs);
+	    d__ = slapy2_(&r__1, &r__2);
+	    r__1 = gs.r / d__;
+	    r__2 = -r_imag(&gs) / d__;
+	    q__1.r = r__1, q__1.i = r__2;
+	    sn->r = q__1.r, sn->i = q__1.i;
+	    return 0;
+	}
+	r__1 = fs.r;
+	r__2 = r_imag(&fs);
+	f2s = slapy2_(&r__1, &r__2);
+/*
+          G2 and G2S are accurate
+          G2 is at least SAFMIN, and G2S is at least SAFMN2
+*/
+	g2s = sqrt(g2);
+/*
+          Error in CS from underflow in F2S is at most
+          UNFL / SAFMN2 .lt. sqrt(UNFL*EPS) .lt. EPS
+          If MAX(G2,ONE)=G2, then F2 .lt. G2*SAFMIN,
+          and so CS .lt. sqrt(SAFMIN)
+          If MAX(G2,ONE)=ONE, then F2 .lt. SAFMIN
+          and so CS .lt. sqrt(SAFMIN)/SAFMN2 = sqrt(EPS)
+          Therefore, CS = F2S/G2S / sqrt( 1 + (F2S/G2S)**2 ) = F2S/G2S
+*/
+	*cs = f2s / g2s;
+/*
+          Make sure abs(FF) = 1
+          Do complex/real division explicitly with 2 real divisions
+   Computing MAX
+*/
+	r__3 = (r__1 = f->r, dabs(r__1)), r__4 = (r__2 = r_imag(f), dabs(r__2)
+		);
+	if (dmax(r__3,r__4) > 1.f) {
+	    r__1 = f->r;
+	    r__2 = r_imag(f);
+	    d__ = slapy2_(&r__1, &r__2);
+	    r__1 = f->r / d__;
+	    r__2 = r_imag(f) / d__;
+	    q__1.r = r__1, q__1.i = r__2;
+	    ff.r = q__1.r, ff.i = q__1.i;
+	} else {
+	    dr = safmx2 * f->r;
+	    di = safmx2 * r_imag(f);
+	    d__ = slapy2_(&dr, &di);
+	    r__1 = dr / d__;
+	    r__2 = di / d__;
+	    q__1.r = r__1, q__1.i = r__2;
+	    ff.r = q__1.r, ff.i = q__1.i;
+	}
+	r__1 = gs.r / g2s;
+	r__2 = -r_imag(&gs) / g2s;
+	q__2.r = r__1, q__2.i = r__2;
+	q__1.r = ff.r * q__2.r - ff.i * q__2.i, q__1.i = ff.r * q__2.i + ff.i
+		* q__2.r;
+	sn->r = q__1.r, sn->i = q__1.i;
+	q__2.r = *cs * f->r, q__2.i = *cs * f->i;
+	q__3.r = sn->r * g->r - sn->i * g->i, q__3.i = sn->r * g->i + sn->i *
+		g->r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	r__->r = q__1.r, r__->i = q__1.i;
+    } else {
+
+/*
+          This is the most common case.
+          Neither F2 nor F2/G2 are less than SAFMIN
+          F2S cannot overflow, and it is accurate
+*/
+
+	f2s = sqrt(g2 / f2 + 1.f);
+/*        Do the F2S(real)*FS(complex) multiply with two real multiplies */
+	r__1 = f2s * fs.r;
+	r__2 = f2s * r_imag(&fs);
+	q__1.r = r__1, q__1.i = r__2;
+	r__->r = q__1.r, r__->i = q__1.i;
+	*cs = 1.f / f2s;
+	d__ = f2 + g2;
+/*        Do complex/real division explicitly with two real divisions */
+	r__1 = r__->r / d__;
+	r__2 = r_imag(r__) / d__;
+	q__1.r = r__1, q__1.i = r__2;
+	sn->r = q__1.r, sn->i = q__1.i;
+	r_cnjg(&q__2, &gs);
+	q__1.r = sn->r * q__2.r - sn->i * q__2.i, q__1.i = sn->r * q__2.i +
+		sn->i * q__2.r;
+	sn->r = q__1.r, sn->i = q__1.i;
+	if (count != 0) {
+	    if (count > 0) {
+		i__1 = count;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    q__1.r = safmx2 * r__->r, q__1.i = safmx2 * r__->i;
+		    r__->r = q__1.r, r__->i = q__1.i;
+/* L30: */
+		}
+	    } else {
+		i__1 = -count;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    q__1.r = safmn2 * r__->r, q__1.i = safmn2 * r__->i;
+		    r__->r = q__1.r, r__->i = q__1.i;
+/* L40: */
+		}
+	    }
+	}
+    }
+    return 0;
+
+/*     End of CLARTG */
+
+} /* clartg_ */
+
+/* Subroutine */ int clascl_(char *type__, integer *kl, integer *ku, real *
+	cfrom, real *cto, integer *m, integer *n, complex *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, k1, k2, k3, k4;
+    static real mul, cto1;
+    static logical done;
+    static real ctoc;
+    extern logical lsame_(char *, char *);
+    static integer itype;
+    static real cfrom1;
+    extern doublereal slamch_(char *);
+    static real cfromc;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real bignum;
+    extern logical sisnan_(real *);
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLASCL multiplies the M by N complex matrix A by the real scalar
+    CTO/CFROM.  This is done without over/underflow as long as the final
+    result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+    A may be full, upper triangular, lower triangular, upper Hessenberg,
+    or banded.
+
+    Arguments
+    =========
+
+    TYPE    (input) CHARACTER*1
+            TYPE indices the storage type of the input matrix.
+            = 'G':  A is a full matrix.
+            = 'L':  A is a lower triangular matrix.
+            = 'U':  A is an upper triangular matrix.
+            = 'H':  A is an upper Hessenberg matrix.
+            = 'B':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the lower
+                    half stored.
+            = 'Q':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the upper
+                    half stored.
+            = 'Z':  A is a band matrix with lower bandwidth KL and upper
+                    bandwidth KU.
+
+    KL      (input) INTEGER
+            The lower bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    KU      (input) INTEGER
+            The upper bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    CFROM   (input) REAL
+    CTO     (input) REAL
+            The matrix A is multiplied by CTO/CFROM. A(I,J) is computed
+            without over/underflow if the final result CTO*A(I,J)/CFROM
+            can be represented without over/underflow.  CFROM must be
+            nonzero.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            The matrix to be multiplied by CTO/CFROM.  See TYPE for the
+            storage type.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    INFO    (output) INTEGER
+            0  - successful exit
+            <0 - if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(type__, "G")) {
+	itype = 0;
+    } else if (lsame_(type__, "L")) {
+	itype = 1;
+    } else if (lsame_(type__, "U")) {
+	itype = 2;
+    } else if (lsame_(type__, "H")) {
+	itype = 3;
+    } else if (lsame_(type__, "B")) {
+	itype = 4;
+    } else if (lsame_(type__, "Q")) {
+	itype = 5;
+    } else if (lsame_(type__, "Z")) {
+	itype = 6;
+    } else {
+	itype = -1;
+    }
+
+    if (itype == -1) {
+	*info = -1;
+    } else if (*cfrom == 0.f || sisnan_(cfrom)) {
+	*info = -4;
+    } else if (sisnan_(cto)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || itype == 4 && *n != *m || itype == 5 && *n != *m) {
+	*info = -7;
+    } else if (itype <= 3 && *lda < max(1,*m)) {
+	*info = -9;
+    } else if (itype >= 4) {
+/* Computing MAX */
+	i__1 = *m - 1;
+	if (*kl < 0 || *kl > max(i__1,0)) {
+	    *info = -2;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = *n - 1;
+	    if (*ku < 0 || *ku > max(i__1,0) || (itype == 4 || itype == 5) &&
+		    *kl != *ku) {
+		*info = -3;
+	    } else if (itype == 4 && *lda < *kl + 1 || itype == 5 && *lda < *
+		    ku + 1 || itype == 6 && *lda < (*kl << 1) + *ku + 1) {
+		*info = -9;
+	    }
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLASCL", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = slamch_("S");
+    bignum = 1.f / smlnum;
+
+    cfromc = *cfrom;
+    ctoc = *cto;
+
+L10:
+    cfrom1 = cfromc * smlnum;
+    if (cfrom1 == cfromc) {
+/*
+          CFROMC is an inf.  Multiply by a correctly signed zero for
+          finite CTOC, or a NaN if CTOC is infinite.
+*/
+	mul = ctoc / cfromc;
+	done = TRUE_;
+	cto1 = ctoc;
+    } else {
+	cto1 = ctoc / bignum;
+	if (cto1 == ctoc) {
+/*
+             CTOC is either 0 or an inf.  In both cases, CTOC itself
+             serves as the correct multiplication factor.
+*/
+	    mul = ctoc;
+	    done = TRUE_;
+	    cfromc = 1.f;
+	} else if (dabs(cfrom1) > dabs(ctoc) && ctoc != 0.f) {
+	    mul = smlnum;
+	    done = FALSE_;
+	    cfromc = cfrom1;
+	} else if (dabs(cto1) > dabs(cfromc)) {
+	    mul = bignum;
+	    done = FALSE_;
+	    ctoc = cto1;
+	} else {
+	    mul = ctoc / cfromc;
+	    done = TRUE_;
+	}
+    }
+
+    if (itype == 0) {
+
+/*        Full matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L20: */
+	    }
+/* L30: */
+	}
+
+    } else if (itype == 1) {
+
+/*        Lower triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L40: */
+	    }
+/* L50: */
+	}
+
+    } else if (itype == 2) {
+
+/*        Upper triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L60: */
+	    }
+/* L70: */
+	}
+
+    } else if (itype == 3) {
+
+/*        Upper Hessenberg matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j + 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+    } else if (itype == 4) {
+
+/*        Lower half of a symmetric band matrix */
+
+	k3 = *kl + 1;
+	k4 = *n + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = k3, i__4 = k4 - j;
+	    i__2 = min(i__3,i__4);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L100: */
+	    }
+/* L110: */
+	}
+
+    } else if (itype == 5) {
+
+/*        Upper half of a symmetric band matrix */
+
+	k1 = *ku + 2;
+	k3 = *ku + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__2 = k1 - j;
+	    i__3 = k3;
+	    for (i__ = max(i__2,1); i__ <= i__3; ++i__) {
+		i__2 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L120: */
+	    }
+/* L130: */
+	}
+
+    } else if (itype == 6) {
+
+/*        Band matrix */
+
+	k1 = *kl + *ku + 2;
+	k2 = *kl + 1;
+	k3 = (*kl << 1) + *ku + 1;
+	k4 = *kl + *ku + 1 + *m;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__3 = k1 - j;
+/* Computing MIN */
+	    i__4 = k3, i__5 = k4 - j;
+	    i__2 = min(i__4,i__5);
+	    for (i__ = max(i__3,k2); i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		q__1.r = mul * a[i__4].r, q__1.i = mul * a[i__4].i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L140: */
+	    }
+/* L150: */
+	}
+
+    }
+
+    if (! done) {
+	goto L10;
+    }
+
+    return 0;
+
+/*     End of CLASCL */
+
+} /* clascl_ */
+
+/* Subroutine */ int claset_(char *uplo, integer *m, integer *n, complex *
+	alpha, complex *beta, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLASET initializes a 2-D array A to BETA on the diagonal and
+    ALPHA on the offdiagonals.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be set.
+            = 'U':      Upper triangular part is set. The lower triangle
+                        is unchanged.
+            = 'L':      Lower triangular part is set. The upper triangle
+                        is unchanged.
+            Otherwise:  All of the matrix A is set.
+
+    M       (input) INTEGER
+            On entry, M specifies the number of rows of A.
+
+    N       (input) INTEGER
+            On entry, N specifies the number of columns of A.
+
+    ALPHA   (input) COMPLEX
+            All the offdiagonal array elements are set to ALPHA.
+
+    BETA    (input) COMPLEX
+            All the diagonal array elements are set to BETA.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, A(i,j) = ALPHA, 1 <= i <= m, 1 <= j <= n, i.ne.j;
+                     A(i,i) = BETA , 1 <= i <= min(m,n)
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+
+/*
+          Set the diagonal to BETA and the strictly upper triangular
+          part of the array to ALPHA.
+*/
+
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j - 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = alpha->r, a[i__3].i = alpha->i;
+/* L10: */
+	    }
+/* L20: */
+	}
+	i__1 = min(*n,*m);
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = beta->r, a[i__2].i = beta->i;
+/* L30: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+
+/*
+          Set the diagonal to BETA and the strictly lower triangular
+          part of the array to ALPHA.
+*/
+
+	i__1 = min(*m,*n);
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = alpha->r, a[i__3].i = alpha->i;
+/* L40: */
+	    }
+/* L50: */
+	}
+	i__1 = min(*n,*m);
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = beta->r, a[i__2].i = beta->i;
+/* L60: */
+	}
+
+    } else {
+
+/*
+          Set the array to BETA on the diagonal and ALPHA on the
+          offdiagonal.
+*/
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = alpha->r, a[i__3].i = alpha->i;
+/* L70: */
+	    }
+/* L80: */
+	}
+	i__1 = min(*m,*n);
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = beta->r, a[i__2].i = beta->i;
+/* L90: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLASET */
+
+} /* claset_ */
+
+/* Subroutine */ int clasr_(char *side, char *pivot, char *direct, integer *m,
+	 integer *n, real *c__, real *s, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    complex q__1, q__2, q__3;
+
+    /* Local variables */
+    static integer i__, j, info;
+    static complex temp;
+    extern logical lsame_(char *, char *);
+    static real ctemp, stemp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLASR applies a sequence of real plane rotations to a complex matrix
+    A, from either the left or the right.
+
+    When SIDE = 'L', the transformation takes the form
+
+       A := P*A
+
+    and when SIDE = 'R', the transformation takes the form
+
+       A := A*P**T
+
+    where P is an orthogonal matrix consisting of a sequence of z plane
+    rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+    and P**T is the transpose of P.
+
+    When DIRECT = 'F' (Forward sequence), then
+
+       P = P(z-1) * ... * P(2) * P(1)
+
+    and when DIRECT = 'B' (Backward sequence), then
+
+       P = P(1) * P(2) * ... * P(z-1)
+
+    where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+
+       R(k) = (  c(k)  s(k) )
+            = ( -s(k)  c(k) ).
+
+    When PIVOT = 'V' (Variable pivot), the rotation is performed
+    for the plane (k,k+1), i.e., P(k) has the form
+
+       P(k) = (  1                                            )
+              (       ...                                     )
+              (              1                                )
+              (                   c(k)  s(k)                  )
+              (                  -s(k)  c(k)                  )
+              (                                1              )
+              (                                     ...       )
+              (                                            1  )
+
+    where R(k) appears as a rank-2 modification to the identity matrix in
+    rows and columns k and k+1.
+
+    When PIVOT = 'T' (Top pivot), the rotation is performed for the
+    plane (1,k+1), so P(k) has the form
+
+       P(k) = (  c(k)                    s(k)                 )
+              (         1                                     )
+              (              ...                              )
+              (                     1                         )
+              ( -s(k)                    c(k)                 )
+              (                                 1             )
+              (                                      ...      )
+              (                                             1 )
+
+    where R(k) appears in rows and columns 1 and k+1.
+
+    Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+    performed for the plane (k,z), giving P(k) the form
+
+       P(k) = ( 1                                             )
+              (      ...                                      )
+              (             1                                 )
+              (                  c(k)                    s(k) )
+              (                         1                     )
+              (                              ...              )
+              (                                     1         )
+              (                 -s(k)                    c(k) )
+
+    where R(k) appears in rows and columns k and z.  The rotations are
+    performed without ever forming P(k) explicitly.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            Specifies whether the plane rotation matrix P is applied to
+            A on the left or the right.
+            = 'L':  Left, compute A := P*A
+            = 'R':  Right, compute A:= A*P**T
+
+    PIVOT   (input) CHARACTER*1
+            Specifies the plane for which P(k) is a plane rotation
+            matrix.
+            = 'V':  Variable pivot, the plane (k,k+1)
+            = 'T':  Top pivot, the plane (1,k+1)
+            = 'B':  Bottom pivot, the plane (k,z)
+
+    DIRECT  (input) CHARACTER*1
+            Specifies whether P is a forward or backward sequence of
+            plane rotations.
+            = 'F':  Forward, P = P(z-1)*...*P(2)*P(1)
+            = 'B':  Backward, P = P(1)*P(2)*...*P(z-1)
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  If m <= 1, an immediate
+            return is effected.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  If n <= 1, an
+            immediate return is effected.
+
+    C       (input) REAL array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The cosines c(k) of the plane rotations.
+
+    S       (input) REAL array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The sines s(k) of the plane rotations.  The 2-by-2 plane
+            rotation part of the matrix P(k), R(k), has the form
+            R(k) = (  c(k)  s(k) )
+                   ( -s(k)  c(k) ).
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            The M-by-N matrix A.  On exit, A is overwritten by P*A if
+            SIDE = 'R' or by A*P**T if SIDE = 'L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --c__;
+    --s;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! (lsame_(side, "L") || lsame_(side, "R"))) {
+	info = 1;
+    } else if (! (lsame_(pivot, "V") || lsame_(pivot,
+	    "T") || lsame_(pivot, "B"))) {
+	info = 2;
+    } else if (! (lsame_(direct, "F") || lsame_(direct,
+	    "B"))) {
+	info = 3;
+    } else if (*m < 0) {
+	info = 4;
+    } else if (*n < 0) {
+	info = 5;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CLASR ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+    if (lsame_(side, "L")) {
+
+/*        Form  P * A */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = j + 1 + i__ * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = j + 1 + i__ * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__4 = j + i__ * a_dim1;
+			    q__3.r = stemp * a[i__4].r, q__3.i = stemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			    i__3 = j + i__ * a_dim1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__4 = j + i__ * a_dim1;
+			    q__3.r = ctemp * a[i__4].r, q__3.i = ctemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = j + 1 + i__ * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = j + 1 + i__ * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__3 = j + i__ * a_dim1;
+			    q__3.r = stemp * a[i__3].r, q__3.i = stemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			    i__2 = j + i__ * a_dim1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__3 = j + i__ * a_dim1;
+			    q__3.r = ctemp * a[i__3].r, q__3.i = ctemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L30: */
+			}
+		    }
+/* L40: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = j + i__ * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = j + i__ * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__4 = i__ * a_dim1 + 1;
+			    q__3.r = stemp * a[i__4].r, q__3.i = stemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			    i__3 = i__ * a_dim1 + 1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__4 = i__ * a_dim1 + 1;
+			    q__3.r = ctemp * a[i__4].r, q__3.i = ctemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = j + i__ * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = j + i__ * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__3 = i__ * a_dim1 + 1;
+			    q__3.r = stemp * a[i__3].r, q__3.i = stemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			    i__2 = i__ * a_dim1 + 1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__3 = i__ * a_dim1 + 1;
+			    q__3.r = ctemp * a[i__3].r, q__3.i = ctemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = j + i__ * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = j + i__ * a_dim1;
+			    i__4 = *m + i__ * a_dim1;
+			    q__2.r = stemp * a[i__4].r, q__2.i = stemp * a[
+				    i__4].i;
+			    q__3.r = ctemp * temp.r, q__3.i = ctemp * temp.i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			    i__3 = *m + i__ * a_dim1;
+			    i__4 = *m + i__ * a_dim1;
+			    q__2.r = ctemp * a[i__4].r, q__2.i = ctemp * a[
+				    i__4].i;
+			    q__3.r = stemp * temp.r, q__3.i = stemp * temp.i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L90: */
+			}
+		    }
+/* L100: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = j + i__ * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = j + i__ * a_dim1;
+			    i__3 = *m + i__ * a_dim1;
+			    q__2.r = stemp * a[i__3].r, q__2.i = stemp * a[
+				    i__3].i;
+			    q__3.r = ctemp * temp.r, q__3.i = ctemp * temp.i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			    i__2 = *m + i__ * a_dim1;
+			    i__3 = *m + i__ * a_dim1;
+			    q__2.r = ctemp * a[i__3].r, q__2.i = ctemp * a[
+				    i__3].i;
+			    q__3.r = stemp * temp.r, q__3.i = stemp * temp.i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+	    }
+	}
+    } else if (lsame_(side, "R")) {
+
+/*        Form A * P' */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + (j + 1) * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = i__ + (j + 1) * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__4 = i__ + j * a_dim1;
+			    q__3.r = stemp * a[i__4].r, q__3.i = stemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			    i__3 = i__ + j * a_dim1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__4 = i__ + j * a_dim1;
+			    q__3.r = ctemp * a[i__4].r, q__3.i = ctemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L130: */
+			}
+		    }
+/* L140: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + (j + 1) * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = i__ + (j + 1) * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__3 = i__ + j * a_dim1;
+			    q__3.r = stemp * a[i__3].r, q__3.i = stemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			    i__2 = i__ + j * a_dim1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__3 = i__ + j * a_dim1;
+			    q__3.r = ctemp * a[i__3].r, q__3.i = ctemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = i__ + j * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__4 = i__ + a_dim1;
+			    q__3.r = stemp * a[i__4].r, q__3.i = stemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			    i__3 = i__ + a_dim1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__4 = i__ + a_dim1;
+			    q__3.r = ctemp * a[i__4].r, q__3.i = ctemp * a[
+				    i__4].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L170: */
+			}
+		    }
+/* L180: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = i__ + j * a_dim1;
+			    q__2.r = ctemp * temp.r, q__2.i = ctemp * temp.i;
+			    i__3 = i__ + a_dim1;
+			    q__3.r = stemp * a[i__3].r, q__3.i = stemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			    i__2 = i__ + a_dim1;
+			    q__2.r = stemp * temp.r, q__2.i = stemp * temp.i;
+			    i__3 = i__ + a_dim1;
+			    q__3.r = ctemp * a[i__3].r, q__3.i = ctemp * a[
+				    i__3].i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L190: */
+			}
+		    }
+/* L200: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = i__ + *n * a_dim1;
+			    q__2.r = stemp * a[i__4].r, q__2.i = stemp * a[
+				    i__4].i;
+			    q__3.r = ctemp * temp.r, q__3.i = ctemp * temp.i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+			    i__3 = i__ + *n * a_dim1;
+			    i__4 = i__ + *n * a_dim1;
+			    q__2.r = ctemp * a[i__4].r, q__2.i = ctemp * a[
+				    i__4].i;
+			    q__3.r = stemp * temp.r, q__3.i = stemp * temp.i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+/* L210: */
+			}
+		    }
+/* L220: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = i__ + j * a_dim1;
+			    i__3 = i__ + *n * a_dim1;
+			    q__2.r = stemp * a[i__3].r, q__2.i = stemp * a[
+				    i__3].i;
+			    q__3.r = ctemp * temp.r, q__3.i = ctemp * temp.i;
+			    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i +
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+			    i__2 = i__ + *n * a_dim1;
+			    i__3 = i__ + *n * a_dim1;
+			    q__2.r = ctemp * a[i__3].r, q__2.i = ctemp * a[
+				    i__3].i;
+			    q__3.r = stemp * temp.r, q__3.i = stemp * temp.i;
+			    q__1.r = q__2.r - q__3.r, q__1.i = q__2.i -
+				    q__3.i;
+			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CLASR */
+
+} /* clasr_ */
+
+/* Subroutine */ int classq_(integer *n, complex *x, integer *incx, real *
+	scale, real *sumsq)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    real r__1;
+
+    /* Local variables */
+    static integer ix;
+    static real temp1;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLASSQ returns the values scl and ssq such that
+
+       ( scl**2 )*ssq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+
+    where x( i ) = abs( X( 1 + ( i - 1 )*INCX ) ). The value of sumsq is
+    assumed to be at least unity and the value of ssq will then satisfy
+
+       1.0 .le. ssq .le. ( sumsq + 2*n ).
+
+    scale is assumed to be non-negative and scl returns the value
+
+       scl = max( scale, abs( real( x( i ) ) ), abs( aimag( x( i ) ) ) ),
+              i
+
+    scale and sumsq must be supplied in SCALE and SUMSQ respectively.
+    SCALE and SUMSQ are overwritten by scl and ssq respectively.
+
+    The routine makes only one pass through the vector X.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of elements to be used from the vector X.
+
+    X       (input) COMPLEX array, dimension (N)
+            The vector x as described above.
+               x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+
+    INCX    (input) INTEGER
+            The increment between successive values of the vector X.
+            INCX > 0.
+
+    SCALE   (input/output) REAL
+            On entry, the value  scale  in the equation above.
+            On exit, SCALE is overwritten with the value  scl .
+
+    SUMSQ   (input/output) REAL
+            On entry, the value  sumsq  in the equation above.
+            On exit, SUMSQ is overwritten with the value  ssq .
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n > 0) {
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    i__3 = ix;
+	    if (x[i__3].r != 0.f) {
+		i__3 = ix;
+		temp1 = (r__1 = x[i__3].r, dabs(r__1));
+		if (*scale < temp1) {
+/* Computing 2nd power */
+		    r__1 = *scale / temp1;
+		    *sumsq = *sumsq * (r__1 * r__1) + 1;
+		    *scale = temp1;
+		} else {
+/* Computing 2nd power */
+		    r__1 = temp1 / *scale;
+		    *sumsq += r__1 * r__1;
+		}
+	    }
+	    if (r_imag(&x[ix]) != 0.f) {
+		temp1 = (r__1 = r_imag(&x[ix]), dabs(r__1));
+		if (*scale < temp1) {
+/* Computing 2nd power */
+		    r__1 = *scale / temp1;
+		    *sumsq = *sumsq * (r__1 * r__1) + 1;
+		    *scale = temp1;
+		} else {
+/* Computing 2nd power */
+		    r__1 = temp1 / *scale;
+		    *sumsq += r__1 * r__1;
+		}
+	    }
+/* L10: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLASSQ */
+
+} /* classq_ */
+
+/* Subroutine */ int claswp_(integer *n, complex *a, integer *lda, integer *
+	k1, integer *k2, integer *ipiv, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+
+    /* Local variables */
+    static integer i__, j, k, i1, i2, n32, ip, ix, ix0, inc;
+    static complex temp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLASWP performs a series of row interchanges on the matrix A.
+    One row interchange is initiated for each of rows K1 through K2 of A.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the matrix of column dimension N to which the row
+            interchanges will be applied.
+            On exit, the permuted matrix.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+
+    K1      (input) INTEGER
+            The first element of IPIV for which a row interchange will
+            be done.
+
+    K2      (input) INTEGER
+            The last element of IPIV for which a row interchange will
+            be done.
+
+    IPIV    (input) INTEGER array, dimension (K2*abs(INCX))
+            The vector of pivot indices.  Only the elements in positions
+            K1 through K2 of IPIV are accessed.
+            IPIV(K) = L implies rows K and L are to be interchanged.
+
+    INCX    (input) INTEGER
+            The increment between successive values of IPIV.  If IPIV
+            is negative, the pivots are applied in reverse order.
+
+    Further Details
+    ===============
+
+    Modified by
+     R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA
+
+   =====================================================================
+
+
+       Interchange row I with row IPIV(I) for each of rows K1 through K2.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    if (*incx > 0) {
+	ix0 = *k1;
+	i1 = *k1;
+	i2 = *k2;
+	inc = 1;
+    } else if (*incx < 0) {
+	ix0 = (1 - *k2) * *incx + 1;
+	i1 = *k2;
+	i2 = *k1;
+	inc = -1;
+    } else {
+	return 0;
+    }
+
+    n32 = *n / 32 << 5;
+    if (n32 != 0) {
+	i__1 = n32;
+	for (j = 1; j <= i__1; j += 32) {
+	    ix = ix0;
+	    i__2 = i2;
+	    i__3 = inc;
+	    for (i__ = i1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__3)
+		    {
+		ip = ipiv[ix];
+		if (ip != i__) {
+		    i__4 = j + 31;
+		    for (k = j; k <= i__4; ++k) {
+			i__5 = i__ + k * a_dim1;
+			temp.r = a[i__5].r, temp.i = a[i__5].i;
+			i__5 = i__ + k * a_dim1;
+			i__6 = ip + k * a_dim1;
+			a[i__5].r = a[i__6].r, a[i__5].i = a[i__6].i;
+			i__5 = ip + k * a_dim1;
+			a[i__5].r = temp.r, a[i__5].i = temp.i;
+/* L10: */
+		    }
+		}
+		ix += *incx;
+/* L20: */
+	    }
+/* L30: */
+	}
+    }
+    if (n32 != *n) {
+	++n32;
+	ix = ix0;
+	i__1 = i2;
+	i__3 = inc;
+	for (i__ = i1; i__3 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__3) {
+	    ip = ipiv[ix];
+	    if (ip != i__) {
+		i__2 = *n;
+		for (k = n32; k <= i__2; ++k) {
+		    i__4 = i__ + k * a_dim1;
+		    temp.r = a[i__4].r, temp.i = a[i__4].i;
+		    i__4 = i__ + k * a_dim1;
+		    i__5 = ip + k * a_dim1;
+		    a[i__4].r = a[i__5].r, a[i__4].i = a[i__5].i;
+		    i__4 = ip + k * a_dim1;
+		    a[i__4].r = temp.r, a[i__4].i = temp.i;
+/* L40: */
+		}
+	    }
+	    ix += *incx;
+/* L50: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLASWP */
+
+} /* claswp_ */
+
+/* Subroutine */ int clatrd_(char *uplo, integer *n, integer *nb, complex *a,
+	integer *lda, real *e, complex *tau, complex *w, integer *ldw)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__, iw;
+    static complex alpha;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *);
+    extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer
+	    *, complex *, integer *);
+    extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex *
+	    , complex *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), chemv_(char *, integer *, complex *,
+	    complex *, integer *, complex *, integer *, complex *, complex *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *,
+	    integer *, complex *, integer *), clarfg_(integer *, complex *,
+	    complex *, integer *, complex *), clacgv_(integer *, complex *,
+	    integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLATRD reduces NB rows and columns of a complex Hermitian matrix A to
+    Hermitian tridiagonal form by a unitary similarity
+    transformation Q' * A * Q, and returns the matrices V and W which are
+    needed to apply the transformation to the unreduced part of A.
+
+    If UPLO = 'U', CLATRD reduces the last NB rows and columns of a
+    matrix, of which the upper triangle is supplied;
+    if UPLO = 'L', CLATRD reduces the first NB rows and columns of a
+    matrix, of which the lower triangle is supplied.
+
+    This is an auxiliary routine called by CHETRD.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            Hermitian matrix A is stored:
+            = 'U': Upper triangular
+            = 'L': Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    NB      (input) INTEGER
+            The number of rows and columns to be reduced.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit:
+            if UPLO = 'U', the last NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements above the diagonal
+              with the array TAU, represent the unitary matrix Q as a
+              product of elementary reflectors;
+            if UPLO = 'L', the first NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements below the diagonal
+              with the array TAU, represent the  unitary matrix Q as a
+              product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    E       (output) REAL array, dimension (N-1)
+            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
+            elements of the last NB columns of the reduced matrix;
+            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
+            the first NB columns of the reduced matrix.
+
+    TAU     (output) COMPLEX array, dimension (N-1)
+            The scalar factors of the elementary reflectors, stored in
+            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
+            See Further Details.
+
+    W       (output) COMPLEX array, dimension (LDW,NB)
+            The n-by-nb matrix W required to update the unreduced part
+            of A.
+
+    LDW     (input) INTEGER
+            The leading dimension of the array W. LDW >= max(1,N).
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n) H(n-1) . . . H(n-nb+1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
+    and tau in TAU(i-1).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
+    and tau in TAU(i).
+
+    The elements of the vectors v together form the n-by-nb matrix V
+    which is needed, with W, to apply the transformation to the unreduced
+    part of the matrix, using a Hermitian rank-2k update of the form:
+    A := A - V*W' - W*V'.
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5 and nb = 2:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  a   a   a   v4  v5 )              (  d                  )
+      (      a   a   v4  v5 )              (  1   d              )
+      (          a   1   v5 )              (  v1  1   a          )
+      (              d   1  )              (  v1  v2  a   a      )
+      (                  d  )              (  v1  v2  a   a   a  )
+
+    where d denotes a diagonal element of the reduced matrix, a denotes
+    an element of the original matrix that is unchanged, and vi denotes
+    an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --e;
+    --tau;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(uplo, "U")) {
+
+/*        Reduce last NB columns of upper triangle */
+
+	i__1 = *n - *nb + 1;
+	for (i__ = *n; i__ >= i__1; --i__) {
+	    iw = i__ - *n + *nb;
+	    if (i__ < *n) {
+
+/*              Update A(1:i,i) */
+
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = i__ + i__ * a_dim1;
+		r__1 = a[i__3].r;
+		a[i__2].r = r__1, a[i__2].i = 0.f;
+		i__2 = *n - i__;
+		clacgv_(&i__2, &w[i__ + (iw + 1) * w_dim1], ldw);
+		i__2 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__, &i__2, &q__1, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &w[i__ + (iw + 1) * w_dim1], ldw, &
+			c_b57, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		clacgv_(&i__2, &w[i__ + (iw + 1) * w_dim1], ldw);
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = *n - i__;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__, &i__2, &q__1, &w[(iw + 1) *
+			w_dim1 + 1], ldw, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b57, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = i__ + i__ * a_dim1;
+		r__1 = a[i__3].r;
+		a[i__2].r = r__1, a[i__2].i = 0.f;
+	    }
+	    if (i__ > 1) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(1:i-2,i)
+*/
+
+		i__2 = i__ - 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = i__ - 1;
+		clarfg_(&i__2, &alpha, &a[i__ * a_dim1 + 1], &c__1, &tau[i__
+			- 1]);
+		i__2 = i__ - 1;
+		e[i__2] = alpha.r;
+		i__2 = i__ - 1 + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute W(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		chemv_("Upper", &i__2, &c_b57, &a[a_offset], lda, &a[i__ *
+			a_dim1 + 1], &c__1, &c_b56, &w[iw * w_dim1 + 1], &
+			c__1);
+		if (i__ < *n) {
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &w[(
+			    iw + 1) * w_dim1 + 1], ldw, &a[i__ * a_dim1 + 1],
+			    &c__1, &c_b56, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &i__2, &i__3, &q__1, &a[(i__ + 1) *
+			     a_dim1 + 1], lda, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b57, &w[iw * w_dim1 + 1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[(
+			    i__ + 1) * a_dim1 + 1], lda, &a[i__ * a_dim1 + 1],
+			     &c__1, &c_b56, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemv_("No transpose", &i__2, &i__3, &q__1, &w[(iw + 1) *
+			    w_dim1 + 1], ldw, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b57, &w[iw * w_dim1 + 1], &c__1);
+		}
+		i__2 = i__ - 1;
+		cscal_(&i__2, &tau[i__ - 1], &w[iw * w_dim1 + 1], &c__1);
+		q__3.r = -.5f, q__3.i = -0.f;
+		i__2 = i__ - 1;
+		q__2.r = q__3.r * tau[i__2].r - q__3.i * tau[i__2].i, q__2.i =
+			 q__3.r * tau[i__2].i + q__3.i * tau[i__2].r;
+		i__3 = i__ - 1;
+		cdotc_(&q__4, &i__3, &w[iw * w_dim1 + 1], &c__1, &a[i__ *
+			a_dim1 + 1], &c__1);
+		q__1.r = q__2.r * q__4.r - q__2.i * q__4.i, q__1.i = q__2.r *
+			q__4.i + q__2.i * q__4.r;
+		alpha.r = q__1.r, alpha.i = q__1.i;
+		i__2 = i__ - 1;
+		caxpy_(&i__2, &alpha, &a[i__ * a_dim1 + 1], &c__1, &w[iw *
+			w_dim1 + 1], &c__1);
+	    }
+
+/* L10: */
+	}
+    } else {
+
+/*        Reduce first NB columns of lower triangle */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:n,i) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__ + i__ * a_dim1;
+	    r__1 = a[i__3].r;
+	    a[i__2].r = r__1, a[i__2].i = 0.f;
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &w[i__ + w_dim1], ldw);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__2, &i__3, &q__1, &a[i__ + a_dim1], lda,
+		     &w[i__ + w_dim1], ldw, &c_b57, &a[i__ + i__ * a_dim1], &
+		    c__1);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &w[i__ + w_dim1], ldw);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    q__1.r = -1.f, q__1.i = -0.f;
+	    cgemv_("No transpose", &i__2, &i__3, &q__1, &w[i__ + w_dim1], ldw,
+		     &a[i__ + a_dim1], lda, &c_b57, &a[i__ + i__ * a_dim1], &
+		    c__1);
+	    i__2 = i__ - 1;
+	    clacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__ + i__ * a_dim1;
+	    r__1 = a[i__3].r;
+	    a[i__2].r = r__1, a[i__2].i = 0.f;
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:n,i)
+*/
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		clarfg_(&i__2, &alpha, &a[min(i__3,*n) + i__ * a_dim1], &c__1,
+			 &tau[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+
+/*              Compute W(i+1:n,i) */
+
+		i__2 = *n - i__;
+		chemv_("Lower", &i__2, &c_b57, &a[i__ + 1 + (i__ + 1) *
+			a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &w[i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &w[i__ +
+			1 + w_dim1], ldw, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &w[i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &a[i__ + 1 +
+			a_dim1], lda, &w[i__ * w_dim1 + 1], &c__1, &c_b57, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &w[i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &w[i__ + 1 +
+			w_dim1], ldw, &w[i__ * w_dim1 + 1], &c__1, &c_b57, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		cscal_(&i__2, &tau[i__], &w[i__ + 1 + i__ * w_dim1], &c__1);
+		q__3.r = -.5f, q__3.i = -0.f;
+		i__2 = i__;
+		q__2.r = q__3.r * tau[i__2].r - q__3.i * tau[i__2].i, q__2.i =
+			 q__3.r * tau[i__2].i + q__3.i * tau[i__2].r;
+		i__3 = *n - i__;
+		cdotc_(&q__4, &i__3, &w[i__ + 1 + i__ * w_dim1], &c__1, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+		q__1.r = q__2.r * q__4.r - q__2.i * q__4.i, q__1.i = q__2.r *
+			q__4.i + q__2.i * q__4.r;
+		alpha.r = q__1.r, alpha.i = q__1.i;
+		i__2 = *n - i__;
+		caxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+	    }
+
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLATRD */
+
+} /* clatrd_ */
+
+/* Subroutine */ int clatrs_(char *uplo, char *trans, char *diag, char *
+	normin, integer *n, complex *a, integer *lda, complex *x, real *scale,
+	 real *cnorm, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1, r__2, r__3, r__4;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__, j;
+    static real xj, rec, tjj;
+    static integer jinc;
+    static real xbnd;
+    static integer imax;
+    static real tmax;
+    static complex tjjs;
+    static real xmax, grow;
+    extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer
+	    *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static real tscal;
+    static complex uscal;
+    static integer jlast;
+    extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer
+	    *, complex *, integer *);
+    static complex csumj;
+    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *,
+	    integer *, complex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *,
+	    complex *, integer *, complex *, integer *), slabad_(real *, real *);
+    extern integer icamax_(integer *, complex *, integer *);
+    extern /* Complex */ VOID cladiv_(complex *, complex *, complex *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
+	    *), xerbla_(char *, integer *);
+    static real bignum;
+    extern integer isamax_(integer *, real *, integer *);
+    extern doublereal scasum_(integer *, complex *, integer *);
+    static logical notran;
+    static integer jfirst;
+    static real smlnum;
+    static logical nounit;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLATRS solves one of the triangular systems
+
+       A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+
+    with scaling to prevent overflow.  Here A is an upper or lower
+    triangular matrix, A**T denotes the transpose of A, A**H denotes the
+    conjugate transpose of A, x and b are n-element vectors, and s is a
+    scaling factor, usually less than or equal to 1, chosen so that the
+    components of x will be less than the overflow threshold.  If the
+    unscaled problem will not cause overflow, the Level 2 BLAS routine
+    CTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+    then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the matrix A is upper or lower triangular.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    TRANS   (input) CHARACTER*1
+            Specifies the operation applied to A.
+            = 'N':  Solve A * x = s*b     (No transpose)
+            = 'T':  Solve A**T * x = s*b  (Transpose)
+            = 'C':  Solve A**H * x = s*b  (Conjugate transpose)
+
+    DIAG    (input) CHARACTER*1
+            Specifies whether or not the matrix A is unit triangular.
+            = 'N':  Non-unit triangular
+            = 'U':  Unit triangular
+
+    NORMIN  (input) CHARACTER*1
+            Specifies whether CNORM has been set or not.
+            = 'Y':  CNORM contains the column norms on entry
+            = 'N':  CNORM is not set on entry.  On exit, the norms will
+                    be computed and stored in CNORM.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The triangular matrix A.  If UPLO = 'U', the leading n by n
+            upper triangular part of the array A contains the upper
+            triangular matrix, and the strictly lower triangular part of
+            A is not referenced.  If UPLO = 'L', the leading n by n lower
+            triangular part of the array A contains the lower triangular
+            matrix, and the strictly upper triangular part of A is not
+            referenced.  If DIAG = 'U', the diagonal elements of A are
+            also not referenced and are assumed to be 1.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max (1,N).
+
+    X       (input/output) COMPLEX array, dimension (N)
+            On entry, the right hand side b of the triangular system.
+            On exit, X is overwritten by the solution vector x.
+
+    SCALE   (output) REAL
+            The scaling factor s for the triangular system
+               A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b.
+            If SCALE = 0, the matrix A is singular or badly scaled, and
+            the vector x is an exact or approximate solution to A*x = 0.
+
+    CNORM   (input or output) REAL array, dimension (N)
+
+            If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
+            contains the norm of the off-diagonal part of the j-th column
+            of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
+            to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
+            must be greater than or equal to the 1-norm.
+
+            If NORMIN = 'N', CNORM is an output argument and CNORM(j)
+            returns the 1-norm of the offdiagonal part of the j-th column
+            of A.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -k, the k-th argument had an illegal value
+
+    Further Details
+    ======= =======
+
+    A rough bound on x is computed; if that is less than overflow, CTRSV
+    is called, otherwise, specific code is used which checks for possible
+    overflow or divide-by-zero at every operation.
+
+    A columnwise scheme is used for solving A*x = b.  The basic algorithm
+    if A is lower triangular is
+
+         x[1:n] := b[1:n]
+         for j = 1, ..., n
+              x(j) := x(j) / A(j,j)
+              x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j]
+         end
+
+    Define bounds on the components of x after j iterations of the loop:
+       M(j) = bound on x[1:j]
+       G(j) = bound on x[j+1:n]
+    Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}.
+
+    Then for iteration j+1 we have
+       M(j+1) <= G(j) / | A(j+1,j+1) |
+       G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] |
+              <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | )
+
+    where CNORM(j+1) is greater than or equal to the infinity-norm of
+    column j+1 of A, not counting the diagonal.  Hence
+
+       G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | )
+                    1<=i<=j
+    and
+
+       |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| )
+                                     1<=i< j
+
+    Since |x(j)| <= M(j), we use the Level 2 BLAS routine CTRSV if the
+    reciprocal of the largest M(j), j=1,..,n, is larger than
+    max(underflow, 1/overflow).
+
+    The bound on x(j) is also used to determine when a step in the
+    columnwise method can be performed without fear of overflow.  If
+    the computed bound is greater than a large constant, x is scaled to
+    prevent overflow, but if the bound overflows, x is set to 0, x(j) to
+    1, and scale to 0, and a non-trivial solution to A*x = 0 is found.
+
+    Similarly, a row-wise scheme is used to solve A**T *x = b  or
+    A**H *x = b.  The basic algorithm for A upper triangular is
+
+         for j = 1, ..., n
+              x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j)
+         end
+
+    We simultaneously compute two bounds
+         G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j
+         M(j) = bound on x(i), 1<=i<=j
+
+    The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we
+    add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1.
+    Then the bound on x(j) is
+
+         M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) |
+
+              <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| )
+                        1<=i<=j
+
+    and we can safely call CTRSV if 1/M(n) and 1/G(n) are both greater
+    than max(underflow, 1/overflow).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --cnorm;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    notran = lsame_(trans, "N");
+    nounit = lsame_(diag, "N");
+
+/*     Test the input parameters. */
+
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T") && !
+	    lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (! lsame_(normin, "Y") && ! lsame_(normin,
+	     "N")) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLATRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine machine dependent parameters to control overflow. */
+
+    smlnum = slamch_("Safe minimum");
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+    smlnum /= slamch_("Precision");
+    bignum = 1.f / smlnum;
+    *scale = 1.f;
+
+    if (lsame_(normin, "N")) {
+
+/*        Compute the 1-norm of each column, not including the diagonal. */
+
+	if (upper) {
+
+/*           A is upper triangular. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j - 1;
+		cnorm[j] = scasum_(&i__2, &a[j * a_dim1 + 1], &c__1);
+/* L10: */
+	    }
+	} else {
+
+/*           A is lower triangular. */
+
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n - j;
+		cnorm[j] = scasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1);
+/* L20: */
+	    }
+	    cnorm[*n] = 0.f;
+	}
+    }
+
+/*
+       Scale the column norms by TSCAL if the maximum element in CNORM is
+       greater than BIGNUM/2.
+*/
+
+    imax = isamax_(n, &cnorm[1], &c__1);
+    tmax = cnorm[imax];
+    if (tmax <= bignum * .5f) {
+	tscal = 1.f;
+    } else {
+	tscal = .5f / (smlnum * tmax);
+	sscal_(n, &tscal, &cnorm[1], &c__1);
+    }
+
+/*
+       Compute a bound on the computed solution vector to see if the
+       Level 2 BLAS routine CTRSV can be used.
+*/
+
+    xmax = 0.f;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	i__2 = j;
+	r__3 = xmax, r__4 = (r__1 = x[i__2].r / 2.f, dabs(r__1)) + (r__2 =
+		r_imag(&x[j]) / 2.f, dabs(r__2));
+	xmax = dmax(r__3,r__4);
+/* L30: */
+    }
+    xbnd = xmax;
+
+    if (notran) {
+
+/*        Compute the growth in A * x = b. */
+
+	if (upper) {
+	    jfirst = *n;
+	    jlast = 1;
+	    jinc = -1;
+	} else {
+	    jfirst = 1;
+	    jlast = *n;
+	    jinc = 1;
+	}
+
+	if (tscal != 1.f) {
+	    grow = 0.f;
+	    goto L60;
+	}
+
+	if (nounit) {
+
+/*
+             A is non-unit triangular.
+
+             Compute GROW = 1/G(j) and XBND = 1/M(j).
+             Initially, G(0) = max{x(i), i=1,...,n}.
+*/
+
+	    grow = .5f / dmax(xbnd,smlnum);
+	    xbnd = grow;
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L60;
+		}
+
+		i__3 = j + j * a_dim1;
+		tjjs.r = a[i__3].r, tjjs.i = a[i__3].i;
+		tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			dabs(r__2));
+
+		if (tjj >= smlnum) {
+
+/*
+                   M(j) = G(j-1) / abs(A(j,j))
+
+   Computing MIN
+*/
+		    r__1 = xbnd, r__2 = dmin(1.f,tjj) * grow;
+		    xbnd = dmin(r__1,r__2);
+		} else {
+
+/*                 M(j) could overflow, set XBND to 0. */
+
+		    xbnd = 0.f;
+		}
+
+		if (tjj + cnorm[j] >= smlnum) {
+
+/*                 G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */
+
+		    grow *= tjj / (tjj + cnorm[j]);
+		} else {
+
+/*                 G(j) could overflow, set GROW to 0. */
+
+		    grow = 0.f;
+		}
+/* L40: */
+	    }
+	    grow = xbnd;
+	} else {
+
+/*
+             A is unit triangular.
+
+             Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}.
+
+   Computing MIN
+*/
+	    r__1 = 1.f, r__2 = .5f / dmax(xbnd,smlnum);
+	    grow = dmin(r__1,r__2);
+	    i__2 = jlast;
+	    i__1 = jinc;
+	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L60;
+		}
+
+/*              G(j) = G(j-1)*( 1 + CNORM(j) ) */
+
+		grow *= 1.f / (cnorm[j] + 1.f);
+/* L50: */
+	    }
+	}
+L60:
+
+	;
+    } else {
+
+/*        Compute the growth in A**T * x = b  or  A**H * x = b. */
+
+	if (upper) {
+	    jfirst = 1;
+	    jlast = *n;
+	    jinc = 1;
+	} else {
+	    jfirst = *n;
+	    jlast = 1;
+	    jinc = -1;
+	}
+
+	if (tscal != 1.f) {
+	    grow = 0.f;
+	    goto L90;
+	}
+
+	if (nounit) {
+
+/*
+             A is non-unit triangular.
+
+             Compute GROW = 1/G(j) and XBND = 1/M(j).
+             Initially, M(0) = max{x(i), i=1,...,n}.
+*/
+
+	    grow = .5f / dmax(xbnd,smlnum);
+	    xbnd = grow;
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L90;
+		}
+
+/*              G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */
+
+		xj = cnorm[j] + 1.f;
+/* Computing MIN */
+		r__1 = grow, r__2 = xbnd / xj;
+		grow = dmin(r__1,r__2);
+
+		i__3 = j + j * a_dim1;
+		tjjs.r = a[i__3].r, tjjs.i = a[i__3].i;
+		tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			dabs(r__2));
+
+		if (tjj >= smlnum) {
+
+/*                 M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */
+
+		    if (xj > tjj) {
+			xbnd *= tjj / xj;
+		    }
+		} else {
+
+/*                 M(j) could overflow, set XBND to 0. */
+
+		    xbnd = 0.f;
+		}
+/* L70: */
+	    }
+	    grow = dmin(grow,xbnd);
+	} else {
+
+/*
+             A is unit triangular.
+
+             Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}.
+
+   Computing MIN
+*/
+	    r__1 = 1.f, r__2 = .5f / dmax(xbnd,smlnum);
+	    grow = dmin(r__1,r__2);
+	    i__2 = jlast;
+	    i__1 = jinc;
+	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L90;
+		}
+
+/*              G(j) = ( 1 + CNORM(j) )*G(j-1) */
+
+		xj = cnorm[j] + 1.f;
+		grow /= xj;
+/* L80: */
+	    }
+	}
+L90:
+	;
+    }
+
+    if (grow * tscal > smlnum) {
+
+/*
+          Use the Level 2 BLAS solve if the reciprocal of the bound on
+          elements of X is not too small.
+*/
+
+	ctrsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1);
+    } else {
+
+/*        Use a Level 1 BLAS solve, scaling intermediate results. */
+
+	if (xmax > bignum * .5f) {
+
+/*
+             Scale X so that its components are less than or equal to
+             BIGNUM in absolute value.
+*/
+
+	    *scale = bignum * .5f / xmax;
+	    csscal_(n, scale, &x[1], &c__1);
+	    xmax = bignum;
+	} else {
+	    xmax *= 2.f;
+	}
+
+	if (notran) {
+
+/*           Solve A * x = b */
+
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*              Compute x(j) = b(j) / A(j,j), scaling x if necessary. */
+
+		i__3 = j;
+		xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]),
+			dabs(r__2));
+		if (nounit) {
+		    i__3 = j + j * a_dim1;
+		    q__1.r = tscal * a[i__3].r, q__1.i = tscal * a[i__3].i;
+		    tjjs.r = q__1.r, tjjs.i = q__1.i;
+		} else {
+		    tjjs.r = tscal, tjjs.i = 0.f;
+		    if (tscal == 1.f) {
+			goto L105;
+		    }
+		}
+		tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			dabs(r__2));
+		if (tjj > smlnum) {
+
+/*                    abs(A(j,j)) > SMLNUM: */
+
+		    if (tjj < 1.f) {
+			if (xj > tjj * bignum) {
+
+/*                          Scale x by 1/b(j). */
+
+			    rec = 1.f / xj;
+			    csscal_(n, &rec, &x[1], &c__1);
+			    *scale *= rec;
+			    xmax *= rec;
+			}
+		    }
+		    i__3 = j;
+		    cladiv_(&q__1, &x[j], &tjjs);
+		    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    i__3 = j;
+		    xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]
+			    ), dabs(r__2));
+		} else if (tjj > 0.f) {
+
+/*                    0 < abs(A(j,j)) <= SMLNUM: */
+
+		    if (xj > tjj * bignum) {
+
+/*
+                         Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM
+                         to avoid overflow when dividing by A(j,j).
+*/
+
+			rec = tjj * bignum / xj;
+			if (cnorm[j] > 1.f) {
+
+/*
+                            Scale by 1/CNORM(j) to avoid overflow when
+                            multiplying x(j) times column j.
+*/
+
+			    rec /= cnorm[j];
+			}
+			csscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+			xmax *= rec;
+		    }
+		    i__3 = j;
+		    cladiv_(&q__1, &x[j], &tjjs);
+		    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    i__3 = j;
+		    xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]
+			    ), dabs(r__2));
+		} else {
+
+/*
+                      A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and
+                      scale = 0, and compute a solution to A*x = 0.
+*/
+
+		    i__3 = *n;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			x[i__4].r = 0.f, x[i__4].i = 0.f;
+/* L100: */
+		    }
+		    i__3 = j;
+		    x[i__3].r = 1.f, x[i__3].i = 0.f;
+		    xj = 1.f;
+		    *scale = 0.f;
+		    xmax = 0.f;
+		}
+L105:
+
+/*
+                Scale x if necessary to avoid overflow when adding a
+                multiple of column j of A.
+*/
+
+		if (xj > 1.f) {
+		    rec = 1.f / xj;
+		    if (cnorm[j] > (bignum - xmax) * rec) {
+
+/*                    Scale x by 1/(2*abs(x(j))). */
+
+			rec *= .5f;
+			csscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+		    }
+		} else if (xj * cnorm[j] > bignum - xmax) {
+
+/*                 Scale x by 1/2. */
+
+		    csscal_(n, &c_b2435, &x[1], &c__1);
+		    *scale *= .5f;
+		}
+
+		if (upper) {
+		    if (j > 1) {
+
+/*
+                      Compute the update
+                         x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j)
+*/
+
+			i__3 = j - 1;
+			i__4 = j;
+			q__2.r = -x[i__4].r, q__2.i = -x[i__4].i;
+			q__1.r = tscal * q__2.r, q__1.i = tscal * q__2.i;
+			caxpy_(&i__3, &q__1, &a[j * a_dim1 + 1], &c__1, &x[1],
+				 &c__1);
+			i__3 = j - 1;
+			i__ = icamax_(&i__3, &x[1], &c__1);
+			i__3 = i__;
+			xmax = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 =
+				r_imag(&x[i__]), dabs(r__2));
+		    }
+		} else {
+		    if (j < *n) {
+
+/*
+                      Compute the update
+                         x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j)
+*/
+
+			i__3 = *n - j;
+			i__4 = j;
+			q__2.r = -x[i__4].r, q__2.i = -x[i__4].i;
+			q__1.r = tscal * q__2.r, q__1.i = tscal * q__2.i;
+			caxpy_(&i__3, &q__1, &a[j + 1 + j * a_dim1], &c__1, &
+				x[j + 1], &c__1);
+			i__3 = *n - j;
+			i__ = j + icamax_(&i__3, &x[j + 1], &c__1);
+			i__3 = i__;
+			xmax = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 =
+				r_imag(&x[i__]), dabs(r__2));
+		    }
+		}
+/* L110: */
+	    }
+
+	} else if (lsame_(trans, "T")) {
+
+/*           Solve A**T * x = b */
+
+	    i__2 = jlast;
+	    i__1 = jinc;
+	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*
+                Compute x(j) = b(j) - sum A(k,j)*x(k).
+                                      k<>j
+*/
+
+		i__3 = j;
+		xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]),
+			dabs(r__2));
+		uscal.r = tscal, uscal.i = 0.f;
+		rec = 1.f / dmax(xmax,1.f);
+		if (cnorm[j] > (bignum - xj) * rec) {
+
+/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */
+
+		    rec *= .5f;
+		    if (nounit) {
+			i__3 = j + j * a_dim1;
+			q__1.r = tscal * a[i__3].r, q__1.i = tscal * a[i__3]
+				.i;
+			tjjs.r = q__1.r, tjjs.i = q__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.f;
+		    }
+		    tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			     dabs(r__2));
+		    if (tjj > 1.f) {
+
+/*
+                         Divide by A(j,j) when scaling x if A(j,j) > 1.
+
+   Computing MIN
+*/
+			r__1 = 1.f, r__2 = rec * tjj;
+			rec = dmin(r__1,r__2);
+			cladiv_(&q__1, &uscal, &tjjs);
+			uscal.r = q__1.r, uscal.i = q__1.i;
+		    }
+		    if (rec < 1.f) {
+			csscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+			xmax *= rec;
+		    }
+		}
+
+		csumj.r = 0.f, csumj.i = 0.f;
+		if (uscal.r == 1.f && uscal.i == 0.f) {
+
+/*
+                   If the scaling needed for A in the dot product is 1,
+                   call CDOTU to perform the dot product.
+*/
+
+		    if (upper) {
+			i__3 = j - 1;
+			cdotu_(&q__1, &i__3, &a[j * a_dim1 + 1], &c__1, &x[1],
+				 &c__1);
+			csumj.r = q__1.r, csumj.i = q__1.i;
+		    } else if (j < *n) {
+			i__3 = *n - j;
+			cdotu_(&q__1, &i__3, &a[j + 1 + j * a_dim1], &c__1, &
+				x[j + 1], &c__1);
+			csumj.r = q__1.r, csumj.i = q__1.i;
+		    }
+		} else {
+
+/*                 Otherwise, use in-line code for the dot product. */
+
+		    if (upper) {
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * a_dim1;
+			    q__3.r = a[i__4].r * uscal.r - a[i__4].i *
+				    uscal.i, q__3.i = a[i__4].r * uscal.i + a[
+				    i__4].i * uscal.r;
+			    i__5 = i__;
+			    q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i,
+				    q__2.i = q__3.r * x[i__5].i + q__3.i * x[
+				    i__5].r;
+			    q__1.r = csumj.r + q__2.r, q__1.i = csumj.i +
+				    q__2.i;
+			    csumj.r = q__1.r, csumj.i = q__1.i;
+/* L120: */
+			}
+		    } else if (j < *n) {
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * a_dim1;
+			    q__3.r = a[i__4].r * uscal.r - a[i__4].i *
+				    uscal.i, q__3.i = a[i__4].r * uscal.i + a[
+				    i__4].i * uscal.r;
+			    i__5 = i__;
+			    q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i,
+				    q__2.i = q__3.r * x[i__5].i + q__3.i * x[
+				    i__5].r;
+			    q__1.r = csumj.r + q__2.r, q__1.i = csumj.i +
+				    q__2.i;
+			    csumj.r = q__1.r, csumj.i = q__1.i;
+/* L130: */
+			}
+		    }
+		}
+
+		q__1.r = tscal, q__1.i = 0.f;
+		if (uscal.r == q__1.r && uscal.i == q__1.i) {
+
+/*
+                   Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j)
+                   was not used to scale the dotproduct.
+*/
+
+		    i__3 = j;
+		    i__4 = j;
+		    q__1.r = x[i__4].r - csumj.r, q__1.i = x[i__4].i -
+			    csumj.i;
+		    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    i__3 = j;
+		    xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]
+			    ), dabs(r__2));
+		    if (nounit) {
+			i__3 = j + j * a_dim1;
+			q__1.r = tscal * a[i__3].r, q__1.i = tscal * a[i__3]
+				.i;
+			tjjs.r = q__1.r, tjjs.i = q__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.f;
+			if (tscal == 1.f) {
+			    goto L145;
+			}
+		    }
+
+/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */
+
+		    tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			     dabs(r__2));
+		    if (tjj > smlnum) {
+
+/*                       abs(A(j,j)) > SMLNUM: */
+
+			if (tjj < 1.f) {
+			    if (xj > tjj * bignum) {
+
+/*                             Scale X by 1/abs(x(j)). */
+
+				rec = 1.f / xj;
+				csscal_(n, &rec, &x[1], &c__1);
+				*scale *= rec;
+				xmax *= rec;
+			    }
+			}
+			i__3 = j;
+			cladiv_(&q__1, &x[j], &tjjs);
+			x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    } else if (tjj > 0.f) {
+
+/*                       0 < abs(A(j,j)) <= SMLNUM: */
+
+			if (xj > tjj * bignum) {
+
+/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
+
+			    rec = tjj * bignum / xj;
+			    csscal_(n, &rec, &x[1], &c__1);
+			    *scale *= rec;
+			    xmax *= rec;
+			}
+			i__3 = j;
+			cladiv_(&q__1, &x[j], &tjjs);
+			x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    } else {
+
+/*
+                         A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and
+                         scale = 0 and compute a solution to A**T *x = 0.
+*/
+
+			i__3 = *n;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    x[i__4].r = 0.f, x[i__4].i = 0.f;
+/* L140: */
+			}
+			i__3 = j;
+			x[i__3].r = 1.f, x[i__3].i = 0.f;
+			*scale = 0.f;
+			xmax = 0.f;
+		    }
+L145:
+		    ;
+		} else {
+
+/*
+                   Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot
+                   product has already been divided by 1/A(j,j).
+*/
+
+		    i__3 = j;
+		    cladiv_(&q__2, &x[j], &tjjs);
+		    q__1.r = q__2.r - csumj.r, q__1.i = q__2.i - csumj.i;
+		    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		}
+/* Computing MAX */
+		i__3 = j;
+		r__3 = xmax, r__4 = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 =
+			r_imag(&x[j]), dabs(r__2));
+		xmax = dmax(r__3,r__4);
+/* L150: */
+	    }
+
+	} else {
+
+/*           Solve A**H * x = b */
+
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*
+                Compute x(j) = b(j) - sum A(k,j)*x(k).
+                                      k<>j
+*/
+
+		i__3 = j;
+		xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]),
+			dabs(r__2));
+		uscal.r = tscal, uscal.i = 0.f;
+		rec = 1.f / dmax(xmax,1.f);
+		if (cnorm[j] > (bignum - xj) * rec) {
+
+/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */
+
+		    rec *= .5f;
+		    if (nounit) {
+			r_cnjg(&q__2, &a[j + j * a_dim1]);
+			q__1.r = tscal * q__2.r, q__1.i = tscal * q__2.i;
+			tjjs.r = q__1.r, tjjs.i = q__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.f;
+		    }
+		    tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			     dabs(r__2));
+		    if (tjj > 1.f) {
+
+/*
+                         Divide by A(j,j) when scaling x if A(j,j) > 1.
+
+   Computing MIN
+*/
+			r__1 = 1.f, r__2 = rec * tjj;
+			rec = dmin(r__1,r__2);
+			cladiv_(&q__1, &uscal, &tjjs);
+			uscal.r = q__1.r, uscal.i = q__1.i;
+		    }
+		    if (rec < 1.f) {
+			csscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+			xmax *= rec;
+		    }
+		}
+
+		csumj.r = 0.f, csumj.i = 0.f;
+		if (uscal.r == 1.f && uscal.i == 0.f) {
+
+/*
+                   If the scaling needed for A in the dot product is 1,
+                   call CDOTC to perform the dot product.
+*/
+
+		    if (upper) {
+			i__3 = j - 1;
+			cdotc_(&q__1, &i__3, &a[j * a_dim1 + 1], &c__1, &x[1],
+				 &c__1);
+			csumj.r = q__1.r, csumj.i = q__1.i;
+		    } else if (j < *n) {
+			i__3 = *n - j;
+			cdotc_(&q__1, &i__3, &a[j + 1 + j * a_dim1], &c__1, &
+				x[j + 1], &c__1);
+			csumj.r = q__1.r, csumj.i = q__1.i;
+		    }
+		} else {
+
+/*                 Otherwise, use in-line code for the dot product. */
+
+		    if (upper) {
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    r_cnjg(&q__4, &a[i__ + j * a_dim1]);
+			    q__3.r = q__4.r * uscal.r - q__4.i * uscal.i,
+				    q__3.i = q__4.r * uscal.i + q__4.i *
+				    uscal.r;
+			    i__4 = i__;
+			    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i,
+				    q__2.i = q__3.r * x[i__4].i + q__3.i * x[
+				    i__4].r;
+			    q__1.r = csumj.r + q__2.r, q__1.i = csumj.i +
+				    q__2.i;
+			    csumj.r = q__1.r, csumj.i = q__1.i;
+/* L160: */
+			}
+		    } else if (j < *n) {
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    r_cnjg(&q__4, &a[i__ + j * a_dim1]);
+			    q__3.r = q__4.r * uscal.r - q__4.i * uscal.i,
+				    q__3.i = q__4.r * uscal.i + q__4.i *
+				    uscal.r;
+			    i__4 = i__;
+			    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i,
+				    q__2.i = q__3.r * x[i__4].i + q__3.i * x[
+				    i__4].r;
+			    q__1.r = csumj.r + q__2.r, q__1.i = csumj.i +
+				    q__2.i;
+			    csumj.r = q__1.r, csumj.i = q__1.i;
+/* L170: */
+			}
+		    }
+		}
+
+		q__1.r = tscal, q__1.i = 0.f;
+		if (uscal.r == q__1.r && uscal.i == q__1.i) {
+
+/*
+                   Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j)
+                   was not used to scale the dotproduct.
+*/
+
+		    i__3 = j;
+		    i__4 = j;
+		    q__1.r = x[i__4].r - csumj.r, q__1.i = x[i__4].i -
+			    csumj.i;
+		    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    i__3 = j;
+		    xj = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 = r_imag(&x[j]
+			    ), dabs(r__2));
+		    if (nounit) {
+			r_cnjg(&q__2, &a[j + j * a_dim1]);
+			q__1.r = tscal * q__2.r, q__1.i = tscal * q__2.i;
+			tjjs.r = q__1.r, tjjs.i = q__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.f;
+			if (tscal == 1.f) {
+			    goto L185;
+			}
+		    }
+
+/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */
+
+		    tjj = (r__1 = tjjs.r, dabs(r__1)) + (r__2 = r_imag(&tjjs),
+			     dabs(r__2));
+		    if (tjj > smlnum) {
+
+/*                       abs(A(j,j)) > SMLNUM: */
+
+			if (tjj < 1.f) {
+			    if (xj > tjj * bignum) {
+
+/*                             Scale X by 1/abs(x(j)). */
+
+				rec = 1.f / xj;
+				csscal_(n, &rec, &x[1], &c__1);
+				*scale *= rec;
+				xmax *= rec;
+			    }
+			}
+			i__3 = j;
+			cladiv_(&q__1, &x[j], &tjjs);
+			x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    } else if (tjj > 0.f) {
+
+/*                       0 < abs(A(j,j)) <= SMLNUM: */
+
+			if (xj > tjj * bignum) {
+
+/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
+
+			    rec = tjj * bignum / xj;
+			    csscal_(n, &rec, &x[1], &c__1);
+			    *scale *= rec;
+			    xmax *= rec;
+			}
+			i__3 = j;
+			cladiv_(&q__1, &x[j], &tjjs);
+			x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		    } else {
+
+/*
+                         A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and
+                         scale = 0 and compute a solution to A**H *x = 0.
+*/
+
+			i__3 = *n;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    x[i__4].r = 0.f, x[i__4].i = 0.f;
+/* L180: */
+			}
+			i__3 = j;
+			x[i__3].r = 1.f, x[i__3].i = 0.f;
+			*scale = 0.f;
+			xmax = 0.f;
+		    }
+L185:
+		    ;
+		} else {
+
+/*
+                   Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot
+                   product has already been divided by 1/A(j,j).
+*/
+
+		    i__3 = j;
+		    cladiv_(&q__2, &x[j], &tjjs);
+		    q__1.r = q__2.r - csumj.r, q__1.i = q__2.i - csumj.i;
+		    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+		}
+/* Computing MAX */
+		i__3 = j;
+		r__3 = xmax, r__4 = (r__1 = x[i__3].r, dabs(r__1)) + (r__2 =
+			r_imag(&x[j]), dabs(r__2));
+		xmax = dmax(r__3,r__4);
+/* L190: */
+	    }
+	}
+	*scale /= tscal;
+    }
+
+/*     Scale the column norms by 1/TSCAL for return. */
+
+    if (tscal != 1.f) {
+	r__1 = 1.f / tscal;
+	sscal_(n, &r__1, &cnorm[1], &c__1);
+    }
+
+    return 0;
+
+/*     End of CLATRS */
+
+} /* clatrs_ */
+
+/* Subroutine */ int clauu2_(char *uplo, integer *n, complex *a, integer *lda,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__;
+    static real aii;
+    extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer
+	    *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex *
+	    , complex *, integer *, complex *, integer *, complex *, complex *
+	    , integer *);
+    static logical upper;
+    extern /* Subroutine */ int clacgv_(integer *, complex *, integer *),
+	    csscal_(integer *, real *, complex *, integer *), xerbla_(char *,
+	    integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLAUU2 computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLAUU2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the product U * U'. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    aii = a[i__2].r;
+	    if (i__ < *n) {
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = *n - i__;
+		cdotc_(&q__1, &i__3, &a[i__ + (i__ + 1) * a_dim1], lda, &a[
+			i__ + (i__ + 1) * a_dim1], lda);
+		r__1 = aii * aii + q__1.r;
+		a[i__2].r = r__1, a[i__2].i = 0.f;
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		q__1.r = aii, q__1.i = 0.f;
+		cgemv_("No transpose", &i__2, &i__3, &c_b57, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			q__1, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		clacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    } else {
+		csscal_(&i__, &aii, &a[i__ * a_dim1 + 1], &c__1);
+	    }
+/* L10: */
+	}
+
+    } else {
+
+/*        Compute the product L' * L. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    aii = a[i__2].r;
+	    if (i__ < *n) {
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = *n - i__;
+		cdotc_(&q__1, &i__3, &a[i__ + 1 + i__ * a_dim1], &c__1, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+		r__1 = aii * aii + q__1.r;
+		a[i__2].r = r__1, a[i__2].i = 0.f;
+		i__2 = i__ - 1;
+		clacgv_(&i__2, &a[i__ + a_dim1], lda);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		q__1.r = aii, q__1.i = 0.f;
+		cgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			q__1, &a[i__ + a_dim1], lda);
+		i__2 = i__ - 1;
+		clacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    } else {
+		csscal_(&i__, &aii, &a[i__ + a_dim1], lda);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLAUU2 */
+
+} /* clauu2_ */
+
+/* Subroutine */ int clauum_(char *uplo, integer *n, complex *a, integer *lda,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, ib, nb;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *), cherk_(char *,
+	    char *, integer *, integer *, real *, complex *, integer *, real *
+	    , complex *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *);
+    static logical upper;
+    extern /* Subroutine */ int clauu2_(char *, integer *, complex *, integer
+	    *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLAUUM computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLAUUM", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "CLAUUM", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	clauu2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute the product U * U'. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		ctrmm_("Right", "Upper", "Conjugate transpose", "Non-unit", &
+			i__3, &ib, &c_b57, &a[i__ + i__ * a_dim1], lda, &a[
+			i__ * a_dim1 + 1], lda);
+		clauu2_("Upper", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    cgemm_("No transpose", "Conjugate transpose", &i__3, &ib,
+			    &i__4, &c_b57, &a[(i__ + ib) * a_dim1 + 1], lda, &
+			    a[i__ + (i__ + ib) * a_dim1], lda, &c_b57, &a[i__
+			    * a_dim1 + 1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    cherk_("Upper", "No transpose", &ib, &i__3, &c_b1034, &a[
+			    i__ + (i__ + ib) * a_dim1], lda, &c_b1034, &a[i__
+			    + i__ * a_dim1], lda);
+		}
+/* L10: */
+	    }
+	} else {
+
+/*           Compute the product L' * L. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		ctrmm_("Left", "Lower", "Conjugate transpose", "Non-unit", &
+			ib, &i__3, &c_b57, &a[i__ + i__ * a_dim1], lda, &a[
+			i__ + a_dim1], lda);
+		clauu2_("Lower", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    cgemm_("Conjugate transpose", "No transpose", &ib, &i__3,
+			    &i__4, &c_b57, &a[i__ + ib + i__ * a_dim1], lda, &
+			    a[i__ + ib + a_dim1], lda, &c_b57, &a[i__ +
+			    a_dim1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    cherk_("Lower", "Conjugate transpose", &ib, &i__3, &
+			    c_b1034, &a[i__ + ib + i__ * a_dim1], lda, &
+			    c_b1034, &a[i__ + i__ * a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CLAUUM */
+
+} /* clauum_ */
+
+/* Subroutine */ int cpotf2_(char *uplo, integer *n, complex *a, integer *lda,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer j;
+    static real ajj;
+    extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer
+	    *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex *
+	    , complex *, integer *, complex *, integer *, complex *, complex *
+	    , integer *);
+    static logical upper;
+    extern /* Subroutine */ int clacgv_(integer *, complex *, integer *),
+	    csscal_(integer *, real *, complex *, integer *), xerbla_(char *,
+	    integer *);
+    extern logical sisnan_(real *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CPOTF2 computes the Cholesky factorization of a complex Hermitian
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U' * U ,  if UPLO = 'U', or
+       A = L  * L',  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            Hermitian matrix A is stored.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            n by n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U'*U  or A = L*L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, the leading minor of order k is not
+                 positive definite, and the factorization could not be
+                 completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CPOTF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the Cholesky factorization A = U'*U. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute U(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j + j * a_dim1;
+	    r__1 = a[i__2].r;
+	    i__3 = j - 1;
+	    cdotc_(&q__2, &i__3, &a[j * a_dim1 + 1], &c__1, &a[j * a_dim1 + 1]
+		    , &c__1);
+	    q__1.r = r__1 - q__2.r, q__1.i = -q__2.i;
+	    ajj = q__1.r;
+	    if (ajj <= 0.f || sisnan_(&ajj)) {
+		i__2 = j + j * a_dim1;
+		a[i__2].r = ajj, a[i__2].i = 0.f;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = ajj, a[i__2].i = 0.f;
+
+/*           Compute elements J+1:N of row J. */
+
+	    if (j < *n) {
+		i__2 = j - 1;
+		clacgv_(&i__2, &a[j * a_dim1 + 1], &c__1);
+		i__2 = j - 1;
+		i__3 = *n - j;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("Transpose", &i__2, &i__3, &q__1, &a[(j + 1) * a_dim1
+			+ 1], lda, &a[j * a_dim1 + 1], &c__1, &c_b57, &a[j + (
+			j + 1) * a_dim1], lda);
+		i__2 = j - 1;
+		clacgv_(&i__2, &a[j * a_dim1 + 1], &c__1);
+		i__2 = *n - j;
+		r__1 = 1.f / ajj;
+		csscal_(&i__2, &r__1, &a[j + (j + 1) * a_dim1], lda);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Compute the Cholesky factorization A = L*L'. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute L(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j + j * a_dim1;
+	    r__1 = a[i__2].r;
+	    i__3 = j - 1;
+	    cdotc_(&q__2, &i__3, &a[j + a_dim1], lda, &a[j + a_dim1], lda);
+	    q__1.r = r__1 - q__2.r, q__1.i = -q__2.i;
+	    ajj = q__1.r;
+	    if (ajj <= 0.f || sisnan_(&ajj)) {
+		i__2 = j + j * a_dim1;
+		a[i__2].r = ajj, a[i__2].i = 0.f;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = ajj, a[i__2].i = 0.f;
+
+/*           Compute elements J+1:N of column J. */
+
+	    if (j < *n) {
+		i__2 = j - 1;
+		clacgv_(&i__2, &a[j + a_dim1], lda);
+		i__2 = *n - j;
+		i__3 = j - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		cgemv_("No transpose", &i__2, &i__3, &q__1, &a[j + 1 + a_dim1]
+			, lda, &a[j + a_dim1], lda, &c_b57, &a[j + 1 + j *
+			a_dim1], &c__1);
+		i__2 = j - 1;
+		clacgv_(&i__2, &a[j + a_dim1], lda);
+		i__2 = *n - j;
+		r__1 = 1.f / ajj;
+		csscal_(&i__2, &r__1, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+    goto L40;
+
+L30:
+    *info = j;
+
+L40:
+    return 0;
+
+/*     End of CPOTF2 */
+
+} /* cpotf2_ */
+
+/* Subroutine */ int cpotrf_(char *uplo, integer *n, complex *a, integer *lda,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    complex q__1;
+
+    /* Local variables */
+    static integer j, jb, nb;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *,
+	    integer *, complex *, complex *, integer *, complex *, integer *,
+	    complex *, complex *, integer *), cherk_(char *,
+	    char *, integer *, integer *, real *, complex *, integer *, real *
+	    , complex *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *);
+    static logical upper;
+    extern /* Subroutine */ int cpotf2_(char *, integer *, complex *, integer
+	    *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CPOTRF computes the Cholesky factorization of a complex Hermitian
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U**H * U,  if UPLO = 'U', or
+       A = L  * L**H,  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the block version of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U**H*U or A = L*L**H.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the leading minor of order i is not
+                  positive definite, and the factorization could not be
+                  completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CPOTRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "CPOTRF", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code. */
+
+	cpotf2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code. */
+
+	if (upper) {
+
+/*           Compute the Cholesky factorization A = U'*U. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		cherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b1276, &
+			a[j * a_dim1 + 1], lda, &c_b1034, &a[j + j * a_dim1],
+			lda);
+		cpotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block row. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("Conjugate transpose", "No transpose", &jb, &i__3,
+			    &i__4, &q__1, &a[j * a_dim1 + 1], lda, &a[(j + jb)
+			     * a_dim1 + 1], lda, &c_b57, &a[j + (j + jb) *
+			    a_dim1], lda);
+		    i__3 = *n - j - jb + 1;
+		    ctrsm_("Left", "Upper", "Conjugate transpose", "Non-unit",
+			     &jb, &i__3, &c_b57, &a[j + j * a_dim1], lda, &a[
+			    j + (j + jb) * a_dim1], lda);
+		}
+/* L10: */
+	    }
+
+	} else {
+
+/*           Compute the Cholesky factorization A = L*L'. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		cherk_("Lower", "No transpose", &jb, &i__3, &c_b1276, &a[j +
+			a_dim1], lda, &c_b1034, &a[j + j * a_dim1], lda);
+		cpotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block column. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__3, &jb,
+			    &i__4, &q__1, &a[j + jb + a_dim1], lda, &a[j +
+			    a_dim1], lda, &c_b57, &a[j + jb + j * a_dim1],
+			    lda);
+		    i__3 = *n - j - jb + 1;
+		    ctrsm_("Right", "Lower", "Conjugate transpose", "Non-unit"
+			    , &i__3, &jb, &c_b57, &a[j + j * a_dim1], lda, &a[
+			    j + jb + j * a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+    goto L40;
+
+L30:
+    *info = *info + j - 1;
+
+L40:
+    return 0;
+
+/*     End of CPOTRF */
+
+} /* cpotrf_ */
+
+/* Subroutine */ int cpotri_(char *uplo, integer *n, complex *a, integer *lda,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), clauum_(
+	    char *, integer *, complex *, integer *, integer *),
+	    ctrtri_(char *, char *, integer *, complex *, integer *, integer *
+	    );
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CPOTRI computes the inverse of a complex Hermitian positive definite
+    matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+    computed by CPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the triangular factor U or L from the Cholesky
+            factorization A = U**H*U or A = L*L**H, as computed by
+            CPOTRF.
+            On exit, the upper or lower triangle of the (Hermitian)
+            inverse of A, overwriting the input factor U or L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the (i,i) element of the factor U or L is
+                  zero, and the inverse could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CPOTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Invert the triangular Cholesky factor U or L. */
+
+    ctrtri_(uplo, "Non-unit", n, &a[a_offset], lda, info);
+    if (*info > 0) {
+	return 0;
+    }
+
+/*     Form inv(U)*inv(U)' or inv(L)'*inv(L). */
+
+    clauum_(uplo, n, &a[a_offset], lda, info);
+
+    return 0;
+
+/*     End of CPOTRI */
+
+} /* cpotri_ */
+
+/* Subroutine */ int cpotrs_(char *uplo, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *b, integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *);
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CPOTRS solves a system of linear equations A*X = B with a Hermitian
+    positive definite matrix A using the Cholesky factorization
+    A = U**H*U or A = L*L**H computed by CPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The triangular factor U or L from the Cholesky factorization
+            A = U**H*U or A = L*L**H, as computed by CPOTRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    B       (input/output) COMPLEX array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CPOTRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*
+          Solve A*X = B where A = U'*U.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	ctrsm_("Left", "Upper", "Conjugate transpose", "Non-unit", n, nrhs, &
+		c_b57, &a[a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	ctrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b57, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A*X = B where A = L*L'.
+
+          Solve L*X = B, overwriting B with X.
+*/
+
+	ctrsm_("Left", "Lower", "No transpose", "Non-unit", n, nrhs, &c_b57, &
+		a[a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	ctrsm_("Left", "Lower", "Conjugate transpose", "Non-unit", n, nrhs, &
+		c_b57, &a[a_offset], lda, &b[b_offset], ldb);
+    }
+
+    return 0;
+
+/*     End of CPOTRS */
+
+} /* cpotrs_ */
+
+/* Subroutine */ int crot_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy, real *c__, complex *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static complex stemp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CROT   applies a plane rotation, where the cos (C) is real and the
+    sin (S) is complex, and the vectors CX and CY are complex.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of elements in the vectors CX and CY.
+
+    CX      (input/output) COMPLEX array, dimension (N)
+            On input, the vector X.
+            On output, CX is overwritten with C*X + S*Y.
+
+    INCX    (input) INTEGER
+            The increment between successive values of CY.  INCX <> 0.
+
+    CY      (input/output) COMPLEX array, dimension (N)
+            On input, the vector Y.
+            On output, CY is overwritten with -CONJG(S)*X + C*Y.
+
+    INCY    (input) INTEGER
+            The increment between successive values of CY.  INCX <> 0.
+
+    C       (input) REAL
+    S       (input) COMPLEX
+            C and S define a rotation
+               [  C          S  ]
+               [ -conjg(S)   C  ]
+            where C*C + S*CONJG(S) = 1.0.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r, stemp.i = q__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = ix;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r *
+		cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r, stemp.i = q__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = i__;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r *
+		cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+    }
+    return 0;
+} /* crot_ */
+
+/* Subroutine */ int cstedc_(char *compz, integer *n, real *d__, real *e,
+	complex *z__, integer *ldz, complex *work, integer *lwork, real *
+	rwork, integer *lrwork, integer *iwork, integer *liwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, j, k, m;
+    static real p;
+    static integer ii, ll, lgn;
+    static real eps, tiny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int cswap_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static integer lwmin;
+    extern /* Subroutine */ int claed0_(integer *, integer *, real *, real *,
+	    complex *, integer *, complex *, integer *, real *, integer *,
+	    integer *);
+    static integer start;
+    extern /* Subroutine */ int clacrm_(integer *, integer *, complex *,
+	    integer *, real *, integer *, complex *, integer *, real *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex
+	    *, integer *, complex *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer finish;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), sstedc_(char *, integer *, real *, real *, real *,
+	    integer *, real *, integer *, integer *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *,
+	    real *, integer *);
+    static integer liwmin, icompz;
+    extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *,
+	    complex *, integer *, real *, integer *);
+    static real orgnrm;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *);
+    static integer lrwmin;
+    static logical lquery;
+    static integer smlsiz;
+    extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *,
+	    real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CSTEDC computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the divide and conquer method.
+    The eigenvectors of a full or band complex Hermitian matrix can also
+    be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
+    matrix to tridiagonal form.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.  See SLAED3 for details.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'I':  Compute eigenvectors of tridiagonal matrix also.
+            = 'V':  Compute eigenvectors of original Hermitian matrix
+                    also.  On entry, Z contains the unitary matrix used
+                    to reduce the original matrix to tridiagonal form.
+
+    N       (input) INTEGER
+            The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the subdiagonal elements of the tridiagonal matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) COMPLEX array, dimension (LDZ,N)
+            On entry, if COMPZ = 'V', then Z contains the unitary
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original Hermitian matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If  COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1.
+            If eigenvectors are desired, then LDZ >= max(1,N).
+
+    WORK    (workspace/output) COMPLEX    array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If COMPZ = 'N' or 'I', or N <= 1, LWORK must be at least 1.
+            If COMPZ = 'V' and N > 1, LWORK must be at least N*N.
+            Note that for COMPZ = 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LWORK need
+            only be 1.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal sizes of the WORK, RWORK and
+            IWORK arrays, returns these values as the first entries of
+            the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    RWORK   (workspace/output) REAL array, dimension (MAX(1,LRWORK))
+            On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+
+    LRWORK  (input) INTEGER
+            The dimension of the array RWORK.
+            If COMPZ = 'N' or N <= 1, LRWORK must be at least 1.
+            If COMPZ = 'V' and N > 1, LRWORK must be at least
+                           1 + 3*N + 2*N*lg N + 3*N**2 ,
+                           where lg( N ) = smallest integer k such
+                           that 2**k >= N.
+            If COMPZ = 'I' and N > 1, LRWORK must be at least
+                           1 + 4*N + 2*N**2 .
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LRWORK
+            need only be max(1,2*(N-1)).
+
+            If LRWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If COMPZ = 'N' or N <= 1, LIWORK must be at least 1.
+            If COMPZ = 'V' or N > 1,  LIWORK must be at least
+                                      6 + 6*N + 5*N*lg N.
+            If COMPZ = 'I' or N > 1,  LIWORK must be at least
+                                      3 + 5*N .
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LIWORK
+            need only be 1.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1 || *lrwork == -1 || *liwork == -1;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+
+    if (*info == 0) {
+
+/*        Compute the workspace requirements */
+
+	smlsiz = ilaenv_(&c__9, "CSTEDC", " ", &c__0, &c__0, &c__0, &c__0, (
+		ftnlen)6, (ftnlen)1);
+	if (*n <= 1 || icompz == 0) {
+	    lwmin = 1;
+	    liwmin = 1;
+	    lrwmin = 1;
+	} else if (*n <= smlsiz) {
+	    lwmin = 1;
+	    liwmin = 1;
+	    lrwmin = *n - 1 << 1;
+	} else if (icompz == 1) {
+	    lgn = (integer) (log((real) (*n)) / log(2.f));
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    lwmin = *n * *n;
+/* Computing 2nd power */
+	    i__1 = *n;
+	    lrwmin = *n * 3 + 1 + (*n << 1) * lgn + i__1 * i__1 * 3;
+	    liwmin = *n * 6 + 6 + *n * 5 * lgn;
+	} else if (icompz == 2) {
+	    lwmin = 1;
+/* Computing 2nd power */
+	    i__1 = *n;
+	    lrwmin = (*n << 2) + 1 + (i__1 * i__1 << 1);
+	    liwmin = *n * 5 + 3;
+	}
+	work[1].r = (real) lwmin, work[1].i = 0.f;
+	rwork[1] = (real) lrwmin;
+	iwork[1] = liwmin;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*lrwork < lrwmin && ! lquery) {
+	    *info = -10;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CSTEDC", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*n == 1) {
+	if (icompz != 0) {
+	    i__1 = z_dim1 + 1;
+	    z__[i__1].r = 1.f, z__[i__1].i = 0.f;
+	}
+	return 0;
+    }
+
+/*
+       If the following conditional clause is removed, then the routine
+       will use the Divide and Conquer routine to compute only the
+       eigenvalues, which requires (3N + 3N**2) real workspace and
+       (2 + 5N + 2N lg(N)) integer workspace.
+       Since on many architectures SSTERF is much faster than any other
+       algorithm for finding eigenvalues only, it is used here
+       as the default. If the conditional clause is removed, then
+       information on the size of workspace needs to be changed.
+
+       If COMPZ = 'N', use SSTERF to compute the eigenvalues.
+*/
+
+    if (icompz == 0) {
+	ssterf_(n, &d__[1], &e[1], info);
+	goto L70;
+    }
+
+/*
+       If N is smaller than the minimum divide size (SMLSIZ+1), then
+       solve the problem with another solver.
+*/
+
+    if (*n <= smlsiz) {
+
+	csteqr_(compz, n, &d__[1], &e[1], &z__[z_offset], ldz, &rwork[1],
+		info);
+
+    } else {
+
+/*        If COMPZ = 'I', we simply call SSTEDC instead. */
+
+	if (icompz == 2) {
+	    slaset_("Full", n, n, &c_b328, &c_b1034, &rwork[1], n);
+	    ll = *n * *n + 1;
+	    i__1 = *lrwork - ll + 1;
+	    sstedc_("I", n, &d__[1], &e[1], &rwork[1], n, &rwork[ll], &i__1, &
+		    iwork[1], liwork, info);
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * z_dim1;
+		    i__4 = (j - 1) * *n + i__;
+		    z__[i__3].r = rwork[i__4], z__[i__3].i = 0.f;
+/* L10: */
+		}
+/* L20: */
+	    }
+	    goto L70;
+	}
+
+/*
+          From now on, only option left to be handled is COMPZ = 'V',
+          i.e. ICOMPZ = 1.
+
+          Scale.
+*/
+
+	orgnrm = slanst_("M", n, &d__[1], &e[1]);
+	if (orgnrm == 0.f) {
+	    goto L70;
+	}
+
+	eps = slamch_("Epsilon");
+
+	start = 1;
+
+/*        while ( START <= N ) */
+
+L30:
+	if (start <= *n) {
+
+/*
+             Let FINISH be the position of the next subdiagonal entry
+             such that E( FINISH ) <= TINY or FINISH = N if no such
+             subdiagonal exists.  The matrix identified by the elements
+             between START and FINISH constitutes an independent
+             sub-problem.
+*/
+
+	    finish = start;
+L40:
+	    if (finish < *n) {
+		tiny = eps * sqrt((r__1 = d__[finish], dabs(r__1))) * sqrt((
+			r__2 = d__[finish + 1], dabs(r__2)));
+		if ((r__1 = e[finish], dabs(r__1)) > tiny) {
+		    ++finish;
+		    goto L40;
+		}
+	    }
+
+/*           (Sub) Problem determined.  Compute its size and solve it. */
+
+	    m = finish - start + 1;
+	    if (m > smlsiz) {
+
+/*              Scale. */
+
+		orgnrm = slanst_("M", &m, &d__[start], &e[start]);
+		slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &m, &c__1, &d__[
+			start], &m, info);
+		i__1 = m - 1;
+		i__2 = m - 1;
+		slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &i__1, &c__1, &
+			e[start], &i__2, info);
+
+		claed0_(n, &m, &d__[start], &e[start], &z__[start * z_dim1 +
+			1], ldz, &work[1], n, &rwork[1], &iwork[1], info);
+		if (*info > 0) {
+		    *info = (*info / (m + 1) + start - 1) * (*n + 1) + *info %
+			     (m + 1) + start - 1;
+		    goto L70;
+		}
+
+/*              Scale back. */
+
+		slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, &m, &c__1, &d__[
+			start], &m, info);
+
+	    } else {
+		ssteqr_("I", &m, &d__[start], &e[start], &rwork[1], &m, &
+			rwork[m * m + 1], info);
+		clacrm_(n, &m, &z__[start * z_dim1 + 1], ldz, &rwork[1], &m, &
+			work[1], n, &rwork[m * m + 1]);
+		clacpy_("A", n, &m, &work[1], n, &z__[start * z_dim1 + 1],
+			ldz);
+		if (*info > 0) {
+		    *info = start * (*n + 1) + finish;
+		    goto L70;
+		}
+	    }
+
+	    start = finish + 1;
+	    goto L30;
+	}
+
+/*
+          endwhile
+
+          If the problem split any number of times, then the eigenvalues
+          will not be properly ordered.  Here we permute the eigenvalues
+          (and the associated eigenvectors) into ascending order.
+*/
+
+	if (m != *n) {
+
+/*           Use Selection Sort to minimize swaps of eigenvectors */
+
+	    i__1 = *n;
+	    for (ii = 2; ii <= i__1; ++ii) {
+		i__ = ii - 1;
+		k = i__;
+		p = d__[i__];
+		i__2 = *n;
+		for (j = ii; j <= i__2; ++j) {
+		    if (d__[j] < p) {
+			k = j;
+			p = d__[j];
+		    }
+/* L50: */
+		}
+		if (k != i__) {
+		    d__[k] = d__[i__];
+		    d__[i__] = p;
+		    cswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k * z_dim1
+			    + 1], &c__1);
+		}
+/* L60: */
+	    }
+	}
+    }
+
+L70:
+    work[1].r = (real) lwmin, work[1].i = 0.f;
+    rwork[1] = (real) lrwmin;
+    iwork[1] = liwmin;
+
+    return 0;
+
+/*     End of CSTEDC */
+
+} /* cstedc_ */
+
+/* Subroutine */ int csteqr_(char *compz, integer *n, real *d__, real *e,
+	complex *z__, integer *ldz, real *work, integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real b, c__, f, g;
+    static integer i__, j, k, l, m;
+    static real p, r__, s;
+    static integer l1, ii, mm, lm1, mm1, nm1;
+    static real rt1, rt2, eps;
+    static integer lsv;
+    static real tst, eps2;
+    static integer lend, jtot;
+    extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *)
+	    ;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int clasr_(char *, char *, char *, integer *,
+	    integer *, real *, real *, complex *, integer *);
+    static real anorm;
+    extern /* Subroutine */ int cswap_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static integer lendm1, lendp1;
+    extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real *
+	    , real *, real *);
+    extern doublereal slapy2_(real *, real *);
+    static integer iscale;
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int claset_(char *, integer *, integer *, complex
+	    *, complex *, complex *, integer *);
+    static real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real safmax;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *);
+    static integer lendsv;
+    extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real *
+	    );
+    static real ssfmin;
+    static integer nmaxit, icompz;
+    static real ssfmax;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CSTEQR computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the implicit QL or QR method.
+    The eigenvectors of a full or band complex Hermitian matrix can also
+    be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
+    matrix to tridiagonal form.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'V':  Compute eigenvalues and eigenvectors of the original
+                    Hermitian matrix.  On entry, Z must contain the
+                    unitary matrix used to reduce the original matrix
+                    to tridiagonal form.
+            = 'I':  Compute eigenvalues and eigenvectors of the
+                    tridiagonal matrix.  Z is initialized to the identity
+                    matrix.
+
+    N       (input) INTEGER
+            The order of the matrix.  N >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the (n-1) subdiagonal elements of the tridiagonal
+            matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) COMPLEX array, dimension (LDZ, N)
+            On entry, if  COMPZ = 'V', then Z contains the unitary
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original Hermitian matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1, and if
+            eigenvectors are desired, then  LDZ >= max(1,N).
+
+    WORK    (workspace) REAL array, dimension (max(1,2*N-2))
+            If COMPZ = 'N', then WORK is not referenced.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  the algorithm has failed to find all the eigenvalues in
+                  a total of 30*N iterations; if INFO = i, then i
+                  elements of E have not converged to zero; on exit, D
+                  and E contain the elements of a symmetric tridiagonal
+                  matrix which is unitarily similar to the original
+                  matrix.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CSTEQR", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	if (icompz == 2) {
+	    i__1 = z_dim1 + 1;
+	    z__[i__1].r = 1.f, z__[i__1].i = 0.f;
+	}
+	return 0;
+    }
+
+/*     Determine the unit roundoff and over/underflow thresholds. */
+
+    eps = slamch_("E");
+/* Computing 2nd power */
+    r__1 = eps;
+    eps2 = r__1 * r__1;
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ssfmax = sqrt(safmax) / 3.f;
+    ssfmin = sqrt(safmin) / eps2;
+
+/*
+       Compute the eigenvalues and eigenvectors of the tridiagonal
+       matrix.
+*/
+
+    if (icompz == 2) {
+	claset_("Full", n, n, &c_b56, &c_b57, &z__[z_offset], ldz);
+    }
+
+    nmaxit = *n * 30;
+    jtot = 0;
+
+/*
+       Determine where the matrix splits and choose QL or QR iteration
+       for each block, according to whether top or bottom diagonal
+       element is smaller.
+*/
+
+    l1 = 1;
+    nm1 = *n - 1;
+
+L10:
+    if (l1 > *n) {
+	goto L160;
+    }
+    if (l1 > 1) {
+	e[l1 - 1] = 0.f;
+    }
+    if (l1 <= nm1) {
+	i__1 = nm1;
+	for (m = l1; m <= i__1; ++m) {
+	    tst = (r__1 = e[m], dabs(r__1));
+	    if (tst == 0.f) {
+		goto L30;
+	    }
+	    if (tst <= sqrt((r__1 = d__[m], dabs(r__1))) * sqrt((r__2 = d__[m
+		    + 1], dabs(r__2))) * eps) {
+		e[m] = 0.f;
+		goto L30;
+	    }
+/* L20: */
+	}
+    }
+    m = *n;
+
+L30:
+    l = l1;
+    lsv = l;
+    lend = m;
+    lendsv = lend;
+    l1 = m + 1;
+    if (lend == l) {
+	goto L10;
+    }
+
+/*     Scale submatrix in rows and columns L to LEND */
+
+    i__1 = lend - l + 1;
+    anorm = slanst_("I", &i__1, &d__[l], &e[l]);
+    iscale = 0;
+    if (anorm == 0.f) {
+	goto L10;
+    }
+    if (anorm > ssfmax) {
+	iscale = 1;
+	i__1 = lend - l + 1;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n,
+		info);
+    } else if (anorm < ssfmin) {
+	iscale = 2;
+	i__1 = lend - l + 1;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n,
+		info);
+    }
+
+/*     Choose between QL and QR iteration */
+
+    if ((r__1 = d__[lend], dabs(r__1)) < (r__2 = d__[l], dabs(r__2))) {
+	lend = lsv;
+	l = lendsv;
+    }
+
+    if (lend > l) {
+
+/*
+          QL Iteration
+
+          Look for small subdiagonal element.
+*/
+
+L40:
+	if (l != lend) {
+	    lendm1 = lend - 1;
+	    i__1 = lendm1;
+	    for (m = l; m <= i__1; ++m) {
+/* Computing 2nd power */
+		r__2 = (r__1 = e[m], dabs(r__1));
+		tst = r__2 * r__2;
+		if (tst <= eps2 * (r__1 = d__[m], dabs(r__1)) * (r__2 = d__[m
+			+ 1], dabs(r__2)) + safmin) {
+		    goto L60;
+		}
+/* L50: */
+	    }
+	}
+
+	m = lend;
+
+L60:
+	if (m < lend) {
+	    e[m] = 0.f;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L80;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use SLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l + 1) {
+	    if (icompz > 0) {
+		slaev2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2, &c__, &s);
+		work[l] = c__;
+		work[*n - 1 + l] = s;
+		clasr_("R", "V", "B", n, &c__2, &work[l], &work[*n - 1 + l], &
+			z__[l * z_dim1 + 1], ldz);
+	    } else {
+		slae2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2);
+	    }
+	    d__[l] = rt1;
+	    d__[l + 1] = rt2;
+	    e[l] = 0.f;
+	    l += 2;
+	    if (l <= lend) {
+		goto L40;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l + 1] - p) / (e[l] * 2.f);
+	r__ = slapy2_(&g, &c_b1034);
+	g = d__[m] - p + e[l] / (g + r_sign(&r__, &g));
+
+	s = 1.f;
+	c__ = 1.f;
+	p = 0.f;
+
+/*        Inner loop */
+
+	mm1 = m - 1;
+	i__1 = l;
+	for (i__ = mm1; i__ >= i__1; --i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    slartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m - 1) {
+		e[i__ + 1] = r__;
+	    }
+	    g = d__[i__ + 1] - p;
+	    r__ = (d__[i__] - g) * s + c__ * 2.f * b;
+	    p = s * r__;
+	    d__[i__ + 1] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = -s;
+	    }
+
+/* L70: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = m - l + 1;
+	    clasr_("R", "V", "B", n, &mm, &work[l], &work[*n - 1 + l], &z__[l
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[l] = g;
+	goto L40;
+
+/*        Eigenvalue found. */
+
+L80:
+	d__[l] = p;
+
+	++l;
+	if (l <= lend) {
+	    goto L40;
+	}
+	goto L140;
+
+    } else {
+
+/*
+          QR Iteration
+
+          Look for small superdiagonal element.
+*/
+
+L90:
+	if (l != lend) {
+	    lendp1 = lend + 1;
+	    i__1 = lendp1;
+	    for (m = l; m >= i__1; --m) {
+/* Computing 2nd power */
+		r__2 = (r__1 = e[m - 1], dabs(r__1));
+		tst = r__2 * r__2;
+		if (tst <= eps2 * (r__1 = d__[m], dabs(r__1)) * (r__2 = d__[m
+			- 1], dabs(r__2)) + safmin) {
+		    goto L110;
+		}
+/* L100: */
+	    }
+	}
+
+	m = lend;
+
+L110:
+	if (m > lend) {
+	    e[m - 1] = 0.f;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L130;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use SLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l - 1) {
+	    if (icompz > 0) {
+		slaev2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2, &c__, &s)
+			;
+		work[m] = c__;
+		work[*n - 1 + m] = s;
+		clasr_("R", "V", "F", n, &c__2, &work[m], &work[*n - 1 + m], &
+			z__[(l - 1) * z_dim1 + 1], ldz);
+	    } else {
+		slae2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2);
+	    }
+	    d__[l - 1] = rt1;
+	    d__[l] = rt2;
+	    e[l - 1] = 0.f;
+	    l += -2;
+	    if (l >= lend) {
+		goto L90;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l - 1] - p) / (e[l - 1] * 2.f);
+	r__ = slapy2_(&g, &c_b1034);
+	g = d__[m] - p + e[l - 1] / (g + r_sign(&r__, &g));
+
+	s = 1.f;
+	c__ = 1.f;
+	p = 0.f;
+
+/*        Inner loop */
+
+	lm1 = l - 1;
+	i__1 = lm1;
+	for (i__ = m; i__ <= i__1; ++i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    slartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m) {
+		e[i__ - 1] = r__;
+	    }
+	    g = d__[i__] - p;
+	    r__ = (d__[i__ + 1] - g) * s + c__ * 2.f * b;
+	    p = s * r__;
+	    d__[i__] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = s;
+	    }
+
+/* L120: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = l - m + 1;
+	    clasr_("R", "V", "F", n, &mm, &work[m], &work[*n - 1 + m], &z__[m
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[lm1] = g;
+	goto L90;
+
+/*        Eigenvalue found. */
+
+L130:
+	d__[l] = p;
+
+	--l;
+	if (l >= lend) {
+	    goto L90;
+	}
+	goto L140;
+
+    }
+
+/*     Undo scaling if necessary */
+
+L140:
+    if (iscale == 1) {
+	i__1 = lendsv - lsv + 1;
+	slascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	slascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    } else if (iscale == 2) {
+	i__1 = lendsv - lsv + 1;
+	slascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	slascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    }
+
+/*
+       Check for no convergence to an eigenvalue after a total
+       of N*MAXIT iterations.
+*/
+
+    if (jtot == nmaxit) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (e[i__] != 0.f) {
+		++(*info);
+	    }
+/* L150: */
+	}
+	return 0;
+    }
+    goto L10;
+
+/*     Order eigenvalues and eigenvectors. */
+
+L160:
+    if (icompz == 0) {
+
+/*        Use Quick Sort */
+
+	slasrt_("I", n, &d__[1], info);
+
+    } else {
+
+/*        Use Selection Sort to minimize swaps of eigenvectors */
+
+	i__1 = *n;
+	for (ii = 2; ii <= i__1; ++ii) {
+	    i__ = ii - 1;
+	    k = i__;
+	    p = d__[i__];
+	    i__2 = *n;
+	    for (j = ii; j <= i__2; ++j) {
+		if (d__[j] < p) {
+		    k = j;
+		    p = d__[j];
+		}
+/* L170: */
+	    }
+	    if (k != i__) {
+		d__[k] = d__[i__];
+		d__[i__] = p;
+		cswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k * z_dim1 + 1],
+			 &c__1);
+	    }
+/* L180: */
+	}
+    }
+    return 0;
+
+/*     End of CSTEQR */
+
+} /* csteqr_ */
+
+/* Subroutine */ int ctrevc_(char *side, char *howmny, logical *select,
+	integer *n, complex *t, integer *ldt, complex *vl, integer *ldvl,
+	complex *vr, integer *ldvr, integer *mm, integer *m, complex *work,
+	real *rwork, integer *info)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3, i__4, i__5;
+    real r__1, r__2, r__3;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j, k, ii, ki, is;
+    static real ulp;
+    static logical allv;
+    static real unfl, ovfl, smin;
+    static logical over;
+    static real scale;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex *
+	    , complex *, integer *, complex *, integer *, complex *, complex *
+	    , integer *);
+    static real remax;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static logical leftv, bothv, somev;
+    extern /* Subroutine */ int slabad_(real *, real *);
+    extern integer icamax_(integer *, complex *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer
+	    *), xerbla_(char *, integer *), clatrs_(char *, char *,
+	    char *, char *, integer *, complex *, integer *, complex *, real *
+	    , real *, integer *);
+    extern doublereal scasum_(integer *, complex *, integer *);
+    static logical rightv;
+    static real smlnum;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CTREVC computes some or all of the right and/or left eigenvectors of
+    a complex upper triangular matrix T.
+    Matrices of this type are produced by the Schur factorization of
+    a complex general matrix:  A = Q*T*Q**H, as computed by CHSEQR.
+
+    The right eigenvector x and the left eigenvector y of T corresponding
+    to an eigenvalue w are defined by:
+
+                 T*x = w*x,     (y**H)*T = w*(y**H)
+
+    where y**H denotes the conjugate transpose of the vector y.
+    The eigenvalues are not input to this routine, but are read directly
+    from the diagonal of T.
+
+    This routine returns the matrices X and/or Y of right and left
+    eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+    input matrix.  If Q is the unitary factor that reduces a matrix A to
+    Schur form T, then Q*X and Q*Y are the matrices of right and left
+    eigenvectors of A.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  compute right eigenvectors only;
+            = 'L':  compute left eigenvectors only;
+            = 'B':  compute both right and left eigenvectors.
+
+    HOWMNY  (input) CHARACTER*1
+            = 'A':  compute all right and/or left eigenvectors;
+            = 'B':  compute all right and/or left eigenvectors,
+                    backtransformed using the matrices supplied in
+                    VR and/or VL;
+            = 'S':  compute selected right and/or left eigenvectors,
+                    as indicated by the logical array SELECT.
+
+    SELECT  (input) LOGICAL array, dimension (N)
+            If HOWMNY = 'S', SELECT specifies the eigenvectors to be
+            computed.
+            The eigenvector corresponding to the j-th eigenvalue is
+            computed if SELECT(j) = .TRUE..
+            Not referenced if HOWMNY = 'A' or 'B'.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) COMPLEX array, dimension (LDT,N)
+            The upper triangular matrix T.  T is modified, but restored
+            on exit.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    VL      (input/output) COMPLEX array, dimension (LDVL,MM)
+            On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must
+            contain an N-by-N matrix Q (usually the unitary matrix Q of
+            Schur vectors returned by CHSEQR).
+            On exit, if SIDE = 'L' or 'B', VL contains:
+            if HOWMNY = 'A', the matrix Y of left eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*Y;
+            if HOWMNY = 'S', the left eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VL, in the same order as their
+                             eigenvalues.
+            Not referenced if SIDE = 'R'.
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1, and if
+            SIDE = 'L' or 'B', LDVL >= N.
+
+    VR      (input/output) COMPLEX array, dimension (LDVR,MM)
+            On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
+            contain an N-by-N matrix Q (usually the unitary matrix Q of
+            Schur vectors returned by CHSEQR).
+            On exit, if SIDE = 'R' or 'B', VR contains:
+            if HOWMNY = 'A', the matrix X of right eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*X;
+            if HOWMNY = 'S', the right eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VR, in the same order as their
+                             eigenvalues.
+            Not referenced if SIDE = 'L'.
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1, and if
+            SIDE = 'R' or 'B'; LDVR >= N.
+
+    MM      (input) INTEGER
+            The number of columns in the arrays VL and/or VR. MM >= M.
+
+    M       (output) INTEGER
+            The number of columns in the arrays VL and/or VR actually
+            used to store the eigenvectors.  If HOWMNY = 'A' or 'B', M
+            is set to N.  Each selected eigenvector occupies one
+            column.
+
+    WORK    (workspace) COMPLEX array, dimension (2*N)
+
+    RWORK   (workspace) REAL array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The algorithm used in this program is basically backward (forward)
+    substitution, with scaling to make the the code robust against
+    possible overflow.
+
+    Each eigenvector is normalized so that the element of largest
+    magnitude has magnitude 1; here the magnitude of a complex number
+    (x,y) is taken to be |x| + |y|.
+
+    =====================================================================
+
+
+       Decode and test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --select;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+    --rwork;
+
+    /* Function Body */
+    bothv = lsame_(side, "B");
+    rightv = lsame_(side, "R") || bothv;
+    leftv = lsame_(side, "L") || bothv;
+
+    allv = lsame_(howmny, "A");
+    over = lsame_(howmny, "B");
+    somev = lsame_(howmny, "S");
+
+/*
+       Set M to the number of columns required to store the selected
+       eigenvectors.
+*/
+
+    if (somev) {
+	*m = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (select[j]) {
+		++(*m);
+	    }
+/* L10: */
+	}
+    } else {
+	*m = *n;
+    }
+
+    *info = 0;
+    if (! rightv && ! leftv) {
+	*info = -1;
+    } else if (! allv && ! over && ! somev) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ldt < max(1,*n)) {
+	*info = -6;
+    } else if (*ldvl < 1 || leftv && *ldvl < *n) {
+	*info = -8;
+    } else if (*ldvr < 1 || rightv && *ldvr < *n) {
+	*info = -10;
+    } else if (*mm < *m) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTREVC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Set the constants to control overflow. */
+
+    unfl = slamch_("Safe minimum");
+    ovfl = 1.f / unfl;
+    slabad_(&unfl, &ovfl);
+    ulp = slamch_("Precision");
+    smlnum = unfl * (*n / ulp);
+
+/*     Store the diagonal elements of T in working array WORK. */
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__ + *n;
+	i__3 = i__ + i__ * t_dim1;
+	work[i__2].r = t[i__3].r, work[i__2].i = t[i__3].i;
+/* L20: */
+    }
+
+/*
+       Compute 1-norm of each column of strictly upper triangular
+       part of T to control overflow in triangular solver.
+*/
+
+    rwork[1] = 0.f;
+    i__1 = *n;
+    for (j = 2; j <= i__1; ++j) {
+	i__2 = j - 1;
+	rwork[j] = scasum_(&i__2, &t[j * t_dim1 + 1], &c__1);
+/* L30: */
+    }
+
+    if (rightv) {
+
+/*        Compute right eigenvectors. */
+
+	is = *m;
+	for (ki = *n; ki >= 1; --ki) {
+
+	    if (somev) {
+		if (! select[ki]) {
+		    goto L80;
+		}
+	    }
+/* Computing MAX */
+	    i__1 = ki + ki * t_dim1;
+	    r__3 = ulp * ((r__1 = t[i__1].r, dabs(r__1)) + (r__2 = r_imag(&t[
+		    ki + ki * t_dim1]), dabs(r__2)));
+	    smin = dmax(r__3,smlnum);
+
+	    work[1].r = 1.f, work[1].i = 0.f;
+
+/*           Form right-hand side. */
+
+	    i__1 = ki - 1;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k;
+		i__3 = k + ki * t_dim1;
+		q__1.r = -t[i__3].r, q__1.i = -t[i__3].i;
+		work[i__2].r = q__1.r, work[i__2].i = q__1.i;
+/* L40: */
+	    }
+
+/*
+             Solve the triangular system:
+                (T(1:KI-1,1:KI-1) - T(KI,KI))*X = SCALE*WORK.
+*/
+
+	    i__1 = ki - 1;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k + k * t_dim1;
+		i__3 = k + k * t_dim1;
+		i__4 = ki + ki * t_dim1;
+		q__1.r = t[i__3].r - t[i__4].r, q__1.i = t[i__3].i - t[i__4]
+			.i;
+		t[i__2].r = q__1.r, t[i__2].i = q__1.i;
+		i__2 = k + k * t_dim1;
+		if ((r__1 = t[i__2].r, dabs(r__1)) + (r__2 = r_imag(&t[k + k *
+			 t_dim1]), dabs(r__2)) < smin) {
+		    i__3 = k + k * t_dim1;
+		    t[i__3].r = smin, t[i__3].i = 0.f;
+		}
+/* L50: */
+	    }
+
+	    if (ki > 1) {
+		i__1 = ki - 1;
+		clatrs_("Upper", "No transpose", "Non-unit", "Y", &i__1, &t[
+			t_offset], ldt, &work[1], &scale, &rwork[1], info);
+		i__1 = ki;
+		work[i__1].r = scale, work[i__1].i = 0.f;
+	    }
+
+/*           Copy the vector x or Q*x to VR and normalize. */
+
+	    if (! over) {
+		ccopy_(&ki, &work[1], &c__1, &vr[is * vr_dim1 + 1], &c__1);
+
+		ii = icamax_(&ki, &vr[is * vr_dim1 + 1], &c__1);
+		i__1 = ii + is * vr_dim1;
+		remax = 1.f / ((r__1 = vr[i__1].r, dabs(r__1)) + (r__2 =
+			r_imag(&vr[ii + is * vr_dim1]), dabs(r__2)));
+		csscal_(&ki, &remax, &vr[is * vr_dim1 + 1], &c__1);
+
+		i__1 = *n;
+		for (k = ki + 1; k <= i__1; ++k) {
+		    i__2 = k + is * vr_dim1;
+		    vr[i__2].r = 0.f, vr[i__2].i = 0.f;
+/* L60: */
+		}
+	    } else {
+		if (ki > 1) {
+		    i__1 = ki - 1;
+		    q__1.r = scale, q__1.i = 0.f;
+		    cgemv_("N", n, &i__1, &c_b57, &vr[vr_offset], ldvr, &work[
+			    1], &c__1, &q__1, &vr[ki * vr_dim1 + 1], &c__1);
+		}
+
+		ii = icamax_(n, &vr[ki * vr_dim1 + 1], &c__1);
+		i__1 = ii + ki * vr_dim1;
+		remax = 1.f / ((r__1 = vr[i__1].r, dabs(r__1)) + (r__2 =
+			r_imag(&vr[ii + ki * vr_dim1]), dabs(r__2)));
+		csscal_(n, &remax, &vr[ki * vr_dim1 + 1], &c__1);
+	    }
+
+/*           Set back the original diagonal elements of T. */
+
+	    i__1 = ki - 1;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k + k * t_dim1;
+		i__3 = k + *n;
+		t[i__2].r = work[i__3].r, t[i__2].i = work[i__3].i;
+/* L70: */
+	    }
+
+	    --is;
+L80:
+	    ;
+	}
+    }
+
+    if (leftv) {
+
+/*        Compute left eigenvectors. */
+
+	is = 1;
+	i__1 = *n;
+	for (ki = 1; ki <= i__1; ++ki) {
+
+	    if (somev) {
+		if (! select[ki]) {
+		    goto L130;
+		}
+	    }
+/* Computing MAX */
+	    i__2 = ki + ki * t_dim1;
+	    r__3 = ulp * ((r__1 = t[i__2].r, dabs(r__1)) + (r__2 = r_imag(&t[
+		    ki + ki * t_dim1]), dabs(r__2)));
+	    smin = dmax(r__3,smlnum);
+
+	    i__2 = *n;
+	    work[i__2].r = 1.f, work[i__2].i = 0.f;
+
+/*           Form right-hand side. */
+
+	    i__2 = *n;
+	    for (k = ki + 1; k <= i__2; ++k) {
+		i__3 = k;
+		r_cnjg(&q__2, &t[ki + k * t_dim1]);
+		q__1.r = -q__2.r, q__1.i = -q__2.i;
+		work[i__3].r = q__1.r, work[i__3].i = q__1.i;
+/* L90: */
+	    }
+
+/*
+             Solve the triangular system:
+                (T(KI+1:N,KI+1:N) - T(KI,KI))'*X = SCALE*WORK.
+*/
+
+	    i__2 = *n;
+	    for (k = ki + 1; k <= i__2; ++k) {
+		i__3 = k + k * t_dim1;
+		i__4 = k + k * t_dim1;
+		i__5 = ki + ki * t_dim1;
+		q__1.r = t[i__4].r - t[i__5].r, q__1.i = t[i__4].i - t[i__5]
+			.i;
+		t[i__3].r = q__1.r, t[i__3].i = q__1.i;
+		i__3 = k + k * t_dim1;
+		if ((r__1 = t[i__3].r, dabs(r__1)) + (r__2 = r_imag(&t[k + k *
+			 t_dim1]), dabs(r__2)) < smin) {
+		    i__4 = k + k * t_dim1;
+		    t[i__4].r = smin, t[i__4].i = 0.f;
+		}
+/* L100: */
+	    }
+
+	    if (ki < *n) {
+		i__2 = *n - ki;
+		clatrs_("Upper", "Conjugate transpose", "Non-unit", "Y", &
+			i__2, &t[ki + 1 + (ki + 1) * t_dim1], ldt, &work[ki +
+			1], &scale, &rwork[1], info);
+		i__2 = ki;
+		work[i__2].r = scale, work[i__2].i = 0.f;
+	    }
+
+/*           Copy the vector x or Q*x to VL and normalize. */
+
+	    if (! over) {
+		i__2 = *n - ki + 1;
+		ccopy_(&i__2, &work[ki], &c__1, &vl[ki + is * vl_dim1], &c__1)
+			;
+
+		i__2 = *n - ki + 1;
+		ii = icamax_(&i__2, &vl[ki + is * vl_dim1], &c__1) + ki - 1;
+		i__2 = ii + is * vl_dim1;
+		remax = 1.f / ((r__1 = vl[i__2].r, dabs(r__1)) + (r__2 =
+			r_imag(&vl[ii + is * vl_dim1]), dabs(r__2)));
+		i__2 = *n - ki + 1;
+		csscal_(&i__2, &remax, &vl[ki + is * vl_dim1], &c__1);
+
+		i__2 = ki - 1;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = k + is * vl_dim1;
+		    vl[i__3].r = 0.f, vl[i__3].i = 0.f;
+/* L110: */
+		}
+	    } else {
+		if (ki < *n) {
+		    i__2 = *n - ki;
+		    q__1.r = scale, q__1.i = 0.f;
+		    cgemv_("N", n, &i__2, &c_b57, &vl[(ki + 1) * vl_dim1 + 1],
+			     ldvl, &work[ki + 1], &c__1, &q__1, &vl[ki *
+			    vl_dim1 + 1], &c__1);
+		}
+
+		ii = icamax_(n, &vl[ki * vl_dim1 + 1], &c__1);
+		i__2 = ii + ki * vl_dim1;
+		remax = 1.f / ((r__1 = vl[i__2].r, dabs(r__1)) + (r__2 =
+			r_imag(&vl[ii + ki * vl_dim1]), dabs(r__2)));
+		csscal_(n, &remax, &vl[ki * vl_dim1 + 1], &c__1);
+	    }
+
+/*           Set back the original diagonal elements of T. */
+
+	    i__2 = *n;
+	    for (k = ki + 1; k <= i__2; ++k) {
+		i__3 = k + k * t_dim1;
+		i__4 = k + *n;
+		t[i__3].r = work[i__4].r, t[i__3].i = work[i__4].i;
+/* L120: */
+	    }
+
+	    ++is;
+L130:
+	    ;
+	}
+    }
+
+    return 0;
+
+/*     End of CTREVC */
+
+} /* ctrevc_ */
+
+/* Subroutine */ int ctrexc_(char *compq, integer *n, complex *t, integer *
+	ldt, complex *q, integer *ldq, integer *ifst, integer *ilst, integer *
+	info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, t_dim1, t_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer k, m1, m2, m3;
+    static real cs;
+    static complex t11, t22, sn, temp;
+    extern /* Subroutine */ int crot_(integer *, complex *, integer *,
+	    complex *, integer *, real *, complex *);
+    extern logical lsame_(char *, char *);
+    static logical wantq;
+    extern /* Subroutine */ int clartg_(complex *, complex *, real *, complex
+	    *, complex *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CTREXC reorders the Schur factorization of a complex matrix
+    A = Q*T*Q**H, so that the diagonal element of T with row index IFST
+    is moved to row ILST.
+
+    The Schur form T is reordered by a unitary similarity transformation
+    Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
+    postmultplying it with Z.
+
+    Arguments
+    =========
+
+    COMPQ   (input) CHARACTER*1
+            = 'V':  update the matrix Q of Schur vectors;
+            = 'N':  do not update Q.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) COMPLEX array, dimension (LDT,N)
+            On entry, the upper triangular matrix T.
+            On exit, the reordered upper triangular matrix.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    Q       (input/output) COMPLEX array, dimension (LDQ,N)
+            On entry, if COMPQ = 'V', the matrix Q of Schur vectors.
+            On exit, if COMPQ = 'V', Q has been postmultiplied by the
+            unitary transformation matrix Z which reorders T.
+            If COMPQ = 'N', Q is not referenced.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    IFST    (input) INTEGER
+    ILST    (input) INTEGER
+            Specify the reordering of the diagonal elements of T:
+            The element with row index IFST is moved to row ILST by a
+            sequence of transpositions between adjacent elements.
+            1 <= IFST <= N; 1 <= ILST <= N.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Decode and test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(compq, "V");
+    if (! lsame_(compq, "N") && ! wantq) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldt < max(1,*n)) {
+	*info = -4;
+    } else if (*ldq < 1 || wantq && *ldq < max(1,*n)) {
+	*info = -6;
+    } else if (*ifst < 1 || *ifst > *n) {
+	*info = -7;
+    } else if (*ilst < 1 || *ilst > *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTREXC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 1 || *ifst == *ilst) {
+	return 0;
+    }
+
+    if (*ifst < *ilst) {
+
+/*        Move the IFST-th diagonal element forward down the diagonal. */
+
+	m1 = 0;
+	m2 = -1;
+	m3 = 1;
+    } else {
+
+/*        Move the IFST-th diagonal element backward up the diagonal. */
+
+	m1 = -1;
+	m2 = 0;
+	m3 = -1;
+    }
+
+    i__1 = *ilst + m2;
+    i__2 = m3;
+    for (k = *ifst + m1; i__2 < 0 ? k >= i__1 : k <= i__1; k += i__2) {
+
+/*        Interchange the k-th and (k+1)-th diagonal elements. */
+
+	i__3 = k + k * t_dim1;
+	t11.r = t[i__3].r, t11.i = t[i__3].i;
+	i__3 = k + 1 + (k + 1) * t_dim1;
+	t22.r = t[i__3].r, t22.i = t[i__3].i;
+
+/*        Determine the transformation to perform the interchange. */
+
+	q__1.r = t22.r - t11.r, q__1.i = t22.i - t11.i;
+	clartg_(&t[k + (k + 1) * t_dim1], &q__1, &cs, &sn, &temp);
+
+/*        Apply transformation to the matrix T. */
+
+	if (k + 2 <= *n) {
+	    i__3 = *n - k - 1;
+	    crot_(&i__3, &t[k + (k + 2) * t_dim1], ldt, &t[k + 1 + (k + 2) *
+		    t_dim1], ldt, &cs, &sn);
+	}
+	i__3 = k - 1;
+	r_cnjg(&q__1, &sn);
+	crot_(&i__3, &t[k * t_dim1 + 1], &c__1, &t[(k + 1) * t_dim1 + 1], &
+		c__1, &cs, &q__1);
+
+	i__3 = k + k * t_dim1;
+	t[i__3].r = t22.r, t[i__3].i = t22.i;
+	i__3 = k + 1 + (k + 1) * t_dim1;
+	t[i__3].r = t11.r, t[i__3].i = t11.i;
+
+	if (wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    r_cnjg(&q__1, &sn);
+	    crot_(n, &q[k * q_dim1 + 1], &c__1, &q[(k + 1) * q_dim1 + 1], &
+		    c__1, &cs, &q__1);
+	}
+
+/* L10: */
+    }
+
+    return 0;
+
+/*     End of CTREXC */
+
+} /* ctrexc_ */
+
+/* Subroutine */ int ctrti2_(char *uplo, char *diag, integer *n, complex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    complex q__1;
+
+    /* Local variables */
+    static integer j;
+    static complex ajj;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *,
+	    complex *, integer *, complex *, integer *), xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CTRTI2 computes the inverse of a complex upper or lower triangular
+    matrix.
+
+    This is the Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the matrix A is upper or lower triangular.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    DIAG    (input) CHARACTER*1
+            Specifies whether or not the matrix A is unit triangular.
+            = 'N':  Non-unit triangular
+            = 'U':  Unit triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading n by n upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTRTI2", &i__1);
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute inverse of upper triangular matrix. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (nounit) {
+		i__2 = j + j * a_dim1;
+		c_div(&q__1, &c_b57, &a[j + j * a_dim1]);
+		a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+		i__2 = j + j * a_dim1;
+		q__1.r = -a[i__2].r, q__1.i = -a[i__2].i;
+		ajj.r = q__1.r, ajj.i = q__1.i;
+	    } else {
+		q__1.r = -1.f, q__1.i = -0.f;
+		ajj.r = q__1.r, ajj.i = q__1.i;
+	    }
+
+/*           Compute elements 1:j-1 of j-th column. */
+
+	    i__2 = j - 1;
+	    ctrmv_("Upper", "No transpose", diag, &i__2, &a[a_offset], lda, &
+		    a[j * a_dim1 + 1], &c__1);
+	    i__2 = j - 1;
+	    cscal_(&i__2, &ajj, &a[j * a_dim1 + 1], &c__1);
+/* L10: */
+	}
+    } else {
+
+/*        Compute inverse of lower triangular matrix. */
+
+	for (j = *n; j >= 1; --j) {
+	    if (nounit) {
+		i__1 = j + j * a_dim1;
+		c_div(&q__1, &c_b57, &a[j + j * a_dim1]);
+		a[i__1].r = q__1.r, a[i__1].i = q__1.i;
+		i__1 = j + j * a_dim1;
+		q__1.r = -a[i__1].r, q__1.i = -a[i__1].i;
+		ajj.r = q__1.r, ajj.i = q__1.i;
+	    } else {
+		q__1.r = -1.f, q__1.i = -0.f;
+		ajj.r = q__1.r, ajj.i = q__1.i;
+	    }
+	    if (j < *n) {
+
+/*              Compute elements j+1:n of j-th column. */
+
+		i__1 = *n - j;
+		ctrmv_("Lower", "No transpose", diag, &i__1, &a[j + 1 + (j +
+			1) * a_dim1], lda, &a[j + 1 + j * a_dim1], &c__1);
+		i__1 = *n - j;
+		cscal_(&i__1, &ajj, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of CTRTI2 */
+
+} /* ctrti2_ */
+
+/* Subroutine */ int ctrtri_(char *uplo, char *diag, integer *n, complex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, i__1, i__2, i__3[2], i__4, i__5;
+    complex q__1;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer j, jb, nb, nn;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *,
+	    integer *, integer *, complex *, complex *, integer *, complex *,
+	    integer *), ctrsm_(char *, char *,
+	     char *, char *, integer *, integer *, complex *, complex *,
+	    integer *, complex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int ctrti2_(char *, char *, integer *, complex *,
+	    integer *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CTRTRI computes the inverse of a complex upper or lower triangular
+    matrix A.
+
+    This is the Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  A is upper triangular;
+            = 'L':  A is lower triangular.
+
+    DIAG    (input) CHARACTER*1
+            = 'N':  A is non-unit triangular;
+            = 'U':  A is unit triangular.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+            > 0: if INFO = i, A(i,i) is exactly zero.  The triangular
+                 matrix is singular and its inverse can not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTRTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Check for singularity if non-unit. */
+
+    if (nounit) {
+	i__1 = *n;
+	for (*info = 1; *info <= i__1; ++(*info)) {
+	    i__2 = *info + *info * a_dim1;
+	    if (a[i__2].r == 0.f && a[i__2].i == 0.f) {
+		return 0;
+	    }
+/* L10: */
+	}
+	*info = 0;
+    }
+
+/*
+       Determine the block size for this environment.
+
+   Writing concatenation
+*/
+    i__3[0] = 1, a__1[0] = uplo;
+    i__3[1] = 1, a__1[1] = diag;
+    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+    nb = ilaenv_(&c__1, "CTRTRI", ch__1, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)2);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	ctrti2_(uplo, diag, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute inverse of upper triangular matrix */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+/* Computing MIN */
+		i__4 = nb, i__5 = *n - j + 1;
+		jb = min(i__4,i__5);
+
+/*              Compute rows 1:j-1 of current block column */
+
+		i__4 = j - 1;
+		ctrmm_("Left", "Upper", "No transpose", diag, &i__4, &jb, &
+			c_b57, &a[a_offset], lda, &a[j * a_dim1 + 1], lda);
+		i__4 = j - 1;
+		q__1.r = -1.f, q__1.i = -0.f;
+		ctrsm_("Right", "Upper", "No transpose", diag, &i__4, &jb, &
+			q__1, &a[j + j * a_dim1], lda, &a[j * a_dim1 + 1],
+			lda);
+
+/*              Compute inverse of current diagonal block */
+
+		ctrti2_("Upper", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L20: */
+	    }
+	} else {
+
+/*           Compute inverse of lower triangular matrix */
+
+	    nn = (*n - 1) / nb * nb + 1;
+	    i__2 = -nb;
+	    for (j = nn; i__2 < 0 ? j >= 1 : j <= 1; j += i__2) {
+/* Computing MIN */
+		i__1 = nb, i__4 = *n - j + 1;
+		jb = min(i__1,i__4);
+		if (j + jb <= *n) {
+
+/*                 Compute rows j+jb:n of current block column */
+
+		    i__1 = *n - j - jb + 1;
+		    ctrmm_("Left", "Lower", "No transpose", diag, &i__1, &jb,
+			    &c_b57, &a[j + jb + (j + jb) * a_dim1], lda, &a[j
+			    + jb + j * a_dim1], lda);
+		    i__1 = *n - j - jb + 1;
+		    q__1.r = -1.f, q__1.i = -0.f;
+		    ctrsm_("Right", "Lower", "No transpose", diag, &i__1, &jb,
+			     &q__1, &a[j + j * a_dim1], lda, &a[j + jb + j *
+			    a_dim1], lda);
+		}
+
+/*              Compute inverse of current diagonal block */
+
+		ctrti2_("Lower", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L30: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTRTRI */
+
+} /* ctrtri_ */
+
+/* Subroutine */ int cung2r_(integer *m, integer *n, integer *k, complex *a,
+	integer *lda, complex *tau, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), clarf_(char *, integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *, complex *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNG2R generates an m by n complex matrix Q with orthonormal columns,
+    which is defined as the first n columns of a product of k elementary
+    reflectors of order m
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by CGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by CGEQRF in the first k columns of its array
+            argument A.
+            On exit, the m by n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEQRF.
+
+    WORK    (workspace) COMPLEX array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNG2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Initialise columns k+1:n to columns of the unit matrix */
+
+    i__1 = *n;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (l = 1; l <= i__2; ++l) {
+	    i__3 = l + j * a_dim1;
+	    a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L10: */
+	}
+	i__2 = j + j * a_dim1;
+	a[i__2].r = 1.f, a[i__2].i = 0.f;
+/* L20: */
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i) to A(i:m,i:n) from the left */
+
+	if (i__ < *n) {
+	    i__1 = i__ + i__ * a_dim1;
+	    a[i__1].r = 1.f, a[i__1].i = 0.f;
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n - i__;
+	    clarf_("Left", &i__1, &i__2, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	}
+	if (i__ < *m) {
+	    i__1 = *m - i__;
+	    i__2 = i__;
+	    q__1.r = -tau[i__2].r, q__1.i = -tau[i__2].i;
+	    cscal_(&i__1, &q__1, &a[i__ + 1 + i__ * a_dim1], &c__1);
+	}
+	i__1 = i__ + i__ * a_dim1;
+	i__2 = i__;
+	q__1.r = 1.f - tau[i__2].r, q__1.i = 0.f - tau[i__2].i;
+	a[i__1].r = q__1.r, a[i__1].i = q__1.i;
+
+/*        Set A(1:i-1,i) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = l + i__ * a_dim1;
+	    a[i__2].r = 0.f, a[i__2].i = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of CUNG2R */
+
+} /* cung2r_ */
+
+/* Subroutine */ int cungbr_(char *vect, integer *m, integer *n, integer *k,
+	complex *a, integer *lda, complex *tau, complex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, nb, mn;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical wantq;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cunglq_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, integer *),
+	     cungqr_(integer *, integer *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNGBR generates one of the complex unitary matrices Q or P**H
+    determined by CGEBRD when reducing a complex matrix A to bidiagonal
+    form: A = Q * B * P**H.  Q and P**H are defined as products of
+    elementary reflectors H(i) or G(i) respectively.
+
+    If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+    is of order M:
+    if m >= k, Q = H(1) H(2) . . . H(k) and CUNGBR returns the first n
+    columns of Q, where m >= n >= k;
+    if m < k, Q = H(1) H(2) . . . H(m-1) and CUNGBR returns Q as an
+    M-by-M matrix.
+
+    If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**H
+    is of order N:
+    if k < n, P**H = G(k) . . . G(2) G(1) and CUNGBR returns the first m
+    rows of P**H, where n >= m >= k;
+    if k >= n, P**H = G(n-1) . . . G(2) G(1) and CUNGBR returns P**H as
+    an N-by-N matrix.
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            Specifies whether the matrix Q or the matrix P**H is
+            required, as defined in the transformation applied by CGEBRD:
+            = 'Q':  generate Q;
+            = 'P':  generate P**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q or P**H to be returned.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q or P**H to be returned.
+            N >= 0.
+            If VECT = 'Q', M >= N >= min(M,K);
+            if VECT = 'P', N >= M >= min(N,K).
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original M-by-K
+            matrix reduced by CGEBRD.
+            If VECT = 'P', the number of rows in the original K-by-N
+            matrix reduced by CGEBRD.
+            K >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by CGEBRD.
+            On exit, the M-by-N matrix Q or P**H.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= M.
+
+    TAU     (input) COMPLEX array, dimension
+                                  (min(M,K)) if VECT = 'Q'
+                                  (min(N,K)) if VECT = 'P'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i), which determines Q or P**H, as
+            returned by CGEBRD in its array argument TAUQ or TAUP.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,min(M,N)).
+            For optimum performance LWORK >= min(M,N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(vect, "Q");
+    mn = min(*m,*n);
+    lquery = *lwork == -1;
+    if (! wantq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0 || wantq && (*n > *m || *n < min(*m,*k)) || ! wantq && (
+	    *m > *n || *m < min(*n,*k))) {
+	*info = -3;
+    } else if (*k < 0) {
+	*info = -4;
+    } else if (*lda < max(1,*m)) {
+	*info = -6;
+    } else if (*lwork < max(1,mn) && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+	if (wantq) {
+	    nb = ilaenv_(&c__1, "CUNGQR", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	} else {
+	    nb = ilaenv_(&c__1, "CUNGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	}
+	lwkopt = max(1,mn) * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNGBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    if (wantq) {
+
+/*
+          Form Q, determined by a call to CGEBRD to reduce an m-by-k
+          matrix
+*/
+
+	if (*m >= *k) {
+
+/*           If m >= k, assume m >= n >= k */
+
+	    cungqr_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If m < k, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             column to the right, and set the first row and column of Q
+             to those of the unit matrix
+*/
+
+	    for (j = *m; j >= 2; --j) {
+		i__1 = j * a_dim1 + 1;
+		a[i__1].r = 0.f, a[i__1].i = 0.f;
+		i__1 = *m;
+		for (i__ = j + 1; i__ <= i__1; ++i__) {
+		    i__2 = i__ + j * a_dim1;
+		    i__3 = i__ + (j - 1) * a_dim1;
+		    a[i__2].r = a[i__3].r, a[i__2].i = a[i__3].i;
+/* L10: */
+		}
+/* L20: */
+	    }
+	    i__1 = a_dim1 + 1;
+	    a[i__1].r = 1.f, a[i__1].i = 0.f;
+	    i__1 = *m;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		i__2 = i__ + a_dim1;
+		a[i__2].r = 0.f, a[i__2].i = 0.f;
+/* L30: */
+	    }
+	    if (*m > 1) {
+
+/*              Form Q(2:m,2:m) */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		cungqr_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    } else {
+
+/*
+          Form P', determined by a call to CGEBRD to reduce a k-by-n
+          matrix
+*/
+
+	if (*k < *n) {
+
+/*           If k < n, assume k <= m <= n */
+
+	    cunglq_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If k >= n, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             row downward, and set the first row and column of P' to
+             those of the unit matrix
+*/
+
+	    i__1 = a_dim1 + 1;
+	    a[i__1].r = 1.f, a[i__1].i = 0.f;
+	    i__1 = *n;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		i__2 = i__ + a_dim1;
+		a[i__2].r = 0.f, a[i__2].i = 0.f;
+/* L40: */
+	    }
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		for (i__ = j - 1; i__ >= 2; --i__) {
+		    i__2 = i__ + j * a_dim1;
+		    i__3 = i__ - 1 + j * a_dim1;
+		    a[i__2].r = a[i__3].r, a[i__2].i = a[i__3].i;
+/* L50: */
+		}
+		i__2 = j * a_dim1 + 1;
+		a[i__2].r = 0.f, a[i__2].i = 0.f;
+/* L60: */
+	    }
+	    if (*n > 1) {
+
+/*              Form P'(2:n,2:n) */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		cunglq_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNGBR */
+
+} /* cungbr_ */
+
+/* Subroutine */ int cunghr_(integer *n, integer *ilo, integer *ihi, complex *
+	a, integer *lda, complex *tau, complex *work, integer *lwork, integer
+	*info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, nb, nh, iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cungqr_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNGHR generates a complex unitary matrix Q which is defined as the
+    product of IHI-ILO elementary reflectors of order N, as returned by
+    CGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix Q. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of CGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by CGEHRD.
+            On exit, the N-by-N unitary matrix Q.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,N).
+
+    TAU     (input) COMPLEX array, dimension (N-1)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEHRD.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= IHI-ILO.
+            For optimum performance LWORK >= (IHI-ILO)*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,nh) && ! lquery) {
+	*info = -8;
+    }
+
+    if (*info == 0) {
+	nb = ilaenv_(&c__1, "CUNGQR", " ", &nh, &nh, &nh, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+	lwkopt = max(1,nh) * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNGHR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+/*
+       Shift the vectors which define the elementary reflectors one
+       column to the right, and set the first ilo and the last n-ihi
+       rows and columns to those of the unit matrix
+*/
+
+    i__1 = *ilo + 1;
+    for (j = *ihi; j >= i__1; --j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L10: */
+	}
+	i__2 = *ihi;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    i__4 = i__ + (j - 1) * a_dim1;
+	    a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i;
+/* L20: */
+	}
+	i__2 = *n;
+	for (i__ = *ihi + 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+    i__1 = *ilo;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L50: */
+	}
+	i__2 = j + j * a_dim1;
+	a[i__2].r = 1.f, a[i__2].i = 0.f;
+/* L60: */
+    }
+    i__1 = *n;
+    for (j = *ihi + 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L70: */
+	}
+	i__2 = j + j * a_dim1;
+	a[i__2].r = 1.f, a[i__2].i = 0.f;
+/* L80: */
+    }
+
+    if (nh > 0) {
+
+/*        Generate Q(ilo+1:ihi,ilo+1:ihi) */
+
+	cungqr_(&nh, &nh, &nh, &a[*ilo + 1 + (*ilo + 1) * a_dim1], lda, &tau[*
+		ilo], &work[1], lwork, &iinfo);
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNGHR */
+
+} /* cunghr_ */
+
+/* Subroutine */ int cungl2_(integer *m, integer *n, integer *k, complex *a,
+	integer *lda, complex *tau, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1, q__2;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *,
+	    integer *), clarf_(char *, integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *, complex *),
+	    clacgv_(integer *, complex *, integer *), xerbla_(char *, integer
+	    *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNGL2 generates an m-by-n complex matrix Q with orthonormal rows,
+    which is defined as the first m rows of a product of k elementary
+    reflectors of order n
+
+          Q  =  H(k)' . . . H(2)' H(1)'
+
+    as returned by CGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by CGELQF in the first k rows of its array argument A.
+            On exit, the m by n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGELQF.
+
+    WORK    (workspace) COMPLEX array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNGL2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	return 0;
+    }
+
+    if (*k < *m) {
+
+/*        Initialise rows k+1:m to rows of the unit matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (l = *k + 1; l <= i__2; ++l) {
+		i__3 = l + j * a_dim1;
+		a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L10: */
+	    }
+	    if (j > *k && j <= *m) {
+		i__2 = j + j * a_dim1;
+		a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    }
+/* L20: */
+	}
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i)' to A(i:m,i:n) from the right */
+
+	if (i__ < *n) {
+	    i__1 = *n - i__;
+	    clacgv_(&i__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    if (i__ < *m) {
+		i__1 = i__ + i__ * a_dim1;
+		a[i__1].r = 1.f, a[i__1].i = 0.f;
+		i__1 = *m - i__;
+		i__2 = *n - i__ + 1;
+		r_cnjg(&q__1, &tau[i__]);
+		clarf_("Right", &i__1, &i__2, &a[i__ + i__ * a_dim1], lda, &
+			q__1, &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    i__1 = *n - i__;
+	    i__2 = i__;
+	    q__1.r = -tau[i__2].r, q__1.i = -tau[i__2].i;
+	    cscal_(&i__1, &q__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    i__1 = *n - i__;
+	    clacgv_(&i__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+	i__1 = i__ + i__ * a_dim1;
+	r_cnjg(&q__2, &tau[i__]);
+	q__1.r = 1.f - q__2.r, q__1.i = 0.f - q__2.i;
+	a[i__1].r = q__1.r, a[i__1].i = q__1.i;
+
+/*        Set A(i,1:i-1,i) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = i__ + l * a_dim1;
+	    a[i__2].r = 0.f, a[i__2].i = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of CUNGL2 */
+
+} /* cungl2_ */
+
+/* Subroutine */ int cunglq_(integer *m, integer *n, integer *k, complex *a,
+	integer *lda, complex *tau, complex *work, integer *lwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int cungl2_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *), clarfb_(
+	    char *, char *, char *, char *, integer *, integer *, integer *,
+	    complex *, integer *, complex *, integer *, complex *, integer *,
+	    complex *, integer *), clarft_(
+	    char *, char *, integer *, integer *, complex *, integer *,
+	    complex *, complex *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNGLQ generates an M-by-N complex matrix Q with orthonormal rows,
+    which is defined as the first M rows of a product of K elementary
+    reflectors of order N
+
+          Q  =  H(k)' . . . H(2)' H(1)'
+
+    as returned by CGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by CGELQF in the first k rows of its array argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGELQF.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit;
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "CUNGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*m) * nb;
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNGLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CUNGLQ", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CUNGLQ", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk rows are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(kk+1:m,1:kk) to zero. */
+
+	i__1 = kk;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = kk + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *m) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	cungl2_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *n - i__ + 1;
+		clarft_("Forward", "Rowwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i+ib:m,i:n) from the right */
+
+		i__2 = *m - i__ - ib + 1;
+		i__3 = *n - i__ + 1;
+		clarfb_("Right", "Conjugate transpose", "Forward", "Rowwise",
+			&i__2, &i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[
+			1], &ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[
+			ib + 1], &ldwork);
+	    }
+
+/*           Apply H' to columns i:n of current block */
+
+	    i__2 = *n - i__ + 1;
+	    cungl2_(&ib, &i__2, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set columns 1:i-1 of current block to zero */
+
+	    i__2 = i__ - 1;
+	    for (j = 1; j <= i__2; ++j) {
+		i__3 = i__ + ib - 1;
+		for (l = i__; l <= i__3; ++l) {
+		    i__4 = l + j * a_dim1;
+		    a[i__4].r = 0.f, a[i__4].i = 0.f;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1].r = (real) iws, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNGLQ */
+
+} /* cunglq_ */
+
+/* Subroutine */ int cungqr_(integer *m, integer *n, integer *k, complex *a,
+	integer *lda, complex *tau, complex *work, integer *lwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int cung2r_(integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *), clarfb_(
+	    char *, char *, char *, char *, integer *, integer *, integer *,
+	    complex *, integer *, complex *, integer *, complex *, integer *,
+	    complex *, integer *), clarft_(
+	    char *, char *, integer *, integer *, complex *, integer *,
+	    complex *, complex *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNGQR generates an M-by-N complex matrix Q with orthonormal columns,
+    which is defined as the first N columns of a product of K elementary
+    reflectors of order M
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by CGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by CGEQRF in the first k columns of its array
+            argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEQRF.
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "CUNGQR", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*n) * nb;
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNGQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CUNGQR", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CUNGQR", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk columns are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(1:kk,kk+1:n) to zero. */
+
+	i__1 = *n;
+	for (j = kk + 1; j <= i__1; ++j) {
+	    i__2 = kk;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = 0.f, a[i__3].i = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *n) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	cung2r_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *m - i__ + 1;
+		clarft_("Forward", "Columnwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i:m,i+ib:n) from the left */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__ - ib + 1;
+		clarfb_("Left", "No transpose", "Forward", "Columnwise", &
+			i__2, &i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[
+			1], &ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &
+			work[ib + 1], &ldwork);
+	    }
+
+/*           Apply H to rows i:m of current block */
+
+	    i__2 = *m - i__ + 1;
+	    cung2r_(&i__2, &ib, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set rows 1:i-1 of current block to zero */
+
+	    i__2 = i__ + ib - 1;
+	    for (j = i__; j <= i__2; ++j) {
+		i__3 = i__ - 1;
+		for (l = 1; l <= i__3; ++l) {
+		    i__4 = l + j * a_dim1;
+		    a[i__4].r = 0.f, a[i__4].i = 0.f;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1].r = (real) iws, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNGQR */
+
+} /* cungqr_ */
+
+/* Subroutine */ int cunm2l_(char *side, char *trans, integer *m, integer *n,
+	integer *k, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, mi, ni, nq;
+    static complex aii;
+    static logical left;
+    static complex taui;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNM2L overwrites the general complex m-by-n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'C', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'C',
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by CGEQLF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q' (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            CGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEQLF.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the m-by-n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNM2L", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+    } else {
+	mi = *m;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) or H(i)' is applied to C(1:m-k+i,1:n) */
+
+	    mi = *m - *k + i__;
+	} else {
+
+/*           H(i) or H(i)' is applied to C(1:m,1:n-k+i) */
+
+	    ni = *n - *k + i__;
+	}
+
+/*        Apply H(i) or H(i)' */
+
+	if (notran) {
+	    i__3 = i__;
+	    taui.r = tau[i__3].r, taui.i = tau[i__3].i;
+	} else {
+	    r_cnjg(&q__1, &tau[i__]);
+	    taui.r = q__1.r, taui.i = q__1.i;
+	}
+	i__3 = nq - *k + i__ + i__ * a_dim1;
+	aii.r = a[i__3].r, aii.i = a[i__3].i;
+	i__3 = nq - *k + i__ + i__ * a_dim1;
+	a[i__3].r = 1.f, a[i__3].i = 0.f;
+	clarf_(side, &mi, &ni, &a[i__ * a_dim1 + 1], &c__1, &taui, &c__[
+		c_offset], ldc, &work[1]);
+	i__3 = nq - *k + i__ + i__ * a_dim1;
+	a[i__3].r = aii.r, a[i__3].i = aii.i;
+/* L10: */
+    }
+    return 0;
+
+/*     End of CUNM2L */
+
+} /* cunm2l_ */
+
+/* Subroutine */ int cunm2r_(char *side, char *trans, integer *m, integer *n,
+	integer *k, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static complex aii;
+    static logical left;
+    static complex taui;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNM2R overwrites the general complex m-by-n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'C', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'C',
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by CGEQRF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q' (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            CGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEQRF.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the m-by-n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNM2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && ! notran || ! left && notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) or H(i)' is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) or H(i)' is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) or H(i)' */
+
+	if (notran) {
+	    i__3 = i__;
+	    taui.r = tau[i__3].r, taui.i = tau[i__3].i;
+	} else {
+	    r_cnjg(&q__1, &tau[i__]);
+	    taui.r = q__1.r, taui.i = q__1.i;
+	}
+	i__3 = i__ + i__ * a_dim1;
+	aii.r = a[i__3].r, aii.i = a[i__3].i;
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = 1.f, a[i__3].i = 0.f;
+	clarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], &c__1, &taui, &c__[ic
+		+ jc * c_dim1], ldc, &work[1]);
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = aii.r, a[i__3].i = aii.i;
+/* L10: */
+    }
+    return 0;
+
+/*     End of CUNM2R */
+
+} /* cunm2r_ */
+
+/* Subroutine */ int cunmbr_(char *vect, char *side, char *trans, integer *m,
+	integer *n, integer *k, complex *a, integer *lda, complex *tau,
+	complex *c__, integer *ldc, complex *work, integer *lwork, integer *
+	info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2];
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cunmlq_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *);
+    static logical notran;
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *);
+    static logical applyq;
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    If VECT = 'Q', CUNMBR overwrites the general complex M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    If VECT = 'P', CUNMBR overwrites the general complex M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      P * C          C * P
+    TRANS = 'C':      P**H * C       C * P**H
+
+    Here Q and P**H are the unitary matrices determined by CGEBRD when
+    reducing a complex matrix A to bidiagonal form: A = Q * B * P**H. Q
+    and P**H are defined as products of elementary reflectors H(i) and
+    G(i) respectively.
+
+    Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+    order of the unitary matrix Q or P**H that is applied.
+
+    If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+    if nq >= k, Q = H(1) H(2) . . . H(k);
+    if nq < k, Q = H(1) H(2) . . . H(nq-1).
+
+    If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+    if k < nq, P = G(1) G(2) . . . G(k);
+    if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            = 'Q': apply Q or Q**H;
+            = 'P': apply P or P**H.
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q, Q**H, P or P**H from the Left;
+            = 'R': apply Q, Q**H, P or P**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q or P;
+            = 'C':  Conjugate transpose, apply Q**H or P**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original
+            matrix reduced by CGEBRD.
+            If VECT = 'P', the number of rows in the original
+            matrix reduced by CGEBRD.
+            K >= 0.
+
+    A       (input) COMPLEX array, dimension
+                                  (LDA,min(nq,K)) if VECT = 'Q'
+                                  (LDA,nq)        if VECT = 'P'
+            The vectors which define the elementary reflectors H(i) and
+            G(i), whose products determine the matrices Q and P, as
+            returned by CGEBRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If VECT = 'Q', LDA >= max(1,nq);
+            if VECT = 'P', LDA >= max(1,min(nq,K)).
+
+    TAU     (input) COMPLEX array, dimension (min(nq,K))
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i) which determines Q or P, as returned
+            by CGEBRD in the array argument TAUQ or TAUP.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q
+            or P*C or P**H*C or C*P or C*P**H.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M);
+            if N = 0 or M = 0, LWORK >= 1.
+            For optimum performance LWORK >= max(1,N*NB) if SIDE = 'L',
+            and LWORK >= max(1,M*NB) if SIDE = 'R', where NB is the
+            optimal blocksize. (NB = 0 if M = 0 or N = 0.)
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    applyq = lsame_(vect, "Q");
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q or P and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (*m == 0 || *n == 0) {
+	nw = 0;
+    }
+    if (! applyq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (! left && ! lsame_(side, "R")) {
+	*info = -2;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*k < 0) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = 1, i__2 = min(nq,*k);
+	if (applyq && *lda < max(1,nq) || ! applyq && *lda < max(i__1,i__2)) {
+	    *info = -8;
+	} else if (*ldc < max(1,*m)) {
+	    *info = -11;
+	} else if (*lwork < max(1,nw) && ! lquery) {
+	    *info = -13;
+	}
+    }
+
+    if (*info == 0) {
+	if (nw > 0) {
+	    if (applyq) {
+		if (left) {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *m - 1;
+		    i__2 = *m - 1;
+		    nb = ilaenv_(&c__1, "CUNMQR", ch__1, &i__1, n, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		} else {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *n - 1;
+		    i__2 = *n - 1;
+		    nb = ilaenv_(&c__1, "CUNMQR", ch__1, m, &i__1, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		}
+	    } else {
+		if (left) {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *m - 1;
+		    i__2 = *m - 1;
+		    nb = ilaenv_(&c__1, "CUNMLQ", ch__1, &i__1, n, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		} else {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *n - 1;
+		    i__2 = *n - 1;
+		    nb = ilaenv_(&c__1, "CUNMLQ", ch__1, m, &i__1, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		}
+	    }
+/* Computing MAX */
+	    i__1 = 1, i__2 = nw * nb;
+	    lwkopt = max(i__1,i__2);
+	} else {
+	    lwkopt = 1;
+	}
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNMBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    if (applyq) {
+
+/*        Apply Q */
+
+	if (nq >= *k) {
+
+/*           Q was determined by a call to CGEBRD with nq >= k */
+
+	    cunmqr_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           Q was determined by a call to CGEBRD with nq < k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    cunmqr_(side, trans, &mi, &ni, &i__1, &a[a_dim1 + 2], lda, &tau[1]
+		    , &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+	}
+    } else {
+
+/*        Apply P */
+
+	if (notran) {
+	    *(unsigned char *)transt = 'C';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+	if (nq > *k) {
+
+/*           P was determined by a call to CGEBRD with nq > k */
+
+	    cunmlq_(side, transt, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           P was determined by a call to CGEBRD with nq <= k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    cunmlq_(side, transt, &mi, &ni, &i__1, &a[(a_dim1 << 1) + 1], lda,
+		     &tau[1], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &
+		    iinfo);
+	}
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNMBR */
+
+} /* cunmbr_ */
+
+/* Subroutine */ int cunmhr_(char *side, char *trans, integer *m, integer *n,
+	integer *ilo, integer *ihi, complex *a, integer *lda, complex *tau,
+	complex *c__, integer *ldc, complex *work, integer *lwork, integer *
+	info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, nh, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNMHR overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    IHI-ILO elementary reflectors, as returned by CGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q**H (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of CGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            If SIDE = 'L', then 1 <= ILO <= IHI <= M, if M > 0, and
+            ILO = 1 and IHI = 0, if M = 0;
+            if SIDE = 'R', then 1 <= ILO <= IHI <= N, if N > 0, and
+            ILO = 1 and IHI = 0, if N = 0.
+
+    A       (input) COMPLEX array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by CGEHRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) COMPLEX array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEHRD.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    left = lsame_(side, "L");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ilo < 1 || *ilo > max(1,nq)) {
+	*info = -5;
+    } else if (*ihi < min(*ilo,nq) || *ihi > nq) {
+	*info = -6;
+    } else if (*lda < max(1,nq)) {
+	*info = -8;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -13;
+    }
+
+    if (*info == 0) {
+	if (left) {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "CUNMQR", ch__1, &nh, n, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	} else {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "CUNMQR", ch__1, m, &nh, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("CUNMHR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nh == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    if (left) {
+	mi = nh;
+	ni = *n;
+	i1 = *ilo + 1;
+	i2 = 1;
+    } else {
+	mi = *m;
+	ni = nh;
+	i1 = 1;
+	i2 = *ilo + 1;
+    }
+
+    cunmqr_(side, trans, &mi, &ni, &nh, &a[*ilo + 1 + *ilo * a_dim1], lda, &
+	    tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNMHR */
+
+} /* cunmhr_ */
+
+/* Subroutine */ int cunml2_(char *side, char *trans, integer *m, integer *n,
+	integer *k, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static complex aii;
+    static logical left;
+    static complex taui;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int clacgv_(integer *, complex *, integer *),
+	    xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNML2 overwrites the general complex m-by-n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'C', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'C',
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k)' . . . H(2)' H(1)'
+
+    as returned by CGELQF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q' (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            CGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGELQF.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the m-by-n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNML2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) or H(i)' is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) or H(i)' is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) or H(i)' */
+
+	if (notran) {
+	    r_cnjg(&q__1, &tau[i__]);
+	    taui.r = q__1.r, taui.i = q__1.i;
+	} else {
+	    i__3 = i__;
+	    taui.r = tau[i__3].r, taui.i = tau[i__3].i;
+	}
+	if (i__ < nq) {
+	    i__3 = nq - i__;
+	    clacgv_(&i__3, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+	i__3 = i__ + i__ * a_dim1;
+	aii.r = a[i__3].r, aii.i = a[i__3].i;
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = 1.f, a[i__3].i = 0.f;
+	clarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], lda, &taui, &c__[ic +
+		jc * c_dim1], ldc, &work[1]);
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = aii.r, a[i__3].i = aii.i;
+	if (i__ < nq) {
+	    i__3 = nq - i__;
+	    clacgv_(&i__3, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of CUNML2 */
+
+} /* cunml2_ */
+
+/* Subroutine */ int cunmlq_(char *side, char *trans, integer *m, integer *n,
+	integer *k, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static complex t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int cunml2_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *), clarfb_(char *, char *,
+	    char *, char *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    integer *), clarft_(char *, char *
+	    , integer *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran;
+    static integer ldwork;
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNMLQ overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k)' . . . H(2)' H(1)'
+
+    as returned by CGELQF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Conjugate transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            CGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGELQF.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "CUNMLQ", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNMLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "CUNMLQ", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	cunml2_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	if (notran) {
+	    *(unsigned char *)transt = 'C';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    clarft_("Forward", "Rowwise", &i__4, &ib, &a[i__ + i__ * a_dim1],
+		    lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    clarfb_(side, transt, "Forward", "Rowwise", &mi, &ni, &ib, &a[i__
+		    + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc * c_dim1],
+		    ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNMLQ */
+
+} /* cunmlq_ */
+
+/* Subroutine */ int cunmql_(char *side, char *trans, integer *m, integer *n,
+	integer *k, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static complex t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int cunm2l_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *), clarfb_(char *, char *,
+	    char *, char *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    integer *), clarft_(char *, char *
+	    , integer *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran;
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNMQL overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by CGEQLF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            CGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEQLF.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = max(1,*n);
+    } else {
+	nq = *n;
+	nw = max(1,*m);
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+
+    if (*info == 0) {
+	if (*m == 0 || *n == 0) {
+	    lwkopt = 1;
+	} else {
+
+/*
+             Determine the block size.  NB may be at most NBMAX, where
+             NBMAX is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 64, i__2 = ilaenv_(&c__1, "CUNMQL", ch__1, m, n, k, &c_n1,
+		    (ftnlen)6, (ftnlen)2);
+	    nb = min(i__1,i__2);
+	    lwkopt = nw * nb;
+	}
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+
+	if (*lwork < nw && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNMQL", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "CUNMQL", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	cunm2l_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	} else {
+	    mi = *m;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i+ib-1) . . . H(i+1) H(i)
+*/
+
+	    i__4 = nq - *k + i__ + ib - 1;
+	    clarft_("Backward", "Columnwise", &i__4, &ib, &a[i__ * a_dim1 + 1]
+		    , lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(1:m-k+i+ib-1,1:n) */
+
+		mi = *m - *k + i__ + ib - 1;
+	    } else {
+
+/*              H or H' is applied to C(1:m,1:n-k+i+ib-1) */
+
+		ni = *n - *k + i__ + ib - 1;
+	    }
+
+/*           Apply H or H' */
+
+	    clarfb_(side, trans, "Backward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ * a_dim1 + 1], lda, t, &c__65, &c__[c_offset], ldc, &
+		    work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNMQL */
+
+} /* cunmql_ */
+
+/* Subroutine */ int cunmqr_(char *side, char *trans, integer *m, integer *n,
+	integer *k, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static complex t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int cunm2r_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *), clarfb_(char *, char *,
+	    char *, char *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    integer *), clarft_(char *, char *
+	    , integer *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran;
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNMQR overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Conjugate transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            CGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CGEQRF.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "CUNMQR", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CUNMQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "CUNMQR", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	cunm2r_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && ! notran || ! left && notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    clarft_("Forward", "Columnwise", &i__4, &ib, &a[i__ + i__ *
+		    a_dim1], lda, &tau[i__], t, &c__65)
+		    ;
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    clarfb_(side, trans, "Forward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc *
+		    c_dim1], ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNMQR */
+
+} /* cunmqr_ */
+
+/* Subroutine */ int cunmtr_(char *side, char *uplo, char *trans, integer *m,
+	integer *n, complex *a, integer *lda, complex *tau, complex *c__,
+	integer *ldc, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2, i__3;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cunmql_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *), cunmqr_(char *,
+	    char *, integer *, integer *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, complex *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CUNMTR overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    nq-1 elementary reflectors, as returned by CHETRD:
+
+    if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+
+    if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    UPLO    (input) CHARACTER*1
+            = 'U': Upper triangle of A contains elementary reflectors
+                   from CHETRD;
+            = 'L': Lower triangle of A contains elementary reflectors
+                   from CHETRD.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Conjugate transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    A       (input) COMPLEX array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by CHETRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) COMPLEX array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by CHETRD.
+
+    C       (input/output) COMPLEX array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >=M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	*info = -2;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+	if (upper) {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "CUNMQL", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "CUNMQL", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	} else {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "CUNMQR", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "CUNMQR", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (real) lwkopt, work[1].i = 0.f;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("CUNMTR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nq == 1) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    if (left) {
+	mi = *m - 1;
+	ni = *n;
+    } else {
+	mi = *m;
+	ni = *n - 1;
+    }
+
+    if (upper) {
+
+/*        Q was determined by a call to CHETRD with UPLO = 'U' */
+
+	i__2 = nq - 1;
+	cunmql_(side, trans, &mi, &ni, &i__2, &a[(a_dim1 << 1) + 1], lda, &
+		tau[1], &c__[c_offset], ldc, &work[1], lwork, &iinfo);
+    } else {
+
+/*        Q was determined by a call to CHETRD with UPLO = 'L' */
+
+	if (left) {
+	    i1 = 2;
+	    i2 = 1;
+	} else {
+	    i1 = 1;
+	    i2 = 2;
+	}
+	i__2 = nq - 1;
+	cunmqr_(side, trans, &mi, &ni, &i__2, &a[a_dim1 + 2], lda, &tau[1], &
+		c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+    }
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    return 0;
+
+/*     End of CUNMTR */
+
+} /* cunmtr_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_c_lapack.f.patch b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_c_lapack.f.patch
new file mode 100644
index 00000000000..bcf7507baa7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_c_lapack.f.patch
@@ -0,0 +1,32 @@
+@@ -13163,5 +13163,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( LASTV, I ).NE.ZERO ) EXIT
++                     IF( V( LASTV, I ).NE.ZERO ) GO TO 15
+                   END DO
++   15             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -13175,5 +13176,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( I, LASTV ).NE.ZERO ) EXIT
++                     IF( V( I, LASTV ).NE.ZERO ) GO TO 16
+                   END DO
++   16             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -13223,5 +13225,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = 1, I-1
+-                        IF( V( LASTV, I ).NE.ZERO ) EXIT
++                        IF( V( LASTV, I ).NE.ZERO ) GO TO 35
+                      END DO
++   35                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
+@@ -13239,5 +13242,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = N, I+1, -1
+-                        IF( V( I, LASTV ).NE.ZERO ) EXIT
++                        IF( V( I, LASTV ).NE.ZERO ) GO TO 36
+                      END DO
++   36                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_config.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_config.c
new file mode 100644
index 00000000000..2fe608227fa
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_config.c
@@ -0,0 +1,2068 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublereal c_b32 = 0.;
+static real c_b66 = 0.f;
+
+doublereal dlamch_(char *cmach)
+{
+    /* Initialized data */
+
+    static logical first = TRUE_;
+
+    /* System generated locals */
+    integer i__1;
+    doublereal ret_val;
+
+    /* Local variables */
+    static doublereal t;
+    static integer it;
+    static doublereal rnd, eps, base;
+    static integer beta;
+    static doublereal emin, prec, emax;
+    static integer imin, imax;
+    static logical lrnd;
+    static doublereal rmin, rmax, rmach;
+    extern logical lsame_(char *, char *);
+    static doublereal small, sfmin;
+    extern /* Subroutine */ int dlamc2_(integer *, integer *, logical *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMCH determines double precision machine parameters.
+
+    Arguments
+    =========
+
+    CMACH   (input) CHARACTER*1
+            Specifies the value to be returned by DLAMCH:
+            = 'E' or 'e',   DLAMCH := eps
+            = 'S' or 's ,   DLAMCH := sfmin
+            = 'B' or 'b',   DLAMCH := base
+            = 'P' or 'p',   DLAMCH := eps*base
+            = 'N' or 'n',   DLAMCH := t
+            = 'R' or 'r',   DLAMCH := rnd
+            = 'M' or 'm',   DLAMCH := emin
+            = 'U' or 'u',   DLAMCH := rmin
+            = 'L' or 'l',   DLAMCH := emax
+            = 'O' or 'o',   DLAMCH := rmax
+
+            where
+
+            eps   = relative machine precision
+            sfmin = safe minimum, such that 1/sfmin does not overflow
+            base  = base of the machine
+            prec  = eps*base
+            t     = number of (base) digits in the mantissa
+            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise
+            emin  = minimum exponent before (gradual) underflow
+            rmin  = underflow threshold - base**(emin-1)
+            emax  = largest exponent before overflow
+            rmax  = overflow threshold  - (base**emax)*(1-eps)
+
+   =====================================================================
+*/
+
+
+    if (first) {
+	dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
+	base = (doublereal) beta;
+	t = (doublereal) it;
+	if (lrnd) {
+	    rnd = 1.;
+	    i__1 = 1 - it;
+	    eps = pow_di(&base, &i__1) / 2;
+	} else {
+	    rnd = 0.;
+	    i__1 = 1 - it;
+	    eps = pow_di(&base, &i__1);
+	}
+	prec = eps * base;
+	emin = (doublereal) imin;
+	emax = (doublereal) imax;
+	sfmin = rmin;
+	small = 1. / rmax;
+	if (small >= sfmin) {
+
+/*
+             Use SMALL plus a bit, to avoid the possibility of rounding
+             causing overflow when computing  1/sfmin.
+*/
+
+	    sfmin = small * (eps + 1.);
+	}
+    }
+
+    if (lsame_(cmach, "E")) {
+	rmach = eps;
+    } else if (lsame_(cmach, "S")) {
+	rmach = sfmin;
+    } else if (lsame_(cmach, "B")) {
+	rmach = base;
+    } else if (lsame_(cmach, "P")) {
+	rmach = prec;
+    } else if (lsame_(cmach, "N")) {
+	rmach = t;
+    } else if (lsame_(cmach, "R")) {
+	rmach = rnd;
+    } else if (lsame_(cmach, "M")) {
+	rmach = emin;
+    } else if (lsame_(cmach, "U")) {
+	rmach = rmin;
+    } else if (lsame_(cmach, "L")) {
+	rmach = emax;
+    } else if (lsame_(cmach, "O")) {
+	rmach = rmax;
+    }
+
+    ret_val = rmach;
+    first = FALSE_;
+    return ret_val;
+
+/*     End of DLAMCH */
+
+} /* dlamch_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int dlamc1_(integer *beta, integer *t, logical *rnd, logical
+	*ieee1)
+{
+    /* Initialized data */
+
+    static logical first = TRUE_;
+
+    /* System generated locals */
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal a, b, c__, f, t1, t2;
+    static integer lt;
+    static doublereal one, qtr;
+    static logical lrnd;
+    static integer lbeta;
+    static doublereal savec;
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    static logical lieee1;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMC1 determines the machine parameters given by BETA, T, RND, and
+    IEEE1.
+
+    Arguments
+    =========
+
+    BETA    (output) INTEGER
+            The base of the machine.
+
+    T       (output) INTEGER
+            The number of ( BETA ) digits in the mantissa.
+
+    RND     (output) LOGICAL
+            Specifies whether proper rounding  ( RND = .TRUE. )  or
+            chopping  ( RND = .FALSE. )  occurs in addition. This may not
+            be a reliable guide to the way in which the machine performs
+            its arithmetic.
+
+    IEEE1   (output) LOGICAL
+            Specifies whether rounding appears to be done in the IEEE
+            'round to nearest' style.
+
+    Further Details
+    ===============
+
+    The routine is based on the routine  ENVRON  by Malcolm and
+    incorporates suggestions by Gentleman and Marovich. See
+
+       Malcolm M. A. (1972) Algorithms to reveal properties of
+          floating-point arithmetic. Comms. of the ACM, 15, 949-951.
+
+       Gentleman W. M. and Marovich S. B. (1974) More on algorithms
+          that reveal properties of floating point arithmetic units.
+          Comms. of the ACM, 17, 276-277.
+
+   =====================================================================
+*/
+
+
+    if (first) {
+	one = 1.;
+
+/*
+          LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BETA,
+          IEEE1, T and RND.
+
+          Throughout this routine  we use the function  DLAMC3  to ensure
+          that relevant values are  stored and not held in registers,  or
+          are not affected by optimizers.
+
+          Compute  a = 2.0**m  with the  smallest positive integer m such
+          that
+
+             fl( a + 1.0 ) = a.
+*/
+
+	a = 1.;
+	c__ = 1.;
+
+/* +       WHILE( C.EQ.ONE )LOOP */
+L10:
+	if (c__ == one) {
+	    a *= 2;
+	    c__ = dlamc3_(&a, &one);
+	    d__1 = -a;
+	    c__ = dlamc3_(&c__, &d__1);
+	    goto L10;
+	}
+/*
+   +       END WHILE
+
+          Now compute  b = 2.0**m  with the smallest positive integer m
+          such that
+
+             fl( a + b ) .gt. a.
+*/
+
+	b = 1.;
+	c__ = dlamc3_(&a, &b);
+
+/* +       WHILE( C.EQ.A )LOOP */
+L20:
+	if (c__ == a) {
+	    b *= 2;
+	    c__ = dlamc3_(&a, &b);
+	    goto L20;
+	}
+/*
+   +       END WHILE
+
+          Now compute the base.  a and c  are neighbouring floating point
+          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and so
+          their difference is beta. Adding 0.25 to c is to ensure that it
+          is truncated to beta and not ( beta - 1 ).
+*/
+
+	qtr = one / 4;
+	savec = c__;
+	d__1 = -a;
+	c__ = dlamc3_(&c__, &d__1);
+	lbeta = (integer) (c__ + qtr);
+
+/*
+          Now determine whether rounding or chopping occurs,  by adding a
+          bit  less  than  beta/2  and a  bit  more  than  beta/2  to  a.
+*/
+
+	b = (doublereal) lbeta;
+	d__1 = b / 2;
+	d__2 = -b / 100;
+	f = dlamc3_(&d__1, &d__2);
+	c__ = dlamc3_(&f, &a);
+	if (c__ == a) {
+	    lrnd = TRUE_;
+	} else {
+	    lrnd = FALSE_;
+	}
+	d__1 = b / 2;
+	d__2 = b / 100;
+	f = dlamc3_(&d__1, &d__2);
+	c__ = dlamc3_(&f, &a);
+	if (lrnd && c__ == a) {
+	    lrnd = FALSE_;
+	}
+
+/*
+          Try and decide whether rounding is done in the  IEEE  'round to
+          nearest' style. B/2 is half a unit in the last place of the two
+          numbers A and SAVEC. Furthermore, A is even, i.e. has last  bit
+          zero, and SAVEC is odd. Thus adding B/2 to A should not  change
+          A, but adding B/2 to SAVEC should change SAVEC.
+*/
+
+	d__1 = b / 2;
+	t1 = dlamc3_(&d__1, &a);
+	d__1 = b / 2;
+	t2 = dlamc3_(&d__1, &savec);
+	lieee1 = t1 == a && t2 > savec && lrnd;
+
+/*
+          Now find  the  mantissa, t.  It should  be the  integer part of
+          log to the base beta of a,  however it is safer to determine  t
+          by powering.  So we find t as the smallest positive integer for
+          which
+
+             fl( beta**t + 1.0 ) = 1.0.
+*/
+
+	lt = 0;
+	a = 1.;
+	c__ = 1.;
+
+/* +       WHILE( C.EQ.ONE )LOOP */
+L30:
+	if (c__ == one) {
+	    ++lt;
+	    a *= lbeta;
+	    c__ = dlamc3_(&a, &one);
+	    d__1 = -a;
+	    c__ = dlamc3_(&c__, &d__1);
+	    goto L30;
+	}
+/* +       END WHILE */
+
+    }
+
+    *beta = lbeta;
+    *t = lt;
+    *rnd = lrnd;
+    *ieee1 = lieee1;
+    first = FALSE_;
+    return 0;
+
+/*     End of DLAMC1 */
+
+} /* dlamc1_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int dlamc2_(integer *beta, integer *t, logical *rnd,
+	doublereal *eps, integer *emin, doublereal *rmin, integer *emax,
+	doublereal *rmax)
+{
+    /* Initialized data */
+
+    static logical first = TRUE_;
+    static logical iwarn = FALSE_;
+
+    /* Format strings */
+    static char fmt_9999[] = "(//\002 WARNING. The value EMIN may be incorre"
+	    "ct:-\002,\002  EMIN = \002,i8,/\002 If, after inspection, the va"
+	    "lue EMIN looks\002,\002 acceptable please comment out \002,/\002"
+	    " the IF block as marked within the code of routine\002,\002 DLAM"
+	    "C2,\002,/\002 otherwise supply EMIN explicitly.\002,/)";
+
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2, d__3, d__4, d__5;
+
+    /* Local variables */
+    static doublereal a, b, c__;
+    static integer i__, lt;
+    static doublereal one, two;
+    static logical ieee;
+    static doublereal half;
+    static logical lrnd;
+    static doublereal leps, zero;
+    static integer lbeta;
+    static doublereal rbase;
+    static integer lemin, lemax, gnmin;
+    static doublereal small;
+    static integer gpmin;
+    static doublereal third, lrmin, lrmax, sixth;
+    extern /* Subroutine */ int dlamc1_(integer *, integer *, logical *,
+	    logical *);
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    static logical lieee1;
+    extern /* Subroutine */ int dlamc4_(integer *, doublereal *, integer *),
+	    dlamc5_(integer *, integer *, integer *, logical *, integer *,
+	    doublereal *);
+    static integer ngnmin, ngpmin;
+
+    /* Fortran I/O blocks */
+    static cilist io___58 = { 0, 6, 0, fmt_9999, 0 };
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMC2 determines the machine parameters specified in its argument
+    list.
+
+    Arguments
+    =========
+
+    BETA    (output) INTEGER
+            The base of the machine.
+
+    T       (output) INTEGER
+            The number of ( BETA ) digits in the mantissa.
+
+    RND     (output) LOGICAL
+            Specifies whether proper rounding  ( RND = .TRUE. )  or
+            chopping  ( RND = .FALSE. )  occurs in addition. This may not
+            be a reliable guide to the way in which the machine performs
+            its arithmetic.
+
+    EPS     (output) DOUBLE PRECISION
+            The smallest positive number such that
+
+               fl( 1.0 - EPS ) .LT. 1.0,
+
+            where fl denotes the computed value.
+
+    EMIN    (output) INTEGER
+            The minimum exponent before (gradual) underflow occurs.
+
+    RMIN    (output) DOUBLE PRECISION
+            The smallest normalized number for the machine, given by
+            BASE**( EMIN - 1 ), where  BASE  is the floating point value
+            of BETA.
+
+    EMAX    (output) INTEGER
+            The maximum exponent before overflow occurs.
+
+    RMAX    (output) DOUBLE PRECISION
+            The largest positive number for the machine, given by
+            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point
+            value of BETA.
+
+    Further Details
+    ===============
+
+    The computation of  EPS  is based on a routine PARANOIA by
+    W. Kahan of the University of California at Berkeley.
+
+   =====================================================================
+*/
+
+
+    if (first) {
+	zero = 0.;
+	one = 1.;
+	two = 2.;
+
+/*
+          LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values of
+          BETA, T, RND, EPS, EMIN and RMIN.
+
+          Throughout this routine  we use the function  DLAMC3  to ensure
+          that relevant values are stored  and not held in registers,  or
+          are not affected by optimizers.
+
+          DLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1.
+*/
+
+	dlamc1_(&lbeta, &lt, &lrnd, &lieee1);
+
+/*        Start to find EPS. */
+
+	b = (doublereal) lbeta;
+	i__1 = -lt;
+	a = pow_di(&b, &i__1);
+	leps = a;
+
+/*        Try some tricks to see whether or not this is the correct  EPS. */
+
+	b = two / 3;
+	half = one / 2;
+	d__1 = -half;
+	sixth = dlamc3_(&b, &d__1);
+	third = dlamc3_(&sixth, &sixth);
+	d__1 = -half;
+	b = dlamc3_(&third, &d__1);
+	b = dlamc3_(&b, &sixth);
+	b = abs(b);
+	if (b < leps) {
+	    b = leps;
+	}
+
+	leps = 1.;
+
+/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
+L10:
+	if (leps > b && b > zero) {
+	    leps = b;
+	    d__1 = half * leps;
+/* Computing 5th power */
+	    d__3 = two, d__4 = d__3, d__3 *= d__3;
+/* Computing 2nd power */
+	    d__5 = leps;
+	    d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5);
+	    c__ = dlamc3_(&d__1, &d__2);
+	    d__1 = -c__;
+	    c__ = dlamc3_(&half, &d__1);
+	    b = dlamc3_(&half, &c__);
+	    d__1 = -b;
+	    c__ = dlamc3_(&half, &d__1);
+	    b = dlamc3_(&half, &c__);
+	    goto L10;
+	}
+/* +       END WHILE */
+
+	if (a < leps) {
+	    leps = a;
+	}
+
+/*
+          Computation of EPS complete.
+
+          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3)).
+          Keep dividing  A by BETA until (gradual) underflow occurs. This
+          is detected when we cannot recover the previous A.
+*/
+
+	rbase = one / lbeta;
+	small = one;
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    d__1 = small * rbase;
+	    small = dlamc3_(&d__1, &zero);
+/* L20: */
+	}
+	a = dlamc3_(&one, &small);
+	dlamc4_(&ngpmin, &one, &lbeta);
+	d__1 = -one;
+	dlamc4_(&ngnmin, &d__1, &lbeta);
+	dlamc4_(&gpmin, &a, &lbeta);
+	d__1 = -a;
+	dlamc4_(&gnmin, &d__1, &lbeta);
+	ieee = FALSE_;
+
+	if (ngpmin == ngnmin && gpmin == gnmin) {
+	    if (ngpmin == gpmin) {
+		lemin = ngpmin;
+/*
+              ( Non twos-complement machines, no gradual underflow;
+                e.g.,  VAX )
+*/
+	    } else if (gpmin - ngpmin == 3) {
+		lemin = ngpmin - 1 + lt;
+		ieee = TRUE_;
+/*
+              ( Non twos-complement machines, with gradual underflow;
+                e.g., IEEE standard followers )
+*/
+	    } else {
+		lemin = min(ngpmin,gpmin);
+/*            ( A guess; no known machine ) */
+		iwarn = TRUE_;
+	    }
+
+	} else if (ngpmin == gpmin && ngnmin == gnmin) {
+	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
+		lemin = max(ngpmin,ngnmin);
+/*
+              ( Twos-complement machines, no gradual underflow;
+                e.g., CYBER 205 )
+*/
+	    } else {
+		lemin = min(ngpmin,ngnmin);
+/*            ( A guess; no known machine ) */
+		iwarn = TRUE_;
+	    }
+
+	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
+		 {
+	    if (gpmin - min(ngpmin,ngnmin) == 3) {
+		lemin = max(ngpmin,ngnmin) - 1 + lt;
+/*
+              ( Twos-complement machines with gradual underflow;
+                no known machine )
+*/
+	    } else {
+		lemin = min(ngpmin,ngnmin);
+/*            ( A guess; no known machine ) */
+		iwarn = TRUE_;
+	    }
+
+	} else {
+/* Computing MIN */
+	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
+	    lemin = min(i__1,gnmin);
+/*         ( A guess; no known machine ) */
+	    iwarn = TRUE_;
+	}
+	first = FALSE_;
+/*
+   **
+   Comment out this if block if EMIN is ok
+*/
+	if (iwarn) {
+	    first = TRUE_;
+	    s_wsfe(&io___58);
+	    do_fio(&c__1, (char *)&lemin, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+/*
+   **
+
+          Assume IEEE arithmetic if we found denormalised  numbers above,
+          or if arithmetic seems to round in the  IEEE style,  determined
+          in routine DLAMC1. A true IEEE machine should have both  things
+          true; however, faulty machines may have one or the other.
+*/
+
+	ieee = ieee || lieee1;
+
+/*
+          Compute  RMIN by successive division by  BETA. We could compute
+          RMIN as BASE**( EMIN - 1 ),  but some machines underflow during
+          this computation.
+*/
+
+	lrmin = 1.;
+	i__1 = 1 - lemin;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = lrmin * rbase;
+	    lrmin = dlamc3_(&d__1, &zero);
+/* L30: */
+	}
+
+/*        Finally, call DLAMC5 to compute EMAX and RMAX. */
+
+	dlamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
+    }
+
+    *beta = lbeta;
+    *t = lt;
+    *rnd = lrnd;
+    *eps = leps;
+    *emin = lemin;
+    *rmin = lrmin;
+    *emax = lemax;
+    *rmax = lrmax;
+
+    return 0;
+
+
+/*     End of DLAMC2 */
+
+} /* dlamc2_ */
+
+
+/* *********************************************************************** */
+
+doublereal dlamc3_(doublereal *a, doublereal *b)
+{
+    /* System generated locals */
+    doublereal ret_val;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMC3  is intended to force  A  and  B  to be stored prior to doing
+    the addition of  A  and  B ,  for use in situations where optimizers
+    might hold one of these in a register.
+
+    Arguments
+    =========
+
+    A       (input) DOUBLE PRECISION
+    B       (input) DOUBLE PRECISION
+            The values A and B.
+
+   =====================================================================
+*/
+
+
+    ret_val = *a + *b;
+
+    return ret_val;
+
+/*     End of DLAMC3 */
+
+} /* dlamc3_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int dlamc4_(integer *emin, doublereal *start, integer *base)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal a;
+    static integer i__;
+    static doublereal b1, b2, c1, c2, d1, d2, one, zero, rbase;
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMC4 is a service routine for DLAMC2.
+
+    Arguments
+    =========
+
+    EMIN    (output) INTEGER
+            The minimum exponent before (gradual) underflow, computed by
+            setting A = START and dividing by BASE until the previous A
+            can not be recovered.
+
+    START   (input) DOUBLE PRECISION
+            The starting point for determining EMIN.
+
+    BASE    (input) INTEGER
+            The base of the machine.
+
+   =====================================================================
+*/
+
+
+    a = *start;
+    one = 1.;
+    rbase = one / *base;
+    zero = 0.;
+    *emin = 1;
+    d__1 = a * rbase;
+    b1 = dlamc3_(&d__1, &zero);
+    c1 = a;
+    c2 = a;
+    d1 = a;
+    d2 = a;
+/*
+   +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.
+      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP
+*/
+L10:
+    if (c1 == a && c2 == a && d1 == a && d2 == a) {
+	--(*emin);
+	a = b1;
+	d__1 = a / *base;
+	b1 = dlamc3_(&d__1, &zero);
+	d__1 = b1 * *base;
+	c1 = dlamc3_(&d__1, &zero);
+	d1 = zero;
+	i__1 = *base;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d1 += b1;
+/* L20: */
+	}
+	d__1 = a * rbase;
+	b2 = dlamc3_(&d__1, &zero);
+	d__1 = b2 / rbase;
+	c2 = dlamc3_(&d__1, &zero);
+	d2 = zero;
+	i__1 = *base;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d2 += b2;
+/* L30: */
+	}
+	goto L10;
+    }
+/* +    END WHILE */
+
+    return 0;
+
+/*     End of DLAMC4 */
+
+} /* dlamc4_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int dlamc5_(integer *beta, integer *p, integer *emin,
+	logical *ieee, integer *emax, doublereal *rmax)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal y, z__;
+    static integer try__, lexp;
+    static doublereal oldy;
+    static integer uexp, nbits;
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    static doublereal recbas;
+    static integer exbits, expsum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMC5 attempts to compute RMAX, the largest machine floating-point
+    number, without overflow.  It assumes that EMAX + abs(EMIN) sum
+    approximately to a power of 2.  It will fail on machines where this
+    assumption does not hold, for example, the Cyber 205 (EMIN = -28625,
+    EMAX = 28718).  It will also fail if the value supplied for EMIN is
+    too large (i.e. too close to zero), probably with overflow.
+
+    Arguments
+    =========
+
+    BETA    (input) INTEGER
+            The base of floating-point arithmetic.
+
+    P       (input) INTEGER
+            The number of base BETA digits in the mantissa of a
+            floating-point value.
+
+    EMIN    (input) INTEGER
+            The minimum exponent before (gradual) underflow.
+
+    IEEE    (input) LOGICAL
+            A logical flag specifying whether or not the arithmetic
+            system is thought to comply with the IEEE standard.
+
+    EMAX    (output) INTEGER
+            The largest exponent before overflow
+
+    RMAX    (output) DOUBLE PRECISION
+            The largest machine floating-point number.
+
+   =====================================================================
+
+
+       First compute LEXP and UEXP, two powers of 2 that bound
+       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum
+       approximately to the bound that is closest to abs(EMIN).
+       (EMAX is the exponent of the required number RMAX).
+*/
+
+    lexp = 1;
+    exbits = 1;
+L10:
+    try__ = lexp << 1;
+    if (try__ <= -(*emin)) {
+	lexp = try__;
+	++exbits;
+	goto L10;
+    }
+    if (lexp == -(*emin)) {
+	uexp = lexp;
+    } else {
+	uexp = try__;
+	++exbits;
+    }
+
+/*
+       Now -LEXP is less than or equal to EMIN, and -UEXP is greater
+       than or equal to EMIN. EXBITS is the number of bits needed to
+       store the exponent.
+*/
+
+    if (uexp + *emin > -lexp - *emin) {
+	expsum = lexp << 1;
+    } else {
+	expsum = uexp << 1;
+    }
+
+/*
+       EXPSUM is the exponent range, approximately equal to
+       EMAX - EMIN + 1 .
+*/
+
+    *emax = expsum + *emin - 1;
+    nbits = exbits + 1 + *p;
+
+/*
+       NBITS is the total number of bits needed to store a
+       floating-point number.
+*/
+
+    if (nbits % 2 == 1 && *beta == 2) {
+
+/*
+          Either there are an odd number of bits used to store a
+          floating-point number, which is unlikely, or some bits are
+          not used in the representation of numbers, which is possible,
+          (e.g. Cray machines) or the mantissa has an implicit bit,
+          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
+          most likely. We have to assume the last alternative.
+          If this is true, then we need to reduce EMAX by one because
+          there must be some way of representing zero in an implicit-bit
+          system. On machines like Cray, we are reducing EMAX by one
+          unnecessarily.
+*/
+
+	--(*emax);
+    }
+
+    if (*ieee) {
+
+/*
+          Assume we are on an IEEE machine which reserves one exponent
+          for infinity and NaN.
+*/
+
+	--(*emax);
+    }
+
+/*
+       Now create RMAX, the largest machine number, which should
+       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .
+
+       First compute 1.0 - BETA**(-P), being careful that the
+       result is less than 1.0 .
+*/
+
+    recbas = 1. / *beta;
+    z__ = *beta - 1.;
+    y = 0.;
+    i__1 = *p;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	z__ *= recbas;
+	if (y < 1.) {
+	    oldy = y;
+	}
+	y = dlamc3_(&y, &z__);
+/* L20: */
+    }
+    if (y >= 1.) {
+	y = oldy;
+    }
+
+/*     Now multiply by BETA**EMAX to get RMAX. */
+
+    i__1 = *emax;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__1 = y * *beta;
+	y = dlamc3_(&d__1, &c_b32);
+/* L30: */
+    }
+
+    *rmax = y;
+    return 0;
+
+/*     End of DLAMC5 */
+
+} /* dlamc5_ */
+
+logical lsame_(char *ca, char *cb)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    static integer inta, intb, zcode;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    LSAME returns .TRUE. if CA is the same letter as CB regardless of
+    case.
+
+    Arguments
+    =========
+
+    CA      (input) CHARACTER*1
+    CB      (input) CHARACTER*1
+            CA and CB specify the single characters to be compared.
+
+   =====================================================================
+
+
+       Test if the characters are equal
+*/
+
+    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
+    if (ret_val) {
+	return ret_val;
+    }
+
+/*     Now test for equivalence if both characters are alphabetic. */
+
+    zcode = 'Z';
+
+/*
+       Use 'Z' rather than 'A' so that ASCII can be detected on Prime
+       machines, on which ICHAR returns a value with bit 8 set.
+       ICHAR('A') on Prime machines returns 193 which is the same as
+       ICHAR('A') on an EBCDIC machine.
+*/
+
+    inta = *(unsigned char *)ca;
+    intb = *(unsigned char *)cb;
+
+    if (zcode == 90 || zcode == 122) {
+
+/*
+          ASCII is assumed - ZCODE is the ASCII code of either lower or
+          upper case 'Z'.
+*/
+
+	if (inta >= 97 && inta <= 122) {
+	    inta += -32;
+	}
+	if (intb >= 97 && intb <= 122) {
+	    intb += -32;
+	}
+
+    } else if (zcode == 233 || zcode == 169) {
+
+/*
+          EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
+          upper case 'Z'.
+*/
+
+	if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta
+		>= 162 && inta <= 169) {
+	    inta += 64;
+	}
+	if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb
+		>= 162 && intb <= 169) {
+	    intb += 64;
+	}
+
+    } else if (zcode == 218 || zcode == 250) {
+
+/*
+          ASCII is assumed, on Prime machines - ZCODE is the ASCII code
+          plus 128 of either lower or upper case 'Z'.
+*/
+
+	if (inta >= 225 && inta <= 250) {
+	    inta += -32;
+	}
+	if (intb >= 225 && intb <= 250) {
+	    intb += -32;
+	}
+    }
+    ret_val = inta == intb;
+
+/*
+       RETURN
+
+       End of LSAME
+*/
+
+    return ret_val;
+} /* lsame_ */
+
+doublereal slamch_(char *cmach)
+{
+    /* Initialized data */
+
+    static logical first = TRUE_;
+
+    /* System generated locals */
+    integer i__1;
+    real ret_val;
+
+    /* Local variables */
+    static real t;
+    static integer it;
+    static real rnd, eps, base;
+    static integer beta;
+    static real emin, prec, emax;
+    static integer imin, imax;
+    static logical lrnd;
+    static real rmin, rmax, rmach;
+    extern logical lsame_(char *, char *);
+    static real small, sfmin;
+    extern /* Subroutine */ int slamc2_(integer *, integer *, logical *, real
+	    *, integer *, real *, integer *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMCH determines single precision machine parameters.
+
+    Arguments
+    =========
+
+    CMACH   (input) CHARACTER*1
+            Specifies the value to be returned by SLAMCH:
+            = 'E' or 'e',   SLAMCH := eps
+            = 'S' or 's ,   SLAMCH := sfmin
+            = 'B' or 'b',   SLAMCH := base
+            = 'P' or 'p',   SLAMCH := eps*base
+            = 'N' or 'n',   SLAMCH := t
+            = 'R' or 'r',   SLAMCH := rnd
+            = 'M' or 'm',   SLAMCH := emin
+            = 'U' or 'u',   SLAMCH := rmin
+            = 'L' or 'l',   SLAMCH := emax
+            = 'O' or 'o',   SLAMCH := rmax
+
+            where
+
+            eps   = relative machine precision
+            sfmin = safe minimum, such that 1/sfmin does not overflow
+            base  = base of the machine
+            prec  = eps*base
+            t     = number of (base) digits in the mantissa
+            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise
+            emin  = minimum exponent before (gradual) underflow
+            rmin  = underflow threshold - base**(emin-1)
+            emax  = largest exponent before overflow
+            rmax  = overflow threshold  - (base**emax)*(1-eps)
+
+   =====================================================================
+*/
+
+
+    if (first) {
+	slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
+	base = (real) beta;
+	t = (real) it;
+	if (lrnd) {
+	    rnd = 1.f;
+	    i__1 = 1 - it;
+	    eps = pow_ri(&base, &i__1) / 2;
+	} else {
+	    rnd = 0.f;
+	    i__1 = 1 - it;
+	    eps = pow_ri(&base, &i__1);
+	}
+	prec = eps * base;
+	emin = (real) imin;
+	emax = (real) imax;
+	sfmin = rmin;
+	small = 1.f / rmax;
+	if (small >= sfmin) {
+
+/*
+             Use SMALL plus a bit, to avoid the possibility of rounding
+             causing overflow when computing  1/sfmin.
+*/
+
+	    sfmin = small * (eps + 1.f);
+	}
+    }
+
+    if (lsame_(cmach, "E")) {
+	rmach = eps;
+    } else if (lsame_(cmach, "S")) {
+	rmach = sfmin;
+    } else if (lsame_(cmach, "B")) {
+	rmach = base;
+    } else if (lsame_(cmach, "P")) {
+	rmach = prec;
+    } else if (lsame_(cmach, "N")) {
+	rmach = t;
+    } else if (lsame_(cmach, "R")) {
+	rmach = rnd;
+    } else if (lsame_(cmach, "M")) {
+	rmach = emin;
+    } else if (lsame_(cmach, "U")) {
+	rmach = rmin;
+    } else if (lsame_(cmach, "L")) {
+	rmach = emax;
+    } else if (lsame_(cmach, "O")) {
+	rmach = rmax;
+    }
+
+    ret_val = rmach;
+    first = FALSE_;
+    return ret_val;
+
+/*     End of SLAMCH */
+
+} /* slamch_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int slamc1_(integer *beta, integer *t, logical *rnd, logical
+	*ieee1)
+{
+    /* Initialized data */
+
+    static logical first = TRUE_;
+
+    /* System generated locals */
+    real r__1, r__2;
+
+    /* Local variables */
+    static real a, b, c__, f, t1, t2;
+    static integer lt;
+    static real one, qtr;
+    static logical lrnd;
+    static integer lbeta;
+    static real savec;
+    static logical lieee1;
+    extern doublereal slamc3_(real *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMC1 determines the machine parameters given by BETA, T, RND, and
+    IEEE1.
+
+    Arguments
+    =========
+
+    BETA    (output) INTEGER
+            The base of the machine.
+
+    T       (output) INTEGER
+            The number of ( BETA ) digits in the mantissa.
+
+    RND     (output) LOGICAL
+            Specifies whether proper rounding  ( RND = .TRUE. )  or
+            chopping  ( RND = .FALSE. )  occurs in addition. This may not
+            be a reliable guide to the way in which the machine performs
+            its arithmetic.
+
+    IEEE1   (output) LOGICAL
+            Specifies whether rounding appears to be done in the IEEE
+            'round to nearest' style.
+
+    Further Details
+    ===============
+
+    The routine is based on the routine  ENVRON  by Malcolm and
+    incorporates suggestions by Gentleman and Marovich. See
+
+       Malcolm M. A. (1972) Algorithms to reveal properties of
+          floating-point arithmetic. Comms. of the ACM, 15, 949-951.
+
+       Gentleman W. M. and Marovich S. B. (1974) More on algorithms
+          that reveal properties of floating point arithmetic units.
+          Comms. of the ACM, 17, 276-277.
+
+   =====================================================================
+*/
+
+
+    if (first) {
+	one = 1.f;
+
+/*
+          LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BETA,
+          IEEE1, T and RND.
+
+          Throughout this routine  we use the function  SLAMC3  to ensure
+          that relevant values are  stored and not held in registers,  or
+          are not affected by optimizers.
+
+          Compute  a = 2.0**m  with the  smallest positive integer m such
+          that
+
+             fl( a + 1.0 ) = a.
+*/
+
+	a = 1.f;
+	c__ = 1.f;
+
+/* +       WHILE( C.EQ.ONE )LOOP */
+L10:
+	if (c__ == one) {
+	    a *= 2;
+	    c__ = slamc3_(&a, &one);
+	    r__1 = -a;
+	    c__ = slamc3_(&c__, &r__1);
+	    goto L10;
+	}
+/*
+   +       END WHILE
+
+          Now compute  b = 2.0**m  with the smallest positive integer m
+          such that
+
+             fl( a + b ) .gt. a.
+*/
+
+	b = 1.f;
+	c__ = slamc3_(&a, &b);
+
+/* +       WHILE( C.EQ.A )LOOP */
+L20:
+	if (c__ == a) {
+	    b *= 2;
+	    c__ = slamc3_(&a, &b);
+	    goto L20;
+	}
+/*
+   +       END WHILE
+
+          Now compute the base.  a and c  are neighbouring floating point
+          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and so
+          their difference is beta. Adding 0.25 to c is to ensure that it
+          is truncated to beta and not ( beta - 1 ).
+*/
+
+	qtr = one / 4;
+	savec = c__;
+	r__1 = -a;
+	c__ = slamc3_(&c__, &r__1);
+	lbeta = c__ + qtr;
+
+/*
+          Now determine whether rounding or chopping occurs,  by adding a
+          bit  less  than  beta/2  and a  bit  more  than  beta/2  to  a.
+*/
+
+	b = (real) lbeta;
+	r__1 = b / 2;
+	r__2 = -b / 100;
+	f = slamc3_(&r__1, &r__2);
+	c__ = slamc3_(&f, &a);
+	if (c__ == a) {
+	    lrnd = TRUE_;
+	} else {
+	    lrnd = FALSE_;
+	}
+	r__1 = b / 2;
+	r__2 = b / 100;
+	f = slamc3_(&r__1, &r__2);
+	c__ = slamc3_(&f, &a);
+	if (lrnd && c__ == a) {
+	    lrnd = FALSE_;
+	}
+
+/*
+          Try and decide whether rounding is done in the  IEEE  'round to
+          nearest' style. B/2 is half a unit in the last place of the two
+          numbers A and SAVEC. Furthermore, A is even, i.e. has last  bit
+          zero, and SAVEC is odd. Thus adding B/2 to A should not  change
+          A, but adding B/2 to SAVEC should change SAVEC.
+*/
+
+	r__1 = b / 2;
+	t1 = slamc3_(&r__1, &a);
+	r__1 = b / 2;
+	t2 = slamc3_(&r__1, &savec);
+	lieee1 = t1 == a && t2 > savec && lrnd;
+
+/*
+          Now find  the  mantissa, t.  It should  be the  integer part of
+          log to the base beta of a,  however it is safer to determine  t
+          by powering.  So we find t as the smallest positive integer for
+          which
+
+             fl( beta**t + 1.0 ) = 1.0.
+*/
+
+	lt = 0;
+	a = 1.f;
+	c__ = 1.f;
+
+/* +       WHILE( C.EQ.ONE )LOOP */
+L30:
+	if (c__ == one) {
+	    ++lt;
+	    a *= lbeta;
+	    c__ = slamc3_(&a, &one);
+	    r__1 = -a;
+	    c__ = slamc3_(&c__, &r__1);
+	    goto L30;
+	}
+/* +       END WHILE */
+
+    }
+
+    *beta = lbeta;
+    *t = lt;
+    *rnd = lrnd;
+    *ieee1 = lieee1;
+    first = FALSE_;
+    return 0;
+
+/*     End of SLAMC1 */
+
+} /* slamc1_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int slamc2_(integer *beta, integer *t, logical *rnd, real *
+	eps, integer *emin, real *rmin, integer *emax, real *rmax)
+{
+    /* Initialized data */
+
+    static logical first = TRUE_;
+    static logical iwarn = FALSE_;
+
+    /* Format strings */
+    static char fmt_9999[] = "(//\002 WARNING. The value EMIN may be incorre"
+	    "ct:-\002,\002  EMIN = \002,i8,/\002 If, after inspection, the va"
+	    "lue EMIN looks\002,\002 acceptable please comment out \002,/\002"
+	    " the IF block as marked within the code of routine\002,\002 SLAM"
+	    "C2,\002,/\002 otherwise supply EMIN explicitly.\002,/)";
+
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2, r__3, r__4, r__5;
+
+    /* Local variables */
+    static real a, b, c__;
+    static integer i__, lt;
+    static real one, two;
+    static logical ieee;
+    static real half;
+    static logical lrnd;
+    static real leps, zero;
+    static integer lbeta;
+    static real rbase;
+    static integer lemin, lemax, gnmin;
+    static real small;
+    static integer gpmin;
+    static real third, lrmin, lrmax, sixth;
+    static logical lieee1;
+    extern /* Subroutine */ int slamc1_(integer *, integer *, logical *,
+	    logical *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int slamc4_(integer *, real *, integer *),
+	    slamc5_(integer *, integer *, integer *, logical *, integer *,
+	    real *);
+    static integer ngnmin, ngpmin;
+
+    /* Fortran I/O blocks */
+    static cilist io___144 = { 0, 6, 0, fmt_9999, 0 };
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMC2 determines the machine parameters specified in its argument
+    list.
+
+    Arguments
+    =========
+
+    BETA    (output) INTEGER
+            The base of the machine.
+
+    T       (output) INTEGER
+            The number of ( BETA ) digits in the mantissa.
+
+    RND     (output) LOGICAL
+            Specifies whether proper rounding  ( RND = .TRUE. )  or
+            chopping  ( RND = .FALSE. )  occurs in addition. This may not
+            be a reliable guide to the way in which the machine performs
+            its arithmetic.
+
+    EPS     (output) REAL
+            The smallest positive number such that
+
+               fl( 1.0 - EPS ) .LT. 1.0,
+
+            where fl denotes the computed value.
+
+    EMIN    (output) INTEGER
+            The minimum exponent before (gradual) underflow occurs.
+
+    RMIN    (output) REAL
+            The smallest normalized number for the machine, given by
+            BASE**( EMIN - 1 ), where  BASE  is the floating point value
+            of BETA.
+
+    EMAX    (output) INTEGER
+            The maximum exponent before overflow occurs.
+
+    RMAX    (output) REAL
+            The largest positive number for the machine, given by
+            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point
+            value of BETA.
+
+    Further Details
+    ===============
+
+    The computation of  EPS  is based on a routine PARANOIA by
+    W. Kahan of the University of California at Berkeley.
+
+   =====================================================================
+*/
+
+
+    if (first) {
+	zero = 0.f;
+	one = 1.f;
+	two = 2.f;
+
+/*
+          LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values of
+          BETA, T, RND, EPS, EMIN and RMIN.
+
+          Throughout this routine  we use the function  SLAMC3  to ensure
+          that relevant values are stored  and not held in registers,  or
+          are not affected by optimizers.
+
+          SLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1.
+*/
+
+	slamc1_(&lbeta, &lt, &lrnd, &lieee1);
+
+/*        Start to find EPS. */
+
+	b = (real) lbeta;
+	i__1 = -lt;
+	a = pow_ri(&b, &i__1);
+	leps = a;
+
+/*        Try some tricks to see whether or not this is the correct  EPS. */
+
+	b = two / 3;
+	half = one / 2;
+	r__1 = -half;
+	sixth = slamc3_(&b, &r__1);
+	third = slamc3_(&sixth, &sixth);
+	r__1 = -half;
+	b = slamc3_(&third, &r__1);
+	b = slamc3_(&b, &sixth);
+	b = dabs(b);
+	if (b < leps) {
+	    b = leps;
+	}
+
+	leps = 1.f;
+
+/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
+L10:
+	if (leps > b && b > zero) {
+	    leps = b;
+	    r__1 = half * leps;
+/* Computing 5th power */
+	    r__3 = two, r__4 = r__3, r__3 *= r__3;
+/* Computing 2nd power */
+	    r__5 = leps;
+	    r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5);
+	    c__ = slamc3_(&r__1, &r__2);
+	    r__1 = -c__;
+	    c__ = slamc3_(&half, &r__1);
+	    b = slamc3_(&half, &c__);
+	    r__1 = -b;
+	    c__ = slamc3_(&half, &r__1);
+	    b = slamc3_(&half, &c__);
+	    goto L10;
+	}
+/* +       END WHILE */
+
+	if (a < leps) {
+	    leps = a;
+	}
+
+/*
+          Computation of EPS complete.
+
+          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3)).
+          Keep dividing  A by BETA until (gradual) underflow occurs. This
+          is detected when we cannot recover the previous A.
+*/
+
+	rbase = one / lbeta;
+	small = one;
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    r__1 = small * rbase;
+	    small = slamc3_(&r__1, &zero);
+/* L20: */
+	}
+	a = slamc3_(&one, &small);
+	slamc4_(&ngpmin, &one, &lbeta);
+	r__1 = -one;
+	slamc4_(&ngnmin, &r__1, &lbeta);
+	slamc4_(&gpmin, &a, &lbeta);
+	r__1 = -a;
+	slamc4_(&gnmin, &r__1, &lbeta);
+	ieee = FALSE_;
+
+	if (ngpmin == ngnmin && gpmin == gnmin) {
+	    if (ngpmin == gpmin) {
+		lemin = ngpmin;
+/*
+              ( Non twos-complement machines, no gradual underflow;
+                e.g.,  VAX )
+*/
+	    } else if (gpmin - ngpmin == 3) {
+		lemin = ngpmin - 1 + lt;
+		ieee = TRUE_;
+/*
+              ( Non twos-complement machines, with gradual underflow;
+                e.g., IEEE standard followers )
+*/
+	    } else {
+		lemin = min(ngpmin,gpmin);
+/*            ( A guess; no known machine ) */
+		iwarn = TRUE_;
+	    }
+
+	} else if (ngpmin == gpmin && ngnmin == gnmin) {
+	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
+		lemin = max(ngpmin,ngnmin);
+/*
+              ( Twos-complement machines, no gradual underflow;
+                e.g., CYBER 205 )
+*/
+	    } else {
+		lemin = min(ngpmin,ngnmin);
+/*            ( A guess; no known machine ) */
+		iwarn = TRUE_;
+	    }
+
+	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
+		 {
+	    if (gpmin - min(ngpmin,ngnmin) == 3) {
+		lemin = max(ngpmin,ngnmin) - 1 + lt;
+/*
+              ( Twos-complement machines with gradual underflow;
+                no known machine )
+*/
+	    } else {
+		lemin = min(ngpmin,ngnmin);
+/*            ( A guess; no known machine ) */
+		iwarn = TRUE_;
+	    }
+
+	} else {
+/* Computing MIN */
+	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
+	    lemin = min(i__1,gnmin);
+/*         ( A guess; no known machine ) */
+	    iwarn = TRUE_;
+	}
+	first = FALSE_;
+/*
+   **
+   Comment out this if block if EMIN is ok
+*/
+	if (iwarn) {
+	    first = TRUE_;
+	    s_wsfe(&io___144);
+	    do_fio(&c__1, (char *)&lemin, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+/*
+   **
+
+          Assume IEEE arithmetic if we found denormalised  numbers above,
+          or if arithmetic seems to round in the  IEEE style,  determined
+          in routine SLAMC1. A true IEEE machine should have both  things
+          true; however, faulty machines may have one or the other.
+*/
+
+	ieee = ieee || lieee1;
+
+/*
+          Compute  RMIN by successive division by  BETA. We could compute
+          RMIN as BASE**( EMIN - 1 ),  but some machines underflow during
+          this computation.
+*/
+
+	lrmin = 1.f;
+	i__1 = 1 - lemin;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = lrmin * rbase;
+	    lrmin = slamc3_(&r__1, &zero);
+/* L30: */
+	}
+
+/*        Finally, call SLAMC5 to compute EMAX and RMAX. */
+
+	slamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
+    }
+
+    *beta = lbeta;
+    *t = lt;
+    *rnd = lrnd;
+    *eps = leps;
+    *emin = lemin;
+    *rmin = lrmin;
+    *emax = lemax;
+    *rmax = lrmax;
+
+    return 0;
+
+
+/*     End of SLAMC2 */
+
+} /* slamc2_ */
+
+
+/* *********************************************************************** */
+
+doublereal slamc3_(real *a, real *b)
+{
+    /* System generated locals */
+    real ret_val;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMC3  is intended to force  A  and  B  to be stored prior to doing
+    the addition of  A  and  B ,  for use in situations where optimizers
+    might hold one of these in a register.
+
+    Arguments
+    =========
+
+    A       (input) REAL
+    B       (input) REAL
+            The values A and B.
+
+   =====================================================================
+*/
+
+
+    ret_val = *a + *b;
+
+    return ret_val;
+
+/*     End of SLAMC3 */
+
+} /* slamc3_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int slamc4_(integer *emin, real *start, integer *base)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1;
+
+    /* Local variables */
+    static real a;
+    static integer i__;
+    static real b1, b2, c1, c2, d1, d2, one, zero, rbase;
+    extern doublereal slamc3_(real *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMC4 is a service routine for SLAMC2.
+
+    Arguments
+    =========
+
+    EMIN    (output) INTEGER
+            The minimum exponent before (gradual) underflow, computed by
+            setting A = START and dividing by BASE until the previous A
+            can not be recovered.
+
+    START   (input) REAL
+            The starting point for determining EMIN.
+
+    BASE    (input) INTEGER
+            The base of the machine.
+
+   =====================================================================
+*/
+
+
+    a = *start;
+    one = 1.f;
+    rbase = one / *base;
+    zero = 0.f;
+    *emin = 1;
+    r__1 = a * rbase;
+    b1 = slamc3_(&r__1, &zero);
+    c1 = a;
+    c2 = a;
+    d1 = a;
+    d2 = a;
+/*
+   +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.
+      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP
+*/
+L10:
+    if (c1 == a && c2 == a && d1 == a && d2 == a) {
+	--(*emin);
+	a = b1;
+	r__1 = a / *base;
+	b1 = slamc3_(&r__1, &zero);
+	r__1 = b1 * *base;
+	c1 = slamc3_(&r__1, &zero);
+	d1 = zero;
+	i__1 = *base;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d1 += b1;
+/* L20: */
+	}
+	r__1 = a * rbase;
+	b2 = slamc3_(&r__1, &zero);
+	r__1 = b2 / rbase;
+	c2 = slamc3_(&r__1, &zero);
+	d2 = zero;
+	i__1 = *base;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d2 += b2;
+/* L30: */
+	}
+	goto L10;
+    }
+/* +    END WHILE */
+
+    return 0;
+
+/*     End of SLAMC4 */
+
+} /* slamc4_ */
+
+
+/* *********************************************************************** */
+
+/* Subroutine */ int slamc5_(integer *beta, integer *p, integer *emin,
+	logical *ieee, integer *emax, real *rmax)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1;
+
+    /* Local variables */
+    static integer i__;
+    static real y, z__;
+    static integer try__, lexp;
+    static real oldy;
+    static integer uexp, nbits;
+    extern doublereal slamc3_(real *, real *);
+    static real recbas;
+    static integer exbits, expsum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMC5 attempts to compute RMAX, the largest machine floating-point
+    number, without overflow.  It assumes that EMAX + abs(EMIN) sum
+    approximately to a power of 2.  It will fail on machines where this
+    assumption does not hold, for example, the Cyber 205 (EMIN = -28625,
+    EMAX = 28718).  It will also fail if the value supplied for EMIN is
+    too large (i.e. too close to zero), probably with overflow.
+
+    Arguments
+    =========
+
+    BETA    (input) INTEGER
+            The base of floating-point arithmetic.
+
+    P       (input) INTEGER
+            The number of base BETA digits in the mantissa of a
+            floating-point value.
+
+    EMIN    (input) INTEGER
+            The minimum exponent before (gradual) underflow.
+
+    IEEE    (input) LOGICAL
+            A logical flag specifying whether or not the arithmetic
+            system is thought to comply with the IEEE standard.
+
+    EMAX    (output) INTEGER
+            The largest exponent before overflow
+
+    RMAX    (output) REAL
+            The largest machine floating-point number.
+
+   =====================================================================
+
+
+       First compute LEXP and UEXP, two powers of 2 that bound
+       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum
+       approximately to the bound that is closest to abs(EMIN).
+       (EMAX is the exponent of the required number RMAX).
+*/
+
+    lexp = 1;
+    exbits = 1;
+L10:
+    try__ = lexp << 1;
+    if (try__ <= -(*emin)) {
+	lexp = try__;
+	++exbits;
+	goto L10;
+    }
+    if (lexp == -(*emin)) {
+	uexp = lexp;
+    } else {
+	uexp = try__;
+	++exbits;
+    }
+
+/*
+       Now -LEXP is less than or equal to EMIN, and -UEXP is greater
+       than or equal to EMIN. EXBITS is the number of bits needed to
+       store the exponent.
+*/
+
+    if (uexp + *emin > -lexp - *emin) {
+	expsum = lexp << 1;
+    } else {
+	expsum = uexp << 1;
+    }
+
+/*
+       EXPSUM is the exponent range, approximately equal to
+       EMAX - EMIN + 1 .
+*/
+
+    *emax = expsum + *emin - 1;
+    nbits = exbits + 1 + *p;
+
+/*
+       NBITS is the total number of bits needed to store a
+       floating-point number.
+*/
+
+    if (nbits % 2 == 1 && *beta == 2) {
+
+/*
+          Either there are an odd number of bits used to store a
+          floating-point number, which is unlikely, or some bits are
+          not used in the representation of numbers, which is possible,
+          (e.g. Cray machines) or the mantissa has an implicit bit,
+          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
+          most likely. We have to assume the last alternative.
+          If this is true, then we need to reduce EMAX by one because
+          there must be some way of representing zero in an implicit-bit
+          system. On machines like Cray, we are reducing EMAX by one
+          unnecessarily.
+*/
+
+	--(*emax);
+    }
+
+    if (*ieee) {
+
+/*
+          Assume we are on an IEEE machine which reserves one exponent
+          for infinity and NaN.
+*/
+
+	--(*emax);
+    }
+
+/*
+       Now create RMAX, the largest machine number, which should
+       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .
+
+       First compute 1.0 - BETA**(-P), being careful that the
+       result is less than 1.0 .
+*/
+
+    recbas = 1.f / *beta;
+    z__ = *beta - 1.f;
+    y = 0.f;
+    i__1 = *p;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	z__ *= recbas;
+	if (y < 1.f) {
+	    oldy = y;
+	}
+	y = slamc3_(&y, &z__);
+/* L20: */
+    }
+    if (y >= 1.f) {
+	y = oldy;
+    }
+
+/*     Now multiply by BETA**EMAX to get RMAX. */
+
+    i__1 = *emax;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	r__1 = y * *beta;
+	y = slamc3_(&r__1, &c_b66);
+/* L30: */
+    }
+
+    *rmax = y;
+    return 0;
+
+/*     End of SLAMC5 */
+
+} /* slamc5_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_config.c.patch b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_config.c.patch
new file mode 100644
index 00000000000..4c43f8aa2a5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_config.c.patch
@@ -0,0 +1,18 @@
+@@ -696,7 +696,7 @@
+ doublereal dlamc3_(doublereal *a, doublereal *b)
+ {
+     /* System generated locals */
+-    doublereal ret_val;
++    volatile doublereal ret_val;
+ 
+ 
+ /*
+@@ -1773,7 +1773,7 @@
+ doublereal slamc3_(real *a, real *b)
+ {
+     /* System generated locals */
+-    real ret_val;
++    volatile real ret_val;
+ 
+ 
+ /*
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_d_lapack.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_d_lapack.c
new file mode 100644
index 00000000000..1a6675ef119
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_d_lapack.c
@@ -0,0 +1,41864 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static integer c__9 = 9;
+static integer c__0 = 0;
+static doublereal c_b15 = 1.;
+static integer c__1 = 1;
+static doublereal c_b29 = 0.;
+static doublereal c_b94 = -.125;
+static doublereal c_b151 = -1.;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+static integer c__65 = 65;
+static integer c__6 = 6;
+static integer c__12 = 12;
+static integer c__49 = 49;
+static integer c__4 = 4;
+static logical c_false = FALSE_;
+static integer c__13 = 13;
+static integer c__15 = 15;
+static integer c__14 = 14;
+static integer c__16 = 16;
+static logical c_true = TRUE_;
+static integer c__10 = 10;
+static integer c__11 = 11;
+static doublereal c_b3192 = 2.;
+
+/* Subroutine */ int dbdsdc_(char *uplo, char *compq, integer *n, doublereal *
+	d__, doublereal *e, doublereal *u, integer *ldu, doublereal *vt,
+	integer *ldvt, doublereal *q, integer *iq, doublereal *work, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, vt_dim1, vt_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static doublereal p, r__;
+    static integer z__, ic, ii, kk;
+    static doublereal cs;
+    static integer is, iu;
+    static doublereal sn;
+    static integer nm1;
+    static doublereal eps;
+    static integer ivt, difl, difr, ierr, perm, mlvl, sqre;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *
+	    , doublereal *, integer *), dswap_(integer *, doublereal *,
+	    integer *, doublereal *, integer *);
+    static integer poles, iuplo, nsize, start;
+    extern /* Subroutine */ int dlasd0_(integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, integer *, doublereal *, integer *);
+
+    extern /* Subroutine */ int dlasda_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *, integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *), dlascl_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *), dlasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, doublereal *, doublereal *, doublereal *,
+	     integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlaset_(char *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static integer givcol;
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    static integer icompq;
+    static doublereal orgnrm;
+    static integer givnum, givptr, qstart, smlsiz, wstart, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DBDSDC computes the singular value decomposition (SVD) of a real
+    N-by-N (upper or lower) bidiagonal matrix B:  B = U * S * VT,
+    using a divide and conquer method, where S is a diagonal matrix
+    with non-negative diagonal elements (the singular values of B), and
+    U and VT are orthogonal matrices of left and right singular vectors,
+    respectively. DBDSDC can be used to compute all singular values,
+    and optionally, singular vectors or singular vectors in compact form.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.  See DLASD3 for details.
+
+    The code currently calls DLASDQ if singular values only are desired.
+    However, it can be slightly modified to compute singular values
+    using the divide and conquer method.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  B is upper bidiagonal.
+            = 'L':  B is lower bidiagonal.
+
+    COMPQ   (input) CHARACTER*1
+            Specifies whether singular vectors are to be computed
+            as follows:
+            = 'N':  Compute singular values only;
+            = 'P':  Compute singular values and compute singular
+                    vectors in compact form;
+            = 'I':  Compute singular values and singular vectors.
+
+    N       (input) INTEGER
+            The order of the matrix B.  N >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the n diagonal elements of the bidiagonal matrix B.
+            On exit, if INFO=0, the singular values of B.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the elements of E contain the offdiagonal
+            elements of the bidiagonal matrix whose SVD is desired.
+            On exit, E has been destroyed.
+
+    U       (output) DOUBLE PRECISION array, dimension (LDU,N)
+            If  COMPQ = 'I', then:
+               On exit, if INFO = 0, U contains the left singular vectors
+               of the bidiagonal matrix.
+            For other values of COMPQ, U is not referenced.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= 1.
+            If singular vectors are desired, then LDU >= max( 1, N ).
+
+    VT      (output) DOUBLE PRECISION array, dimension (LDVT,N)
+            If  COMPQ = 'I', then:
+               On exit, if INFO = 0, VT' contains the right singular
+               vectors of the bidiagonal matrix.
+            For other values of COMPQ, VT is not referenced.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.  LDVT >= 1.
+            If singular vectors are desired, then LDVT >= max( 1, N ).
+
+    Q       (output) DOUBLE PRECISION array, dimension (LDQ)
+            If  COMPQ = 'P', then:
+               On exit, if INFO = 0, Q and IQ contain the left
+               and right singular vectors in a compact form,
+               requiring O(N log N) space instead of 2*N**2.
+               In particular, Q contains all the DOUBLE PRECISION data in
+               LDQ >= N*(11 + 2*SMLSIZ + 8*INT(LOG_2(N/(SMLSIZ+1))))
+               words of memory, where SMLSIZ is returned by ILAENV and
+               is equal to the maximum size of the subproblems at the
+               bottom of the computation tree (usually about 25).
+            For other values of COMPQ, Q is not referenced.
+
+    IQ      (output) INTEGER array, dimension (LDIQ)
+            If  COMPQ = 'P', then:
+               On exit, if INFO = 0, Q and IQ contain the left
+               and right singular vectors in a compact form,
+               requiring O(N log N) space instead of 2*N**2.
+               In particular, IQ contains all INTEGER data in
+               LDIQ >= N*(3 + 3*INT(LOG_2(N/(SMLSIZ+1))))
+               words of memory, where SMLSIZ is returned by ILAENV and
+               is equal to the maximum size of the subproblems at the
+               bottom of the computation tree (usually about 25).
+            For other values of COMPQ, IQ is not referenced.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            If COMPQ = 'N' then LWORK >= (4 * N).
+            If COMPQ = 'P' then LWORK >= (6 * N).
+            If COMPQ = 'I' then LWORK >= (3 * N**2 + 4 * N).
+
+    IWORK   (workspace) INTEGER array, dimension (8*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute a singular value.
+                  The update process of divide and conquer failed.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+    Changed dimension statement in comment describing E from (N) to
+    (N-1).  Sven, 17 Feb 05.
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --q;
+    --iq;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    iuplo = 0;
+    if (lsame_(uplo, "U")) {
+	iuplo = 1;
+    }
+    if (lsame_(uplo, "L")) {
+	iuplo = 2;
+    }
+    if (lsame_(compq, "N")) {
+	icompq = 0;
+    } else if (lsame_(compq, "P")) {
+	icompq = 1;
+    } else if (lsame_(compq, "I")) {
+	icompq = 2;
+    } else {
+	icompq = -1;
+    }
+    if (iuplo == 0) {
+	*info = -1;
+    } else if (icompq < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ldu < 1 || icompq == 2 && *ldu < *n) {
+	*info = -7;
+    } else if (*ldvt < 1 || icompq == 2 && *ldvt < *n) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DBDSDC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    smlsiz = ilaenv_(&c__9, "DBDSDC", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+    if (*n == 1) {
+	if (icompq == 1) {
+	    q[1] = d_sign(&c_b15, &d__[1]);
+	    q[smlsiz * *n + 1] = 1.;
+	} else if (icompq == 2) {
+	    u[u_dim1 + 1] = d_sign(&c_b15, &d__[1]);
+	    vt[vt_dim1 + 1] = 1.;
+	}
+	d__[1] = abs(d__[1]);
+	return 0;
+    }
+    nm1 = *n - 1;
+
+/*
+       If matrix lower bidiagonal, rotate to be upper bidiagonal
+       by applying Givens rotations on the left
+*/
+
+    wstart = 1;
+    qstart = 3;
+    if (icompq == 1) {
+	dcopy_(n, &d__[1], &c__1, &q[1], &c__1);
+	i__1 = *n - 1;
+	dcopy_(&i__1, &e[1], &c__1, &q[*n + 1], &c__1);
+    }
+    if (iuplo == 2) {
+	qstart = 5;
+	wstart = (*n << 1) - 1;
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dlartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (icompq == 1) {
+		q[i__ + (*n << 1)] = cs;
+		q[i__ + *n * 3] = sn;
+	    } else if (icompq == 2) {
+		work[i__] = cs;
+		work[nm1 + i__] = -sn;
+	    }
+/* L10: */
+	}
+    }
+
+/*     If ICOMPQ = 0, use DLASDQ to compute the singular values. */
+
+    if (icompq == 0) {
+	dlasdq_("U", &c__0, n, &c__0, &c__0, &c__0, &d__[1], &e[1], &vt[
+		vt_offset], ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[
+		wstart], info);
+	goto L40;
+    }
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= smlsiz) {
+	if (icompq == 2) {
+	    dlaset_("A", n, n, &c_b29, &c_b15, &u[u_offset], ldu);
+	    dlaset_("A", n, n, &c_b29, &c_b15, &vt[vt_offset], ldvt);
+	    dlasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &vt[vt_offset]
+		    , ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[
+		    wstart], info);
+	} else if (icompq == 1) {
+	    iu = 1;
+	    ivt = iu + *n;
+	    dlaset_("A", n, n, &c_b29, &c_b15, &q[iu + (qstart - 1) * *n], n);
+	    dlaset_("A", n, n, &c_b29, &c_b15, &q[ivt + (qstart - 1) * *n], n);
+	    dlasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &q[ivt + (
+		    qstart - 1) * *n], n, &q[iu + (qstart - 1) * *n], n, &q[
+		    iu + (qstart - 1) * *n], n, &work[wstart], info);
+	}
+	goto L40;
+    }
+
+    if (icompq == 2) {
+	dlaset_("A", n, n, &c_b29, &c_b15, &u[u_offset], ldu);
+	dlaset_("A", n, n, &c_b29, &c_b15, &vt[vt_offset], ldvt);
+    }
+
+/*     Scale. */
+
+    orgnrm = dlanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.) {
+	return 0;
+    }
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, &ierr);
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1, &
+	    ierr);
+
+    eps = EPSILON;
+
+    mlvl = (integer) (log((doublereal) (*n) / (doublereal) (smlsiz + 1)) /
+	    log(2.)) + 1;
+    smlszp = smlsiz + 1;
+
+    if (icompq == 1) {
+	iu = 1;
+	ivt = smlsiz + 1;
+	difl = ivt + smlszp;
+	difr = difl + mlvl;
+	z__ = difr + (mlvl << 1);
+	ic = z__ + mlvl;
+	is = ic + 1;
+	poles = is + 1;
+	givnum = poles + (mlvl << 1);
+
+	k = 1;
+	givptr = 2;
+	perm = 3;
+	givcol = perm + mlvl;
+    }
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = d__[i__], abs(d__1)) < eps) {
+	    d__[i__] = d_sign(&eps, &d__[i__]);
+	}
+/* L20: */
+    }
+
+    start = 1;
+    sqre = 0;
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = e[i__], abs(d__1)) < eps || i__ == nm1) {
+
+/*
+          Subproblem found. First determine its size and then
+          apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*        A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - start + 1;
+	    } else if ((d__1 = e[i__], abs(d__1)) >= eps) {
+
+/*        A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - start + 1;
+	    } else {
+
+/*
+          A subproblem with E(NM1) small. This implies an
+          1-by-1 subproblem at D(N). Solve this 1-by-1 problem
+          first.
+*/
+
+		nsize = i__ - start + 1;
+		if (icompq == 2) {
+		    u[*n + *n * u_dim1] = d_sign(&c_b15, &d__[*n]);
+		    vt[*n + *n * vt_dim1] = 1.;
+		} else if (icompq == 1) {
+		    q[*n + (qstart - 1) * *n] = d_sign(&c_b15, &d__[*n]);
+		    q[*n + (smlsiz + qstart - 1) * *n] = 1.;
+		}
+		d__[*n] = (d__1 = d__[*n], abs(d__1));
+	    }
+	    if (icompq == 2) {
+		dlasd0_(&nsize, &sqre, &d__[start], &e[start], &u[start +
+			start * u_dim1], ldu, &vt[start + start * vt_dim1],
+			ldvt, &smlsiz, &iwork[1], &work[wstart], info);
+	    } else {
+		dlasda_(&icompq, &smlsiz, &nsize, &sqre, &d__[start], &e[
+			start], &q[start + (iu + qstart - 2) * *n], n, &q[
+			start + (ivt + qstart - 2) * *n], &iq[start + k * *n],
+			 &q[start + (difl + qstart - 2) * *n], &q[start + (
+			difr + qstart - 2) * *n], &q[start + (z__ + qstart -
+			2) * *n], &q[start + (poles + qstart - 2) * *n], &iq[
+			start + givptr * *n], &iq[start + givcol * *n], n, &
+			iq[start + perm * *n], &q[start + (givnum + qstart -
+			2) * *n], &q[start + (ic + qstart - 2) * *n], &q[
+			start + (is + qstart - 2) * *n], &work[wstart], &
+			iwork[1], info);
+	    }
+	    if (*info != 0) {
+		return 0;
+	    }
+	    start = i__ + 1;
+	}
+/* L30: */
+    }
+
+/*     Unscale */
+
+    dlascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, &ierr);
+L40:
+
+/*     Use Selection Sort to minimize swaps of singular vectors */
+
+    i__1 = *n;
+    for (ii = 2; ii <= i__1; ++ii) {
+	i__ = ii - 1;
+	kk = i__;
+	p = d__[i__];
+	i__2 = *n;
+	for (j = ii; j <= i__2; ++j) {
+	    if (d__[j] > p) {
+		kk = j;
+		p = d__[j];
+	    }
+/* L50: */
+	}
+	if (kk != i__) {
+	    d__[kk] = d__[i__];
+	    d__[i__] = p;
+	    if (icompq == 1) {
+		iq[i__] = kk;
+	    } else if (icompq == 2) {
+		dswap_(n, &u[i__ * u_dim1 + 1], &c__1, &u[kk * u_dim1 + 1], &
+			c__1);
+		dswap_(n, &vt[i__ + vt_dim1], ldvt, &vt[kk + vt_dim1], ldvt);
+	    }
+	} else if (icompq == 1) {
+	    iq[i__] = i__;
+	}
+/* L60: */
+    }
+
+/*     If ICOMPQ = 1, use IQ(N,1) as the indicator for UPLO */
+
+    if (icompq == 1) {
+	if (iuplo == 1) {
+	    iq[*n] = 1;
+	} else {
+	    iq[*n] = 0;
+	}
+    }
+
+/*
+       If B is lower bidiagonal, update U by those Givens rotations
+       which rotated B to be upper bidiagonal
+*/
+
+    if (iuplo == 2 && icompq == 2) {
+	dlasr_("L", "V", "B", n, n, &work[1], &work[*n], &u[u_offset], ldu);
+    }
+
+    return 0;
+
+/*     End of DBDSDC */
+
+} /* dbdsdc_ */
+
+/* Subroutine */ int dbdsqr_(char *uplo, integer *n, integer *ncvt, integer *
+	nru, integer *ncc, doublereal *d__, doublereal *e, doublereal *vt,
+	integer *ldvt, doublereal *u, integer *ldu, doublereal *c__, integer *
+	ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static doublereal f, g, h__;
+    static integer i__, j, m;
+    static doublereal r__, cs;
+    static integer ll;
+    static doublereal sn, mu;
+    static integer nm1, nm12, nm13, lll;
+    static doublereal eps, sll, tol, abse;
+    static integer idir;
+    static doublereal abss;
+    static integer oldm;
+    static doublereal cosl;
+    static integer isub, iter;
+    static doublereal unfl, sinl, cosr, smin, smax, sinr;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *), dlas2_(
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *), dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    static doublereal oldcs;
+    extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *);
+    static integer oldll;
+    static doublereal shift, sigmn, oldsn;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer maxit;
+    static doublereal sminl, sigmx;
+    static logical lower;
+    extern /* Subroutine */ int dlasq1_(integer *, doublereal *, doublereal *,
+	     doublereal *, integer *), dlasv2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *);
+
+    extern /* Subroutine */ int dlartg_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *), xerbla_(char *,
+	    integer *);
+    static doublereal sminoa, thresh;
+    static logical rotate;
+    static doublereal tolmul;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       January 2007
+
+
+    Purpose
+    =======
+
+    DBDSQR computes the singular values and, optionally, the right and/or
+    left singular vectors from the singular value decomposition (SVD) of
+    a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+    zero-shift QR algorithm.  The SVD of B has the form
+
+       B = Q * S * P**T
+
+    where S is the diagonal matrix of singular values, Q is an orthogonal
+    matrix of left singular vectors, and P is an orthogonal matrix of
+    right singular vectors.  If left singular vectors are requested, this
+    subroutine actually returns U*Q instead of Q, and, if right singular
+    vectors are requested, this subroutine returns P**T*VT instead of
+    P**T, for given real input matrices U and VT.  When U and VT are the
+    orthogonal matrices that reduce a general matrix A to bidiagonal
+    form:  A = U*B*VT, as computed by DGEBRD, then
+
+       A = (U*Q) * S * (P**T*VT)
+
+    is the SVD of A.  Optionally, the subroutine may also compute Q**T*C
+    for a given real input matrix C.
+
+    See "Computing  Small Singular Values of Bidiagonal Matrices With
+    Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+    LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+    no. 5, pp. 873-912, Sept 1990) and
+    "Accurate singular values and differential qd algorithms," by
+    B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+    Department, University of California at Berkeley, July 1992
+    for a detailed description of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  B is upper bidiagonal;
+            = 'L':  B is lower bidiagonal.
+
+    N       (input) INTEGER
+            The order of the matrix B.  N >= 0.
+
+    NCVT    (input) INTEGER
+            The number of columns of the matrix VT. NCVT >= 0.
+
+    NRU     (input) INTEGER
+            The number of rows of the matrix U. NRU >= 0.
+
+    NCC     (input) INTEGER
+            The number of columns of the matrix C. NCC >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the n diagonal elements of the bidiagonal matrix B.
+            On exit, if INFO=0, the singular values of B in decreasing
+            order.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the N-1 offdiagonal elements of the bidiagonal
+            matrix B.
+            On exit, if INFO = 0, E is destroyed; if INFO > 0, D and E
+            will contain the diagonal and superdiagonal elements of a
+            bidiagonal matrix orthogonally equivalent to the one given
+            as input.
+
+    VT      (input/output) DOUBLE PRECISION array, dimension (LDVT, NCVT)
+            On entry, an N-by-NCVT matrix VT.
+            On exit, VT is overwritten by P**T * VT.
+            Not referenced if NCVT = 0.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.
+            LDVT >= max(1,N) if NCVT > 0; LDVT >= 1 if NCVT = 0.
+
+    U       (input/output) DOUBLE PRECISION array, dimension (LDU, N)
+            On entry, an NRU-by-N matrix U.
+            On exit, U is overwritten by U * Q.
+            Not referenced if NRU = 0.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= max(1,NRU).
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC, NCC)
+            On entry, an N-by-NCC matrix C.
+            On exit, C is overwritten by Q**T * C.
+            Not referenced if NCC = 0.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C.
+            LDC >= max(1,N) if NCC > 0; LDC >=1 if NCC = 0.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (4*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  If INFO = -i, the i-th argument had an illegal value
+            > 0:
+               if NCVT = NRU = NCC = 0,
+                  = 1, a split was marked by a positive value in E
+                  = 2, current block of Z not diagonalized after 30*N
+                       iterations (in inner while loop)
+                  = 3, termination criterion of outer while loop not met
+                       (program created more than N unreduced blocks)
+               else NCVT = NRU = NCC = 0,
+                     the algorithm did not converge; D and E contain the
+                     elements of a bidiagonal matrix which is orthogonally
+                     similar to the input matrix B;  if INFO = i, i
+                     elements of E have not converged to zero.
+
+    Internal Parameters
+    ===================
+
+    TOLMUL  DOUBLE PRECISION, default = max(10,min(100,EPS**(-1/8)))
+            TOLMUL controls the convergence criterion of the QR loop.
+            If it is positive, TOLMUL*EPS is the desired relative
+               precision in the computed singular values.
+            If it is negative, abs(TOLMUL*EPS*sigma_max) is the
+               desired absolute accuracy in the computed singular
+               values (corresponds to relative accuracy
+               abs(TOLMUL*EPS) in the largest singular value.
+            abs(TOLMUL) should be between 1 and 1/EPS, and preferably
+               between 10 (for fast convergence) and .1/EPS
+               (for there to be some accuracy in the results).
+            Default is to lose at either one eighth or 2 of the
+               available decimal digits in each computed singular value
+               (whichever is smaller).
+
+    MAXITR  INTEGER, default = 6
+            MAXITR controls the maximum number of passes of the
+            algorithm through its inner loop. The algorithms stops
+            (and so fails to converge) if the number of passes
+            through the inner loop exceeds MAXITR*N**2.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    lower = lsame_(uplo, "L");
+    if (! lsame_(uplo, "U") && ! lower) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ncvt < 0) {
+	*info = -3;
+    } else if (*nru < 0) {
+	*info = -4;
+    } else if (*ncc < 0) {
+	*info = -5;
+    } else if (*ncvt == 0 && *ldvt < 1 || *ncvt > 0 && *ldvt < max(1,*n)) {
+	*info = -9;
+    } else if (*ldu < max(1,*nru)) {
+	*info = -11;
+    } else if (*ncc == 0 && *ldc < 1 || *ncc > 0 && *ldc < max(1,*n)) {
+	*info = -13;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DBDSQR", &i__1);
+	return 0;
+    }
+    if (*n == 0) {
+	return 0;
+    }
+    if (*n == 1) {
+	goto L160;
+    }
+
+/*     ROTATE is true if any singular vectors desired, false otherwise */
+
+    rotate = *ncvt > 0 || *nru > 0 || *ncc > 0;
+
+/*     If no singular vectors desired, use qd algorithm */
+
+    if (! rotate) {
+	dlasq1_(n, &d__[1], &e[1], &work[1], info);
+	return 0;
+    }
+
+    nm1 = *n - 1;
+    nm12 = nm1 + nm1;
+    nm13 = nm12 + nm1;
+    idir = 0;
+
+/*     Get machine constants */
+
+    eps = EPSILON;
+    unfl = SAFEMINIMUM;
+
+/*
+       If matrix lower bidiagonal, rotate to be upper bidiagonal
+       by applying Givens rotations on the left
+*/
+
+    if (lower) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dlartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    work[i__] = cs;
+	    work[nm1 + i__] = sn;
+/* L10: */
+	}
+
+/*        Update singular vectors if desired */
+
+	if (*nru > 0) {
+	    dlasr_("R", "V", "F", nru, n, &work[1], &work[*n], &u[u_offset],
+		    ldu);
+	}
+	if (*ncc > 0) {
+	    dlasr_("L", "V", "F", n, ncc, &work[1], &work[*n], &c__[c_offset],
+		     ldc);
+	}
+    }
+
+/*
+       Compute singular values to relative accuracy TOL
+       (By setting TOL to be negative, algorithm will compute
+       singular values to absolute accuracy ABS(TOL)*norm(input matrix))
+
+   Computing MAX
+   Computing MIN
+*/
+    d__3 = 100., d__4 = pow_dd(&eps, &c_b94);
+    d__1 = 10., d__2 = min(d__3,d__4);
+    tolmul = max(d__1,d__2);
+    tol = tolmul * eps;
+
+/*     Compute approximate maximum, minimum singular values */
+
+    smax = 0.;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	d__2 = smax, d__3 = (d__1 = d__[i__], abs(d__1));
+	smax = max(d__2,d__3);
+/* L20: */
+    }
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	d__2 = smax, d__3 = (d__1 = e[i__], abs(d__1));
+	smax = max(d__2,d__3);
+/* L30: */
+    }
+    sminl = 0.;
+    if (tol >= 0.) {
+
+/*        Relative accuracy desired */
+
+	sminoa = abs(d__[1]);
+	if (sminoa == 0.) {
+	    goto L50;
+	}
+	mu = sminoa;
+	i__1 = *n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    mu = (d__2 = d__[i__], abs(d__2)) * (mu / (mu + (d__1 = e[i__ - 1]
+		    , abs(d__1))));
+	    sminoa = min(sminoa,mu);
+	    if (sminoa == 0.) {
+		goto L50;
+	    }
+/* L40: */
+	}
+L50:
+	sminoa /= sqrt((doublereal) (*n));
+/* Computing MAX */
+	d__1 = tol * sminoa, d__2 = *n * 6 * *n * unfl;
+	thresh = max(d__1,d__2);
+    } else {
+
+/*
+          Absolute accuracy desired
+
+   Computing MAX
+*/
+	d__1 = abs(tol) * smax, d__2 = *n * 6 * *n * unfl;
+	thresh = max(d__1,d__2);
+    }
+
+/*
+       Prepare for main iteration loop for the singular values
+       (MAXIT is the maximum number of passes through the inner
+       loop permitted before nonconvergence signalled.)
+*/
+
+    maxit = *n * 6 * *n;
+    iter = 0;
+    oldll = -1;
+    oldm = -1;
+
+/*     M points to last element of unconverged part of matrix */
+
+    m = *n;
+
+/*     Begin main iteration loop */
+
+L60:
+
+/*     Check for convergence or exceeding iteration count */
+
+    if (m <= 1) {
+	goto L160;
+    }
+    if (iter > maxit) {
+	goto L200;
+    }
+
+/*     Find diagonal block of matrix to work on */
+
+    if (tol < 0. && (d__1 = d__[m], abs(d__1)) <= thresh) {
+	d__[m] = 0.;
+    }
+    smax = (d__1 = d__[m], abs(d__1));
+    smin = smax;
+    i__1 = m - 1;
+    for (lll = 1; lll <= i__1; ++lll) {
+	ll = m - lll;
+	abss = (d__1 = d__[ll], abs(d__1));
+	abse = (d__1 = e[ll], abs(d__1));
+	if (tol < 0. && abss <= thresh) {
+	    d__[ll] = 0.;
+	}
+	if (abse <= thresh) {
+	    goto L80;
+	}
+	smin = min(smin,abss);
+/* Computing MAX */
+	d__1 = max(smax,abss);
+	smax = max(d__1,abse);
+/* L70: */
+    }
+    ll = 0;
+    goto L90;
+L80:
+    e[ll] = 0.;
+
+/*     Matrix splits since E(LL) = 0 */
+
+    if (ll == m - 1) {
+
+/*        Convergence of bottom singular value, return to top of loop */
+
+	--m;
+	goto L60;
+    }
+L90:
+    ++ll;
+
+/*     E(LL) through E(M-1) are nonzero, E(LL-1) is zero */
+
+    if (ll == m - 1) {
+
+/*        2 by 2 block, handle separately */
+
+	dlasv2_(&d__[m - 1], &e[m - 1], &d__[m], &sigmn, &sigmx, &sinr, &cosr,
+		 &sinl, &cosl);
+	d__[m - 1] = sigmx;
+	e[m - 1] = 0.;
+	d__[m] = sigmn;
+
+/*        Compute singular vectors, if desired */
+
+	if (*ncvt > 0) {
+	    drot_(ncvt, &vt[m - 1 + vt_dim1], ldvt, &vt[m + vt_dim1], ldvt, &
+		    cosr, &sinr);
+	}
+	if (*nru > 0) {
+	    drot_(nru, &u[(m - 1) * u_dim1 + 1], &c__1, &u[m * u_dim1 + 1], &
+		    c__1, &cosl, &sinl);
+	}
+	if (*ncc > 0) {
+	    drot_(ncc, &c__[m - 1 + c_dim1], ldc, &c__[m + c_dim1], ldc, &
+		    cosl, &sinl);
+	}
+	m += -2;
+	goto L60;
+    }
+
+/*
+       If working on new submatrix, choose shift direction
+       (from larger end diagonal element towards smaller)
+*/
+
+    if (ll > oldm || m < oldll) {
+	if ((d__1 = d__[ll], abs(d__1)) >= (d__2 = d__[m], abs(d__2))) {
+
+/*           Chase bulge from top (big end) to bottom (small end) */
+
+	    idir = 1;
+	} else {
+
+/*           Chase bulge from bottom (big end) to top (small end) */
+
+	    idir = 2;
+	}
+    }
+
+/*     Apply convergence tests */
+
+    if (idir == 1) {
+
+/*
+          Run convergence test in forward direction
+          First apply standard test to bottom of matrix
+*/
+
+	if ((d__2 = e[m - 1], abs(d__2)) <= abs(tol) * (d__1 = d__[m], abs(
+		d__1)) || tol < 0. && (d__3 = e[m - 1], abs(d__3)) <= thresh)
+		{
+	    e[m - 1] = 0.;
+	    goto L60;
+	}
+
+	if (tol >= 0.) {
+
+/*
+             If relative accuracy desired,
+             apply convergence criterion forward
+*/
+
+	    mu = (d__1 = d__[ll], abs(d__1));
+	    sminl = mu;
+	    i__1 = m - 1;
+	    for (lll = ll; lll <= i__1; ++lll) {
+		if ((d__1 = e[lll], abs(d__1)) <= tol * mu) {
+		    e[lll] = 0.;
+		    goto L60;
+		}
+		mu = (d__2 = d__[lll + 1], abs(d__2)) * (mu / (mu + (d__1 = e[
+			lll], abs(d__1))));
+		sminl = min(sminl,mu);
+/* L100: */
+	    }
+	}
+
+    } else {
+
+/*
+          Run convergence test in backward direction
+          First apply standard test to top of matrix
+*/
+
+	if ((d__2 = e[ll], abs(d__2)) <= abs(tol) * (d__1 = d__[ll], abs(d__1)
+		) || tol < 0. && (d__3 = e[ll], abs(d__3)) <= thresh) {
+	    e[ll] = 0.;
+	    goto L60;
+	}
+
+	if (tol >= 0.) {
+
+/*
+             If relative accuracy desired,
+             apply convergence criterion backward
+*/
+
+	    mu = (d__1 = d__[m], abs(d__1));
+	    sminl = mu;
+	    i__1 = ll;
+	    for (lll = m - 1; lll >= i__1; --lll) {
+		if ((d__1 = e[lll], abs(d__1)) <= tol * mu) {
+		    e[lll] = 0.;
+		    goto L60;
+		}
+		mu = (d__2 = d__[lll], abs(d__2)) * (mu / (mu + (d__1 = e[lll]
+			, abs(d__1))));
+		sminl = min(sminl,mu);
+/* L110: */
+	    }
+	}
+    }
+    oldll = ll;
+    oldm = m;
+
+/*
+       Compute shift.  First, test if shifting would ruin relative
+       accuracy, and if so set the shift to zero.
+
+   Computing MAX
+*/
+    d__1 = eps, d__2 = tol * .01;
+    if (tol >= 0. && *n * tol * (sminl / smax) <= max(d__1,d__2)) {
+
+/*        Use a zero shift to avoid loss of relative accuracy */
+
+	shift = 0.;
+    } else {
+
+/*        Compute the shift from 2-by-2 block at end of matrix */
+
+	if (idir == 1) {
+	    sll = (d__1 = d__[ll], abs(d__1));
+	    dlas2_(&d__[m - 1], &e[m - 1], &d__[m], &shift, &r__);
+	} else {
+	    sll = (d__1 = d__[m], abs(d__1));
+	    dlas2_(&d__[ll], &e[ll], &d__[ll + 1], &shift, &r__);
+	}
+
+/*        Test if shift negligible, and if so set to zero */
+
+	if (sll > 0.) {
+/* Computing 2nd power */
+	    d__1 = shift / sll;
+	    if (d__1 * d__1 < eps) {
+		shift = 0.;
+	    }
+	}
+    }
+
+/*     Increment iteration count */
+
+    iter = iter + m - ll;
+
+/*     If SHIFT = 0, do simplified QR iteration */
+
+    if (shift == 0.) {
+	if (idir == 1) {
+
+/*
+             Chase bulge from top to bottom
+             Save cosines and sines for later singular vector updates
+*/
+
+	    cs = 1.;
+	    oldcs = 1.;
+	    i__1 = m - 1;
+	    for (i__ = ll; i__ <= i__1; ++i__) {
+		d__1 = d__[i__] * cs;
+		dlartg_(&d__1, &e[i__], &cs, &sn, &r__);
+		if (i__ > ll) {
+		    e[i__ - 1] = oldsn * r__;
+		}
+		d__1 = oldcs * r__;
+		d__2 = d__[i__ + 1] * sn;
+		dlartg_(&d__1, &d__2, &oldcs, &oldsn, &d__[i__]);
+		work[i__ - ll + 1] = cs;
+		work[i__ - ll + 1 + nm1] = sn;
+		work[i__ - ll + 1 + nm12] = oldcs;
+		work[i__ - ll + 1 + nm13] = oldsn;
+/* L120: */
+	    }
+	    h__ = d__[m] * cs;
+	    d__[m] = h__ * oldcs;
+	    e[m - 1] = h__ * oldsn;
+
+/*           Update singular vectors */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "F", &i__1, ncvt, &work[1], &work[*n], &vt[
+			ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("R", "V", "F", nru, &i__1, &work[nm12 + 1], &work[nm13
+			+ 1], &u[ll * u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "F", &i__1, ncc, &work[nm12 + 1], &work[nm13
+			+ 1], &c__[ll + c_dim1], ldc);
+	    }
+
+/*           Test convergence */
+
+	    if ((d__1 = e[m - 1], abs(d__1)) <= thresh) {
+		e[m - 1] = 0.;
+	    }
+
+	} else {
+
+/*
+             Chase bulge from bottom to top
+             Save cosines and sines for later singular vector updates
+*/
+
+	    cs = 1.;
+	    oldcs = 1.;
+	    i__1 = ll + 1;
+	    for (i__ = m; i__ >= i__1; --i__) {
+		d__1 = d__[i__] * cs;
+		dlartg_(&d__1, &e[i__ - 1], &cs, &sn, &r__);
+		if (i__ < m) {
+		    e[i__] = oldsn * r__;
+		}
+		d__1 = oldcs * r__;
+		d__2 = d__[i__ - 1] * sn;
+		dlartg_(&d__1, &d__2, &oldcs, &oldsn, &d__[i__]);
+		work[i__ - ll] = cs;
+		work[i__ - ll + nm1] = -sn;
+		work[i__ - ll + nm12] = oldcs;
+		work[i__ - ll + nm13] = -oldsn;
+/* L130: */
+	    }
+	    h__ = d__[ll] * cs;
+	    d__[ll] = h__ * oldcs;
+	    e[ll] = h__ * oldsn;
+
+/*           Update singular vectors */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "B", &i__1, ncvt, &work[nm12 + 1], &work[
+			nm13 + 1], &vt[ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("R", "V", "B", nru, &i__1, &work[1], &work[*n], &u[ll *
+			 u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "B", &i__1, ncc, &work[1], &work[*n], &c__[
+			ll + c_dim1], ldc);
+	    }
+
+/*           Test convergence */
+
+	    if ((d__1 = e[ll], abs(d__1)) <= thresh) {
+		e[ll] = 0.;
+	    }
+	}
+    } else {
+
+/*        Use nonzero shift */
+
+	if (idir == 1) {
+
+/*
+             Chase bulge from top to bottom
+             Save cosines and sines for later singular vector updates
+*/
+
+	    f = ((d__1 = d__[ll], abs(d__1)) - shift) * (d_sign(&c_b15, &d__[
+		    ll]) + shift / d__[ll]);
+	    g = e[ll];
+	    i__1 = m - 1;
+	    for (i__ = ll; i__ <= i__1; ++i__) {
+		dlartg_(&f, &g, &cosr, &sinr, &r__);
+		if (i__ > ll) {
+		    e[i__ - 1] = r__;
+		}
+		f = cosr * d__[i__] + sinr * e[i__];
+		e[i__] = cosr * e[i__] - sinr * d__[i__];
+		g = sinr * d__[i__ + 1];
+		d__[i__ + 1] = cosr * d__[i__ + 1];
+		dlartg_(&f, &g, &cosl, &sinl, &r__);
+		d__[i__] = r__;
+		f = cosl * e[i__] + sinl * d__[i__ + 1];
+		d__[i__ + 1] = cosl * d__[i__ + 1] - sinl * e[i__];
+		if (i__ < m - 1) {
+		    g = sinl * e[i__ + 1];
+		    e[i__ + 1] = cosl * e[i__ + 1];
+		}
+		work[i__ - ll + 1] = cosr;
+		work[i__ - ll + 1 + nm1] = sinr;
+		work[i__ - ll + 1 + nm12] = cosl;
+		work[i__ - ll + 1 + nm13] = sinl;
+/* L140: */
+	    }
+	    e[m - 1] = f;
+
+/*           Update singular vectors */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "F", &i__1, ncvt, &work[1], &work[*n], &vt[
+			ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("R", "V", "F", nru, &i__1, &work[nm12 + 1], &work[nm13
+			+ 1], &u[ll * u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "F", &i__1, ncc, &work[nm12 + 1], &work[nm13
+			+ 1], &c__[ll + c_dim1], ldc);
+	    }
+
+/*           Test convergence */
+
+	    if ((d__1 = e[m - 1], abs(d__1)) <= thresh) {
+		e[m - 1] = 0.;
+	    }
+
+	} else {
+
+/*
+             Chase bulge from bottom to top
+             Save cosines and sines for later singular vector updates
+*/
+
+	    f = ((d__1 = d__[m], abs(d__1)) - shift) * (d_sign(&c_b15, &d__[m]
+		    ) + shift / d__[m]);
+	    g = e[m - 1];
+	    i__1 = ll + 1;
+	    for (i__ = m; i__ >= i__1; --i__) {
+		dlartg_(&f, &g, &cosr, &sinr, &r__);
+		if (i__ < m) {
+		    e[i__] = r__;
+		}
+		f = cosr * d__[i__] + sinr * e[i__ - 1];
+		e[i__ - 1] = cosr * e[i__ - 1] - sinr * d__[i__];
+		g = sinr * d__[i__ - 1];
+		d__[i__ - 1] = cosr * d__[i__ - 1];
+		dlartg_(&f, &g, &cosl, &sinl, &r__);
+		d__[i__] = r__;
+		f = cosl * e[i__ - 1] + sinl * d__[i__ - 1];
+		d__[i__ - 1] = cosl * d__[i__ - 1] - sinl * e[i__ - 1];
+		if (i__ > ll + 1) {
+		    g = sinl * e[i__ - 2];
+		    e[i__ - 2] = cosl * e[i__ - 2];
+		}
+		work[i__ - ll] = cosr;
+		work[i__ - ll + nm1] = -sinr;
+		work[i__ - ll + nm12] = cosl;
+		work[i__ - ll + nm13] = -sinl;
+/* L150: */
+	    }
+	    e[ll] = f;
+
+/*           Test convergence */
+
+	    if ((d__1 = e[ll], abs(d__1)) <= thresh) {
+		e[ll] = 0.;
+	    }
+
+/*           Update singular vectors if desired */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "B", &i__1, ncvt, &work[nm12 + 1], &work[
+			nm13 + 1], &vt[ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("R", "V", "B", nru, &i__1, &work[1], &work[*n], &u[ll *
+			 u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		dlasr_("L", "V", "B", &i__1, ncc, &work[1], &work[*n], &c__[
+			ll + c_dim1], ldc);
+	    }
+	}
+    }
+
+/*     QR iteration finished, go back and check convergence */
+
+    goto L60;
+
+/*     All singular values converged, so make them positive */
+
+L160:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (d__[i__] < 0.) {
+	    d__[i__] = -d__[i__];
+
+/*           Change sign of singular vectors, if desired */
+
+	    if (*ncvt > 0) {
+		dscal_(ncvt, &c_b151, &vt[i__ + vt_dim1], ldvt);
+	    }
+	}
+/* L170: */
+    }
+
+/*
+       Sort the singular values into decreasing order (insertion sort on
+       singular values, but only one transposition per singular vector)
+*/
+
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Scan for smallest D(I) */
+
+	isub = 1;
+	smin = d__[1];
+	i__2 = *n + 1 - i__;
+	for (j = 2; j <= i__2; ++j) {
+	    if (d__[j] <= smin) {
+		isub = j;
+		smin = d__[j];
+	    }
+/* L180: */
+	}
+	if (isub != *n + 1 - i__) {
+
+/*           Swap singular values and vectors */
+
+	    d__[isub] = d__[*n + 1 - i__];
+	    d__[*n + 1 - i__] = smin;
+	    if (*ncvt > 0) {
+		dswap_(ncvt, &vt[isub + vt_dim1], ldvt, &vt[*n + 1 - i__ +
+			vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		dswap_(nru, &u[isub * u_dim1 + 1], &c__1, &u[(*n + 1 - i__) *
+			u_dim1 + 1], &c__1);
+	    }
+	    if (*ncc > 0) {
+		dswap_(ncc, &c__[isub + c_dim1], ldc, &c__[*n + 1 - i__ +
+			c_dim1], ldc);
+	    }
+	}
+/* L190: */
+    }
+    goto L220;
+
+/*     Maximum number of iterations exceeded, failure to converge */
+
+L200:
+    *info = 0;
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (e[i__] != 0.) {
+	    ++(*info);
+	}
+/* L210: */
+    }
+L220:
+    return 0;
+
+/*     End of DBDSQR */
+
+} /* dbdsqr_ */
+
+/* Subroutine */ int dgebak_(char *job, char *side, integer *n, integer *ilo,
+	integer *ihi, doublereal *scale, integer *m, doublereal *v, integer *
+	ldv, integer *info)
+{
+    /* System generated locals */
+    integer v_dim1, v_offset, i__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal s;
+    static integer ii;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static logical leftv;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical rightv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGEBAK forms the right or left eigenvectors of a real general matrix
+    by backward transformation on the computed eigenvectors of the
+    balanced matrix output by DGEBAL.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the type of backward transformation required:
+            = 'N', do nothing, return immediately;
+            = 'P', do backward transformation for permutation only;
+            = 'S', do backward transformation for scaling only;
+            = 'B', do backward transformations for both permutation and
+                   scaling.
+            JOB must be the same as the argument JOB supplied to DGEBAL.
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  V contains right eigenvectors;
+            = 'L':  V contains left eigenvectors.
+
+    N       (input) INTEGER
+            The number of rows of the matrix V.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            The integers ILO and IHI determined by DGEBAL.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    SCALE   (input) DOUBLE PRECISION array, dimension (N)
+            Details of the permutation and scaling factors, as returned
+            by DGEBAL.
+
+    M       (input) INTEGER
+            The number of columns of the matrix V.  M >= 0.
+
+    V       (input/output) DOUBLE PRECISION array, dimension (LDV,M)
+            On entry, the matrix of right or left eigenvectors to be
+            transformed, as returned by DHSEIN or DTREVC.
+            On exit, V is overwritten by the transformed eigenvectors.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V. LDV >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Decode and Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --scale;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+
+    /* Function Body */
+    rightv = lsame_(side, "R");
+    leftv = lsame_(side, "L");
+
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (! rightv && ! leftv) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -7;
+    } else if (*ldv < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEBAK", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*m == 0) {
+	return 0;
+    }
+    if (lsame_(job, "N")) {
+	return 0;
+    }
+
+    if (*ilo == *ihi) {
+	goto L30;
+    }
+
+/*     Backward balance */
+
+    if (lsame_(job, "S") || lsame_(job, "B")) {
+
+	if (rightv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = scale[i__];
+		dscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L10: */
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = 1. / scale[i__];
+		dscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L20: */
+	    }
+	}
+
+    }
+
+/*
+       Backward permutation
+
+       For  I = ILO-1 step -1 until 1,
+                IHI+1 step 1 until N do --
+*/
+
+L30:
+    if (lsame_(job, "P") || lsame_(job, "B")) {
+	if (rightv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L40;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = (integer) scale[i__];
+		if (k == i__) {
+		    goto L40;
+		}
+		dswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L40:
+		;
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L50;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = (integer) scale[i__];
+		if (k == i__) {
+		    goto L50;
+		}
+		dswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L50:
+		;
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEBAK */
+
+} /* dgebak_ */
+
+/* Subroutine */ int dgebal_(char *job, integer *n, doublereal *a, integer *
+	lda, integer *ilo, integer *ihi, doublereal *scale, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal c__, f, g;
+    static integer i__, j, k, l, m;
+    static doublereal r__, s, ca, ra;
+    static integer ica, ira, iexc;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static doublereal sfmin1, sfmin2, sfmax1, sfmax2;
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical noconv;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DGEBAL balances a general real matrix A.  This involves, first,
+    permuting A by a similarity transformation to isolate eigenvalues
+    in the first 1 to ILO-1 and last IHI+1 to N elements on the
+    diagonal; and second, applying a diagonal similarity transformation
+    to rows and columns ILO to IHI to make the rows and columns as
+    close in norm as possible.  Both steps are optional.
+
+    Balancing may reduce the 1-norm of the matrix, and improve the
+    accuracy of the computed eigenvalues and/or eigenvectors.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the operations to be performed on A:
+            = 'N':  none:  simply set ILO = 1, IHI = N, SCALE(I) = 1.0
+                    for i = 1,...,N;
+            = 'P':  permute only;
+            = 'S':  scale only;
+            = 'B':  both permute and scale.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the input matrix A.
+            On exit,  A is overwritten by the balanced matrix.
+            If JOB = 'N', A is not referenced.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    ILO     (output) INTEGER
+    IHI     (output) INTEGER
+            ILO and IHI are set to integers such that on exit
+            A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
+            If JOB = 'N' or 'S', ILO = 1 and IHI = N.
+
+    SCALE   (output) DOUBLE PRECISION array, dimension (N)
+            Details of the permutations and scaling factors applied to
+            A.  If P(j) is the index of the row and column interchanged
+            with row and column j and D(j) is the scaling factor
+            applied to row and column j, then
+            SCALE(j) = P(j)    for j = 1,...,ILO-1
+                     = D(j)    for j = ILO,...,IHI
+                     = P(j)    for j = IHI+1,...,N.
+            The order in which the interchanges are made is N to IHI+1,
+            then 1 to ILO-1.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The permutations consist of row and column interchanges which put
+    the matrix in the form
+
+               ( T1   X   Y  )
+       P A P = (  0   B   Z  )
+               (  0   0   T2 )
+
+    where T1 and T2 are upper triangular matrices whose eigenvalues lie
+    along the diagonal.  The column indices ILO and IHI mark the starting
+    and ending columns of the submatrix B. Balancing consists of applying
+    a diagonal similarity transformation inv(D) * B * D to make the
+    1-norms of each row of B and its corresponding column nearly equal.
+    The output matrix is
+
+       ( T1     X*D          Y    )
+       (  0  inv(D)*B*D  inv(D)*Z ).
+       (  0      0           T2   )
+
+    Information about the permutations P and the diagonal matrix D is
+    returned in the vector SCALE.
+
+    This subroutine is based on the EISPACK routine BALANC.
+
+    Modified by Tzu-Yi Chen, Computer Science Division, University of
+      California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --scale;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEBAL", &i__1);
+	return 0;
+    }
+
+    k = 1;
+    l = *n;
+
+    if (*n == 0) {
+	goto L210;
+    }
+
+    if (lsame_(job, "N")) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scale[i__] = 1.;
+/* L10: */
+	}
+	goto L210;
+    }
+
+    if (lsame_(job, "S")) {
+	goto L120;
+    }
+
+/*     Permutation to isolate eigenvalues if possible */
+
+    goto L50;
+
+/*     Row and column exchange. */
+
+L20:
+    scale[m] = (doublereal) j;
+    if (j == m) {
+	goto L30;
+    }
+
+    dswap_(&l, &a[j * a_dim1 + 1], &c__1, &a[m * a_dim1 + 1], &c__1);
+    i__1 = *n - k + 1;
+    dswap_(&i__1, &a[j + k * a_dim1], lda, &a[m + k * a_dim1], lda);
+
+L30:
+    switch (iexc) {
+	case 1:  goto L40;
+	case 2:  goto L80;
+    }
+
+/*     Search for rows isolating an eigenvalue and push them down. */
+
+L40:
+    if (l == 1) {
+	goto L210;
+    }
+    --l;
+
+L50:
+    for (j = l; j >= 1; --j) {
+
+	i__1 = l;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (i__ == j) {
+		goto L60;
+	    }
+	    if (a[j + i__ * a_dim1] != 0.) {
+		goto L70;
+	    }
+L60:
+	    ;
+	}
+
+	m = l;
+	iexc = 1;
+	goto L20;
+L70:
+	;
+    }
+
+    goto L90;
+
+/*     Search for columns isolating an eigenvalue and push them left. */
+
+L80:
+    ++k;
+
+L90:
+    i__1 = l;
+    for (j = k; j <= i__1; ++j) {
+
+	i__2 = l;
+	for (i__ = k; i__ <= i__2; ++i__) {
+	    if (i__ == j) {
+		goto L100;
+	    }
+	    if (a[i__ + j * a_dim1] != 0.) {
+		goto L110;
+	    }
+L100:
+	    ;
+	}
+
+	m = k;
+	iexc = 2;
+	goto L20;
+L110:
+	;
+    }
+
+L120:
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	scale[i__] = 1.;
+/* L130: */
+    }
+
+    if (lsame_(job, "P")) {
+	goto L210;
+    }
+
+/*
+       Balance the submatrix in rows K to L.
+
+       Iterative loop for norm reduction
+*/
+
+    sfmin1 = SAFEMINIMUM / PRECISION;
+    sfmax1 = 1. / sfmin1;
+    sfmin2 = sfmin1 * 2.;
+    sfmax2 = 1. / sfmin2;
+L140:
+    noconv = FALSE_;
+
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	c__ = 0.;
+	r__ = 0.;
+
+	i__2 = l;
+	for (j = k; j <= i__2; ++j) {
+	    if (j == i__) {
+		goto L150;
+	    }
+	    c__ += (d__1 = a[j + i__ * a_dim1], abs(d__1));
+	    r__ += (d__1 = a[i__ + j * a_dim1], abs(d__1));
+L150:
+	    ;
+	}
+	ica = idamax_(&l, &a[i__ * a_dim1 + 1], &c__1);
+	ca = (d__1 = a[ica + i__ * a_dim1], abs(d__1));
+	i__2 = *n - k + 1;
+	ira = idamax_(&i__2, &a[i__ + k * a_dim1], lda);
+	ra = (d__1 = a[i__ + (ira + k - 1) * a_dim1], abs(d__1));
+
+/*        Guard against zero C or R due to underflow. */
+
+	if (c__ == 0. || r__ == 0.) {
+	    goto L200;
+	}
+	g = r__ / 2.;
+	f = 1.;
+	s = c__ + r__;
+L160:
+/* Computing MAX */
+	d__1 = max(f,c__);
+/* Computing MIN */
+	d__2 = min(r__,g);
+	if (c__ >= g || max(d__1,ca) >= sfmax2 || min(d__2,ra) <= sfmin2) {
+	    goto L170;
+	}
+	d__1 = c__ + f + ca + r__ + g + ra;
+	if (disnan_(&d__1)) {
+
+/*           Exit if NaN to avoid infinite loop */
+
+	    *info = -3;
+	    i__2 = -(*info);
+	    xerbla_("DGEBAL", &i__2);
+	    return 0;
+	}
+	f *= 2.;
+	c__ *= 2.;
+	ca *= 2.;
+	r__ /= 2.;
+	g /= 2.;
+	ra /= 2.;
+	goto L160;
+
+L170:
+	g = c__ / 2.;
+L180:
+/* Computing MIN */
+	d__1 = min(f,c__), d__1 = min(d__1,g);
+	if (g < r__ || max(r__,ra) >= sfmax2 || min(d__1,ca) <= sfmin2) {
+	    goto L190;
+	}
+	f /= 2.;
+	c__ /= 2.;
+	g /= 2.;
+	ca /= 2.;
+	r__ *= 2.;
+	ra *= 2.;
+	goto L180;
+
+/*        Now balance. */
+
+L190:
+	if (c__ + r__ >= s * .95) {
+	    goto L200;
+	}
+	if (f < 1. && scale[i__] < 1.) {
+	    if (f * scale[i__] <= sfmin1) {
+		goto L200;
+	    }
+	}
+	if (f > 1. && scale[i__] > 1.) {
+	    if (scale[i__] >= sfmax1 / f) {
+		goto L200;
+	    }
+	}
+	g = 1. / f;
+	scale[i__] *= f;
+	noconv = TRUE_;
+
+	i__2 = *n - k + 1;
+	dscal_(&i__2, &g, &a[i__ + k * a_dim1], lda);
+	dscal_(&l, &f, &a[i__ * a_dim1 + 1], &c__1);
+
+L200:
+	;
+    }
+
+    if (noconv) {
+	goto L140;
+    }
+
+L210:
+    *ilo = k;
+    *ihi = l;
+
+    return 0;
+
+/*     End of DGEBAL */
+
+} /* dgebal_ */
+
+/* Subroutine */ int dgebd2_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal *
+	taup, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *), dlarfg_(integer *, doublereal *,
+	    doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGEBD2 reduces a real general m by n matrix A to upper or lower
+    bidiagonal form B by an orthogonal transformation: Q' * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the orthogonal matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the orthogonal matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) DOUBLE PRECISION array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix P. See Further Details.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (max(M,N))
+
+    INFO    (output) INTEGER
+            = 0: successful exit.
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
+    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("DGEBD2", &i__1);
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ *
+		    a_dim1], &c__1, &tauq[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.;
+
+/*           Apply H(i) to A(i:m,i+1:n) from the left */
+
+	    if (i__ < *n) {
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		dlarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &
+			tauq[i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]
+			);
+	    }
+	    a[i__ + i__ * a_dim1] = d__[i__];
+
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector G(i) to annihilate
+                A(i,i+2:n)
+*/
+
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		dlarfg_(&i__2, &a[i__ + (i__ + 1) * a_dim1], &a[i__ + min(
+			i__3,*n) * a_dim1], lda, &taup[i__]);
+		e[i__] = a[i__ + (i__ + 1) * a_dim1];
+		a[i__ + (i__ + 1) * a_dim1] = 1.;
+
+/*              Apply G(i) to A(i+1:m,i+1:n) from the right */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		dlarf_("Right", &i__2, &i__3, &a[i__ + (i__ + 1) * a_dim1],
+			lda, &taup[i__], &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &work[1]);
+		a[i__ + (i__ + 1) * a_dim1] = e[i__];
+	    } else {
+		taup[i__] = 0.;
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector G(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3,*n) *
+		    a_dim1], lda, &taup[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.;
+
+/*           Apply G(i) to A(i+1:m,i:n) from the right */
+
+	    if (i__ < *m) {
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		dlarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &
+			taup[i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    a[i__ + i__ * a_dim1] = d__[i__];
+
+	    if (i__ < *m) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:m,i)
+*/
+
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		dlarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*m) +
+			i__ * a_dim1], &c__1, &tauq[i__]);
+		e[i__] = a[i__ + 1 + i__ * a_dim1];
+		a[i__ + 1 + i__ * a_dim1] = 1.;
+
+/*              Apply H(i) to A(i+1:m,i+1:n) from the left */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		dlarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &
+			c__1, &tauq[i__], &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &work[1]);
+		a[i__ + 1 + i__ * a_dim1] = e[i__];
+	    } else {
+		tauq[i__] = 0.;
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of DGEBD2 */
+
+} /* dgebd2_ */
+
+/* Subroutine */ int dgebrd_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal *
+	taup, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, nb, nx;
+    static doublereal ws;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer nbmin, iinfo, minmn;
+    extern /* Subroutine */ int dgebd2_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *), dlabrd_(integer *, integer *, integer *
+	    , doublereal *, integer *, doublereal *, doublereal *, doublereal
+	    *, doublereal *, doublereal *, integer *, doublereal *, integer *)
+	    , xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwrkx, ldwrky, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGEBRD reduces a general real M-by-N matrix A to upper or lower
+    bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the M-by-N general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the orthogonal matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the orthogonal matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) DOUBLE PRECISION array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix P. See Further Details.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,M,N).
+            For optimum performance LWORK >= (M+N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
+    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MAX */
+    i__1 = 1, i__2 = ilaenv_(&c__1, "DGEBRD", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = max(i__1,i__2);
+    lwkopt = (*m + *n) * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = max(1,*m);
+	if (*lwork < max(i__1,*n) && ! lquery) {
+	    *info = -10;
+	}
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("DGEBRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    minmn = min(*m,*n);
+    if (minmn == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    ws = (doublereal) max(*m,*n);
+    ldwrkx = *m;
+    ldwrky = *n;
+
+    if (nb > 1 && nb < minmn) {
+
+/*
+          Set the crossover point NX.
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "DGEBRD", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+
+/*        Determine when to switch from blocked to unblocked code. */
+
+	if (nx < minmn) {
+	    ws = (doublereal) ((*m + *n) * nb);
+	    if ((doublereal) (*lwork) < ws) {
+
+/*
+                Not enough work space for the optimal NB, consider using
+                a smaller block size.
+*/
+
+		nbmin = ilaenv_(&c__2, "DGEBRD", " ", m, n, &c_n1, &c_n1, (
+			ftnlen)6, (ftnlen)1);
+		if (*lwork >= (*m + *n) * nbmin) {
+		    nb = *lwork / (*m + *n);
+		} else {
+		    nb = 1;
+		    nx = minmn;
+		}
+	    }
+	}
+    } else {
+	nx = minmn;
+    }
+
+    i__1 = minmn - nx;
+    i__2 = nb;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+
+/*
+          Reduce rows and columns i:i+nb-1 to bidiagonal form and return
+          the matrices X and Y which are needed to update the unreduced
+          part of the matrix
+*/
+
+	i__3 = *m - i__ + 1;
+	i__4 = *n - i__ + 1;
+	dlabrd_(&i__3, &i__4, &nb, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[
+		i__], &tauq[i__], &taup[i__], &work[1], &ldwrkx, &work[ldwrkx
+		* nb + 1], &ldwrky);
+
+/*
+          Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
+          of the form  A := A - V*Y' - X*U'
+*/
+
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	dgemm_("No transpose", "Transpose", &i__3, &i__4, &nb, &c_b151, &a[
+		i__ + nb + i__ * a_dim1], lda, &work[ldwrkx * nb + nb + 1], &
+		ldwrky, &c_b15, &a[i__ + nb + (i__ + nb) * a_dim1], lda);
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	dgemm_("No transpose", "No transpose", &i__3, &i__4, &nb, &c_b151, &
+		work[nb + 1], &ldwrkx, &a[i__ + (i__ + nb) * a_dim1], lda, &
+		c_b15, &a[i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*        Copy diagonal and off-diagonal elements of B back into A */
+
+	if (*m >= *n) {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j + j * a_dim1] = d__[j];
+		a[j + (j + 1) * a_dim1] = e[j];
+/* L10: */
+	    }
+	} else {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j + j * a_dim1] = d__[j];
+		a[j + 1 + j * a_dim1] = e[j];
+/* L20: */
+	    }
+	}
+/* L30: */
+    }
+
+/*     Use unblocked code to reduce the remainder of the matrix */
+
+    i__2 = *m - i__ + 1;
+    i__1 = *n - i__ + 1;
+    dgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], &
+	    tauq[i__], &taup[i__], &work[1], &iinfo);
+    work[1] = ws;
+    return 0;
+
+/*     End of DGEBRD */
+
+} /* dgebrd_ */
+
+/* Subroutine */ int dgeev_(char *jobvl, char *jobvr, integer *n, doublereal *
+	a, integer *lda, doublereal *wr, doublereal *wi, doublereal *vl,
+	integer *ldvl, doublereal *vr, integer *ldvr, doublereal *work,
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal r__, cs, sn;
+    static integer ihi;
+    static doublereal scl;
+    static integer ilo;
+    static doublereal dum[1], eps;
+    static integer ibal;
+    static char side[1];
+    static doublereal anrm;
+    static integer ierr, itau;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static integer iwrk, nout;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern doublereal dlapy2_(doublereal *, doublereal *);
+    extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dgebak_(
+	    char *, char *, integer *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, integer *),
+	    dgebal_(char *, integer *, doublereal *, integer *, integer *,
+	    integer *, doublereal *, integer *);
+    static logical scalea;
+
+    static doublereal cscale;
+    extern doublereal dlange_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *);
+    extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *), dlascl_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dlacpy_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *),
+	    dlartg_(doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *), xerbla_(char *, integer *);
+    static logical select[1];
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static doublereal bignum;
+    extern /* Subroutine */ int dorghr_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *), dhseqr_(char *, char *, integer *, integer *, integer
+	    *, doublereal *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, integer *), dtrevc_(char *, char *, logical *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, integer *, integer *, doublereal *, integer *);
+    static integer minwrk, maxwrk;
+    static logical wantvl;
+    static doublereal smlnum;
+    static integer hswork;
+    static logical lquery, wantvr;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGEEV computes for an N-by-N real nonsymmetric matrix A, the
+    eigenvalues and, optionally, the left and/or right eigenvectors.
+
+    The right eigenvector v(j) of A satisfies
+                     A * v(j) = lambda(j) * v(j)
+    where lambda(j) is its eigenvalue.
+    The left eigenvector u(j) of A satisfies
+                  u(j)**H * A = lambda(j) * u(j)**H
+    where u(j)**H denotes the conjugate transpose of u(j).
+
+    The computed eigenvectors are normalized to have Euclidean norm
+    equal to 1 and largest component real.
+
+    Arguments
+    =========
+
+    JOBVL   (input) CHARACTER*1
+            = 'N': left eigenvectors of A are not computed;
+            = 'V': left eigenvectors of A are computed.
+
+    JOBVR   (input) CHARACTER*1
+            = 'N': right eigenvectors of A are not computed;
+            = 'V': right eigenvectors of A are computed.
+
+    N       (input) INTEGER
+            The order of the matrix A. N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the N-by-N matrix A.
+            On exit, A has been overwritten.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    WR      (output) DOUBLE PRECISION array, dimension (N)
+    WI      (output) DOUBLE PRECISION array, dimension (N)
+            WR and WI contain the real and imaginary parts,
+            respectively, of the computed eigenvalues.  Complex
+            conjugate pairs of eigenvalues appear consecutively
+            with the eigenvalue having the positive imaginary part
+            first.
+
+    VL      (output) DOUBLE PRECISION array, dimension (LDVL,N)
+            If JOBVL = 'V', the left eigenvectors u(j) are stored one
+            after another in the columns of VL, in the same order
+            as their eigenvalues.
+            If JOBVL = 'N', VL is not referenced.
+            If the j-th eigenvalue is real, then u(j) = VL(:,j),
+            the j-th column of VL.
+            If the j-th and (j+1)-st eigenvalues form a complex
+            conjugate pair, then u(j) = VL(:,j) + i*VL(:,j+1) and
+            u(j+1) = VL(:,j) - i*VL(:,j+1).
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1; if
+            JOBVL = 'V', LDVL >= N.
+
+    VR      (output) DOUBLE PRECISION array, dimension (LDVR,N)
+            If JOBVR = 'V', the right eigenvectors v(j) are stored one
+            after another in the columns of VR, in the same order
+            as their eigenvalues.
+            If JOBVR = 'N', VR is not referenced.
+            If the j-th eigenvalue is real, then v(j) = VR(:,j),
+            the j-th column of VR.
+            If the j-th and (j+1)-st eigenvalues form a complex
+            conjugate pair, then v(j) = VR(:,j) + i*VR(:,j+1) and
+            v(j+1) = VR(:,j) - i*VR(:,j+1).
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1; if
+            JOBVR = 'V', LDVR >= N.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,3*N), and
+            if JOBVL = 'V' or JOBVR = 'V', LWORK >= 4*N.  For good
+            performance, LWORK must generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = i, the QR algorithm failed to compute all the
+                  eigenvalues, and no eigenvectors have been computed;
+                  elements i+1:N of WR and WI contain eigenvalues which
+                  have converged.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --wr;
+    --wi;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    wantvl = lsame_(jobvl, "V");
+    wantvr = lsame_(jobvr, "V");
+    if (! wantvl && ! lsame_(jobvl, "N")) {
+	*info = -1;
+    } else if (! wantvr && ! lsame_(jobvr, "N")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldvl < 1 || wantvl && *ldvl < *n) {
+	*info = -9;
+    } else if (*ldvr < 1 || wantvr && *ldvr < *n) {
+	*info = -11;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         NB refers to the optimal block size for the immediately
+         following subroutine, as returned by ILAENV.
+         HSWORK refers to the workspace preferred by DHSEQR, as
+         calculated below. HSWORK is computed assuming ILO=1 and IHI=N,
+         the worst case.)
+*/
+
+    if (*info == 0) {
+	if (*n == 0) {
+	    minwrk = 1;
+	    maxwrk = 1;
+	} else {
+	    maxwrk = (*n << 1) + *n * ilaenv_(&c__1, "DGEHRD", " ", n, &c__1,
+		    n, &c__0, (ftnlen)6, (ftnlen)1);
+	    if (wantvl) {
+		minwrk = *n << 2;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"DORGHR", " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)
+			1);
+		maxwrk = max(i__1,i__2);
+		dhseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &wr[1], &wi[
+			1], &vl[vl_offset], ldvl, &work[1], &c_n1, info);
+		hswork = (integer) work[1];
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + 1, i__1 = max(i__1,i__2), i__2 = *
+			n + hswork;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n << 2;
+		maxwrk = max(i__1,i__2);
+	    } else if (wantvr) {
+		minwrk = *n << 2;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"DORGHR", " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)
+			1);
+		maxwrk = max(i__1,i__2);
+		dhseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &wr[1], &wi[
+			1], &vr[vr_offset], ldvr, &work[1], &c_n1, info);
+		hswork = (integer) work[1];
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + 1, i__1 = max(i__1,i__2), i__2 = *
+			n + hswork;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n << 2;
+		maxwrk = max(i__1,i__2);
+	    } else {
+		minwrk = *n * 3;
+		dhseqr_("E", "N", n, &c__1, n, &a[a_offset], lda, &wr[1], &wi[
+			1], &vr[vr_offset], ldvr, &work[1], &c_n1, info);
+		hswork = (integer) work[1];
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + 1, i__1 = max(i__1,i__2), i__2 = *
+			n + hswork;
+		maxwrk = max(i__1,i__2);
+	    }
+	    maxwrk = max(maxwrk,minwrk);
+	}
+	work[1] = (doublereal) maxwrk;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -13;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEEV ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = PRECISION;
+    smlnum = SAFEMINIMUM;
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+    smlnum = sqrt(smlnum) / eps;
+    bignum = 1. / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = dlange_("M", n, n, &a[a_offset], lda, dum);
+    scalea = FALSE_;
+    if (anrm > 0. && anrm < smlnum) {
+	scalea = TRUE_;
+	cscale = smlnum;
+    } else if (anrm > bignum) {
+	scalea = TRUE_;
+	cscale = bignum;
+    }
+    if (scalea) {
+	dlascl_("G", &c__0, &c__0, &anrm, &cscale, n, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+/*
+       Balance the matrix
+       (Workspace: need N)
+*/
+
+    ibal = 1;
+    dgebal_("B", n, &a[a_offset], lda, &ilo, &ihi, &work[ibal], &ierr);
+
+/*
+       Reduce to upper Hessenberg form
+       (Workspace: need 3*N, prefer 2*N+N*NB)
+*/
+
+    itau = ibal + *n;
+    iwrk = itau + *n;
+    i__1 = *lwork - iwrk + 1;
+    dgehrd_(n, &ilo, &ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1,
+	     &ierr);
+
+    if (wantvl) {
+
+/*
+          Want left eigenvectors
+          Copy Householder vectors to VL
+*/
+
+	*(unsigned char *)side = 'L';
+	dlacpy_("L", n, n, &a[a_offset], lda, &vl[vl_offset], ldvl)
+		;
+
+/*
+          Generate orthogonal matrix in VL
+          (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	dorghr_(n, &ilo, &ihi, &vl[vl_offset], ldvl, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VL
+          (Workspace: need N+1, prefer N+HSWORK (see comments) )
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	dhseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &wr[1], &wi[1], &
+		vl[vl_offset], ldvl, &work[iwrk], &i__1, info);
+
+	if (wantvr) {
+
+/*
+             Want left and right eigenvectors
+             Copy Schur vectors to VR
+*/
+
+	    *(unsigned char *)side = 'B';
+	    dlacpy_("F", n, n, &vl[vl_offset], ldvl, &vr[vr_offset], ldvr);
+	}
+
+    } else if (wantvr) {
+
+/*
+          Want right eigenvectors
+          Copy Householder vectors to VR
+*/
+
+	*(unsigned char *)side = 'R';
+	dlacpy_("L", n, n, &a[a_offset], lda, &vr[vr_offset], ldvr)
+		;
+
+/*
+          Generate orthogonal matrix in VR
+          (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	dorghr_(n, &ilo, &ihi, &vr[vr_offset], ldvr, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VR
+          (Workspace: need N+1, prefer N+HSWORK (see comments) )
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	dhseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &wr[1], &wi[1], &
+		vr[vr_offset], ldvr, &work[iwrk], &i__1, info);
+
+    } else {
+
+/*
+          Compute eigenvalues only
+          (Workspace: need N+1, prefer N+HSWORK (see comments) )
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	dhseqr_("E", "N", n, &ilo, &ihi, &a[a_offset], lda, &wr[1], &wi[1], &
+		vr[vr_offset], ldvr, &work[iwrk], &i__1, info);
+    }
+
+/*     If INFO > 0 from DHSEQR, then quit */
+
+    if (*info > 0) {
+	goto L50;
+    }
+
+    if (wantvl || wantvr) {
+
+/*
+          Compute left and/or right eigenvectors
+          (Workspace: need 4*N)
+*/
+
+	dtrevc_(side, "B", select, n, &a[a_offset], lda, &vl[vl_offset], ldvl,
+		 &vr[vr_offset], ldvr, n, &nout, &work[iwrk], &ierr);
+    }
+
+    if (wantvl) {
+
+/*
+          Undo balancing of left eigenvectors
+          (Workspace: need N)
+*/
+
+	dgebak_("B", "L", n, &ilo, &ihi, &work[ibal], n, &vl[vl_offset], ldvl,
+		 &ierr);
+
+/*        Normalize left eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (wi[i__] == 0.) {
+		scl = 1. / dnrm2_(n, &vl[i__ * vl_dim1 + 1], &c__1);
+		dscal_(n, &scl, &vl[i__ * vl_dim1 + 1], &c__1);
+	    } else if (wi[i__] > 0.) {
+		d__1 = dnrm2_(n, &vl[i__ * vl_dim1 + 1], &c__1);
+		d__2 = dnrm2_(n, &vl[(i__ + 1) * vl_dim1 + 1], &c__1);
+		scl = 1. / dlapy2_(&d__1, &d__2);
+		dscal_(n, &scl, &vl[i__ * vl_dim1 + 1], &c__1);
+		dscal_(n, &scl, &vl[(i__ + 1) * vl_dim1 + 1], &c__1);
+		i__2 = *n;
+		for (k = 1; k <= i__2; ++k) {
+/* Computing 2nd power */
+		    d__1 = vl[k + i__ * vl_dim1];
+/* Computing 2nd power */
+		    d__2 = vl[k + (i__ + 1) * vl_dim1];
+		    work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
+/* L10: */
+		}
+		k = idamax_(n, &work[iwrk], &c__1);
+		dlartg_(&vl[k + i__ * vl_dim1], &vl[k + (i__ + 1) * vl_dim1],
+			&cs, &sn, &r__);
+		drot_(n, &vl[i__ * vl_dim1 + 1], &c__1, &vl[(i__ + 1) *
+			vl_dim1 + 1], &c__1, &cs, &sn);
+		vl[k + (i__ + 1) * vl_dim1] = 0.;
+	    }
+/* L20: */
+	}
+    }
+
+    if (wantvr) {
+
+/*
+          Undo balancing of right eigenvectors
+          (Workspace: need N)
+*/
+
+	dgebak_("B", "R", n, &ilo, &ihi, &work[ibal], n, &vr[vr_offset], ldvr,
+		 &ierr);
+
+/*        Normalize right eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (wi[i__] == 0.) {
+		scl = 1. / dnrm2_(n, &vr[i__ * vr_dim1 + 1], &c__1);
+		dscal_(n, &scl, &vr[i__ * vr_dim1 + 1], &c__1);
+	    } else if (wi[i__] > 0.) {
+		d__1 = dnrm2_(n, &vr[i__ * vr_dim1 + 1], &c__1);
+		d__2 = dnrm2_(n, &vr[(i__ + 1) * vr_dim1 + 1], &c__1);
+		scl = 1. / dlapy2_(&d__1, &d__2);
+		dscal_(n, &scl, &vr[i__ * vr_dim1 + 1], &c__1);
+		dscal_(n, &scl, &vr[(i__ + 1) * vr_dim1 + 1], &c__1);
+		i__2 = *n;
+		for (k = 1; k <= i__2; ++k) {
+/* Computing 2nd power */
+		    d__1 = vr[k + i__ * vr_dim1];
+/* Computing 2nd power */
+		    d__2 = vr[k + (i__ + 1) * vr_dim1];
+		    work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
+/* L30: */
+		}
+		k = idamax_(n, &work[iwrk], &c__1);
+		dlartg_(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1],
+			&cs, &sn, &r__);
+		drot_(n, &vr[i__ * vr_dim1 + 1], &c__1, &vr[(i__ + 1) *
+			vr_dim1 + 1], &c__1, &cs, &sn);
+		vr[k + (i__ + 1) * vr_dim1] = 0.;
+	    }
+/* L40: */
+	}
+    }
+
+/*     Undo scaling if necessary */
+
+L50:
+    if (scalea) {
+	i__1 = *n - *info;
+/* Computing MAX */
+	i__3 = *n - *info;
+	i__2 = max(i__3,1);
+	dlascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wr[*info +
+		1], &i__2, &ierr);
+	i__1 = *n - *info;
+/* Computing MAX */
+	i__3 = *n - *info;
+	i__2 = max(i__3,1);
+	dlascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wi[*info +
+		1], &i__2, &ierr);
+	if (*info > 0) {
+	    i__1 = ilo - 1;
+	    dlascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wr[1],
+		    n, &ierr);
+	    i__1 = ilo - 1;
+	    dlascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wi[1],
+		    n, &ierr);
+	}
+    }
+
+    work[1] = (doublereal) maxwrk;
+    return 0;
+
+/*     End of DGEEV */
+
+} /* dgeev_ */
+
+/* Subroutine */ int dgehd2_(integer *n, integer *ilo, integer *ihi,
+	doublereal *a, integer *lda, doublereal *tau, doublereal *work,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal aii;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *), dlarfg_(integer *, doublereal *,
+	    doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGEHD2 reduces a real general matrix A to upper Hessenberg form H by
+    an orthogonal similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to DGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= max(1,N).
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the n by n general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the orthogonal matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEHD2", &i__1);
+	return 0;
+    }
+
+    i__1 = *ihi - 1;
+    for (i__ = *ilo; i__ <= i__1; ++i__) {
+
+/*        Compute elementary reflector H(i) to annihilate A(i+2:ihi,i) */
+
+	i__2 = *ihi - i__;
+/* Computing MIN */
+	i__3 = i__ + 2;
+	dlarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		a_dim1], &c__1, &tau[i__]);
+	aii = a[i__ + 1 + i__ * a_dim1];
+	a[i__ + 1 + i__ * a_dim1] = 1.;
+
+/*        Apply H(i) to A(1:ihi,i+1:ihi) from the right */
+
+	i__2 = *ihi - i__;
+	dlarf_("Right", ihi, &i__2, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+		i__], &a[(i__ + 1) * a_dim1 + 1], lda, &work[1]);
+
+/*        Apply H(i) to A(i+1:ihi,i+1:n) from the left */
+
+	i__2 = *ihi - i__;
+	i__3 = *n - i__;
+	dlarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+		i__], &a[i__ + 1 + (i__ + 1) * a_dim1], lda, &work[1]);
+
+	a[i__ + 1 + i__ * a_dim1] = aii;
+/* L10: */
+    }
+
+    return 0;
+
+/*     End of DGEHD2 */
+
+} /* dgehd2_ */
+
+/* Subroutine */ int dgehrd_(integer *n, integer *ilo, integer *ihi,
+	doublereal *a, integer *lda, doublereal *tau, doublereal *work,
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal t[4160]	/* was [65][64] */;
+    static integer ib;
+    static doublereal ei;
+    static integer nb, nh, nx, iws;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *), daxpy_(
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *), dgehd2_(integer *, integer *, integer *, doublereal *,
+	     integer *, doublereal *, doublereal *, integer *), dlahr2_(
+	    integer *, integer *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *),
+	    dlarfb_(char *, char *, char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2.1)                                  --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    DGEHRD reduces a real general matrix A to upper Hessenberg form H by
+    an orthogonal similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to DGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the N-by-N general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the orthogonal matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
+            zero.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This file is a slight modification of LAPACK-3.0's DGEHRD
+    subroutine incorporating improvements proposed by Quintana-Orti and
+    Van de Geijn (2006). (See DLAHR2.)
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MIN */
+    i__1 = 64, i__2 = ilaenv_(&c__1, "DGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    lwkopt = *n * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEHRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
+
+    i__1 = *ilo - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	tau[i__] = 0.;
+/* L10: */
+    }
+    i__1 = *n - 1;
+    for (i__ = max(1,*ihi); i__ <= i__1; ++i__) {
+	tau[i__] = 0.;
+/* L20: */
+    }
+
+/*     Quick return if possible */
+
+    nh = *ihi - *ilo + 1;
+    if (nh <= 1) {
+	work[1] = 1.;
+	return 0;
+    }
+
+/*
+       Determine the block size
+
+   Computing MIN
+*/
+    i__1 = 64, i__2 = ilaenv_(&c__1, "DGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    nbmin = 2;
+    iws = 1;
+    if (nb > 1 && nb < nh) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code)
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "DGEHRD", " ", n, ilo, ihi, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < nh) {
+
+/*           Determine if workspace is large enough for blocked code */
+
+	    iws = *n * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code
+
+   Computing MAX
+*/
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGEHRD", " ", n, ilo, ihi, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+		if (*lwork >= *n * nbmin) {
+		    nb = *lwork / *n;
+		} else {
+		    nb = 1;
+		}
+	    }
+	}
+    }
+    ldwork = *n;
+
+    if (nb < nbmin || nb >= nh) {
+
+/*        Use unblocked code below */
+
+	i__ = *ilo;
+
+    } else {
+
+/*        Use blocked code */
+
+	i__1 = *ihi - 1 - nx;
+	i__2 = nb;
+	for (i__ = *ilo; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = nb, i__4 = *ihi - i__;
+	    ib = min(i__3,i__4);
+
+/*
+             Reduce columns i:i+ib-1 to Hessenberg form, returning the
+             matrices V and T of the block reflector H = I - V*T*V'
+             which performs the reduction, and also the matrix Y = A*V*T
+*/
+
+	    dlahr2_(ihi, &i__, &ib, &a[i__ * a_dim1 + 1], lda, &tau[i__], t, &
+		    c__65, &work[1], &ldwork);
+
+/*
+             Apply the block reflector H to A(1:ihi,i+ib:ihi) from the
+             right, computing  A := A - Y * V'. V(i+ib,ib-1) must be set
+             to 1
+*/
+
+	    ei = a[i__ + ib + (i__ + ib - 1) * a_dim1];
+	    a[i__ + ib + (i__ + ib - 1) * a_dim1] = 1.;
+	    i__3 = *ihi - i__ - ib + 1;
+	    dgemm_("No transpose", "Transpose", ihi, &i__3, &ib, &c_b151, &
+		    work[1], &ldwork, &a[i__ + ib + i__ * a_dim1], lda, &
+		    c_b15, &a[(i__ + ib) * a_dim1 + 1], lda);
+	    a[i__ + ib + (i__ + ib - 1) * a_dim1] = ei;
+
+/*
+             Apply the block reflector H to A(1:i,i+1:i+ib-1) from the
+             right
+*/
+
+	    i__3 = ib - 1;
+	    dtrmm_("Right", "Lower", "Transpose", "Unit", &i__, &i__3, &c_b15,
+		     &a[i__ + 1 + i__ * a_dim1], lda, &work[1], &ldwork);
+	    i__3 = ib - 2;
+	    for (j = 0; j <= i__3; ++j) {
+		daxpy_(&i__, &c_b151, &work[ldwork * j + 1], &c__1, &a[(i__ +
+			j + 1) * a_dim1 + 1], &c__1);
+/* L30: */
+	    }
+
+/*
+             Apply the block reflector H to A(i+1:ihi,i+ib:n) from the
+             left
+*/
+
+	    i__3 = *ihi - i__;
+	    i__4 = *n - i__ - ib + 1;
+	    dlarfb_("Left", "Transpose", "Forward", "Columnwise", &i__3, &
+		    i__4, &ib, &a[i__ + 1 + i__ * a_dim1], lda, t, &c__65, &a[
+		    i__ + 1 + (i__ + ib) * a_dim1], lda, &work[1], &ldwork);
+/* L40: */
+	}
+    }
+
+/*     Use unblocked code to reduce the rest of the matrix */
+
+    dgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo);
+    work[1] = (doublereal) iws;
+
+    return 0;
+
+/*     End of DGEHRD */
+
+} /* dgehrd_ */
+
+/* Subroutine */ int dgelq2_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal aii;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *), dlarfg_(integer *, doublereal *,
+	    doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DGELQ2 computes an LQ factorization of a real m by n matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m by min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k) . . . H(2) H(1), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELQ2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i,i+1:n) */
+
+	i__2 = *n - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	dlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3,*n) * a_dim1]
+		, lda, &tau[i__]);
+	if (i__ < *m) {
+
+/*           Apply H(i) to A(i+1:m,i:n) from the right */
+
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.;
+	    i__2 = *m - i__;
+	    i__3 = *n - i__ + 1;
+	    dlarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[
+		    i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of DGELQ2 */
+
+} /* dgelq2_ */
+
+/* Subroutine */ int dgelqf_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int dgelq2_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *), dlarfb_(char *,
+	     char *, char *, char *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGELQF computes an LQ factorization of a real M-by-N matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k) . . . H(2) H(1), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "DGELQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *m * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELQF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGELQF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGELQF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the LQ factorization of the current block
+             A(i:i+ib-1,i:n)
+*/
+
+	    i__3 = *n - i__ + 1;
+	    dgelq2_(&ib, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *n - i__ + 1;
+		dlarft_("Forward", "Rowwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i+ib:m,i:n) from the right */
+
+		i__3 = *m - i__ - ib + 1;
+		i__4 = *n - i__ + 1;
+		dlarfb_("Right", "No transpose", "Forward", "Rowwise", &i__3,
+			&i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib +
+			1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	dgelq2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1] = (doublereal) iws;
+    return 0;
+
+/*     End of DGELQF */
+
+} /* dgelqf_ */
+
+/* Subroutine */ int dgelsd_(integer *m, integer *n, integer *nrhs,
+	doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *
+	s, doublereal *rcond, integer *rank, doublereal *work, integer *lwork,
+	 integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer ie, il, mm;
+    static doublereal eps, anrm, bnrm;
+    static integer itau, nlvl, iascl, ibscl;
+    static doublereal sfmin;
+    static integer minmn, maxmn, itaup, itauq, mnthr, nwork;
+    extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dgebrd_(
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *);
+    extern doublereal dlamch_(char *), dlange_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, integer *),
+	    dlalsd_(char *, integer *, integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *, integer *), dlascl_(char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    integer *, doublereal *, integer *, integer *), dgeqrf_(
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *), dlacpy_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static doublereal bignum;
+    extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, integer *);
+    static integer wlalsd;
+    extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *);
+    static integer liwork, minwrk, maxwrk;
+    static doublereal smlnum;
+    static logical lquery;
+    static integer smlsiz;
+
+
+/*
+    -- LAPACK driver routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DGELSD computes the minimum-norm solution to a real linear least
+    squares problem:
+        minimize 2-norm(| b - A*x |)
+    using the singular value decomposition (SVD) of A. A is an M-by-N
+    matrix which may be rank-deficient.
+
+    Several right hand side vectors b and solution vectors x can be
+    handled in a single call; they are stored as the columns of the
+    M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+    matrix X.
+
+    The problem is solved in three steps:
+    (1) Reduce the coefficient matrix A to bidiagonal form with
+        Householder transformations, reducing the original problem
+        into a "bidiagonal least squares problem" (BLS)
+    (2) Solve the BLS using a divide and conquer approach.
+    (3) Apply back all the Householder tranformations to solve
+        the original least squares problem.
+
+    The effective rank of A is determined by treating as zero those
+    singular values which are less than RCOND times the largest singular
+    value.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of A. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of A. N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrices B and X. NRHS >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, A has been destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
+            On entry, the M-by-NRHS right hand side matrix B.
+            On exit, B is overwritten by the N-by-NRHS solution
+            matrix X.  If m >= n and RANK = n, the residual
+            sum-of-squares for the solution in the i-th column is given
+            by the sum of squares of elements n+1:m in that column.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >= max(1,max(M,N)).
+
+    S       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The singular values of A in decreasing order.
+            The condition number of A in the 2-norm = S(1)/S(min(m,n)).
+
+    RCOND   (input) DOUBLE PRECISION
+            RCOND is used to determine the effective rank of A.
+            Singular values S(i) <= RCOND*S(1) are treated as zero.
+            If RCOND < 0, machine precision is used instead.
+
+    RANK    (output) INTEGER
+            The effective rank of A, i.e., the number of singular values
+            which are greater than RCOND*S(1).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK must be at least 1.
+            The exact minimum amount of workspace needed depends on M,
+            N and NRHS. As long as LWORK is at least
+                12*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2,
+            if M is greater than or equal to N or
+                12*M + 2*M*SMLSIZ + 8*M*NLVL + M*NRHS + (SMLSIZ+1)**2,
+            if M is less than N, the code will execute correctly.
+            SMLSIZ is returned by ILAENV and is equal to the maximum
+            size of the subproblems at the bottom of the computation
+            tree (usually about 25), and
+               NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+            For good performance, LWORK should generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    IWORK   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
+            LIWORK >= max(1, 3 * MINMN * NLVL + 11 * MINMN),
+            where MINMN = MIN( M,N ).
+            On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  the algorithm for computing the SVD failed to converge;
+                  if INFO = i, i off-diagonal elements of an intermediate
+                  bidiagonal form did not converge to zero.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input arguments.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    maxmn = max(*m,*n);
+    mnthr = ilaenv_(&c__6, "DGELSD", " ", m, n, nrhs, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldb < max(1,maxmn)) {
+	*info = -7;
+    }
+
+    smlsiz = ilaenv_(&c__9, "DGELSD", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+
+/*
+       Compute workspace.
+       (Note: Comments in the code beginning "Workspace:" describe the
+       minimal amount of workspace needed at that point in the code,
+       as well as the preferred amount for good performance.
+       NB refers to the optimal block size for the immediately
+       following subroutine, as returned by ILAENV.)
+*/
+
+    minwrk = 1;
+    liwork = 1;
+    minmn = max(1,minmn);
+/* Computing MAX */
+    i__1 = (integer) (log((doublereal) minmn / (doublereal) (smlsiz + 1)) /
+	    log(2.)) + 1;
+    nlvl = max(i__1,0);
+
+    if (*info == 0) {
+	maxwrk = 0;
+	liwork = minmn * 3 * nlvl + minmn * 11;
+	mm = *m;
+	if (*m >= *n && *m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns. */
+
+	    mm = *n;
+/* Computing MAX */
+	    i__1 = maxwrk, i__2 = *n + *n * ilaenv_(&c__1, "DGEQRF", " ", m,
+		    n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+	    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+	    i__1 = maxwrk, i__2 = *n + *nrhs * ilaenv_(&c__1, "DORMQR", "LT",
+		    m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2);
+	    maxwrk = max(i__1,i__2);
+	}
+	if (*m >= *n) {
+
+/*
+             Path 1 - overdetermined or exactly determined.
+
+   Computing MAX
+*/
+	    i__1 = maxwrk, i__2 = *n * 3 + (mm + *n) * ilaenv_(&c__1, "DGEBRD"
+		    , " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+	    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+	    i__1 = maxwrk, i__2 = *n * 3 + *nrhs * ilaenv_(&c__1, "DORMBR",
+		    "QLT", &mm, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)3);
+	    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+	    i__1 = maxwrk, i__2 = *n * 3 + (*n - 1) * ilaenv_(&c__1, "DORMBR",
+		     "PLN", n, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)3);
+	    maxwrk = max(i__1,i__2);
+/* Computing 2nd power */
+	    i__1 = smlsiz + 1;
+	    wlalsd = *n * 9 + (*n << 1) * smlsiz + (*n << 3) * nlvl + *n * *
+		    nrhs + i__1 * i__1;
+/* Computing MAX */
+	    i__1 = maxwrk, i__2 = *n * 3 + wlalsd;
+	    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+	    i__1 = *n * 3 + mm, i__2 = *n * 3 + *nrhs, i__1 = max(i__1,i__2),
+		    i__2 = *n * 3 + wlalsd;
+	    minwrk = max(i__1,i__2);
+	}
+	if (*n > *m) {
+/* Computing 2nd power */
+	    i__1 = smlsiz + 1;
+	    wlalsd = *m * 9 + (*m << 1) * smlsiz + (*m << 3) * nlvl + *m * *
+		    nrhs + i__1 * i__1;
+	    if (*n >= mnthr) {
+
+/*
+                Path 2a - underdetermined, with many more columns
+                than rows.
+*/
+
+		maxwrk = *m + *m * ilaenv_(&c__1, "DGELQF", " ", m, n, &c_n1,
+			&c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) *
+			ilaenv_(&c__1, "DGEBRD", " ", m, m, &c_n1, &c_n1, (
+			ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs * ilaenv_(&
+			c__1, "DORMBR", "QLT", m, nrhs, m, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) *
+			ilaenv_(&c__1, "DORMBR", "PLN", m, nrhs, m, &c_n1, (
+			ftnlen)6, (ftnlen)3);
+		maxwrk = max(i__1,i__2);
+		if (*nrhs > 1) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+		} else {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 1);
+		    maxwrk = max(i__1,i__2);
+		}
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m + *nrhs * ilaenv_(&c__1, "DORMLQ",
+			"LT", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)2);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + wlalsd;
+		maxwrk = max(i__1,i__2);
+/*
+       XXX: Ensure the Path 2a case below is triggered.  The workspace
+       calculation should use queries for all routines eventually.
+   Computing MAX
+   Computing MAX
+*/
+		i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+			 max(i__3,*nrhs), i__4 = *n - *m * 3;
+		i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4);
+		maxwrk = max(i__1,i__2);
+	    } else {
+
+/*              Path 2 - remaining underdetermined cases. */
+
+		maxwrk = *m * 3 + (*n + *m) * ilaenv_(&c__1, "DGEBRD", " ", m,
+			 n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * 3 + *nrhs * ilaenv_(&c__1, "DORMBR"
+			, "QLT", m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR",
+			"PLN", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *m * 3 + wlalsd;
+		maxwrk = max(i__1,i__2);
+	    }
+/* Computing MAX */
+	    i__1 = *m * 3 + *nrhs, i__2 = *m * 3 + *m, i__1 = max(i__1,i__2),
+		    i__2 = *m * 3 + wlalsd;
+	    minwrk = max(i__1,i__2);
+	}
+	minwrk = min(minwrk,maxwrk);
+	work[1] = (doublereal) maxwrk;
+	iwork[1] = liwork;
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELSD", &i__1);
+	return 0;
+    } else if (lquery) {
+	goto L10;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	*rank = 0;
+	return 0;
+    }
+
+/*     Get machine parameters. */
+
+    eps = PRECISION;
+    sfmin = SAFEMINIMUM;
+    smlnum = sfmin / eps;
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+
+/*     Scale A if max entry outside range [SMLNUM,BIGNUM]. */
+
+    anrm = dlange_("M", m, n, &a[a_offset], lda, &work[1]);
+    iascl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 2;
+    } else if (anrm == 0.) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	dlaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	dlaset_("F", &minmn, &c__1, &c_b29, &c_b29, &s[1], &c__1);
+	*rank = 0;
+	goto L10;
+    }
+
+/*     Scale B if max entry outside range [SMLNUM,BIGNUM]. */
+
+    bnrm = dlange_("M", m, nrhs, &b[b_offset], ldb, &work[1]);
+    ibscl = 0;
+    if (bnrm > 0. && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	dlascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	dlascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 2;
+    }
+
+/*     If M < N make sure certain entries of B are zero. */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	dlaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     Overdetermined case. */
+
+    if (*m >= *n) {
+
+/*        Path 1 - overdetermined or exactly determined. */
+
+	mm = *m;
+	if (*m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns. */
+
+	    mm = *n;
+	    itau = 1;
+	    nwork = itau + *n;
+
+/*
+             Compute A=Q*R.
+             (Workspace: need 2*N, prefer N+N*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+
+/*
+             Multiply B by transpose(Q).
+             (Workspace: need N+NRHS, prefer N+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dormqr_("L", "T", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+/*           Zero out below R. */
+
+	    if (*n > 1) {
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		dlaset_("L", &i__1, &i__2, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+	    }
+	}
+
+	ie = 1;
+	itauq = ie + *n;
+	itaup = itauq + *n;
+	nwork = itaup + *n;
+
+/*
+          Bidiagonalize R in A.
+          (Workspace: need 3*N+MM, prefer 3*N+(MM+N)*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	dgebrd_(&mm, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of R.
+          (Workspace: need 3*N+NRHS, prefer 3*N+NRHS*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	dormbr_("Q", "L", "T", &mm, nrhs, n, &a[a_offset], lda, &work[itauq],
+		&b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	dlalsd_("U", &smlsiz, n, nrhs, &s[1], &work[ie], &b[b_offset], ldb,
+		rcond, rank, &work[nwork], &iwork[1], info);
+	if (*info != 0) {
+	    goto L10;
+	}
+
+/*        Multiply B by right bidiagonalizing vectors of R. */
+
+	i__1 = *lwork - nwork + 1;
+	dormbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], &
+		b[b_offset], ldb, &work[nwork], &i__1, info);
+
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max(
+		i__1,*nrhs), i__2 = *n - *m * 3, i__1 = max(i__1,i__2);
+	if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,wlalsd)) {
+
+/*
+          Path 2a - underdetermined, with many more columns than rows
+          and sufficient workspace for an efficient algorithm.
+*/
+
+	    ldwork = *m;
+/*
+   Computing MAX
+   Computing MAX
+*/
+	    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+		    max(i__3,*nrhs), i__4 = *n - *m * 3;
+	    i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda +
+		    *m + *m * *nrhs, i__1 = max(i__1,i__2), i__2 = (*m << 2)
+		    + *m * *lda + wlalsd;
+	    if (*lwork >= max(i__1,i__2)) {
+		ldwork = *lda;
+	    }
+	    itau = 1;
+	    nwork = *m + 1;
+
+/*
+          Compute A=L*Q.
+          (Workspace: need 2*M, prefer M+M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+	    il = nwork;
+
+/*        Copy L to WORK(IL), zeroing out above its diagonal. */
+
+	    dlacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork);
+	    i__1 = *m - 1;
+	    i__2 = *m - 1;
+	    dlaset_("U", &i__1, &i__2, &c_b29, &c_b29, &work[il + ldwork], &
+		    ldwork);
+	    ie = il + ldwork * *m;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+          Bidiagonalize L in WORK(IL).
+          (Workspace: need M*M+5*M, prefer M*M+4*M+2*M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dgebrd_(m, m, &work[il], &ldwork, &s[1], &work[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of L.
+          (Workspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dormbr_("Q", "L", "T", m, nrhs, m, &work[il], &ldwork, &work[
+		    itauq], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    dlalsd_("U", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &iwork[1], info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of L. */
+
+	    i__1 = *lwork - nwork + 1;
+	    dormbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[
+		    itaup], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Zero out below first M rows of B. */
+
+	    i__1 = *n - *m;
+	    dlaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1],
+		    ldb);
+	    nwork = itau + *m;
+
+/*
+          Multiply transpose(Q) by B.
+          (Workspace: need M+NRHS, prefer M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dormlq_("L", "T", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+	} else {
+
+/*        Path 2 - remaining underdetermined cases. */
+
+	    ie = 1;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+          Bidiagonalize A.
+          (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors.
+          (Workspace: need 3*M+NRHS, prefer 3*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    dormbr_("Q", "L", "T", m, nrhs, n, &a[a_offset], lda, &work[itauq]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    dlalsd_("L", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &iwork[1], info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of A. */
+
+	    i__1 = *lwork - nwork + 1;
+	    dormbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+	}
+    }
+
+/*     Undo scaling. */
+
+    if (iascl == 1) {
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	dlascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    } else if (iascl == 2) {
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	dlascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    }
+    if (ibscl == 1) {
+	dlascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    } else if (ibscl == 2) {
+	dlascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    }
+
+L10:
+    work[1] = (doublereal) maxwrk;
+    iwork[1] = liwork;
+    return 0;
+
+/*     End of DGELSD */
+
+} /* dgelsd_ */
+
+/* Subroutine */ int dgeqr2_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal aii;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *), dlarfg_(integer *, doublereal *,
+	    doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DGEQR2 computes a QR factorization of a real m by n matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(m,n) by n upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQR2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	dlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
+		, &c__1, &tau[i__]);
+	if (i__ < *n) {
+
+/*           Apply H(i) to A(i:m,i+1:n) from the left */
+
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    dlarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of DGEQR2 */
+
+} /* dgeqr2_ */
+
+/* Subroutine */ int dgeqrf_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int dgeqr2_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *), dlarfb_(char *,
+	     char *, char *, char *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGEQRF computes a QR factorization of a real M-by-N matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of min(m,n) elementary reflectors (see Further
+            Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "DGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *n * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQRF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQRF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the QR factorization of the current block
+             A(i:m,i:i+ib-1)
+*/
+
+	    i__3 = *m - i__ + 1;
+	    dgeqr2_(&i__3, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *m - i__ + 1;
+		dlarft_("Forward", "Columnwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i:m,i+ib:n) from the left */
+
+		i__3 = *m - i__ + 1;
+		i__4 = *n - i__ - ib + 1;
+		dlarfb_("Left", "Transpose", "Forward", "Columnwise", &i__3, &
+			i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &work[ib
+			+ 1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	dgeqr2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1] = (doublereal) iws;
+    return 0;
+
+/*     End of DGEQRF */
+
+} /* dgeqrf_ */
+
+/* Subroutine */ int dgesdd_(char *jobz, integer *m, integer *n, doublereal *
+	a, integer *lda, doublereal *s, doublereal *u, integer *ldu,
+	doublereal *vt, integer *ldvt, doublereal *work, integer *lwork,
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ie, il, ir, iu, blk;
+    static doublereal dum[1], eps;
+    static integer ivt, iscl;
+    static doublereal anrm;
+    static integer idum[1], ierr, itau;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    static integer chunk, minmn, wrkbl, itaup, itauq, mnthr;
+    static logical wntqa;
+    static integer nwork;
+    static logical wntqn, wntqo, wntqs;
+    extern /* Subroutine */ int dbdsdc_(char *, char *, integer *, doublereal
+	    *, doublereal *, doublereal *, integer *, doublereal *, integer *,
+	     doublereal *, integer *, doublereal *, integer *, integer *), dgebrd_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *, integer *);
+    extern doublereal dlamch_(char *), dlange_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *);
+    static integer bdspac;
+    extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, integer *),
+	    dlascl_(char *, integer *, integer *, doublereal *, doublereal *,
+	    integer *, integer *, doublereal *, integer *, integer *),
+	     dgeqrf_(integer *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *, integer *), dlacpy_(char *,
+	     integer *, integer *, doublereal *, integer *, doublereal *,
+	    integer *), dlaset_(char *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *),
+	    xerbla_(char *, integer *), dorgbr_(char *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static doublereal bignum;
+    extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, integer *), dorglq_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *), dorgqr_(integer *, integer *, integer *, doublereal *,
+	     integer *, doublereal *, doublereal *, integer *, integer *);
+    static integer ldwrkl, ldwrkr, minwrk, ldwrku, maxwrk, ldwkvt;
+    static doublereal smlnum;
+    static logical wntqas, lquery;
+
+
+/*
+    -- LAPACK driver routine (version 3.2.1)                                  --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       March 2009
+
+
+    Purpose
+    =======
+
+    DGESDD computes the singular value decomposition (SVD) of a real
+    M-by-N matrix A, optionally computing the left and right singular
+    vectors.  If singular vectors are desired, it uses a
+    divide-and-conquer algorithm.
+
+    The SVD is written
+
+         A = U * SIGMA * transpose(V)
+
+    where SIGMA is an M-by-N matrix which is zero except for its
+    min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+    V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+    are the singular values of A; they are real and non-negative, and
+    are returned in descending order.  The first min(m,n) columns of
+    U and V are the left and right singular vectors of A.
+
+    Note that the routine returns VT = V**T, not V.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            Specifies options for computing all or part of the matrix U:
+            = 'A':  all M columns of U and all N rows of V**T are
+                    returned in the arrays U and VT;
+            = 'S':  the first min(M,N) columns of U and the first
+                    min(M,N) rows of V**T are returned in the arrays U
+                    and VT;
+            = 'O':  If M >= N, the first N columns of U are overwritten
+                    on the array A and all rows of V**T are returned in
+                    the array VT;
+                    otherwise, all columns of U are returned in the
+                    array U and the first M rows of V**T are overwritten
+                    in the array A;
+            = 'N':  no columns of U or rows of V**T are computed.
+
+    M       (input) INTEGER
+            The number of rows of the input matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the input matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit,
+            if JOBZ = 'O',  A is overwritten with the first N columns
+                            of U (the left singular vectors, stored
+                            columnwise) if M >= N;
+                            A is overwritten with the first M rows
+                            of V**T (the right singular vectors, stored
+                            rowwise) otherwise.
+            if JOBZ .ne. 'O', the contents of A are destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    S       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The singular values of A, sorted so that S(i) >= S(i+1).
+
+    U       (output) DOUBLE PRECISION array, dimension (LDU,UCOL)
+            UCOL = M if JOBZ = 'A' or JOBZ = 'O' and M < N;
+            UCOL = min(M,N) if JOBZ = 'S'.
+            If JOBZ = 'A' or JOBZ = 'O' and M < N, U contains the M-by-M
+            orthogonal matrix U;
+            if JOBZ = 'S', U contains the first min(M,N) columns of U
+            (the left singular vectors, stored columnwise);
+            if JOBZ = 'O' and M >= N, or JOBZ = 'N', U is not referenced.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= 1; if
+            JOBZ = 'S' or 'A' or JOBZ = 'O' and M < N, LDU >= M.
+
+    VT      (output) DOUBLE PRECISION array, dimension (LDVT,N)
+            If JOBZ = 'A' or JOBZ = 'O' and M >= N, VT contains the
+            N-by-N orthogonal matrix V**T;
+            if JOBZ = 'S', VT contains the first min(M,N) rows of
+            V**T (the right singular vectors, stored rowwise);
+            if JOBZ = 'O' and M < N, or JOBZ = 'N', VT is not referenced.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.  LDVT >= 1; if
+            JOBZ = 'A' or JOBZ = 'O' and M >= N, LDVT >= N;
+            if JOBZ = 'S', LDVT >= min(M,N).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= 1.
+            If JOBZ = 'N',
+              LWORK >= 3*min(M,N) + max(max(M,N),7*min(M,N)).
+            If JOBZ = 'O',
+              LWORK >= 3*min(M,N) +
+                       max(max(M,N),5*min(M,N)*min(M,N)+4*min(M,N)).
+            If JOBZ = 'S' or 'A'
+              LWORK >= 3*min(M,N) +
+                       max(max(M,N),4*min(M,N)*min(M,N)+4*min(M,N)).
+            For good performance, LWORK should generally be larger.
+            If LWORK = -1 but other input arguments are legal, WORK(1)
+            returns the optimal LWORK.
+
+    IWORK   (workspace) INTEGER array, dimension (8*min(M,N))
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  DBDSDC did not converge, updating process failed.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --s;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    wntqa = lsame_(jobz, "A");
+    wntqs = lsame_(jobz, "S");
+    wntqas = wntqa || wntqs;
+    wntqo = lsame_(jobz, "O");
+    wntqn = lsame_(jobz, "N");
+    lquery = *lwork == -1;
+
+    if (! (wntqa || wntqs || wntqo || wntqn)) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldu < 1 || wntqas && *ldu < *m || wntqo && *m < *n && *ldu < *
+	    m) {
+	*info = -8;
+    } else if (*ldvt < 1 || wntqa && *ldvt < *n || wntqs && *ldvt < minmn ||
+	    wntqo && *m >= *n && *ldvt < *n) {
+	*info = -10;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         NB refers to the optimal block size for the immediately
+         following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	if (*m >= *n && minmn > 0) {
+
+/*           Compute space needed for DBDSDC */
+
+	    mnthr = (integer) (minmn * 11. / 6.);
+	    if (wntqn) {
+		bdspac = *n * 7;
+	    } else {
+		bdspac = *n * 3 * *n + (*n << 2);
+	    }
+	    if (*m >= mnthr) {
+		if (wntqn) {
+
+/*                 Path 1 (M much larger than N, JOBZ='N') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "DGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = bdspac + *n;
+		} else if (wntqo) {
+
+/*                 Path 2 (M much larger than N, JOBZ='O') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "DGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "DORGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + (*n << 1) * *n;
+		    minwrk = bdspac + (*n << 1) * *n + *n * 3;
+		} else if (wntqs) {
+
+/*                 Path 3 (M much larger than N, JOBZ='S') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "DGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "DORGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *n * *n;
+		    minwrk = bdspac + *n * *n + *n * 3;
+		} else if (wntqa) {
+
+/*                 Path 4 (M much larger than N, JOBZ='A') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "DGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *m * ilaenv_(&c__1, "DORGQR",
+			    " ", m, m, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *n * *n;
+		    minwrk = bdspac + *n * *n + *n * 3;
+		}
+	    } else {
+
+/*              Path 5 (M at least N, but not much larger) */
+
+		wrkbl = *n * 3 + (*m + *n) * ilaenv_(&c__1, "DGEBRD", " ", m,
+			n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		if (wntqn) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3 + max(*m,bdspac);
+		} else if (wntqo) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *n;
+/* Computing MAX */
+		    i__1 = *m, i__2 = *n * *n + bdspac;
+		    minwrk = *n * 3 + max(i__1,i__2);
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3 + max(*m,bdspac);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = bdspac + *n * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3 + max(*m,bdspac);
+		}
+	    }
+	} else if (minmn > 0) {
+
+/*           Compute space needed for DBDSDC */
+
+	    mnthr = (integer) (minmn * 11. / 6.);
+	    if (wntqn) {
+		bdspac = *m * 7;
+	    } else {
+		bdspac = *m * 3 * *m + (*m << 2);
+	    }
+	    if (*n >= mnthr) {
+		if (wntqn) {
+
+/*                 Path 1t (N much larger than M, JOBZ='N') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "DGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = bdspac + *m;
+		} else if (wntqo) {
+
+/*                 Path 2t (N much larger than M, JOBZ='O') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "DGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "DORGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + (*m << 1) * *m;
+		    minwrk = bdspac + (*m << 1) * *m + *m * 3;
+		} else if (wntqs) {
+
+/*                 Path 3t (N much larger than M, JOBZ='S') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "DGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "DORGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *m;
+		    minwrk = bdspac + *m * *m + *m * 3;
+		} else if (wntqa) {
+
+/*                 Path 4t (N much larger than M, JOBZ='A') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "DGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *n * ilaenv_(&c__1, "DORGLQ",
+			    " ", n, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "DGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *m;
+		    minwrk = bdspac + *m * *m + *m * 3;
+		}
+	    } else {
+
+/*              Path 5t (N greater than M, but not much larger) */
+
+		wrkbl = *m * 3 + (*m + *n) * ilaenv_(&c__1, "DGEBRD", " ", m,
+			n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		if (wntqn) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3 + max(*n,bdspac);
+		} else if (wntqo) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", m, n, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *n;
+/* Computing MAX */
+		    i__1 = *n, i__2 = *m * *m + bdspac;
+		    minwrk = *m * 3 + max(i__1,i__2);
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", m, n, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3 + max(*n,bdspac);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "DORMBR"
+			    , "PRT", n, n, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3 + max(*n,bdspac);
+		}
+	    }
+	}
+	maxwrk = max(maxwrk,minwrk);
+	work[1] = (doublereal) maxwrk;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGESDD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = PRECISION;
+    smlnum = sqrt(SAFEMINIMUM) / eps;
+    bignum = 1. / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = dlange_("M", m, n, &a[a_offset], lda, dum);
+    iscl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+	iscl = 1;
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, &
+		ierr);
+    } else if (anrm > bignum) {
+	iscl = 1;
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+    if (*m >= *n) {
+
+/*
+          A has at least as many rows as columns. If A has sufficiently
+          more rows than columns, first reduce using the QR
+          decomposition (if sufficient workspace available)
+*/
+
+	if (*m >= mnthr) {
+
+	    if (wntqn) {
+
+/*
+                Path 1 (M much larger than N, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (Workspace: need 2*N, prefer N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Zero out below R */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		dlaset_("L", &i__1, &i__2, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+		ie = 1;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (Workspace: need 4*N, prefer 3*N+2*N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgebrd_(n, n, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+		nwork = ie + *n;
+
+/*
+                Perform bidiagonal SVD, computing singular values only
+                (Workspace: need N+BDSPAC)
+*/
+
+		dbdsdc_("U", "N", n, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2 (M much larger than N, JOBZ = 'O')
+                N left singular vectors to be overwritten on A and
+                N right singular vectors to be computed in VT
+*/
+
+		ir = 1;
+
+/*              WORK(IR) is LDWRKR by N */
+
+		if (*lwork >= *lda * *n + *n * *n + *n * 3 + bdspac) {
+		    ldwrkr = *lda;
+		} else {
+		    ldwrkr = (*lwork - *n * *n - *n * 3 - bdspac) / *n;
+		}
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy R to WORK(IR), zeroing out below it */
+
+		dlacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		dlaset_("L", &i__1, &i__2, &c_b29, &c_b29, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dorgqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = itau;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in VT, copying result to WORK(IR)
+                (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*              WORK(IU) is N by N */
+
+		iu = nwork;
+		nwork = iu + *n * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in WORK(IU) and computing right
+                singular vectors of bidiagonal matrix in VT
+                (Workspace: need N+N*N+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &work[ie], &work[iu], n, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite WORK(IU) by left singular vectors of R
+                and VT by right singular vectors of R
+                (Workspace: need 2*N*N+3*N, prefer 2*N*N+2*N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &work[iu], n, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IU), storing result in WORK(IR) and copying to A
+                (Workspace: need 2*N*N, prefer N*N+M*N)
+*/
+
+		i__1 = *m;
+		i__2 = ldwrkr;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *m - i__ + 1;
+		    chunk = min(i__3,ldwrkr);
+		    dgemm_("N", "N", &chunk, n, n, &c_b15, &a[i__ + a_dim1],
+			    lda, &work[iu], n, &c_b29, &work[ir], &ldwrkr);
+		    dlacpy_("F", &chunk, n, &work[ir], &ldwrkr, &a[i__ +
+			    a_dim1], lda);
+/* L10: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Path 3 (M much larger than N, JOBZ='S')
+                N left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		ir = 1;
+
+/*              WORK(IR) is N by N */
+
+		ldwrkr = *n;
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy R to WORK(IR), zeroing out below it */
+
+		dlacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		dlaset_("L", &i__2, &i__1, &c_b29, &c_b29, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dorgqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = itau;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in WORK(IR)
+                (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagoal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need N+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of R and VT
+                by right singular vectors of R
+                (Workspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IR), storing result in U
+                (Workspace: need N*N)
+*/
+
+		dlacpy_("F", n, n, &u[u_offset], ldu, &work[ir], &ldwrkr);
+		dgemm_("N", "N", m, n, n, &c_b15, &a[a_offset], lda, &work[ir]
+			, &ldwrkr, &c_b29, &u[u_offset], ldu);
+
+	    } else if (wntqa) {
+
+/*
+                Path 4 (M much larger than N, JOBZ='A')
+                M left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		iu = 1;
+
+/*              WORK(IU) is N by N */
+
+		ldwrku = *n;
+		itau = iu + ldwrku * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R, copying result to U
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+		dlacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Generate Q in U
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+		i__2 = *lwork - nwork + 1;
+		dorgqr_(m, m, n, &u[u_offset], ldu, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+
+/*              Produce R in A, zeroing out other entries */
+
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		dlaset_("L", &i__2, &i__1, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+		ie = itau;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgebrd_(n, n, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in WORK(IU) and computing right
+                singular vectors of bidiagonal matrix in VT
+                (Workspace: need N+N*N+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &work[ie], &work[iu], n, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite WORK(IU) by left singular vectors of R and VT
+                by right singular vectors of R
+                (Workspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", n, n, n, &a[a_offset], lda, &work[
+			itauq], &work[iu], &ldwrku, &work[nwork], &i__2, &
+			ierr);
+		i__2 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in U by left singular vectors of R in
+                WORK(IU), storing result in A
+                (Workspace: need N*N)
+*/
+
+		dgemm_("N", "N", m, n, n, &c_b15, &u[u_offset], ldu, &work[iu]
+			, &ldwrku, &c_b29, &a[a_offset], lda);
+
+/*              Copy left singular vectors of A from A to U */
+
+		dlacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+	    }
+
+	} else {
+
+/*
+             M .LT. MNTHR
+
+             Path 5 (M at least N, but not much larger)
+             Reduce to bidiagonal form without QR decomposition
+*/
+
+	    ie = 1;
+	    itauq = ie + *n;
+	    itaup = itauq + *n;
+	    nwork = itaup + *n;
+
+/*
+             Bidiagonalize A
+             (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    dgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Perform bidiagonal SVD, only computing singular values
+                (Workspace: need N+BDSPAC)
+*/
+
+		dbdsdc_("U", "N", n, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+	    } else if (wntqo) {
+		iu = nwork;
+		if (*lwork >= *m * *n + *n * 3 + bdspac) {
+
+/*                 WORK( IU ) is M by N */
+
+		    ldwrku = *m;
+		    nwork = iu + ldwrku * *n;
+		    dlaset_("F", m, n, &c_b29, &c_b29, &work[iu], &ldwrku);
+		} else {
+
+/*                 WORK( IU ) is N by N */
+
+		    ldwrku = *n;
+		    nwork = iu + ldwrku * *n;
+
+/*                 WORK(IR) is LDWRKR by N */
+
+		    ir = nwork;
+		    ldwrkr = (*lwork - *n * *n - *n * 3) / *n;
+		}
+		nwork = iu + ldwrku * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in WORK(IU) and computing right
+                singular vectors of bidiagonal matrix in VT
+                (Workspace: need N+N*N+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &work[ie], &work[iu], &ldwrku, &
+			vt[vt_offset], ldvt, dum, idum, &work[nwork], &iwork[
+			1], info);
+
+/*
+                Overwrite VT by right singular vectors of A
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+		if (*lwork >= *m * *n + *n * 3 + bdspac) {
+
+/*
+                   Overwrite WORK(IU) by left singular vectors of A
+                   (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    dormbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			    itauq], &work[iu], &ldwrku, &work[nwork], &i__2, &
+			    ierr);
+
+/*                 Copy left singular vectors of A from WORK(IU) to A */
+
+		    dlacpy_("F", m, n, &work[iu], &ldwrku, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate Q in A
+                   (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    dorgbr_("Q", m, n, n, &a[a_offset], lda, &work[itauq], &
+			    work[nwork], &i__2, &ierr);
+
+/*
+                   Multiply Q in A by left singular vectors of
+                   bidiagonal matrix in WORK(IU), storing result in
+                   WORK(IR) and copying to A
+                   (Workspace: need 2*N*N, prefer N*N+M*N)
+*/
+
+		    i__2 = *m;
+		    i__1 = ldwrkr;
+		    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			     i__1) {
+/* Computing MIN */
+			i__3 = *m - i__ + 1;
+			chunk = min(i__3,ldwrkr);
+			dgemm_("N", "N", &chunk, n, n, &c_b15, &a[i__ +
+				a_dim1], lda, &work[iu], &ldwrku, &c_b29, &
+				work[ir], &ldwrkr);
+			dlacpy_("F", &chunk, n, &work[ir], &ldwrkr, &a[i__ +
+				a_dim1], lda);
+/* L20: */
+		    }
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need N+BDSPAC)
+*/
+
+		dlaset_("F", m, n, &c_b29, &c_b29, &u[u_offset], ldu);
+		dbdsdc_("U", "I", n, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need 3*N, prefer 2*N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    } else if (wntqa) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need N+BDSPAC)
+*/
+
+		dlaset_("F", m, m, &c_b29, &c_b29, &u[u_offset], ldu);
+		dbdsdc_("U", "I", n, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*              Set the right corner of U to identity matrix */
+
+		if (*m > *n) {
+		    i__1 = *m - *n;
+		    i__2 = *m - *n;
+		    dlaset_("F", &i__1, &i__2, &c_b29, &c_b15, &u[*n + 1 + (*
+			    n + 1) * u_dim1], ldu);
+		}
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need N*N+2*N+M, prefer N*N+2*N+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    }
+
+	}
+
+    } else {
+
+/*
+          A has more columns than rows. If A has sufficiently more
+          columns than rows, first reduce using the LQ decomposition (if
+          sufficient workspace available)
+*/
+
+	if (*n >= mnthr) {
+
+	    if (wntqn) {
+
+/*
+                Path 1t (N much larger than M, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (Workspace: need 2*M, prefer M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Zero out above L */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		dlaset_("U", &i__1, &i__2, &c_b29, &c_b29, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = 1;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (Workspace: need 4*M, prefer 3*M+2*M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgebrd_(m, m, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+		nwork = ie + *m;
+
+/*
+                Perform bidiagonal SVD, computing singular values only
+                (Workspace: need M+BDSPAC)
+*/
+
+		dbdsdc_("U", "N", m, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2t (N much larger than M, JOBZ='O')
+                M right singular vectors to be overwritten on A and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+
+/*              IVT is M by M */
+
+		il = ivt + *m * *m;
+		if (*lwork >= *m * *n + *m * *m + *m * 3 + bdspac) {
+
+/*                 WORK(IL) is M by N */
+
+		    ldwrkl = *m;
+		    chunk = *n;
+		} else {
+		    ldwrkl = *m;
+		    chunk = (*lwork - *m * *m) / *m;
+		}
+		itau = il + ldwrkl * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy L to WORK(IL), zeroing about above it */
+
+		dlacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		dlaset_("U", &i__1, &i__2, &c_b29, &c_b29, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dorglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = itau;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IL)
+                (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dgebrd_(m, m, &work[il], &ldwrkl, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U, and computing right singular
+                vectors of bidiagonal matrix in WORK(IVT)
+                (Workspace: need M+M*M+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &
+			work[ivt], m, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of L and WORK(IVT)
+                by right singular vectors of L
+                (Workspace: need 2*M*M+3*M, prefer 2*M*M+2*M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &work[ivt], m, &work[nwork], &i__1, &ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IVT) by Q
+                in A, storing result in WORK(IL) and copying to A
+                (Workspace: need 2*M*M, prefer M*M+M*N)
+*/
+
+		i__1 = *n;
+		i__2 = chunk;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *n - i__ + 1;
+		    blk = min(i__3,chunk);
+		    dgemm_("N", "N", m, &blk, m, &c_b15, &work[ivt], m, &a[
+			    i__ * a_dim1 + 1], lda, &c_b29, &work[il], &
+			    ldwrkl);
+		    dlacpy_("F", m, &blk, &work[il], &ldwrkl, &a[i__ * a_dim1
+			    + 1], lda);
+/* L30: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Path 3t (N much larger than M, JOBZ='S')
+                M right singular vectors to be computed in VT and
+                M left singular vectors to be computed in U
+*/
+
+		il = 1;
+
+/*              WORK(IL) is M by M */
+
+		ldwrkl = *m;
+		itau = il + ldwrkl * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy L to WORK(IL), zeroing out above it */
+
+		dlacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		dlaset_("U", &i__2, &i__1, &c_b29, &c_b29, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dorglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = itau;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IU), copying result to U
+                (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgebrd_(m, m, &work[il], &ldwrkl, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need M+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of L and VT
+                by right singular vectors of L
+                (Workspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+		i__2 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IL) by
+                Q in A, storing result in VT
+                (Workspace: need M*M)
+*/
+
+		dlacpy_("F", m, m, &vt[vt_offset], ldvt, &work[il], &ldwrkl);
+		dgemm_("N", "N", m, n, m, &c_b15, &work[il], &ldwrkl, &a[
+			a_offset], lda, &c_b29, &vt[vt_offset], ldvt);
+
+	    } else if (wntqa) {
+
+/*
+                Path 4t (N much larger than M, JOBZ='A')
+                N right singular vectors to be computed in VT and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+
+/*              WORK(IVT) is M by M */
+
+		ldwkvt = *m;
+		itau = ivt + ldwkvt * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q, copying result to VT
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+		dlacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Generate Q in VT
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dorglq_(n, n, m, &vt[vt_offset], ldvt, &work[itau], &work[
+			nwork], &i__2, &ierr);
+
+/*              Produce L in A, zeroing out other entries */
+
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		dlaset_("U", &i__2, &i__1, &c_b29, &c_b29, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = itau;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dgebrd_(m, m, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in WORK(IVT)
+                (Workspace: need M+M*M+BDSPAC)
+*/
+
+		dbdsdc_("U", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &
+			work[ivt], &ldwkvt, dum, idum, &work[nwork], &iwork[1]
+			, info);
+
+/*
+                Overwrite U by left singular vectors of L and WORK(IVT)
+                by right singular vectors of L
+                (Workspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, m, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+		i__2 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", m, m, m, &a[a_offset], lda, &work[
+			itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IVT) by
+                Q in VT, storing result in A
+                (Workspace: need M*M)
+*/
+
+		dgemm_("N", "N", m, n, m, &c_b15, &work[ivt], &ldwkvt, &vt[
+			vt_offset], ldvt, &c_b29, &a[a_offset], lda);
+
+/*              Copy right singular vectors of A from A to VT */
+
+		dlacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+	    }
+
+	} else {
+
+/*
+             N .LT. MNTHR
+
+             Path 5t (N greater than M, but not much larger)
+             Reduce to bidiagonal form without LQ decomposition
+*/
+
+	    ie = 1;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+             Bidiagonalize A
+             (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    dgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Perform bidiagonal SVD, only computing singular values
+                (Workspace: need M+BDSPAC)
+*/
+
+		dbdsdc_("L", "N", m, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+	    } else if (wntqo) {
+		ldwkvt = *m;
+		ivt = nwork;
+		if (*lwork >= *m * *n + *m * 3 + bdspac) {
+
+/*                 WORK( IVT ) is M by N */
+
+		    dlaset_("F", m, n, &c_b29, &c_b29, &work[ivt], &ldwkvt);
+		    nwork = ivt + ldwkvt * *n;
+		} else {
+
+/*                 WORK( IVT ) is M by M */
+
+		    nwork = ivt + ldwkvt * *m;
+		    il = nwork;
+
+/*                 WORK(IL) is M by CHUNK */
+
+		    chunk = (*lwork - *m * *m - *m * 3) / *m;
+		}
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in WORK(IVT)
+                (Workspace: need M*M+BDSPAC)
+*/
+
+		dbdsdc_("L", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &
+			work[ivt], &ldwkvt, dum, idum, &work[nwork], &iwork[1]
+			, info);
+
+/*
+                Overwrite U by left singular vectors of A
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+		if (*lwork >= *m * *n + *m * 3 + bdspac) {
+
+/*
+                   Overwrite WORK(IVT) by left singular vectors of A
+                   (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    dormbr_("P", "R", "T", m, n, m, &a[a_offset], lda, &work[
+			    itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2,
+			    &ierr);
+
+/*                 Copy right singular vectors of A from WORK(IVT) to A */
+
+		    dlacpy_("F", m, n, &work[ivt], &ldwkvt, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate P**T in A
+                   (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    dorgbr_("P", m, n, m, &a[a_offset], lda, &work[itaup], &
+			    work[nwork], &i__2, &ierr);
+
+/*
+                   Multiply Q in A by right singular vectors of
+                   bidiagonal matrix in WORK(IVT), storing result in
+                   WORK(IL) and copying to A
+                   (Workspace: need 2*M*M, prefer M*M+M*N)
+*/
+
+		    i__2 = *n;
+		    i__1 = chunk;
+		    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			     i__1) {
+/* Computing MIN */
+			i__3 = *n - i__ + 1;
+			blk = min(i__3,chunk);
+			dgemm_("N", "N", m, &blk, m, &c_b15, &work[ivt], &
+				ldwkvt, &a[i__ * a_dim1 + 1], lda, &c_b29, &
+				work[il], m);
+			dlacpy_("F", m, &blk, &work[il], m, &a[i__ * a_dim1 +
+				1], lda);
+/* L40: */
+		    }
+		}
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need M+BDSPAC)
+*/
+
+		dlaset_("F", m, n, &c_b29, &c_b29, &vt[vt_offset], ldvt);
+		dbdsdc_("L", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need 3*M, prefer 2*M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", m, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    } else if (wntqa) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need M+BDSPAC)
+*/
+
+		dlaset_("F", n, n, &c_b29, &c_b29, &vt[vt_offset], ldvt);
+		dbdsdc_("L", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*              Set the right corner of VT to identity matrix */
+
+		if (*n > *m) {
+		    i__1 = *n - *m;
+		    i__2 = *n - *m;
+		    dlaset_("F", &i__1, &i__2, &c_b29, &c_b15, &vt[*m + 1 + (*
+			    m + 1) * vt_dim1], ldvt);
+		}
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need 2*M+N, prefer 2*M+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		dormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		dormbr_("P", "R", "T", n, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    }
+
+	}
+
+    }
+
+/*     Undo scaling if necessary */
+
+    if (iscl == 1) {
+	if (anrm > bignum) {
+	    dlascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+	if (anrm < smlnum) {
+	    dlascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+    }
+
+/*     Return optimal workspace in WORK(1) */
+
+    work[1] = (doublereal) maxwrk;
+
+    return 0;
+
+/*     End of DGESDD */
+
+} /* dgesdd_ */
+
+/* Subroutine */ int dgesv_(integer *n, integer *nrhs, doublereal *a, integer
+	*lda, integer *ipiv, doublereal *b, integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int dgetrf_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *), dgetrs_(char *, integer *, integer *, doublereal *,
+	    integer *, integer *, doublereal *, integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGESV computes the solution to a real system of linear equations
+       A * X = B,
+    where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+
+    The LU decomposition with partial pivoting and row interchanges is
+    used to factor A as
+       A = P * L * U,
+    where P is a permutation matrix, L is unit lower triangular, and U is
+    upper triangular.  The factored form of A is then used to solve the
+    system of equations A * X = B.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of linear equations, i.e., the order of the
+            matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the N-by-N coefficient matrix A.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (output) INTEGER array, dimension (N)
+            The pivot indices that define the permutation matrix P;
+            row i of the matrix was interchanged with row IPIV(i).
+
+    B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
+            On entry, the N-by-NRHS matrix of right hand side matrix B.
+            On exit, if INFO = 0, the N-by-NRHS solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization
+                  has been completed, but the factor U is exactly
+                  singular, so the solution could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*nrhs < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGESV ", &i__1);
+	return 0;
+    }
+
+/*     Compute the LU factorization of A. */
+
+    dgetrf_(n, n, &a[a_offset], lda, &ipiv[1], info);
+    if (*info == 0) {
+
+/*        Solve the system A*X = B, overwriting B with X. */
+
+	dgetrs_("No transpose", n, nrhs, &a[a_offset], lda, &ipiv[1], &b[
+		b_offset], ldb, info);
+    }
+    return 0;
+
+/*     End of DGESV */
+
+} /* dgesv_ */
+
+/* Subroutine */ int dgetf2_(integer *m, integer *n, doublereal *a, integer *
+	lda, integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, jp;
+    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *), dscal_(integer *, doublereal *, doublereal *, integer
+	    *);
+    static doublereal sfmin;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGETF2 computes an LU factorization of a general m-by-n matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the m by n matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, U(k,k) is exactly zero. The factorization
+                 has been completed, but the factor U is exactly
+                 singular, and division by zero will occur if it is used
+                 to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGETF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Compute machine safe minimum */
+
+    sfmin = SAFEMINIMUM;
+
+    i__1 = min(*m,*n);
+    for (j = 1; j <= i__1; ++j) {
+
+/*        Find pivot and test for singularity. */
+
+	i__2 = *m - j + 1;
+	jp = j - 1 + idamax_(&i__2, &a[j + j * a_dim1], &c__1);
+	ipiv[j] = jp;
+	if (a[jp + j * a_dim1] != 0.) {
+
+/*           Apply the interchange to columns 1:N. */
+
+	    if (jp != j) {
+		dswap_(n, &a[j + a_dim1], lda, &a[jp + a_dim1], lda);
+	    }
+
+/*           Compute elements J+1:M of J-th column. */
+
+	    if (j < *m) {
+		if ((d__1 = a[j + j * a_dim1], abs(d__1)) >= sfmin) {
+		    i__2 = *m - j;
+		    d__1 = 1. / a[j + j * a_dim1];
+		    dscal_(&i__2, &d__1, &a[j + 1 + j * a_dim1], &c__1);
+		} else {
+		    i__2 = *m - j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			a[j + i__ + j * a_dim1] /= a[j + j * a_dim1];
+/* L20: */
+		    }
+		}
+	    }
+
+	} else if (*info == 0) {
+
+	    *info = j;
+	}
+
+	if (j < min(*m,*n)) {
+
+/*           Update trailing submatrix. */
+
+	    i__2 = *m - j;
+	    i__3 = *n - j;
+	    dger_(&i__2, &i__3, &c_b151, &a[j + 1 + j * a_dim1], &c__1, &a[j
+		    + (j + 1) * a_dim1], lda, &a[j + 1 + (j + 1) * a_dim1],
+		    lda);
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of DGETF2 */
+
+} /* dgetf2_ */
+
+/* Subroutine */ int dgetrf_(integer *m, integer *n, doublereal *a, integer *
+	lda, integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+
+    /* Local variables */
+    static integer i__, j, jb, nb;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer iinfo;
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *), dgetf2_(
+	    integer *, integer *, doublereal *, integer *, integer *, integer
+	    *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dlaswp_(integer *, doublereal *, integer *,
+	    integer *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGETRF computes an LU factorization of a general M-by-N matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the M-by-N matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
+                  has been completed, but the factor U is exactly
+                  singular, and division by zero will occur if it is used
+                  to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGETRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "DGETRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    if (nb <= 1 || nb >= min(*m,*n)) {
+
+/*        Use unblocked code. */
+
+	dgetf2_(m, n, &a[a_offset], lda, &ipiv[1], info);
+    } else {
+
+/*        Use blocked code. */
+
+	i__1 = min(*m,*n);
+	i__2 = nb;
+	for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+/* Computing MIN */
+	    i__3 = min(*m,*n) - j + 1;
+	    jb = min(i__3,nb);
+
+/*
+             Factor diagonal and subdiagonal blocks and test for exact
+             singularity.
+*/
+
+	    i__3 = *m - j + 1;
+	    dgetf2_(&i__3, &jb, &a[j + j * a_dim1], lda, &ipiv[j], &iinfo);
+
+/*           Adjust INFO and the pivot indices. */
+
+	    if (*info == 0 && iinfo > 0) {
+		*info = iinfo + j - 1;
+	    }
+/* Computing MIN */
+	    i__4 = *m, i__5 = j + jb - 1;
+	    i__3 = min(i__4,i__5);
+	    for (i__ = j; i__ <= i__3; ++i__) {
+		ipiv[i__] = j - 1 + ipiv[i__];
+/* L10: */
+	    }
+
+/*           Apply interchanges to columns 1:J-1. */
+
+	    i__3 = j - 1;
+	    i__4 = j + jb - 1;
+	    dlaswp_(&i__3, &a[a_offset], lda, &j, &i__4, &ipiv[1], &c__1);
+
+	    if (j + jb <= *n) {
+
+/*              Apply interchanges to columns J+JB:N. */
+
+		i__3 = *n - j - jb + 1;
+		i__4 = j + jb - 1;
+		dlaswp_(&i__3, &a[(j + jb) * a_dim1 + 1], lda, &j, &i__4, &
+			ipiv[1], &c__1);
+
+/*              Compute block row of U. */
+
+		i__3 = *n - j - jb + 1;
+		dtrsm_("Left", "Lower", "No transpose", "Unit", &jb, &i__3, &
+			c_b15, &a[j + j * a_dim1], lda, &a[j + (j + jb) *
+			a_dim1], lda);
+		if (j + jb <= *m) {
+
+/*                 Update trailing submatrix. */
+
+		    i__3 = *m - j - jb + 1;
+		    i__4 = *n - j - jb + 1;
+		    dgemm_("No transpose", "No transpose", &i__3, &i__4, &jb,
+			    &c_b151, &a[j + jb + j * a_dim1], lda, &a[j + (j
+			    + jb) * a_dim1], lda, &c_b15, &a[j + jb + (j + jb)
+			     * a_dim1], lda);
+		}
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of DGETRF */
+
+} /* dgetrf_ */
+
+/* Subroutine */ int dgetrs_(char *trans, integer *n, integer *nrhs,
+	doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer *
+	ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *), xerbla_(
+	    char *, integer *), dlaswp_(integer *, doublereal *,
+	    integer *, integer *, integer *, integer *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DGETRS solves a system of linear equations
+       A * X = B  or  A' * X = B
+    with a general N-by-N matrix A using the LU factorization computed
+    by DGETRF.
+
+    Arguments
+    =========
+
+    TRANS   (input) CHARACTER*1
+            Specifies the form of the system of equations:
+            = 'N':  A * X = B  (No transpose)
+            = 'T':  A'* X = B  (Transpose)
+            = 'C':  A'* X = B  (Conjugate transpose = Transpose)
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The factors L and U from the factorization A = P*L*U
+            as computed by DGETRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (input) INTEGER array, dimension (N)
+            The pivot indices from DGETRF; for 1<=i<=N, row i of the
+            matrix was interchanged with row IPIV(i).
+
+    B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    notran = lsame_(trans, "N");
+    if (! notran && ! lsame_(trans, "T") && ! lsame_(
+	    trans, "C")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGETRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (notran) {
+
+/*
+          Solve A * X = B.
+
+          Apply row interchanges to the right hand sides.
+*/
+
+	dlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c__1);
+
+/*        Solve L*X = B, overwriting B with X. */
+
+	dtrsm_("Left", "Lower", "No transpose", "Unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	dtrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b15, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A' * X = B.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	dtrsm_("Left", "Upper", "Transpose", "Non-unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	dtrsm_("Left", "Lower", "Transpose", "Unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Apply row interchanges to the solution vectors. */
+
+	dlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1);
+    }
+
+    return 0;
+
+/*     End of DGETRS */
+
+} /* dgetrs_ */
+
+/* Subroutine */ int dhseqr_(char *job, char *compz, integer *n, integer *ilo,
+	 integer *ihi, doublereal *h__, integer *ldh, doublereal *wr,
+	doublereal *wi, doublereal *z__, integer *ldz, doublereal *work,
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2[2], i__3;
+    doublereal d__1;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublereal hl[2401]	/* was [49][49] */;
+    static integer kbot, nmin;
+    extern logical lsame_(char *, char *);
+    static logical initz;
+    static doublereal workl[49];
+    static logical wantt, wantz;
+    extern /* Subroutine */ int dlaqr0_(logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    doublereal *, integer *, integer *), dlahqr_(logical *, logical *,
+	     integer *, integer *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlacpy_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *),
+	    dlaset_(char *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical lquery;
+
+
+/*
+    -- LAPACK computational routine (version 3.2.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       June 2010
+
+       Purpose
+       =======
+
+       DHSEQR computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+       Schur form), and Z is the orthogonal matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input orthogonal
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+       Arguments
+       =========
+
+       JOB   (input) CHARACTER*1
+             = 'E':  compute eigenvalues only;
+             = 'S':  compute eigenvalues and the Schur form T.
+
+       COMPZ (input) CHARACTER*1
+             = 'N':  no Schur vectors are computed;
+             = 'I':  Z is initialized to the unit matrix and the matrix Z
+                     of Schur vectors of H is returned;
+             = 'V':  Z must contain an orthogonal matrix Q on entry, and
+                     the product Q*Z is returned.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+             set by a previous call to DGEBAL, and then passed to DGEHRD
+             when the matrix output by DGEBAL is reduced to Hessenberg
+             form. Otherwise ILO and IHI should be set to 1 and N
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) DOUBLE PRECISION array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and JOB = 'S', then H contains the
+             upper quasi-triangular matrix T from the Schur decomposition
+             (the Schur form); 2-by-2 diagonal blocks (corresponding to
+             complex conjugate pairs of eigenvalues) are returned in
+             standard form, with H(i,i) = H(i+1,i+1) and
+             H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the
+             contents of H are unspecified on exit.  (The output value of
+             H when INFO.GT.0 is given under the description of INFO
+             below.)
+
+             Unlike earlier versions of DHSEQR, this subroutine may
+             explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+             or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       WR    (output) DOUBLE PRECISION array, dimension (N)
+       WI    (output) DOUBLE PRECISION array, dimension (N)
+             The real and imaginary parts, respectively, of the computed
+             eigenvalues. If two eigenvalues are computed as a complex
+             conjugate pair, they are stored in consecutive elements of
+             WR and WI, say the i-th and (i+1)th, with WI(i) .GT. 0 and
+             WI(i+1) .LT. 0. If JOB = 'S', the eigenvalues are stored in
+             the same order as on the diagonal of the Schur form returned
+             in H, with WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2
+             diagonal block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
+             WI(i+1) = -WI(i).
+
+       Z     (input/output) DOUBLE PRECISION array, dimension (LDZ,N)
+             If COMPZ = 'N', Z is not referenced.
+             If COMPZ = 'I', on entry Z need not be set and on exit,
+             if INFO = 0, Z contains the orthogonal matrix Z of the Schur
+             vectors of H.  If COMPZ = 'V', on entry Z must contain an
+             N-by-N matrix Q, which is assumed to be equal to the unit
+             matrix except for the submatrix Z(ILO:IHI,ILO:IHI). On exit,
+             if INFO = 0, Z contains Q*Z.
+             Normally Q is the orthogonal matrix generated by DORGHR
+             after the call to DGEHRD which formed the Hessenberg matrix
+             H. (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if COMPZ = 'I' or
+             COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) DOUBLE PRECISION array, dimension (LWORK)
+             On exit, if INFO = 0, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient and delivers very good and sometimes
+             optimal performance.  However, LWORK as large as 11*N
+             may be required for optimal performance.  A workspace
+             query is recommended to determine the optimal workspace
+             size.
+
+             If LWORK = -1, then DHSEQR does a workspace query.
+             In this case, DHSEQR checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .LT. 0:  if INFO = -i, the i-th argument had an illegal
+                      value
+             .GT. 0:  if INFO = i, DHSEQR failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and JOB = 'E', then on exit, the
+                  remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and JOB   = 'S', then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is an orthogonal matrix.  The final
+                  value of H is upper Hessenberg and quasi-triangular
+                  in rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and COMPZ = 'V', then on exit
+
+                    (final value of Z)  =  (initial value of Z)*U
+
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'I', then on exit
+                        (final value of Z)  = U
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'N', then Z is not
+                  accessed.
+
+       ================================================================
+               Default values supplied by
+               ILAENV(ISPEC,'DHSEQR',JOB(:1)//COMPZ(:1),N,ILO,IHI,LWORK).
+               It is suggested that these defaults be adjusted in order
+               to attain best performance in each particular
+               computational environment.
+
+              ISPEC=12: The DLAHQR vs DLAQR0 crossover point.
+                        Default: 75. (Must be at least 11.)
+
+              ISPEC=13: Recommended deflation window size.
+                        This depends on ILO, IHI and NS.  NS is the
+                        number of simultaneous shifts returned
+                        by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
+                        The default for (IHI-ILO+1).LE.500 is NS.
+                        The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+
+              ISPEC=14: Nibble crossover point. (See IPARMQ for
+                        details.)  Default: 14% of deflation window
+                        size.
+
+              ISPEC=15: Number of simultaneous shifts in a multishift
+                        QR iteration.
+
+                        If IHI-ILO+1 is ...
+
+                        greater than      ...but less    ... the
+                        or equal to ...      than        default is
+
+                             1               30          NS =   2(+)
+                            30               60          NS =   4(+)
+                            60              150          NS =  10(+)
+                           150              590          NS =  **
+                           590             3000          NS =  64
+                          3000             6000          NS = 128
+                          6000             infinity      NS = 256
+
+                    (+)  By default some or all matrices of this order
+                         are passed to the implicit double shift routine
+                         DLAHQR and this parameter is ignored.  See
+                         ISPEC=12 above and comments in IPARMQ for
+                         details.
+
+                   (**)  The asterisks (**) indicate an ad-hoc
+                         function of N increasing from 10 to 64.
+
+              ISPEC=16: Select structured matrix multiply.
+                        If the number of simultaneous shifts (specified
+                        by ISPEC=15) is less than 14, then the default
+                        for ISPEC=16 is 0.  Otherwise the default for
+                        ISPEC=16 is 2.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    DLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== NL allocates some local workspace to help small matrices
+       .    through a rare DLAHQR failure.  NL .GT. NTINY = 11 is
+       .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+       .    mended.  (The default value of NMIN is 75.)  Using NL = 49
+       .    allows up to six simultaneous shifts and a 16-by-16
+       .    deflation window.  ====
+
+       ==== Decode and check the input parameters. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    wantt = lsame_(job, "S");
+    initz = lsame_(compz, "I");
+    wantz = initz || lsame_(compz, "V");
+    work[1] = (doublereal) max(1,*n);
+    lquery = *lwork == -1;
+
+    *info = 0;
+    if (! lsame_(job, "E") && ! wantt) {
+	*info = -1;
+    } else if (! lsame_(compz, "N") && ! wantz) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*ldh < max(1,*n)) {
+	*info = -7;
+    } else if (*ldz < 1 || wantz && *ldz < max(1,*n)) {
+	*info = -11;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -13;
+    }
+
+    if (*info != 0) {
+
+/*        ==== Quick return in case of invalid argument. ==== */
+
+	i__1 = -(*info);
+	xerbla_("DHSEQR", &i__1);
+	return 0;
+
+    } else if (*n == 0) {
+
+/*        ==== Quick return in case N = 0; nothing to do. ==== */
+
+	return 0;
+
+    } else if (lquery) {
+
+/*        ==== Quick return in case of a workspace query ==== */
+
+	dlaqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1], &wi[
+		1], ilo, ihi, &z__[z_offset], ldz, &work[1], lwork, info);
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+   Computing MAX
+*/
+	d__1 = (doublereal) max(1,*n);
+	work[1] = max(d__1,work[1]);
+	return 0;
+
+    } else {
+
+/*        ==== copy eigenvalues isolated by DGEBAL ==== */
+
+	i__1 = *ilo - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    wr[i__] = h__[i__ + i__ * h_dim1];
+	    wi[i__] = 0.;
+/* L10: */
+	}
+	i__1 = *n;
+	for (i__ = *ihi + 1; i__ <= i__1; ++i__) {
+	    wr[i__] = h__[i__ + i__ * h_dim1];
+	    wi[i__] = 0.;
+/* L20: */
+	}
+
+/*        ==== Initialize Z, if requested ==== */
+
+	if (initz) {
+	    dlaset_("A", n, n, &c_b29, &c_b15, &z__[z_offset], ldz)
+		    ;
+	}
+
+/*        ==== Quick return if possible ==== */
+
+	if (*ilo == *ihi) {
+	    wr[*ilo] = h__[*ilo + *ilo * h_dim1];
+	    wi[*ilo] = 0.;
+	    return 0;
+	}
+
+/*
+          ==== DLAHQR/DLAQR0 crossover point ====
+
+   Writing concatenation
+*/
+	i__2[0] = 1, a__1[0] = job;
+	i__2[1] = 1, a__1[1] = compz;
+	s_cat(ch__1, a__1, i__2, &c__2, (ftnlen)2);
+	nmin = ilaenv_(&c__12, "DHSEQR", ch__1, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== DLAQR0 for big matrices; DLAHQR for small ones ==== */
+
+	if (*n > nmin) {
+	    dlaqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1],
+		    &wi[1], ilo, ihi, &z__[z_offset], ldz, &work[1], lwork,
+		    info);
+	} else {
+
+/*           ==== Small matrix ==== */
+
+	    dlahqr_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1],
+		    &wi[1], ilo, ihi, &z__[z_offset], ldz, info);
+
+	    if (*info > 0) {
+
+/*
+                ==== A rare DLAHQR failure!  DLAQR0 sometimes succeeds
+                .    when DLAHQR fails. ====
+*/
+
+		kbot = *info;
+
+		if (*n >= 49) {
+
+/*
+                   ==== Larger matrices have enough subdiagonal scratch
+                   .    space to call DLAQR0 directly. ====
+*/
+
+		    dlaqr0_(&wantt, &wantz, n, ilo, &kbot, &h__[h_offset],
+			    ldh, &wr[1], &wi[1], ilo, ihi, &z__[z_offset],
+			    ldz, &work[1], lwork, info);
+
+		} else {
+
+/*
+                   ==== Tiny matrices don't have enough subdiagonal
+                   .    scratch space to benefit from DLAQR0.  Hence,
+                   .    tiny matrices must be copied into a larger
+                   .    array before calling DLAQR0. ====
+*/
+
+		    dlacpy_("A", n, n, &h__[h_offset], ldh, hl, &c__49);
+		    hl[*n + 1 + *n * 49 - 50] = 0.;
+		    i__1 = 49 - *n;
+		    dlaset_("A", &c__49, &i__1, &c_b29, &c_b29, &hl[(*n + 1) *
+			     49 - 49], &c__49);
+		    dlaqr0_(&wantt, &wantz, &c__49, ilo, &kbot, hl, &c__49, &
+			    wr[1], &wi[1], ilo, ihi, &z__[z_offset], ldz,
+			    workl, &c__49, info);
+		    if (wantt || *info != 0) {
+			dlacpy_("A", n, n, hl, &c__49, &h__[h_offset], ldh);
+		    }
+		}
+	    }
+	}
+
+/*        ==== Clear out the trash, if necessary. ==== */
+
+	if ((wantt || *info != 0) && *n > 2) {
+	    i__1 = *n - 2;
+	    i__3 = *n - 2;
+	    dlaset_("L", &i__1, &i__3, &c_b29, &c_b29, &h__[h_dim1 + 3], ldh);
+	}
+
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+
+   Computing MAX
+*/
+	d__1 = (doublereal) max(1,*n);
+	work[1] = max(d__1,work[1]);
+    }
+
+/*     ==== End of DHSEQR ==== */
+
+    return 0;
+} /* dhseqr_ */
+
+logical disnan_(doublereal *din)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    extern logical dlaisnan_(doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DISNAN returns .TRUE. if its argument is NaN, and .FALSE.
+    otherwise.  To be replaced by the Fortran 2003 intrinsic in the
+    future.
+
+    Arguments
+    =========
+
+    DIN     (input) DOUBLE PRECISION
+            Input to test for NaN.
+
+    =====================================================================
+*/
+
+    ret_val = dlaisnan_(din, din);
+    return ret_val;
+} /* disnan_ */
+
+/* Subroutine */ int dlabad_(doublereal *small, doublereal *large)
+{
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLABAD takes as input the values computed by DLAMCH for underflow and
+    overflow, and returns the square root of each of these values if the
+    log of LARGE is sufficiently large.  This subroutine is intended to
+    identify machines with a large exponent range, such as the Crays, and
+    redefine the underflow and overflow limits to be the square roots of
+    the values computed by DLAMCH.  This subroutine is needed because
+    DLAMCH does not compensate for poor arithmetic in the upper half of
+    the exponent range, as is found on a Cray.
+
+    Arguments
+    =========
+
+    SMALL   (input/output) DOUBLE PRECISION
+            On entry, the underflow threshold as computed by DLAMCH.
+            On exit, if LOG10(LARGE) is sufficiently large, the square
+            root of SMALL, otherwise unchanged.
+
+    LARGE   (input/output) DOUBLE PRECISION
+            On entry, the overflow threshold as computed by DLAMCH.
+            On exit, if LOG10(LARGE) is sufficiently large, the square
+            root of LARGE, otherwise unchanged.
+
+    =====================================================================
+
+
+       If it looks like we're on a Cray, take the square root of
+       SMALL and LARGE to avoid overflow and underflow problems.
+*/
+
+    if (d_lg10(large) > 2e3) {
+	*small = sqrt(*small);
+	*large = sqrt(*large);
+    }
+
+    return 0;
+
+/*     End of DLABAD */
+
+} /* dlabad_ */
+
+/* Subroutine */ int dlabrd_(integer *m, integer *n, integer *nb, doublereal *
+	a, integer *lda, doublereal *d__, doublereal *e, doublereal *tauq,
+	doublereal *taup, doublereal *x, integer *ldx, doublereal *y, integer
+	*ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+
+    /* Local variables */
+    static integer i__;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *), dgemv_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *), dlarfg_(integer *, doublereal *,
+	     doublereal *, integer *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLABRD reduces the first NB rows and columns of a real general
+    m by n matrix A to upper or lower bidiagonal form by an orthogonal
+    transformation Q' * A * P, and returns the matrices X and Y which
+    are needed to apply the transformation to the unreduced part of A.
+
+    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+    bidiagonal form.
+
+    This is an auxiliary routine called by DGEBRD
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.
+
+    NB      (input) INTEGER
+            The number of leading rows and columns of A to be reduced.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit, the first NB rows and columns of the matrix are
+            overwritten; the rest of the array is unchanged.
+            If m >= n, elements on and below the diagonal in the first NB
+              columns, with the array TAUQ, represent the orthogonal
+              matrix Q as a product of elementary reflectors; and
+              elements above the diagonal in the first NB rows, with the
+              array TAUP, represent the orthogonal matrix P as a product
+              of elementary reflectors.
+            If m < n, elements below the diagonal in the first NB
+              columns, with the array TAUQ, represent the orthogonal
+              matrix Q as a product of elementary reflectors, and
+              elements on and above the diagonal in the first NB rows,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) DOUBLE PRECISION array, dimension (NB)
+            The diagonal elements of the first NB rows and columns of
+            the reduced matrix.  D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (NB)
+            The off-diagonal elements of the first NB rows and columns of
+            the reduced matrix.
+
+    TAUQ    (output) DOUBLE PRECISION array dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) DOUBLE PRECISION array, dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix P. See Further Details.
+
+    X       (output) DOUBLE PRECISION array, dimension (LDX,NB)
+            The m-by-nb matrix X required to update the unreduced part
+            of A.
+
+    LDX     (input) INTEGER
+            The leading dimension of the array X. LDX >= M.
+
+    Y       (output) DOUBLE PRECISION array, dimension (LDY,NB)
+            The n-by-nb matrix Y required to update the unreduced part
+            of A.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors.
+
+    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
+    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The elements of the vectors v and u together form the m-by-nb matrix
+    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
+    the transformation to the unreduced part of the matrix, using a block
+    update of the form:  A := A - V*Y' - X*U'.
+
+    The contents of A on exit are illustrated by the following examples
+    with nb = 2:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
+      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
+      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )
+
+    where a denotes an element of the original matrix which is unchanged,
+    vi denotes an element of the vector defining H(i), and ui an element
+    of the vector defining G(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:m,i) */
+
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + a_dim1],
+		    lda, &y[i__ + y_dim1], ldy, &c_b15, &a[i__ + i__ * a_dim1]
+		    , &c__1);
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("No transpose", &i__2, &i__3, &c_b151, &x[i__ + x_dim1],
+		    ldx, &a[i__ * a_dim1 + 1], &c__1, &c_b15, &a[i__ + i__ *
+		    a_dim1], &c__1);
+
+/*           Generate reflection Q(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ *
+		    a_dim1], &c__1, &tauq[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    if (i__ < *n) {
+		a[i__ + i__ * a_dim1] = 1.;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + (i__ + 1) *
+			 a_dim1], lda, &a[i__ + i__ * a_dim1], &c__1, &c_b29,
+			&y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + a_dim1],
+			lda, &a[i__ + i__ * a_dim1], &c__1, &c_b29, &y[i__ *
+			y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b15, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &x[i__ + x_dim1],
+			ldx, &a[i__ + i__ * a_dim1], &c__1, &c_b29, &y[i__ *
+			y_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		dgemv_("Transpose", &i__2, &i__3, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &c_b15,
+			 &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		dscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+
+/*              Update A(i,i+1:n) */
+
+		i__2 = *n - i__;
+		dgemv_("No transpose", &i__2, &i__, &c_b151, &y[i__ + 1 +
+			y_dim1], ldy, &a[i__ + a_dim1], lda, &c_b15, &a[i__ +
+			(i__ + 1) * a_dim1], lda);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		dgemv_("Transpose", &i__2, &i__3, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, &c_b15, &a[
+			i__ + (i__ + 1) * a_dim1], lda);
+
+/*              Generate reflection P(i) to annihilate A(i,i+2:n) */
+
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		dlarfg_(&i__2, &a[i__ + (i__ + 1) * a_dim1], &a[i__ + min(
+			i__3,*n) * a_dim1], lda, &taup[i__]);
+		e[i__] = a[i__ + (i__ + 1) * a_dim1];
+		a[i__ + (i__ + 1) * a_dim1] = 1.;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		dgemv_("No transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + (
+			i__ + 1) * a_dim1], lda, &a[i__ + (i__ + 1) * a_dim1],
+			 lda, &c_b29, &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__;
+		dgemv_("Transpose", &i__2, &i__, &c_b15, &y[i__ + 1 + y_dim1],
+			 ldy, &a[i__ + (i__ + 1) * a_dim1], lda, &c_b29, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		dgemv_("No transpose", &i__2, &i__, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		dgemv_("No transpose", &i__2, &i__3, &c_b15, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b29, &x[i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		dscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i,i:n) */
+
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("No transpose", &i__2, &i__3, &c_b151, &y[i__ + y_dim1],
+		    ldy, &a[i__ + a_dim1], lda, &c_b15, &a[i__ + i__ * a_dim1]
+		    , lda);
+	    i__2 = i__ - 1;
+	    i__3 = *n - i__ + 1;
+	    dgemv_("Transpose", &i__2, &i__3, &c_b151, &a[i__ * a_dim1 + 1],
+		    lda, &x[i__ + x_dim1], ldx, &c_b15, &a[i__ + i__ * a_dim1]
+		    , lda);
+
+/*           Generate reflection P(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3,*n) *
+		    a_dim1], lda, &taup[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    if (i__ < *m) {
+		a[i__ + i__ * a_dim1] = 1.;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + i__
+			* a_dim1], lda, &a[i__ + i__ * a_dim1], lda, &c_b29, &
+			x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__ + 1;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &y[i__ + y_dim1],
+			ldy, &a[i__ + i__ * a_dim1], lda, &c_b29, &x[i__ *
+			x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__ + 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b15, &a[i__ * a_dim1
+			+ 1], lda, &a[i__ + i__ * a_dim1], lda, &c_b29, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		dscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+
+/*              Update A(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &y[i__ + y_dim1], ldy, &c_b15, &a[i__ +
+			1 + i__ * a_dim1], &c__1);
+		i__2 = *m - i__;
+		dgemv_("No transpose", &i__2, &i__, &c_b151, &x[i__ + 1 +
+			x_dim1], ldx, &a[i__ * a_dim1 + 1], &c__1, &c_b15, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+
+/*              Generate reflection Q(i) to annihilate A(i+2:m,i) */
+
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		dlarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*m) +
+			i__ * a_dim1], &c__1, &tauq[i__]);
+		e[i__] = a[i__ + 1 + i__ * a_dim1];
+		a[i__ + 1 + i__ * a_dim1] = 1.;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + (i__ +
+			1) * a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1,
+			&c_b29, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + a_dim1]
+			, lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &y[
+			i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b15, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__;
+		dgemv_("Transpose", &i__2, &i__, &c_b15, &x[i__ + 1 + x_dim1],
+			 ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &y[
+			i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		dgemv_("Transpose", &i__, &i__2, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &c_b15,
+			 &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		dscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of DLABRD */
+
+} /* dlabrd_ */
+
+/* Subroutine */ int dlacpy_(char *uplo, integer *m, integer *n, doublereal *
+	a, integer *lda, doublereal *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLACPY copies all or part of a two-dimensional matrix A to another
+    matrix B.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be copied to B.
+            = 'U':      Upper triangular part
+            = 'L':      Lower triangular part
+            Otherwise:  All of the matrix A
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The m by n matrix A.  If UPLO = 'U', only the upper triangle
+            or trapezoid is accessed; if UPLO = 'L', only the lower
+            triangle or trapezoid is accessed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (output) DOUBLE PRECISION array, dimension (LDB,N)
+            On exit, B = A in the locations specified by UPLO.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(uplo, "L")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
+/* L30: */
+	    }
+/* L40: */
+	}
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+    return 0;
+
+/*     End of DLACPY */
+
+} /* dlacpy_ */
+
+/* Subroutine */ int dladiv_(doublereal *a, doublereal *b, doublereal *c__,
+	doublereal *d__, doublereal *p, doublereal *q)
+{
+    static doublereal e, f;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLADIV performs complex division in  real arithmetic
+
+                          a + i*b
+               p + i*q = ---------
+                          c + i*d
+
+    The algorithm is due to Robert L. Smith and can be found
+    in D. Knuth, The art of Computer Programming, Vol.2, p.195
+
+    Arguments
+    =========
+
+    A       (input) DOUBLE PRECISION
+    B       (input) DOUBLE PRECISION
+    C       (input) DOUBLE PRECISION
+    D       (input) DOUBLE PRECISION
+            The scalars a, b, c, and d in the above expression.
+
+    P       (output) DOUBLE PRECISION
+    Q       (output) DOUBLE PRECISION
+            The scalars p and q in the above expression.
+
+    =====================================================================
+*/
+
+
+    if (abs(*d__) < abs(*c__)) {
+	e = *d__ / *c__;
+	f = *c__ + *d__ * e;
+	*p = (*a + *b * e) / f;
+	*q = (*b - *a * e) / f;
+    } else {
+	e = *c__ / *d__;
+	f = *d__ + *c__ * e;
+	*p = (*b + *a * e) / f;
+	*q = (-(*a) + *b * e) / f;
+    }
+
+    return 0;
+
+/*     End of DLADIV */
+
+} /* dladiv_ */
+
+/* Subroutine */ int dlae2_(doublereal *a, doublereal *b, doublereal *c__,
+	doublereal *rt1, doublereal *rt2)
+{
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal ab, df, tb, sm, rt, adf, acmn, acmx;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAE2  computes the eigenvalues of a 2-by-2 symmetric matrix
+       [  A   B  ]
+       [  B   C  ].
+    On return, RT1 is the eigenvalue of larger absolute value, and RT2
+    is the eigenvalue of smaller absolute value.
+
+    Arguments
+    =========
+
+    A       (input) DOUBLE PRECISION
+            The (1,1) element of the 2-by-2 matrix.
+
+    B       (input) DOUBLE PRECISION
+            The (1,2) and (2,1) elements of the 2-by-2 matrix.
+
+    C       (input) DOUBLE PRECISION
+            The (2,2) element of the 2-by-2 matrix.
+
+    RT1     (output) DOUBLE PRECISION
+            The eigenvalue of larger absolute value.
+
+    RT2     (output) DOUBLE PRECISION
+            The eigenvalue of smaller absolute value.
+
+    Further Details
+    ===============
+
+    RT1 is accurate to a few ulps barring over/underflow.
+
+    RT2 may be inaccurate if there is massive cancellation in the
+    determinant A*C-B*B; higher precision or correctly rounded or
+    correctly truncated arithmetic would be needed to compute RT2
+    accurately in all cases.
+
+    Overflow is possible only if RT1 is within a factor of 5 of overflow.
+    Underflow is harmless if the input data is 0 or exceeds
+       underflow_threshold / macheps.
+
+   =====================================================================
+
+
+       Compute the eigenvalues
+*/
+
+    sm = *a + *c__;
+    df = *a - *c__;
+    adf = abs(df);
+    tb = *b + *b;
+    ab = abs(tb);
+    if (abs(*a) > abs(*c__)) {
+	acmx = *a;
+	acmn = *c__;
+    } else {
+	acmx = *c__;
+	acmn = *a;
+    }
+    if (adf > ab) {
+/* Computing 2nd power */
+	d__1 = ab / adf;
+	rt = adf * sqrt(d__1 * d__1 + 1.);
+    } else if (adf < ab) {
+/* Computing 2nd power */
+	d__1 = adf / ab;
+	rt = ab * sqrt(d__1 * d__1 + 1.);
+    } else {
+
+/*        Includes case AB=ADF=0 */
+
+	rt = ab * sqrt(2.);
+    }
+    if (sm < 0.) {
+	*rt1 = (sm - rt) * .5;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else if (sm > 0.) {
+	*rt1 = (sm + rt) * .5;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else {
+
+/*        Includes case RT1 = RT2 = 0 */
+
+	*rt1 = rt * .5;
+	*rt2 = rt * -.5;
+    }
+    return 0;
+
+/*     End of DLAE2 */
+
+} /* dlae2_ */
+
+/* Subroutine */ int dlaed0_(integer *icompq, integer *qsiz, integer *n,
+	doublereal *d__, doublereal *e, doublereal *q, integer *ldq,
+	doublereal *qstore, integer *ldqs, doublereal *work, integer *iwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, qstore_dim1, qstore_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, k, iq, lgn, msd2, smm1, spm1, spm2;
+    static doublereal temp;
+    static integer curr;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer iperm;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer indxq, iwrem;
+    extern /* Subroutine */ int dlaed1_(integer *, doublereal *, doublereal *,
+	     integer *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, integer *);
+    static integer iqptr;
+    extern /* Subroutine */ int dlaed7_(integer *, integer *, integer *,
+	    integer *, integer *, integer *, doublereal *, doublereal *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, integer *, integer *, integer *, integer *, doublereal
+	    *, doublereal *, integer *, integer *);
+    static integer tlvls;
+    extern /* Subroutine */ int dlacpy_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *);
+    static integer igivcl;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer igivnm, submat, curprb, subpbs, igivpt;
+    extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *);
+    static integer curlvl, matsiz, iprmpt, smlsiz;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAED0 computes all eigenvalues and corresponding eigenvectors of a
+    symmetric tridiagonal matrix using the divide and conquer method.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            = 0:  Compute eigenvalues only.
+            = 1:  Compute eigenvectors of original dense symmetric matrix
+                  also.  On entry, Q contains the orthogonal matrix used
+                  to reduce the original matrix to tridiagonal form.
+            = 2:  Compute eigenvalues and eigenvectors of tridiagonal
+                  matrix.
+
+    QSIZ   (input) INTEGER
+           The dimension of the orthogonal matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, the main diagonal of the tridiagonal matrix.
+           On exit, its eigenvalues.
+
+    E      (input) DOUBLE PRECISION array, dimension (N-1)
+           The off-diagonal elements of the tridiagonal matrix.
+           On exit, E has been destroyed.
+
+    Q      (input/output) DOUBLE PRECISION array, dimension (LDQ, N)
+           On entry, Q must contain an N-by-N orthogonal matrix.
+           If ICOMPQ = 0    Q is not referenced.
+           If ICOMPQ = 1    On entry, Q is a subset of the columns of the
+                            orthogonal matrix used to reduce the full
+                            matrix to tridiagonal form corresponding to
+                            the subset of the full matrix which is being
+                            decomposed at this time.
+           If ICOMPQ = 2    On entry, Q will be the identity matrix.
+                            On exit, Q contains the eigenvectors of the
+                            tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  If eigenvectors are
+           desired, then  LDQ >= max(1,N).  In any case,  LDQ >= 1.
+
+    QSTORE (workspace) DOUBLE PRECISION array, dimension (LDQS, N)
+           Referenced only when ICOMPQ = 1.  Used to store parts of
+           the eigenvector matrix when the updating matrix multiplies
+           take place.
+
+    LDQS   (input) INTEGER
+           The leading dimension of the array QSTORE.  If ICOMPQ = 1,
+           then  LDQS >= max(1,N).  In any case,  LDQS >= 1.
+
+    WORK   (workspace) DOUBLE PRECISION array,
+           If ICOMPQ = 0 or 1, the dimension of WORK must be at least
+                       1 + 3*N + 2*N*lg N + 2*N**2
+                       ( lg( N ) = smallest integer k
+                                   such that 2^k >= N )
+           If ICOMPQ = 2, the dimension of WORK must be at least
+                       4*N + N**2.
+
+    IWORK  (workspace) INTEGER array,
+           If ICOMPQ = 0 or 1, the dimension of IWORK must be at least
+                          6 + 6*N + 5*N*lg N.
+                          ( lg( N ) = smallest integer k
+                                      such that 2^k >= N )
+           If ICOMPQ = 2, the dimension of IWORK must be at least
+                          3 + 5*N.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    qstore_dim1 = *ldqs;
+    qstore_offset = 1 + qstore_dim1;
+    qstore -= qstore_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 2) {
+	*info = -1;
+    } else if (*icompq == 1 && *qsiz < max(0,*n)) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ldq < max(1,*n)) {
+	*info = -7;
+    } else if (*ldqs < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED0", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    smlsiz = ilaenv_(&c__9, "DLAED0", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+
+/*
+       Determine the size and placement of the submatrices, and save in
+       the leading elements of IWORK.
+*/
+
+    iwork[1] = *n;
+    subpbs = 1;
+    tlvls = 0;
+L10:
+    if (iwork[subpbs] > smlsiz) {
+	for (j = subpbs; j >= 1; --j) {
+	    iwork[j * 2] = (iwork[j] + 1) / 2;
+	    iwork[(j << 1) - 1] = iwork[j] / 2;
+/* L20: */
+	}
+	++tlvls;
+	subpbs <<= 1;
+	goto L10;
+    }
+    i__1 = subpbs;
+    for (j = 2; j <= i__1; ++j) {
+	iwork[j] += iwork[j - 1];
+/* L30: */
+    }
+
+/*
+       Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1
+       using rank-1 modifications (cuts).
+*/
+
+    spm1 = subpbs - 1;
+    i__1 = spm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	submat = iwork[i__] + 1;
+	smm1 = submat - 1;
+	d__[smm1] -= (d__1 = e[smm1], abs(d__1));
+	d__[submat] -= (d__1 = e[smm1], abs(d__1));
+/* L40: */
+    }
+
+    indxq = (*n << 2) + 3;
+    if (*icompq != 2) {
+
+/*
+          Set up workspaces for eigenvalues only/accumulate new vectors
+          routine
+*/
+
+	temp = log((doublereal) (*n)) / log(2.);
+	lgn = (integer) temp;
+	if (pow_ii(&c__2, &lgn) < *n) {
+	    ++lgn;
+	}
+	if (pow_ii(&c__2, &lgn) < *n) {
+	    ++lgn;
+	}
+	iprmpt = indxq + *n + 1;
+	iperm = iprmpt + *n * lgn;
+	iqptr = iperm + *n * lgn;
+	igivpt = iqptr + *n + 2;
+	igivcl = igivpt + *n * lgn;
+
+	igivnm = 1;
+	iq = igivnm + (*n << 1) * lgn;
+/* Computing 2nd power */
+	i__1 = *n;
+	iwrem = iq + i__1 * i__1 + 1;
+
+/*        Initialize pointers */
+
+	i__1 = subpbs;
+	for (i__ = 0; i__ <= i__1; ++i__) {
+	    iwork[iprmpt + i__] = 1;
+	    iwork[igivpt + i__] = 1;
+/* L50: */
+	}
+	iwork[iqptr] = 1;
+    }
+
+/*
+       Solve each submatrix eigenproblem at the bottom of the divide and
+       conquer tree.
+*/
+
+    curr = 0;
+    i__1 = spm1;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	if (i__ == 0) {
+	    submat = 1;
+	    matsiz = iwork[1];
+	} else {
+	    submat = iwork[i__] + 1;
+	    matsiz = iwork[i__ + 1] - iwork[i__];
+	}
+	if (*icompq == 2) {
+	    dsteqr_("I", &matsiz, &d__[submat], &e[submat], &q[submat +
+		    submat * q_dim1], ldq, &work[1], info);
+	    if (*info != 0) {
+		goto L130;
+	    }
+	} else {
+	    dsteqr_("I", &matsiz, &d__[submat], &e[submat], &work[iq - 1 +
+		    iwork[iqptr + curr]], &matsiz, &work[1], info);
+	    if (*info != 0) {
+		goto L130;
+	    }
+	    if (*icompq == 1) {
+		dgemm_("N", "N", qsiz, &matsiz, &matsiz, &c_b15, &q[submat *
+			q_dim1 + 1], ldq, &work[iq - 1 + iwork[iqptr + curr]],
+			 &matsiz, &c_b29, &qstore[submat * qstore_dim1 + 1],
+			ldqs);
+	    }
+/* Computing 2nd power */
+	    i__2 = matsiz;
+	    iwork[iqptr + curr + 1] = iwork[iqptr + curr] + i__2 * i__2;
+	    ++curr;
+	}
+	k = 1;
+	i__2 = iwork[i__ + 1];
+	for (j = submat; j <= i__2; ++j) {
+	    iwork[indxq + j] = k;
+	    ++k;
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       Successively merge eigensystems of adjacent submatrices
+       into eigensystem for the corresponding larger matrix.
+
+       while ( SUBPBS > 1 )
+*/
+
+    curlvl = 1;
+L80:
+    if (subpbs > 1) {
+	spm2 = subpbs - 2;
+	i__1 = spm2;
+	for (i__ = 0; i__ <= i__1; i__ += 2) {
+	    if (i__ == 0) {
+		submat = 1;
+		matsiz = iwork[2];
+		msd2 = iwork[1];
+		curprb = 0;
+	    } else {
+		submat = iwork[i__] + 1;
+		matsiz = iwork[i__ + 2] - iwork[i__];
+		msd2 = matsiz / 2;
+		++curprb;
+	    }
+
+/*
+       Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2)
+       into an eigensystem of size MATSIZ.
+       DLAED1 is used only for the full eigensystem of a tridiagonal
+       matrix.
+       DLAED7 handles the cases in which eigenvalues only or eigenvalues
+       and eigenvectors of a full symmetric matrix (which was reduced to
+       tridiagonal form) are desired.
+*/
+
+	    if (*icompq == 2) {
+		dlaed1_(&matsiz, &d__[submat], &q[submat + submat * q_dim1],
+			ldq, &iwork[indxq + submat], &e[submat + msd2 - 1], &
+			msd2, &work[1], &iwork[subpbs + 1], info);
+	    } else {
+		dlaed7_(icompq, &matsiz, qsiz, &tlvls, &curlvl, &curprb, &d__[
+			submat], &qstore[submat * qstore_dim1 + 1], ldqs, &
+			iwork[indxq + submat], &e[submat + msd2 - 1], &msd2, &
+			work[iq], &iwork[iqptr], &iwork[iprmpt], &iwork[iperm]
+			, &iwork[igivpt], &iwork[igivcl], &work[igivnm], &
+			work[iwrem], &iwork[subpbs + 1], info);
+	    }
+	    if (*info != 0) {
+		goto L130;
+	    }
+	    iwork[i__ / 2 + 1] = iwork[i__ + 2];
+/* L90: */
+	}
+	subpbs /= 2;
+	++curlvl;
+	goto L80;
+    }
+
+/*
+       end while
+
+       Re-merge the eigenvalues/vectors which were deflated at the final
+       merge step.
+*/
+
+    if (*icompq == 1) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    j = iwork[indxq + i__];
+	    work[i__] = d__[j];
+	    dcopy_(qsiz, &qstore[j * qstore_dim1 + 1], &c__1, &q[i__ * q_dim1
+		    + 1], &c__1);
+/* L100: */
+	}
+	dcopy_(n, &work[1], &c__1, &d__[1], &c__1);
+    } else if (*icompq == 2) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    j = iwork[indxq + i__];
+	    work[i__] = d__[j];
+	    dcopy_(n, &q[j * q_dim1 + 1], &c__1, &work[*n * i__ + 1], &c__1);
+/* L110: */
+	}
+	dcopy_(n, &work[1], &c__1, &d__[1], &c__1);
+	dlacpy_("A", n, n, &work[*n + 1], n, &q[q_offset], ldq);
+    } else {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    j = iwork[indxq + i__];
+	    work[i__] = d__[j];
+/* L120: */
+	}
+	dcopy_(n, &work[1], &c__1, &d__[1], &c__1);
+    }
+    goto L140;
+
+L130:
+    *info = submat * (*n + 1) + submat + matsiz - 1;
+
+L140:
+    return 0;
+
+/*     End of DLAED0 */
+
+} /* dlaed0_ */
+
+/* Subroutine */ int dlaed1_(integer *n, doublereal *d__, doublereal *q,
+	integer *ldq, integer *indxq, doublereal *rho, integer *cutpnt,
+	doublereal *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, k, n1, n2, is, iw, iz, iq2, zpp1, indx, indxc;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer indxp;
+    extern /* Subroutine */ int dlaed2_(integer *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *, integer *, integer *, integer *), dlaed3_(integer *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *, integer *,
+	    doublereal *, doublereal *, integer *);
+    static integer idlmda;
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *);
+    static integer coltyp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAED1 computes the updated eigensystem of a diagonal
+    matrix after modification by a rank-one symmetric matrix.  This
+    routine is used only for the eigenproblem which requires all
+    eigenvalues and eigenvectors of a tridiagonal matrix.  DLAED7 handles
+    the case in which eigenvalues only or eigenvalues and eigenvectors
+    of a full symmetric matrix (which was reduced to tridiagonal form)
+    are desired.
+
+      T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out)
+
+       where Z = Q'u, u is a vector of length N with ones in the
+       CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+
+       The eigenvectors of the original matrix are stored in Q, and the
+       eigenvalues are in D.  The algorithm consists of three stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple eigenvalues or if there is a zero in
+          the Z vector.  For each such occurence the dimension of the
+          secular equation problem is reduced by one.  This stage is
+          performed by the routine DLAED2.
+
+          The second stage consists of calculating the updated
+          eigenvalues. This is done by finding the roots of the secular
+          equation via the routine DLAED4 (as called by DLAED3).
+          This routine also calculates the eigenvectors of the current
+          problem.
+
+          The final stage consists of computing the updated eigenvectors
+          directly using the updated eigenvalues.  The eigenvectors for
+          the current problem are multiplied with the eigenvectors from
+          the overall problem.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, the eigenvalues of the rank-1-perturbed matrix.
+           On exit, the eigenvalues of the repaired matrix.
+
+    Q      (input/output) DOUBLE PRECISION array, dimension (LDQ,N)
+           On entry, the eigenvectors of the rank-1-perturbed matrix.
+           On exit, the eigenvectors of the repaired tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (input/output) INTEGER array, dimension (N)
+           On entry, the permutation which separately sorts the two
+           subproblems in D into ascending order.
+           On exit, the permutation which will reintegrate the
+           subproblems back into sorted order,
+           i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.
+
+    RHO    (input) DOUBLE PRECISION
+           The subdiagonal entry used to create the rank-1 modification.
+
+    CUTPNT (input) INTEGER
+           The location of the last eigenvalue in the leading sub-matrix.
+           min(1,N) <= CUTPNT <= N/2.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension (4*N + N**2)
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ldq < max(1,*n)) {
+	*info = -4;
+    } else /* if(complicated condition) */ {
+/* Computing MIN */
+	i__1 = 1, i__2 = *n / 2;
+	if (min(i__1,i__2) > *cutpnt || *n / 2 < *cutpnt) {
+	    *info = -7;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED1", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*
+       The following values are integer pointers which indicate
+       the portion of the workspace
+       used by a particular array in DLAED2 and DLAED3.
+*/
+
+    iz = 1;
+    idlmda = iz + *n;
+    iw = idlmda + *n;
+    iq2 = iw + *n;
+
+    indx = 1;
+    indxc = indx + *n;
+    coltyp = indxc + *n;
+    indxp = coltyp + *n;
+
+
+/*
+       Form the z-vector which consists of the last row of Q_1 and the
+       first row of Q_2.
+*/
+
+    dcopy_(cutpnt, &q[*cutpnt + q_dim1], ldq, &work[iz], &c__1);
+    zpp1 = *cutpnt + 1;
+    i__1 = *n - *cutpnt;
+    dcopy_(&i__1, &q[zpp1 + zpp1 * q_dim1], ldq, &work[iz + *cutpnt], &c__1);
+
+/*     Deflate eigenvalues. */
+
+    dlaed2_(&k, n, cutpnt, &d__[1], &q[q_offset], ldq, &indxq[1], rho, &work[
+	    iz], &work[idlmda], &work[iw], &work[iq2], &iwork[indx], &iwork[
+	    indxc], &iwork[indxp], &iwork[coltyp], info);
+
+    if (*info != 0) {
+	goto L20;
+    }
+
+/*     Solve Secular Equation. */
+
+    if (k != 0) {
+	is = (iwork[coltyp] + iwork[coltyp + 1]) * *cutpnt + (iwork[coltyp +
+		1] + iwork[coltyp + 2]) * (*n - *cutpnt) + iq2;
+	dlaed3_(&k, n, cutpnt, &d__[1], &q[q_offset], ldq, rho, &work[idlmda],
+		 &work[iq2], &iwork[indxc], &iwork[coltyp], &work[iw], &work[
+		is], info);
+	if (*info != 0) {
+	    goto L20;
+	}
+
+/*     Prepare the INDXQ sorting permutation. */
+
+	n1 = k;
+	n2 = *n - k;
+	dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &indxq[1]);
+    } else {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    indxq[i__] = i__;
+/* L10: */
+	}
+    }
+
+L20:
+    return 0;
+
+/*     End of DLAED1 */
+
+} /* dlaed1_ */
+
+/* Subroutine */ int dlaed2_(integer *k, integer *n, integer *n1, doublereal *
+	d__, doublereal *q, integer *ldq, integer *indxq, doublereal *rho,
+	doublereal *z__, doublereal *dlamda, doublereal *w, doublereal *q2,
+	integer *indx, integer *indxc, integer *indxp, integer *coltyp,
+	integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static doublereal c__;
+    static integer i__, j;
+    static doublereal s, t;
+    static integer k2, n2, ct, nj, pj, js, iq1, iq2, n1p1;
+    static doublereal eps, tau, tol;
+    static integer psm[4], imax, jmax;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static integer ctot[4];
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *), dcopy_(integer *, doublereal *, integer *, doublereal
+	    *, integer *);
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), dlacpy_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAED2 merges the two sets of eigenvalues together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    eigenvalues are close together or if there is a tiny entry in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    Arguments
+    =========
+
+    K      (output) INTEGER
+           The number of non-deflated eigenvalues, and the order of the
+           related secular equation. 0 <= K <=N.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    N1     (input) INTEGER
+           The location of the last eigenvalue in the leading sub-matrix.
+           min(1,N) <= N1 <= N/2.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, D contains the eigenvalues of the two submatrices to
+           be combined.
+           On exit, D contains the trailing (N-K) updated eigenvalues
+           (those which were deflated) sorted into increasing order.
+
+    Q      (input/output) DOUBLE PRECISION array, dimension (LDQ, N)
+           On entry, Q contains the eigenvectors of two submatrices in
+           the two square blocks with corners at (1,1), (N1,N1)
+           and (N1+1, N1+1), (N,N).
+           On exit, Q contains the trailing (N-K) updated eigenvectors
+           (those which were deflated) in its last N-K columns.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (input/output) INTEGER array, dimension (N)
+           The permutation which separately sorts the two sub-problems
+           in D into ascending order.  Note that elements in the second
+           half of this permutation must first have N1 added to their
+           values. Destroyed on exit.
+
+    RHO    (input/output) DOUBLE PRECISION
+           On entry, the off-diagonal element associated with the rank-1
+           cut which originally split the two submatrices which are now
+           being recombined.
+           On exit, RHO has been modified to the value required by
+           DLAED3.
+
+    Z      (input) DOUBLE PRECISION array, dimension (N)
+           On entry, Z contains the updating vector (the last
+           row of the first sub-eigenvector matrix and the first row of
+           the second sub-eigenvector matrix).
+           On exit, the contents of Z have been destroyed by the updating
+           process.
+
+    DLAMDA (output) DOUBLE PRECISION array, dimension (N)
+           A copy of the first K eigenvalues which will be used by
+           DLAED3 to form the secular equation.
+
+    W      (output) DOUBLE PRECISION array, dimension (N)
+           The first k values of the final deflation-altered z-vector
+           which will be passed to DLAED3.
+
+    Q2     (output) DOUBLE PRECISION array, dimension (N1**2+(N-N1)**2)
+           A copy of the first K eigenvectors which will be used by
+           DLAED3 in a matrix multiply (DGEMM) to solve for the new
+           eigenvectors.
+
+    INDX   (workspace) INTEGER array, dimension (N)
+           The permutation used to sort the contents of DLAMDA into
+           ascending order.
+
+    INDXC  (output) INTEGER array, dimension (N)
+           The permutation used to arrange the columns of the deflated
+           Q matrix into three groups:  the first group contains non-zero
+           elements only at and above N1, the second contains
+           non-zero elements only below N1, and the third is dense.
+
+    INDXP  (workspace) INTEGER array, dimension (N)
+           The permutation used to place deflated values of D at the end
+           of the array.  INDXP(1:K) points to the nondeflated D-values
+           and INDXP(K+1:N) points to the deflated eigenvalues.
+
+    COLTYP (workspace/output) INTEGER array, dimension (N)
+           During execution, a label which will indicate which of the
+           following types a column in the Q2 matrix is:
+           1 : non-zero in the upper half only;
+           2 : dense;
+           3 : non-zero in the lower half only;
+           4 : deflated.
+           On exit, COLTYP(i) is the number of columns of type i,
+           for i=1 to 4 only.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --z__;
+    --dlamda;
+    --w;
+    --q2;
+    --indx;
+    --indxc;
+    --indxp;
+    --coltyp;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -2;
+    } else if (*ldq < max(1,*n)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MIN */
+	i__1 = 1, i__2 = *n / 2;
+	if (min(i__1,i__2) > *n1 || *n / 2 < *n1) {
+	    *info = -3;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    n2 = *n - *n1;
+    n1p1 = *n1 + 1;
+
+    if (*rho < 0.) {
+	dscal_(&n2, &c_b151, &z__[n1p1], &c__1);
+    }
+
+/*
+       Normalize z so that norm(z) = 1.  Since z is the concatenation of
+       two normalized vectors, norm2(z) = sqrt(2).
+*/
+
+    t = 1. / sqrt(2.);
+    dscal_(n, &t, &z__[1], &c__1);
+
+/*     RHO = ABS( norm(z)**2 * RHO ) */
+
+    *rho = (d__1 = *rho * 2., abs(d__1));
+
+/*     Sort the eigenvalues into increasing order */
+
+    i__1 = *n;
+    for (i__ = n1p1; i__ <= i__1; ++i__) {
+	indxq[i__] += *n1;
+/* L10: */
+    }
+
+/*     re-integrate the deflated parts from the last pass */
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = d__[indxq[i__]];
+/* L20: */
+    }
+    dlamrg_(n1, &n2, &dlamda[1], &c__1, &c__1, &indxc[1]);
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	indx[i__] = indxq[indxc[i__]];
+/* L30: */
+    }
+
+/*     Calculate the allowable deflation tolerance */
+
+    imax = idamax_(n, &z__[1], &c__1);
+    jmax = idamax_(n, &d__[1], &c__1);
+    eps = EPSILON;
+/* Computing MAX */
+    d__3 = (d__1 = d__[jmax], abs(d__1)), d__4 = (d__2 = z__[imax], abs(d__2))
+	    ;
+    tol = eps * 8. * max(d__3,d__4);
+
+/*
+       If the rank-1 modifier is small enough, no more needs to be done
+       except to reorganize Q so that its columns correspond with the
+       elements in D.
+*/
+
+    if (*rho * (d__1 = z__[imax], abs(d__1)) <= tol) {
+	*k = 0;
+	iq2 = 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__ = indx[j];
+	    dcopy_(n, &q[i__ * q_dim1 + 1], &c__1, &q2[iq2], &c__1);
+	    dlamda[j] = d__[i__];
+	    iq2 += *n;
+/* L40: */
+	}
+	dlacpy_("A", n, n, &q2[1], n, &q[q_offset], ldq);
+	dcopy_(n, &dlamda[1], &c__1, &d__[1], &c__1);
+	goto L190;
+    }
+
+/*
+       If there are multiple eigenvalues then the problem deflates.  Here
+       the number of equal eigenvalues are found.  As each equal
+       eigenvalue is found, an elementary reflector is computed to rotate
+       the corresponding eigensubspace so that the corresponding
+       components of Z are zero in this new basis.
+*/
+
+    i__1 = *n1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	coltyp[i__] = 1;
+/* L50: */
+    }
+    i__1 = *n;
+    for (i__ = n1p1; i__ <= i__1; ++i__) {
+	coltyp[i__] = 3;
+/* L60: */
+    }
+
+
+    *k = 0;
+    k2 = *n + 1;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	nj = indx[j];
+	if (*rho * (d__1 = z__[nj], abs(d__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    coltyp[nj] = 4;
+	    indxp[k2] = nj;
+	    if (j == *n) {
+		goto L100;
+	    }
+	} else {
+	    pj = nj;
+	    goto L80;
+	}
+/* L70: */
+    }
+L80:
+    ++j;
+    nj = indx[j];
+    if (j > *n) {
+	goto L100;
+    }
+    if (*rho * (d__1 = z__[nj], abs(d__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	coltyp[nj] = 4;
+	indxp[k2] = nj;
+    } else {
+
+/*        Check if eigenvalues are close enough to allow deflation. */
+
+	s = z__[pj];
+	c__ = z__[nj];
+
+/*
+          Find sqrt(a**2+b**2) without overflow or
+          destructive underflow.
+*/
+
+	tau = dlapy2_(&c__, &s);
+	t = d__[nj] - d__[pj];
+	c__ /= tau;
+	s = -s / tau;
+	if ((d__1 = t * c__ * s, abs(d__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    z__[nj] = tau;
+	    z__[pj] = 0.;
+	    if (coltyp[nj] != coltyp[pj]) {
+		coltyp[nj] = 2;
+	    }
+	    coltyp[pj] = 4;
+	    drot_(n, &q[pj * q_dim1 + 1], &c__1, &q[nj * q_dim1 + 1], &c__1, &
+		    c__, &s);
+/* Computing 2nd power */
+	    d__1 = c__;
+/* Computing 2nd power */
+	    d__2 = s;
+	    t = d__[pj] * (d__1 * d__1) + d__[nj] * (d__2 * d__2);
+/* Computing 2nd power */
+	    d__1 = s;
+/* Computing 2nd power */
+	    d__2 = c__;
+	    d__[nj] = d__[pj] * (d__1 * d__1) + d__[nj] * (d__2 * d__2);
+	    d__[pj] = t;
+	    --k2;
+	    i__ = 1;
+L90:
+	    if (k2 + i__ <= *n) {
+		if (d__[pj] < d__[indxp[k2 + i__]]) {
+		    indxp[k2 + i__ - 1] = indxp[k2 + i__];
+		    indxp[k2 + i__] = pj;
+		    ++i__;
+		    goto L90;
+		} else {
+		    indxp[k2 + i__ - 1] = pj;
+		}
+	    } else {
+		indxp[k2 + i__ - 1] = pj;
+	    }
+	    pj = nj;
+	} else {
+	    ++(*k);
+	    dlamda[*k] = d__[pj];
+	    w[*k] = z__[pj];
+	    indxp[*k] = pj;
+	    pj = nj;
+	}
+    }
+    goto L80;
+L100:
+
+/*     Record the last eigenvalue. */
+
+    ++(*k);
+    dlamda[*k] = d__[pj];
+    w[*k] = z__[pj];
+    indxp[*k] = pj;
+
+/*
+       Count up the total number of the various types of columns, then
+       form a permutation which positions the four column types into
+       four uniform groups (although one or more of these groups may be
+       empty).
+*/
+
+    for (j = 1; j <= 4; ++j) {
+	ctot[j - 1] = 0;
+/* L110: */
+    }
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	ct = coltyp[j];
+	++ctot[ct - 1];
+/* L120: */
+    }
+
+/*     PSM(*) = Position in SubMatrix (of types 1 through 4) */
+
+    psm[0] = 1;
+    psm[1] = ctot[0] + 1;
+    psm[2] = psm[1] + ctot[1];
+    psm[3] = psm[2] + ctot[2];
+    *k = *n - ctot[3];
+
+/*
+       Fill out the INDXC array so that the permutation which it induces
+       will place all type-1 columns first, all type-2 columns next,
+       then all type-3's, and finally all type-4's.
+*/
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	js = indxp[j];
+	ct = coltyp[js];
+	indx[psm[ct - 1]] = js;
+	indxc[psm[ct - 1]] = j;
+	++psm[ct - 1];
+/* L130: */
+    }
+
+/*
+       Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+       and Q2 respectively.  The eigenvalues/vectors which were not
+       deflated go into the first K slots of DLAMDA and Q2 respectively,
+       while those which were deflated go into the last N - K slots.
+*/
+
+    i__ = 1;
+    iq1 = 1;
+    iq2 = (ctot[0] + ctot[1]) * *n1 + 1;
+    i__1 = ctot[0];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	dcopy_(n1, &q[js * q_dim1 + 1], &c__1, &q2[iq1], &c__1);
+	z__[i__] = d__[js];
+	++i__;
+	iq1 += *n1;
+/* L140: */
+    }
+
+    i__1 = ctot[1];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	dcopy_(n1, &q[js * q_dim1 + 1], &c__1, &q2[iq1], &c__1);
+	dcopy_(&n2, &q[*n1 + 1 + js * q_dim1], &c__1, &q2[iq2], &c__1);
+	z__[i__] = d__[js];
+	++i__;
+	iq1 += *n1;
+	iq2 += n2;
+/* L150: */
+    }
+
+    i__1 = ctot[2];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	dcopy_(&n2, &q[*n1 + 1 + js * q_dim1], &c__1, &q2[iq2], &c__1);
+	z__[i__] = d__[js];
+	++i__;
+	iq2 += n2;
+/* L160: */
+    }
+
+    iq1 = iq2;
+    i__1 = ctot[3];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	dcopy_(n, &q[js * q_dim1 + 1], &c__1, &q2[iq2], &c__1);
+	iq2 += *n;
+	z__[i__] = d__[js];
+	++i__;
+/* L170: */
+    }
+
+/*
+       The deflated eigenvalues and their corresponding vectors go back
+       into the last N - K slots of D and Q respectively.
+*/
+
+    dlacpy_("A", n, &ctot[3], &q2[iq1], n, &q[(*k + 1) * q_dim1 + 1], ldq);
+    i__1 = *n - *k;
+    dcopy_(&i__1, &z__[*k + 1], &c__1, &d__[*k + 1], &c__1);
+
+/*     Copy CTOT into COLTYP for referencing in DLAED3. */
+
+    for (j = 1; j <= 4; ++j) {
+	coltyp[j] = ctot[j - 1];
+/* L180: */
+    }
+
+L190:
+    return 0;
+
+/*     End of DLAED2 */
+
+} /* dlaed2_ */
+
+/* Subroutine */ int dlaed3_(integer *k, integer *n, integer *n1, doublereal *
+	d__, doublereal *q, integer *ldq, doublereal *rho, doublereal *dlamda,
+	 doublereal *q2, integer *indx, integer *ctot, doublereal *w,
+	doublereal *s, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, n2, n12, ii, n23, iq2;
+    static doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *),
+	     dcopy_(integer *, doublereal *, integer *, doublereal *, integer
+	    *), dlaed4_(integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, integer *);
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    extern /* Subroutine */ int dlacpy_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *),
+	    dlaset_(char *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAED3 finds the roots of the secular equation, as defined by the
+    values in D, W, and RHO, between 1 and K.  It makes the
+    appropriate calls to DLAED4 and then updates the eigenvectors by
+    multiplying the matrix of eigenvectors of the pair of eigensystems
+    being combined by the matrix of eigenvectors of the K-by-K system
+    which is solved here.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    K       (input) INTEGER
+            The number of terms in the rational function to be solved by
+            DLAED4.  K >= 0.
+
+    N       (input) INTEGER
+            The number of rows and columns in the Q matrix.
+            N >= K (deflation may result in N>K).
+
+    N1      (input) INTEGER
+            The location of the last eigenvalue in the leading submatrix.
+            min(1,N) <= N1 <= N/2.
+
+    D       (output) DOUBLE PRECISION array, dimension (N)
+            D(I) contains the updated eigenvalues for
+            1 <= I <= K.
+
+    Q       (output) DOUBLE PRECISION array, dimension (LDQ,N)
+            Initially the first K columns are used as workspace.
+            On output the columns 1 to K contain
+            the updated eigenvectors.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    RHO     (input) DOUBLE PRECISION
+            The value of the parameter in the rank one update equation.
+            RHO >= 0 required.
+
+    DLAMDA  (input/output) DOUBLE PRECISION array, dimension (K)
+            The first K elements of this array contain the old roots
+            of the deflated updating problem.  These are the poles
+            of the secular equation. May be changed on output by
+            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
+            Cray-2, or Cray C-90, as described above.
+
+    Q2      (input) DOUBLE PRECISION array, dimension (LDQ2, N)
+            The first K columns of this matrix contain the non-deflated
+            eigenvectors for the split problem.
+
+    INDX    (input) INTEGER array, dimension (N)
+            The permutation used to arrange the columns of the deflated
+            Q matrix into three groups (see DLAED2).
+            The rows of the eigenvectors found by DLAED4 must be likewise
+            permuted before the matrix multiply can take place.
+
+    CTOT    (input) INTEGER array, dimension (4)
+            A count of the total number of the various types of columns
+            in Q, as described in INDX.  The fourth column type is any
+            column which has been deflated.
+
+    W       (input/output) DOUBLE PRECISION array, dimension (K)
+            The first K elements of this array contain the components
+            of the deflation-adjusted updating vector. Destroyed on
+            output.
+
+    S       (workspace) DOUBLE PRECISION array, dimension (N1 + 1)*K
+            Will contain the eigenvectors of the repaired matrix which
+            will be multiplied by the previously accumulated eigenvectors
+            to update the system.
+
+    LDS     (input) INTEGER
+            The leading dimension of S.  LDS >= max(1,K).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --dlamda;
+    --q2;
+    --indx;
+    --ctot;
+    --w;
+    --s;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*k < 0) {
+	*info = -1;
+    } else if (*n < *k) {
+	*info = -2;
+    } else if (*ldq < max(1,*n)) {
+	*info = -6;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED3", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 0) {
+	return 0;
+    }
+
+/*
+       Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DLAMDA(I) if it is 1; this makes the subsequent
+       subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DLAMDA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DLAMDA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DLAMBDA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = dlamc3_(&dlamda[i__], &dlamda[i__]) - dlamda[i__];
+/* L10: */
+    }
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	dlaed4_(k, &j, &dlamda[1], &w[1], &q[j * q_dim1 + 1], rho, &d__[j],
+		info);
+
+/*        If the zero finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    goto L120;
+	}
+/* L20: */
+    }
+
+    if (*k == 1) {
+	goto L110;
+    }
+    if (*k == 2) {
+	i__1 = *k;
+	for (j = 1; j <= i__1; ++j) {
+	    w[1] = q[j * q_dim1 + 1];
+	    w[2] = q[j * q_dim1 + 2];
+	    ii = indx[1];
+	    q[j * q_dim1 + 1] = w[ii];
+	    ii = indx[2];
+	    q[j * q_dim1 + 2] = w[ii];
+/* L30: */
+	}
+	goto L110;
+    }
+
+/*     Compute updated W. */
+
+    dcopy_(k, &w[1], &c__1, &s[1], &c__1);
+
+/*     Initialize W(I) = Q(I,I) */
+
+    i__1 = *ldq + 1;
+    dcopy_(k, &q[q_offset], &i__1, &w[1], &c__1);
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L40: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L50: */
+	}
+/* L60: */
+    }
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__1 = sqrt(-w[i__]);
+	w[i__] = d_sign(&d__1, &s[i__]);
+/* L70: */
+    }
+
+/*     Compute eigenvectors of the modified rank-1 modification. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    s[i__] = w[i__] / q[i__ + j * q_dim1];
+/* L80: */
+	}
+	temp = dnrm2_(k, &s[1], &c__1);
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    ii = indx[i__];
+	    q[i__ + j * q_dim1] = s[ii] / temp;
+/* L90: */
+	}
+/* L100: */
+    }
+
+/*     Compute the updated eigenvectors. */
+
+L110:
+
+    n2 = *n - *n1;
+    n12 = ctot[1] + ctot[2];
+    n23 = ctot[2] + ctot[3];
+
+    dlacpy_("A", &n23, k, &q[ctot[1] + 1 + q_dim1], ldq, &s[1], &n23);
+    iq2 = *n1 * n12 + 1;
+    if (n23 != 0) {
+	dgemm_("N", "N", &n2, k, &n23, &c_b15, &q2[iq2], &n2, &s[1], &n23, &
+		c_b29, &q[*n1 + 1 + q_dim1], ldq);
+    } else {
+	dlaset_("A", &n2, k, &c_b29, &c_b29, &q[*n1 + 1 + q_dim1], ldq);
+    }
+
+    dlacpy_("A", &n12, k, &q[q_offset], ldq, &s[1], &n12);
+    if (n12 != 0) {
+	dgemm_("N", "N", n1, k, &n12, &c_b15, &q2[1], n1, &s[1], &n12, &c_b29,
+		 &q[q_offset], ldq);
+    } else {
+	dlaset_("A", n1, k, &c_b29, &c_b29, &q[q_dim1 + 1], ldq);
+    }
+
+
+L120:
+    return 0;
+
+/*     End of DLAED3 */
+
+} /* dlaed3_ */
+
+/* Subroutine */ int dlaed4_(integer *n, integer *i__, doublereal *d__,
+	doublereal *z__, doublereal *delta, doublereal *rho, doublereal *dlam,
+	 integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal a, b, c__;
+    static integer j;
+    static doublereal w;
+    static integer ii;
+    static doublereal dw, zz[3];
+    static integer ip1;
+    static doublereal del, eta, phi, eps, tau, psi;
+    static integer iim1, iip1;
+    static doublereal dphi, dpsi;
+    static integer iter;
+    static doublereal temp, prew, temp1, dltlb, dltub, midpt;
+    static integer niter;
+    static logical swtch;
+    extern /* Subroutine */ int dlaed5_(integer *, doublereal *, doublereal *,
+	     doublereal *, doublereal *, doublereal *), dlaed6_(integer *,
+	    logical *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *);
+    static logical swtch3;
+
+    static logical orgati;
+    static doublereal erretm, rhoinv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the I-th updated eigenvalue of a symmetric
+    rank-one modification to a diagonal matrix whose elements are
+    given in the array d, and that
+
+               D(i) < D(j)  for  i < j
+
+    and that RHO > 0.  This is arranged by the calling routine, and is
+    no loss in generality.  The rank-one modified system is thus
+
+               diag( D )  +  RHO *  Z * Z_transpose.
+
+    where we assume the Euclidean norm of Z is 1.
+
+    The method consists of approximating the rational functions in the
+    secular equation by simpler interpolating rational functions.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The length of all arrays.
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  1 <= I <= N.
+
+    D      (input) DOUBLE PRECISION array, dimension (N)
+           The original eigenvalues.  It is assumed that they are in
+           order, D(I) < D(J)  for I < J.
+
+    Z      (input) DOUBLE PRECISION array, dimension (N)
+           The components of the updating vector.
+
+    DELTA  (output) DOUBLE PRECISION array, dimension (N)
+           If N .GT. 2, DELTA contains (D(j) - lambda_I) in its  j-th
+           component.  If N = 1, then DELTA(1) = 1. If N = 2, see DLAED5
+           for detail. The vector DELTA contains the information necessary
+           to construct the eigenvectors by DLAED3 and DLAED9.
+
+    RHO    (input) DOUBLE PRECISION
+           The scalar in the symmetric updating formula.
+
+    DLAM   (output) DOUBLE PRECISION
+           The computed lambda_I, the I-th updated eigenvalue.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit
+           > 0:  if INFO = 1, the updating process failed.
+
+    Internal Parameters
+    ===================
+
+    Logical variable ORGATI (origin-at-i?) is used for distinguishing
+    whether D(i) or D(i+1) is treated as the origin.
+
+              ORGATI = .true.    origin at i
+              ORGATI = .false.   origin at i+1
+
+     Logical variable SWTCH3 (switch-for-3-poles?) is for noting
+     if we are working with THREE poles!
+
+     MAXIT is the maximum number of iterations allowed for each
+     eigenvalue.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Since this routine is called in an inner loop, we do no argument
+       checking.
+
+       Quick return for N=1 and 2.
+*/
+
+    /* Parameter adjustments */
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    if (*n == 1) {
+
+/*         Presumably, I=1 upon entry */
+
+	*dlam = d__[1] + *rho * z__[1] * z__[1];
+	delta[1] = 1.;
+	return 0;
+    }
+    if (*n == 2) {
+	dlaed5_(i__, &d__[1], &z__[1], &delta[1], rho, dlam);
+	return 0;
+    }
+
+/*     Compute machine epsilon */
+
+    eps = EPSILON;
+    rhoinv = 1. / *rho;
+
+/*     The case I = N */
+
+    if (*i__ == *n) {
+
+/*        Initialize some basic variables */
+
+	ii = *n - 1;
+	niter = 1;
+
+/*        Calculate initial guess */
+
+	midpt = *rho / 2.;
+
+/*
+          If ||Z||_2 is not one, then TEMP should be set to
+          RHO * ||Z||_2^2 / TWO
+*/
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - midpt;
+/* L10: */
+	}
+
+	psi = 0.;
+	i__1 = *n - 2;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / delta[j];
+/* L20: */
+	}
+
+	c__ = rhoinv + psi;
+	w = c__ + z__[ii] * z__[ii] / delta[ii] + z__[*n] * z__[*n] / delta[*
+		n];
+
+	if (w <= 0.) {
+	    temp = z__[*n - 1] * z__[*n - 1] / (d__[*n] - d__[*n - 1] + *rho)
+		    + z__[*n] * z__[*n] / *rho;
+	    if (c__ <= temp) {
+		tau = *rho;
+	    } else {
+		del = d__[*n] - d__[*n - 1];
+		a = -c__ * del + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*n]
+			;
+		b = z__[*n] * z__[*n] * del;
+		if (a < 0.) {
+		    tau = b * 2. / (sqrt(a * a + b * 4. * c__) - a);
+		} else {
+		    tau = (a + sqrt(a * a + b * 4. * c__)) / (c__ * 2.);
+		}
+	    }
+
+/*
+             It can be proved that
+                 D(N)+RHO/2 <= LAMBDA(N) < D(N)+TAU <= D(N)+RHO
+*/
+
+	    dltlb = midpt;
+	    dltub = *rho;
+	} else {
+	    del = d__[*n] - d__[*n - 1];
+	    a = -c__ * del + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*n];
+	    b = z__[*n] * z__[*n] * del;
+	    if (a < 0.) {
+		tau = b * 2. / (sqrt(a * a + b * 4. * c__) - a);
+	    } else {
+		tau = (a + sqrt(a * a + b * 4. * c__)) / (c__ * 2.);
+	    }
+
+/*
+             It can be proved that
+                 D(N) < D(N)+TAU < LAMBDA(N) < D(N)+RHO/2
+*/
+
+	    dltlb = 0.;
+	    dltub = midpt;
+	}
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - tau;
+/* L30: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L40: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / delta[*n];
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8. + erretm - phi + rhoinv + abs(tau) * (dpsi
+		+ dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Test for convergence */
+
+	if (abs(w) <= eps * erretm) {
+	    *dlam = d__[*i__] + tau;
+	    goto L250;
+	}
+
+	if (w <= 0.) {
+	    dltlb = max(dltlb,tau);
+	} else {
+	    dltub = min(dltub,tau);
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	c__ = w - delta[*n - 1] * dpsi - delta[*n] * dphi;
+	a = (delta[*n - 1] + delta[*n]) * w - delta[*n - 1] * delta[*n] * (
+		dpsi + dphi);
+	b = delta[*n - 1] * delta[*n] * w;
+	if (c__ < 0.) {
+	    c__ = abs(c__);
+	}
+	if (c__ == 0.) {
+/*
+            ETA = B/A
+             ETA = RHO - TAU
+*/
+	    eta = dltub - tau;
+	} else if (a >= 0.) {
+	    eta = (a + sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (c__
+		    * 2.);
+	} else {
+	    eta = b * 2. / (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))
+		    );
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta > 0.) {
+	    eta = -w / (dpsi + dphi);
+	}
+	temp = tau + eta;
+	if (temp > dltub || temp < dltlb) {
+	    if (w < 0.) {
+		eta = (dltub - tau) / 2.;
+	    } else {
+		eta = (dltlb - tau) / 2.;
+	    }
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] -= eta;
+/* L50: */
+	}
+
+	tau += eta;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L60: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / delta[*n];
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8. + erretm - phi + rhoinv + abs(tau) * (dpsi
+		+ dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Main loop to update the values of the array   DELTA */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 30; ++niter) {
+
+/*           Test for convergence */
+
+	    if (abs(w) <= eps * erretm) {
+		*dlam = d__[*i__] + tau;
+		goto L250;
+	    }
+
+	    if (w <= 0.) {
+		dltlb = max(dltlb,tau);
+	    } else {
+		dltub = min(dltub,tau);
+	    }
+
+/*           Calculate the new step */
+
+	    c__ = w - delta[*n - 1] * dpsi - delta[*n] * dphi;
+	    a = (delta[*n - 1] + delta[*n]) * w - delta[*n - 1] * delta[*n] *
+		    (dpsi + dphi);
+	    b = delta[*n - 1] * delta[*n] * w;
+	    if (a >= 0.) {
+		eta = (a + sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+			c__ * 2.);
+	    } else {
+		eta = b * 2. / (a - sqrt((d__1 = a * a - b * 4. * c__, abs(
+			d__1))));
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta > 0.) {
+		eta = -w / (dpsi + dphi);
+	    }
+	    temp = tau + eta;
+	    if (temp > dltub || temp < dltlb) {
+		if (w < 0.) {
+		    eta = (dltub - tau) / 2.;
+		} else {
+		    eta = (dltlb - tau) / 2.;
+		}
+	    }
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] -= eta;
+/* L70: */
+	    }
+
+	    tau += eta;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.;
+	    psi = 0.;
+	    erretm = 0.;
+	    i__1 = ii;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / delta[j];
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L80: */
+	    }
+	    erretm = abs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    temp = z__[*n] / delta[*n];
+	    phi = z__[*n] * temp;
+	    dphi = temp * temp;
+	    erretm = (-phi - psi) * 8. + erretm - phi + rhoinv + abs(tau) * (
+		    dpsi + dphi);
+
+	    w = rhoinv + phi + psi;
+/* L90: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+	*dlam = d__[*i__] + tau;
+	goto L250;
+
+/*        End for the case I = N */
+
+    } else {
+
+/*        The case for I < N */
+
+	niter = 1;
+	ip1 = *i__ + 1;
+
+/*        Calculate initial guess */
+
+	del = d__[ip1] - d__[*i__];
+	midpt = del / 2.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - midpt;
+/* L100: */
+	}
+
+	psi = 0.;
+	i__1 = *i__ - 1;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / delta[j];
+/* L110: */
+	}
+
+	phi = 0.;
+	i__1 = *i__ + 2;
+	for (j = *n; j >= i__1; --j) {
+	    phi += z__[j] * z__[j] / delta[j];
+/* L120: */
+	}
+	c__ = rhoinv + psi + phi;
+	w = c__ + z__[*i__] * z__[*i__] / delta[*i__] + z__[ip1] * z__[ip1] /
+		delta[ip1];
+
+	if (w > 0.) {
+
+/*
+             d(i)< the ith eigenvalue < (d(i)+d(i+1))/2
+
+             We choose d(i) as origin.
+*/
+
+	    orgati = TRUE_;
+	    a = c__ * del + z__[*i__] * z__[*i__] + z__[ip1] * z__[ip1];
+	    b = z__[*i__] * z__[*i__] * del;
+	    if (a > 0.) {
+		tau = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__, abs(
+			d__1))));
+	    } else {
+		tau = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+			c__ * 2.);
+	    }
+	    dltlb = 0.;
+	    dltub = midpt;
+	} else {
+
+/*
+             (d(i)+d(i+1))/2 <= the ith eigenvalue < d(i+1)
+
+             We choose d(i+1) as origin.
+*/
+
+	    orgati = FALSE_;
+	    a = c__ * del - z__[*i__] * z__[*i__] - z__[ip1] * z__[ip1];
+	    b = z__[ip1] * z__[ip1] * del;
+	    if (a < 0.) {
+		tau = b * 2. / (a - sqrt((d__1 = a * a + b * 4. * c__, abs(
+			d__1))));
+	    } else {
+		tau = -(a + sqrt((d__1 = a * a + b * 4. * c__, abs(d__1)))) /
+			(c__ * 2.);
+	    }
+	    dltlb = -midpt;
+	    dltub = 0.;
+	}
+
+	if (orgati) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] = d__[j] - d__[*i__] - tau;
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] = d__[j] - d__[ip1] - tau;
+/* L140: */
+	    }
+	}
+	if (orgati) {
+	    ii = *i__;
+	} else {
+	    ii = *i__ + 1;
+	}
+	iim1 = ii - 1;
+	iip1 = ii + 1;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L150: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.;
+	phi = 0.;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / delta[j];
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L160: */
+	}
+
+	w = rhoinv + phi + psi;
+
+/*
+          W is the value of the secular function with
+          its ii-th element removed.
+*/
+
+	swtch3 = FALSE_;
+	if (orgati) {
+	    if (w < 0.) {
+		swtch3 = TRUE_;
+	    }
+	} else {
+	    if (w > 0.) {
+		swtch3 = TRUE_;
+	    }
+	}
+	if (ii == 1 || ii == *n) {
+	    swtch3 = FALSE_;
+	}
+
+	temp = z__[ii] / delta[ii];
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w += temp;
+	erretm = (phi - psi) * 8. + erretm + rhoinv * 2. + abs(temp) * 3. +
+		abs(tau) * dw;
+
+/*        Test for convergence */
+
+	if (abs(w) <= eps * erretm) {
+	    if (orgati) {
+		*dlam = d__[*i__] + tau;
+	    } else {
+		*dlam = d__[ip1] + tau;
+	    }
+	    goto L250;
+	}
+
+	if (w <= 0.) {
+	    dltlb = max(dltlb,tau);
+	} else {
+	    dltub = min(dltub,tau);
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	if (! swtch3) {
+	    if (orgati) {
+/* Computing 2nd power */
+		d__1 = z__[*i__] / delta[*i__];
+		c__ = w - delta[ip1] * dw - (d__[*i__] - d__[ip1]) * (d__1 *
+			d__1);
+	    } else {
+/* Computing 2nd power */
+		d__1 = z__[ip1] / delta[ip1];
+		c__ = w - delta[*i__] * dw - (d__[ip1] - d__[*i__]) * (d__1 *
+			d__1);
+	    }
+	    a = (delta[*i__] + delta[ip1]) * w - delta[*i__] * delta[ip1] *
+		    dw;
+	    b = delta[*i__] * delta[ip1] * w;
+	    if (c__ == 0.) {
+		if (a == 0.) {
+		    if (orgati) {
+			a = z__[*i__] * z__[*i__] + delta[ip1] * delta[ip1] *
+				(dpsi + dphi);
+		    } else {
+			a = z__[ip1] * z__[ip1] + delta[*i__] * delta[*i__] *
+				(dpsi + dphi);
+		    }
+		}
+		eta = b / a;
+	    } else if (a <= 0.) {
+		eta = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+			c__ * 2.);
+	    } else {
+		eta = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__, abs(
+			d__1))));
+	    }
+	} else {
+
+/*           Interpolation using THREE most relevant poles */
+
+	    temp = rhoinv + psi + phi;
+	    if (orgati) {
+		temp1 = z__[iim1] / delta[iim1];
+		temp1 *= temp1;
+		c__ = temp - delta[iip1] * (dpsi + dphi) - (d__[iim1] - d__[
+			iip1]) * temp1;
+		zz[0] = z__[iim1] * z__[iim1];
+		zz[2] = delta[iip1] * delta[iip1] * (dpsi - temp1 + dphi);
+	    } else {
+		temp1 = z__[iip1] / delta[iip1];
+		temp1 *= temp1;
+		c__ = temp - delta[iim1] * (dpsi + dphi) - (d__[iip1] - d__[
+			iim1]) * temp1;
+		zz[0] = delta[iim1] * delta[iim1] * (dpsi + (dphi - temp1));
+		zz[2] = z__[iip1] * z__[iip1];
+	    }
+	    zz[1] = z__[ii] * z__[ii];
+	    dlaed6_(&niter, &orgati, &c__, &delta[iim1], zz, &w, &eta, info);
+	    if (*info != 0) {
+		goto L250;
+	    }
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta >= 0.) {
+	    eta = -w / dw;
+	}
+	temp = tau + eta;
+	if (temp > dltub || temp < dltlb) {
+	    if (w < 0.) {
+		eta = (dltub - tau) / 2.;
+	    } else {
+		eta = (dltlb - tau) / 2.;
+	    }
+	}
+
+	prew = w;
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] -= eta;
+/* L180: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L190: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.;
+	phi = 0.;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / delta[j];
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L200: */
+	}
+
+	temp = z__[ii] / delta[ii];
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w = rhoinv + phi + psi + temp;
+	erretm = (phi - psi) * 8. + erretm + rhoinv * 2. + abs(temp) * 3. + (
+		d__1 = tau + eta, abs(d__1)) * dw;
+
+	swtch = FALSE_;
+	if (orgati) {
+	    if (-w > abs(prew) / 10.) {
+		swtch = TRUE_;
+	    }
+	} else {
+	    if (w > abs(prew) / 10.) {
+		swtch = TRUE_;
+	    }
+	}
+
+	tau += eta;
+
+/*        Main loop to update the values of the array   DELTA */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 30; ++niter) {
+
+/*           Test for convergence */
+
+	    if (abs(w) <= eps * erretm) {
+		if (orgati) {
+		    *dlam = d__[*i__] + tau;
+		} else {
+		    *dlam = d__[ip1] + tau;
+		}
+		goto L250;
+	    }
+
+	    if (w <= 0.) {
+		dltlb = max(dltlb,tau);
+	    } else {
+		dltub = min(dltub,tau);
+	    }
+
+/*           Calculate the new step */
+
+	    if (! swtch3) {
+		if (! swtch) {
+		    if (orgati) {
+/* Computing 2nd power */
+			d__1 = z__[*i__] / delta[*i__];
+			c__ = w - delta[ip1] * dw - (d__[*i__] - d__[ip1]) * (
+				d__1 * d__1);
+		    } else {
+/* Computing 2nd power */
+			d__1 = z__[ip1] / delta[ip1];
+			c__ = w - delta[*i__] * dw - (d__[ip1] - d__[*i__]) *
+				(d__1 * d__1);
+		    }
+		} else {
+		    temp = z__[ii] / delta[ii];
+		    if (orgati) {
+			dpsi += temp * temp;
+		    } else {
+			dphi += temp * temp;
+		    }
+		    c__ = w - delta[*i__] * dpsi - delta[ip1] * dphi;
+		}
+		a = (delta[*i__] + delta[ip1]) * w - delta[*i__] * delta[ip1]
+			* dw;
+		b = delta[*i__] * delta[ip1] * w;
+		if (c__ == 0.) {
+		    if (a == 0.) {
+			if (! swtch) {
+			    if (orgati) {
+				a = z__[*i__] * z__[*i__] + delta[ip1] *
+					delta[ip1] * (dpsi + dphi);
+			    } else {
+				a = z__[ip1] * z__[ip1] + delta[*i__] * delta[
+					*i__] * (dpsi + dphi);
+			    }
+			} else {
+			    a = delta[*i__] * delta[*i__] * dpsi + delta[ip1]
+				    * delta[ip1] * dphi;
+			}
+		    }
+		    eta = b / a;
+		} else if (a <= 0.) {
+		    eta = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1))))
+			     / (c__ * 2.);
+		} else {
+		    eta = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__,
+			    abs(d__1))));
+		}
+	    } else {
+
+/*              Interpolation using THREE most relevant poles */
+
+		temp = rhoinv + psi + phi;
+		if (swtch) {
+		    c__ = temp - delta[iim1] * dpsi - delta[iip1] * dphi;
+		    zz[0] = delta[iim1] * delta[iim1] * dpsi;
+		    zz[2] = delta[iip1] * delta[iip1] * dphi;
+		} else {
+		    if (orgati) {
+			temp1 = z__[iim1] / delta[iim1];
+			temp1 *= temp1;
+			c__ = temp - delta[iip1] * (dpsi + dphi) - (d__[iim1]
+				- d__[iip1]) * temp1;
+			zz[0] = z__[iim1] * z__[iim1];
+			zz[2] = delta[iip1] * delta[iip1] * (dpsi - temp1 +
+				dphi);
+		    } else {
+			temp1 = z__[iip1] / delta[iip1];
+			temp1 *= temp1;
+			c__ = temp - delta[iim1] * (dpsi + dphi) - (d__[iip1]
+				- d__[iim1]) * temp1;
+			zz[0] = delta[iim1] * delta[iim1] * (dpsi + (dphi -
+				temp1));
+			zz[2] = z__[iip1] * z__[iip1];
+		    }
+		}
+		dlaed6_(&niter, &orgati, &c__, &delta[iim1], zz, &w, &eta,
+			info);
+		if (*info != 0) {
+		    goto L250;
+		}
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta >= 0.) {
+		eta = -w / dw;
+	    }
+	    temp = tau + eta;
+	    if (temp > dltub || temp < dltlb) {
+		if (w < 0.) {
+		    eta = (dltub - tau) / 2.;
+		} else {
+		    eta = (dltlb - tau) / 2.;
+		}
+	    }
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] -= eta;
+/* L210: */
+	    }
+
+	    tau += eta;
+	    prew = w;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.;
+	    psi = 0.;
+	    erretm = 0.;
+	    i__1 = iim1;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / delta[j];
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L220: */
+	    }
+	    erretm = abs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    dphi = 0.;
+	    phi = 0.;
+	    i__1 = iip1;
+	    for (j = *n; j >= i__1; --j) {
+		temp = z__[j] / delta[j];
+		phi += z__[j] * temp;
+		dphi += temp * temp;
+		erretm += phi;
+/* L230: */
+	    }
+
+	    temp = z__[ii] / delta[ii];
+	    dw = dpsi + dphi + temp * temp;
+	    temp = z__[ii] * temp;
+	    w = rhoinv + phi + psi + temp;
+	    erretm = (phi - psi) * 8. + erretm + rhoinv * 2. + abs(temp) * 3.
+		    + abs(tau) * dw;
+	    if (w * prew > 0. && abs(w) > abs(prew) / 10.) {
+		swtch = ! swtch;
+	    }
+
+/* L240: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+	if (orgati) {
+	    *dlam = d__[*i__] + tau;
+	} else {
+	    *dlam = d__[ip1] + tau;
+	}
+
+    }
+
+L250:
+
+    return 0;
+
+/*     End of DLAED4 */
+
+} /* dlaed4_ */
+
+/* Subroutine */ int dlaed5_(integer *i__, doublereal *d__, doublereal *z__,
+	doublereal *delta, doublereal *rho, doublereal *dlam)
+{
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal b, c__, w, del, tau, temp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the I-th eigenvalue of a symmetric rank-one
+    modification of a 2-by-2 diagonal matrix
+
+               diag( D )  +  RHO *  Z * transpose(Z) .
+
+    The diagonal elements in the array D are assumed to satisfy
+
+               D(i) < D(j)  for  i < j .
+
+    We also assume RHO > 0 and that the Euclidean norm of the vector
+    Z is one.
+
+    Arguments
+    =========
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  I = 1 or I = 2.
+
+    D      (input) DOUBLE PRECISION array, dimension (2)
+           The original eigenvalues.  We assume D(1) < D(2).
+
+    Z      (input) DOUBLE PRECISION array, dimension (2)
+           The components of the updating vector.
+
+    DELTA  (output) DOUBLE PRECISION array, dimension (2)
+           The vector DELTA contains the information necessary
+           to construct the eigenvectors.
+
+    RHO    (input) DOUBLE PRECISION
+           The scalar in the symmetric updating formula.
+
+    DLAM   (output) DOUBLE PRECISION
+           The computed lambda_I, the I-th updated eigenvalue.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    del = d__[2] - d__[1];
+    if (*i__ == 1) {
+	w = *rho * 2. * (z__[2] * z__[2] - z__[1] * z__[1]) / del + 1.;
+	if (w > 0.) {
+	    b = del + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[1] * z__[1] * del;
+
+/*           B > ZERO, always */
+
+	    tau = c__ * 2. / (b + sqrt((d__1 = b * b - c__ * 4., abs(d__1))));
+	    *dlam = d__[1] + tau;
+	    delta[1] = -z__[1] / tau;
+	    delta[2] = z__[2] / (del - tau);
+	} else {
+	    b = -del + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[2] * z__[2] * del;
+	    if (b > 0.) {
+		tau = c__ * -2. / (b + sqrt(b * b + c__ * 4.));
+	    } else {
+		tau = (b - sqrt(b * b + c__ * 4.)) / 2.;
+	    }
+	    *dlam = d__[2] + tau;
+	    delta[1] = -z__[1] / (del + tau);
+	    delta[2] = -z__[2] / tau;
+	}
+	temp = sqrt(delta[1] * delta[1] + delta[2] * delta[2]);
+	delta[1] /= temp;
+	delta[2] /= temp;
+    } else {
+
+/*     Now I=2 */
+
+	b = -del + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	c__ = *rho * z__[2] * z__[2] * del;
+	if (b > 0.) {
+	    tau = (b + sqrt(b * b + c__ * 4.)) / 2.;
+	} else {
+	    tau = c__ * 2. / (-b + sqrt(b * b + c__ * 4.));
+	}
+	*dlam = d__[2] + tau;
+	delta[1] = -z__[1] / (del + tau);
+	delta[2] = -z__[2] / tau;
+	temp = sqrt(delta[1] * delta[1] + delta[2] * delta[2]);
+	delta[1] /= temp;
+	delta[2] /= temp;
+    }
+    return 0;
+
+/*     End OF DLAED5 */
+
+} /* dlaed5_ */
+
+/* Subroutine */ int dlaed6_(integer *kniter, logical *orgati, doublereal *
+	rho, doublereal *d__, doublereal *z__, doublereal *finit, doublereal *
+	tau, integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static doublereal a, b, c__, f;
+    static integer i__;
+    static doublereal fc, df, ddf, lbd, eta, ubd, eps, base;
+    static integer iter;
+    static doublereal temp, temp1, temp2, temp3, temp4;
+    static logical scale;
+    static integer niter;
+    static doublereal small1, small2, sminv1, sminv2;
+
+    static doublereal dscale[3], sclfac, zscale[3], erretm, sclinv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       February 2007
+
+
+    Purpose
+    =======
+
+    DLAED6 computes the positive or negative root (closest to the origin)
+    of
+                     z(1)        z(2)        z(3)
+    f(x) =   rho + --------- + ---------- + ---------
+                    d(1)-x      d(2)-x      d(3)-x
+
+    It is assumed that
+
+          if ORGATI = .true. the root is between d(2) and d(3);
+          otherwise it is between d(1) and d(2)
+
+    This routine will be called by DLAED4 when necessary. In most cases,
+    the root sought is the smallest in magnitude, though it might not be
+    in some extremely rare situations.
+
+    Arguments
+    =========
+
+    KNITER       (input) INTEGER
+                 Refer to DLAED4 for its significance.
+
+    ORGATI       (input) LOGICAL
+                 If ORGATI is true, the needed root is between d(2) and
+                 d(3); otherwise it is between d(1) and d(2).  See
+                 DLAED4 for further details.
+
+    RHO          (input) DOUBLE PRECISION
+                 Refer to the equation f(x) above.
+
+    D            (input) DOUBLE PRECISION array, dimension (3)
+                 D satisfies d(1) < d(2) < d(3).
+
+    Z            (input) DOUBLE PRECISION array, dimension (3)
+                 Each of the elements in z must be positive.
+
+    FINIT        (input) DOUBLE PRECISION
+                 The value of f at 0. It is more accurate than the one
+                 evaluated inside this routine (if someone wants to do
+                 so).
+
+    TAU          (output) DOUBLE PRECISION
+                 The root of the equation f(x).
+
+    INFO         (output) INTEGER
+                 = 0: successful exit
+                 > 0: if INFO = 1, failure to converge
+
+    Further Details
+    ===============
+
+    30/06/99: Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    10/02/03: This version has a few statements commented out for thread
+    safety (machine parameters are computed on each entry). SJH.
+
+    05/10/06: Modified from a new version of Ren-Cang Li, use
+       Gragg-Thornton-Warner cubic convergent scheme for better stability.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*orgati) {
+	lbd = d__[2];
+	ubd = d__[3];
+    } else {
+	lbd = d__[1];
+	ubd = d__[2];
+    }
+    if (*finit < 0.) {
+	lbd = 0.;
+    } else {
+	ubd = 0.;
+    }
+
+    niter = 1;
+    *tau = 0.;
+    if (*kniter == 2) {
+	if (*orgati) {
+	    temp = (d__[3] - d__[2]) / 2.;
+	    c__ = *rho + z__[1] / (d__[1] - d__[2] - temp);
+	    a = c__ * (d__[2] + d__[3]) + z__[2] + z__[3];
+	    b = c__ * d__[2] * d__[3] + z__[2] * d__[3] + z__[3] * d__[2];
+	} else {
+	    temp = (d__[1] - d__[2]) / 2.;
+	    c__ = *rho + z__[3] / (d__[3] - d__[2] - temp);
+	    a = c__ * (d__[1] + d__[2]) + z__[1] + z__[2];
+	    b = c__ * d__[1] * d__[2] + z__[1] * d__[2] + z__[2] * d__[1];
+	}
+/* Computing MAX */
+	d__1 = abs(a), d__2 = abs(b), d__1 = max(d__1,d__2), d__2 = abs(c__);
+	temp = max(d__1,d__2);
+	a /= temp;
+	b /= temp;
+	c__ /= temp;
+	if (c__ == 0.) {
+	    *tau = b / a;
+	} else if (a <= 0.) {
+	    *tau = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+		    c__ * 2.);
+	} else {
+	    *tau = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__, abs(d__1))
+		    ));
+	}
+	if (*tau < lbd || *tau > ubd) {
+	    *tau = (lbd + ubd) / 2.;
+	}
+	if (d__[1] == *tau || d__[2] == *tau || d__[3] == *tau) {
+	    *tau = 0.;
+	} else {
+	    temp = *finit + *tau * z__[1] / (d__[1] * (d__[1] - *tau)) + *tau
+		    * z__[2] / (d__[2] * (d__[2] - *tau)) + *tau * z__[3] / (
+		    d__[3] * (d__[3] - *tau));
+	    if (temp <= 0.) {
+		lbd = *tau;
+	    } else {
+		ubd = *tau;
+	    }
+	    if (abs(*finit) <= abs(temp)) {
+		*tau = 0.;
+	    }
+	}
+    }
+
+/*
+       get machine parameters for possible scaling to avoid overflow
+
+       modified by Sven: parameters SMALL1, SMINV1, SMALL2,
+       SMINV2, EPS are not SAVEd anymore between one call to the
+       others but recomputed at each call
+*/
+
+    eps = EPSILON;
+    base = BASE;
+    i__1 = (integer) (log(SAFEMINIMUM) / log(base) / 3.);
+    small1 = pow_di(&base, &i__1);
+    sminv1 = 1. / small1;
+    small2 = small1 * small1;
+    sminv2 = sminv1 * sminv1;
+
+/*
+       Determine if scaling of inputs necessary to avoid overflow
+       when computing 1/TEMP**3
+*/
+
+    if (*orgati) {
+/* Computing MIN */
+	d__3 = (d__1 = d__[2] - *tau, abs(d__1)), d__4 = (d__2 = d__[3] - *
+		tau, abs(d__2));
+	temp = min(d__3,d__4);
+    } else {
+/* Computing MIN */
+	d__3 = (d__1 = d__[1] - *tau, abs(d__1)), d__4 = (d__2 = d__[2] - *
+		tau, abs(d__2));
+	temp = min(d__3,d__4);
+    }
+    scale = FALSE_;
+    if (temp <= small1) {
+	scale = TRUE_;
+	if (temp <= small2) {
+
+/*        Scale up by power of radix nearest 1/SAFMIN**(2/3) */
+
+	    sclfac = sminv2;
+	    sclinv = small2;
+	} else {
+
+/*        Scale up by power of radix nearest 1/SAFMIN**(1/3) */
+
+	    sclfac = sminv1;
+	    sclinv = small1;
+	}
+
+/*        Scaling up safe because D, Z, TAU scaled elsewhere to be O(1) */
+
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    dscale[i__ - 1] = d__[i__] * sclfac;
+	    zscale[i__ - 1] = z__[i__] * sclfac;
+/* L10: */
+	}
+	*tau *= sclfac;
+	lbd *= sclfac;
+	ubd *= sclfac;
+    } else {
+
+/*        Copy D and Z to DSCALE and ZSCALE */
+
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    dscale[i__ - 1] = d__[i__];
+	    zscale[i__ - 1] = z__[i__];
+/* L20: */
+	}
+    }
+
+    fc = 0.;
+    df = 0.;
+    ddf = 0.;
+    for (i__ = 1; i__ <= 3; ++i__) {
+	temp = 1. / (dscale[i__ - 1] - *tau);
+	temp1 = zscale[i__ - 1] * temp;
+	temp2 = temp1 * temp;
+	temp3 = temp2 * temp;
+	fc += temp1 / dscale[i__ - 1];
+	df += temp2;
+	ddf += temp3;
+/* L30: */
+    }
+    f = *finit + *tau * fc;
+
+    if (abs(f) <= 0.) {
+	goto L60;
+    }
+    if (f <= 0.) {
+	lbd = *tau;
+    } else {
+	ubd = *tau;
+    }
+
+/*
+          Iteration begins -- Use Gragg-Thornton-Warner cubic convergent
+                              scheme
+
+       It is not hard to see that
+
+             1) Iterations will go up monotonically
+                if FINIT < 0;
+
+             2) Iterations will go down monotonically
+                if FINIT > 0.
+*/
+
+    iter = niter + 1;
+
+    for (niter = iter; niter <= 40; ++niter) {
+
+	if (*orgati) {
+	    temp1 = dscale[1] - *tau;
+	    temp2 = dscale[2] - *tau;
+	} else {
+	    temp1 = dscale[0] - *tau;
+	    temp2 = dscale[1] - *tau;
+	}
+	a = (temp1 + temp2) * f - temp1 * temp2 * df;
+	b = temp1 * temp2 * f;
+	c__ = f - (temp1 + temp2) * df + temp1 * temp2 * ddf;
+/* Computing MAX */
+	d__1 = abs(a), d__2 = abs(b), d__1 = max(d__1,d__2), d__2 = abs(c__);
+	temp = max(d__1,d__2);
+	a /= temp;
+	b /= temp;
+	c__ /= temp;
+	if (c__ == 0.) {
+	    eta = b / a;
+	} else if (a <= 0.) {
+	    eta = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (c__
+		    * 2.);
+	} else {
+	    eta = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))
+		    );
+	}
+	if (f * eta >= 0.) {
+	    eta = -f / df;
+	}
+
+	*tau += eta;
+	if (*tau < lbd || *tau > ubd) {
+	    *tau = (lbd + ubd) / 2.;
+	}
+
+	fc = 0.;
+	erretm = 0.;
+	df = 0.;
+	ddf = 0.;
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    temp = 1. / (dscale[i__ - 1] - *tau);
+	    temp1 = zscale[i__ - 1] * temp;
+	    temp2 = temp1 * temp;
+	    temp3 = temp2 * temp;
+	    temp4 = temp1 / dscale[i__ - 1];
+	    fc += temp4;
+	    erretm += abs(temp4);
+	    df += temp2;
+	    ddf += temp3;
+/* L40: */
+	}
+	f = *finit + *tau * fc;
+	erretm = (abs(*finit) + abs(*tau) * erretm) * 8. + abs(*tau) * df;
+	if (abs(f) <= eps * erretm) {
+	    goto L60;
+	}
+	if (f <= 0.) {
+	    lbd = *tau;
+	} else {
+	    ubd = *tau;
+	}
+/* L50: */
+    }
+    *info = 1;
+L60:
+
+/*     Undo scaling */
+
+    if (scale) {
+	*tau *= sclinv;
+    }
+    return 0;
+
+/*     End of DLAED6 */
+
+} /* dlaed6_ */
+
+/* Subroutine */ int dlaed7_(integer *icompq, integer *n, integer *qsiz,
+	integer *tlvls, integer *curlvl, integer *curpbm, doublereal *d__,
+	doublereal *q, integer *ldq, integer *indxq, doublereal *rho, integer
+	*cutpnt, doublereal *qstore, integer *qptr, integer *prmptr, integer *
+	perm, integer *givptr, integer *givcol, doublereal *givnum,
+	doublereal *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, k, n1, n2, is, iw, iz, iq2, ptr, ldq2, indx, curr;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer indxc, indxp;
+    extern /* Subroutine */ int dlaed8_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     integer *, doublereal *, integer *, integer *, integer *,
+	    doublereal *, integer *, integer *, integer *), dlaed9_(integer *,
+	     integer *, integer *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     integer *, integer *), dlaeda_(integer *, integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *, doublereal
+	    *, doublereal *, integer *, doublereal *, doublereal *, integer *)
+	    ;
+    static integer idlmda;
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *);
+    static integer coltyp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAED7 computes the updated eigensystem of a diagonal
+    matrix after modification by a rank-one symmetric matrix. This
+    routine is used only for the eigenproblem which requires all
+    eigenvalues and optionally eigenvectors of a dense symmetric matrix
+    that has been reduced to tridiagonal form.  DLAED1 handles
+    the case in which all eigenvalues and eigenvectors of a symmetric
+    tridiagonal matrix are desired.
+
+      T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out)
+
+       where Z = Q'u, u is a vector of length N with ones in the
+       CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+
+       The eigenvectors of the original matrix are stored in Q, and the
+       eigenvalues are in D.  The algorithm consists of three stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple eigenvalues or if there is a zero in
+          the Z vector.  For each such occurence the dimension of the
+          secular equation problem is reduced by one.  This stage is
+          performed by the routine DLAED8.
+
+          The second stage consists of calculating the updated
+          eigenvalues. This is done by finding the roots of the secular
+          equation via the routine DLAED4 (as called by DLAED9).
+          This routine also calculates the eigenvectors of the current
+          problem.
+
+          The final stage consists of computing the updated eigenvectors
+          directly using the updated eigenvalues.  The eigenvectors for
+          the current problem are multiplied with the eigenvectors from
+          the overall problem.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            = 0:  Compute eigenvalues only.
+            = 1:  Compute eigenvectors of original dense symmetric matrix
+                  also.  On entry, Q contains the orthogonal matrix used
+                  to reduce the original matrix to tridiagonal form.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    QSIZ   (input) INTEGER
+           The dimension of the orthogonal matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    TLVLS  (input) INTEGER
+           The total number of merging levels in the overall divide and
+           conquer tree.
+
+    CURLVL (input) INTEGER
+           The current level in the overall merge routine,
+           0 <= CURLVL <= TLVLS.
+
+    CURPBM (input) INTEGER
+           The current problem in the current level in the overall
+           merge routine (counting from upper left to lower right).
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, the eigenvalues of the rank-1-perturbed matrix.
+           On exit, the eigenvalues of the repaired matrix.
+
+    Q      (input/output) DOUBLE PRECISION array, dimension (LDQ, N)
+           On entry, the eigenvectors of the rank-1-perturbed matrix.
+           On exit, the eigenvectors of the repaired tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (output) INTEGER array, dimension (N)
+           The permutation which will reintegrate the subproblem just
+           solved back into sorted order, i.e., D( INDXQ( I = 1, N ) )
+           will be in ascending order.
+
+    RHO    (input) DOUBLE PRECISION
+           The subdiagonal element used to create the rank-1
+           modification.
+
+    CUTPNT (input) INTEGER
+           Contains the location of the last eigenvalue in the leading
+           sub-matrix.  min(1,N) <= CUTPNT <= N.
+
+    QSTORE (input/output) DOUBLE PRECISION array, dimension (N**2+1)
+           Stores eigenvectors of submatrices encountered during
+           divide and conquer, packed together. QPTR points to
+           beginning of the submatrices.
+
+    QPTR   (input/output) INTEGER array, dimension (N+2)
+           List of indices pointing to beginning of submatrices stored
+           in QSTORE. The submatrices are numbered starting at the
+           bottom left of the divide and conquer tree, from left to
+           right and bottom to top.
+
+    PRMPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in PERM a
+           level's permutation is stored.  PRMPTR(i+1) - PRMPTR(i)
+           indicates the size of the permutation and also the size of
+           the full, non-deflated problem.
+
+    PERM   (input) INTEGER array, dimension (N lg N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in GIVCOL a
+           level's Givens rotations are stored.  GIVPTR(i+1) - GIVPTR(i)
+           indicates the number of Givens rotations.
+
+    GIVCOL (input) INTEGER array, dimension (2, N lg N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension (2, N lg N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension (3*N+QSIZ*N)
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --qstore;
+    --qptr;
+    --prmptr;
+    --perm;
+    --givptr;
+    givcol -= 3;
+    givnum -= 3;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*icompq == 1 && *qsiz < *n) {
+	*info = -4;
+    } else if (*ldq < max(1,*n)) {
+	*info = -9;
+    } else if (min(1,*n) > *cutpnt || *n < *cutpnt) {
+	*info = -12;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED7", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in DLAED8 and DLAED9.
+*/
+
+    if (*icompq == 1) {
+	ldq2 = *qsiz;
+    } else {
+	ldq2 = *n;
+    }
+
+    iz = 1;
+    idlmda = iz + *n;
+    iw = idlmda + *n;
+    iq2 = iw + *n;
+    is = iq2 + *n * ldq2;
+
+    indx = 1;
+    indxc = indx + *n;
+    coltyp = indxc + *n;
+    indxp = coltyp + *n;
+
+/*
+       Form the z-vector which consists of the last row of Q_1 and the
+       first row of Q_2.
+*/
+
+    ptr = pow_ii(&c__2, tlvls) + 1;
+    i__1 = *curlvl - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = *tlvls - i__;
+	ptr += pow_ii(&c__2, &i__2);
+/* L10: */
+    }
+    curr = ptr + *curpbm;
+    dlaeda_(n, tlvls, curlvl, curpbm, &prmptr[1], &perm[1], &givptr[1], &
+	    givcol[3], &givnum[3], &qstore[1], &qptr[1], &work[iz], &work[iz
+	    + *n], info);
+
+/*
+       When solving the final problem, we no longer need the stored data,
+       so we will overwrite the data from this level onto the previously
+       used storage space.
+*/
+
+    if (*curlvl == *tlvls) {
+	qptr[curr] = 1;
+	prmptr[curr] = 1;
+	givptr[curr] = 1;
+    }
+
+/*     Sort and Deflate eigenvalues. */
+
+    dlaed8_(icompq, &k, n, qsiz, &d__[1], &q[q_offset], ldq, &indxq[1], rho,
+	    cutpnt, &work[iz], &work[idlmda], &work[iq2], &ldq2, &work[iw], &
+	    perm[prmptr[curr]], &givptr[curr + 1], &givcol[(givptr[curr] << 1)
+	     + 1], &givnum[(givptr[curr] << 1) + 1], &iwork[indxp], &iwork[
+	    indx], info);
+    prmptr[curr + 1] = prmptr[curr] + *n;
+    givptr[curr + 1] += givptr[curr];
+
+/*     Solve Secular Equation. */
+
+    if (k != 0) {
+	dlaed9_(&k, &c__1, &k, n, &d__[1], &work[is], &k, rho, &work[idlmda],
+		&work[iw], &qstore[qptr[curr]], &k, info);
+	if (*info != 0) {
+	    goto L30;
+	}
+	if (*icompq == 1) {
+	    dgemm_("N", "N", qsiz, &k, &k, &c_b15, &work[iq2], &ldq2, &qstore[
+		    qptr[curr]], &k, &c_b29, &q[q_offset], ldq);
+	}
+/* Computing 2nd power */
+	i__1 = k;
+	qptr[curr + 1] = qptr[curr] + i__1 * i__1;
+
+/*     Prepare the INDXQ sorting permutation. */
+
+	n1 = k;
+	n2 = *n - k;
+	dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &indxq[1]);
+    } else {
+	qptr[curr + 1] = qptr[curr];
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    indxq[i__] = i__;
+/* L20: */
+	}
+    }
+
+L30:
+    return 0;
+
+/*     End of DLAED7 */
+
+} /* dlaed7_ */
+
+/* Subroutine */ int dlaed8_(integer *icompq, integer *k, integer *n, integer
+	*qsiz, doublereal *d__, doublereal *q, integer *ldq, integer *indxq,
+	doublereal *rho, integer *cutpnt, doublereal *z__, doublereal *dlamda,
+	 doublereal *q2, integer *ldq2, doublereal *w, integer *perm, integer
+	*givptr, integer *givcol, doublereal *givnum, integer *indxp, integer
+	*indx, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, q2_dim1, q2_offset, i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal c__;
+    static integer i__, j;
+    static doublereal s, t;
+    static integer k2, n1, n2, jp, n1p1;
+    static doublereal eps, tau, tol;
+    static integer jlam, imax, jmax;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *), dscal_(
+	    integer *, doublereal *, doublereal *, integer *), dcopy_(integer
+	    *, doublereal *, integer *, doublereal *, integer *);
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), dlacpy_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLAED8 merges the two sets of eigenvalues together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    eigenvalues are close together or if there is a tiny element in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            = 0:  Compute eigenvalues only.
+            = 1:  Compute eigenvectors of original dense symmetric matrix
+                  also.  On entry, Q contains the orthogonal matrix used
+                  to reduce the original matrix to tridiagonal form.
+
+    K      (output) INTEGER
+           The number of non-deflated eigenvalues, and the order of the
+           related secular equation.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    QSIZ   (input) INTEGER
+           The dimension of the orthogonal matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, the eigenvalues of the two submatrices to be
+           combined.  On exit, the trailing (N-K) updated eigenvalues
+           (those which were deflated) sorted into increasing order.
+
+    Q      (input/output) DOUBLE PRECISION array, dimension (LDQ,N)
+           If ICOMPQ = 0, Q is not referenced.  Otherwise,
+           on entry, Q contains the eigenvectors of the partially solved
+           system which has been previously updated in matrix
+           multiplies with other partially solved eigensystems.
+           On exit, Q contains the trailing (N-K) updated eigenvectors
+           (those which were deflated) in its last N-K columns.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (input) INTEGER array, dimension (N)
+           The permutation which separately sorts the two sub-problems
+           in D into ascending order.  Note that elements in the second
+           half of this permutation must first have CUTPNT added to
+           their values in order to be accurate.
+
+    RHO    (input/output) DOUBLE PRECISION
+           On entry, the off-diagonal element associated with the rank-1
+           cut which originally split the two submatrices which are now
+           being recombined.
+           On exit, RHO has been modified to the value required by
+           DLAED3.
+
+    CUTPNT (input) INTEGER
+           The location of the last eigenvalue in the leading
+           sub-matrix.  min(1,N) <= CUTPNT <= N.
+
+    Z      (input) DOUBLE PRECISION array, dimension (N)
+           On entry, Z contains the updating vector (the last row of
+           the first sub-eigenvector matrix and the first row of the
+           second sub-eigenvector matrix).
+           On exit, the contents of Z are destroyed by the updating
+           process.
+
+    DLAMDA (output) DOUBLE PRECISION array, dimension (N)
+           A copy of the first K eigenvalues which will be used by
+           DLAED3 to form the secular equation.
+
+    Q2     (output) DOUBLE PRECISION array, dimension (LDQ2,N)
+           If ICOMPQ = 0, Q2 is not referenced.  Otherwise,
+           a copy of the first K eigenvectors which will be used by
+           DLAED7 in a matrix multiply (DGEMM) to update the new
+           eigenvectors.
+
+    LDQ2   (input) INTEGER
+           The leading dimension of the array Q2.  LDQ2 >= max(1,N).
+
+    W      (output) DOUBLE PRECISION array, dimension (N)
+           The first k values of the final deflation-altered z-vector and
+           will be passed to DLAED3.
+
+    PERM   (output) INTEGER array, dimension (N)
+           The permutations (from deflation and sorting) to be applied
+           to each eigenblock.
+
+    GIVPTR (output) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (output) INTEGER array, dimension (2, N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (output) DOUBLE PRECISION array, dimension (2, N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    INDXP  (workspace) INTEGER array, dimension (N)
+           The permutation used to place deflated values of D at the end
+           of the array.  INDXP(1:K) points to the nondeflated D-values
+           and INDXP(K+1:N) points to the deflated eigenvalues.
+
+    INDX   (workspace) INTEGER array, dimension (N)
+           The permutation used to sort the contents of D into ascending
+           order.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --z__;
+    --dlamda;
+    q2_dim1 = *ldq2;
+    q2_offset = 1 + q2_dim1;
+    q2 -= q2_offset;
+    --w;
+    --perm;
+    givcol -= 3;
+    givnum -= 3;
+    --indxp;
+    --indx;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*icompq == 1 && *qsiz < *n) {
+	*info = -4;
+    } else if (*ldq < max(1,*n)) {
+	*info = -7;
+    } else if (*cutpnt < min(1,*n) || *cutpnt > *n) {
+	*info = -10;
+    } else if (*ldq2 < max(1,*n)) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED8", &i__1);
+	return 0;
+    }
+
+/*
+       Need to initialize GIVPTR to O here in case of quick exit
+       to prevent an unspecified code behavior (usually sigfault)
+       when IWORK array on entry to *stedc is not zeroed
+       (or at least some IWORK entries which used in *laed7 for GIVPTR).
+*/
+
+    *givptr = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    n1 = *cutpnt;
+    n2 = *n - n1;
+    n1p1 = n1 + 1;
+
+    if (*rho < 0.) {
+	dscal_(&n2, &c_b151, &z__[n1p1], &c__1);
+    }
+
+/*     Normalize z so that norm(z) = 1 */
+
+    t = 1. / sqrt(2.);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	indx[j] = j;
+/* L10: */
+    }
+    dscal_(n, &t, &z__[1], &c__1);
+    *rho = (d__1 = *rho * 2., abs(d__1));
+
+/*     Sort the eigenvalues into increasing order */
+
+    i__1 = *n;
+    for (i__ = *cutpnt + 1; i__ <= i__1; ++i__) {
+	indxq[i__] += *cutpnt;
+/* L20: */
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = d__[indxq[i__]];
+	w[i__] = z__[indxq[i__]];
+/* L30: */
+    }
+    i__ = 1;
+    j = *cutpnt + 1;
+    dlamrg_(&n1, &n2, &dlamda[1], &c__1, &c__1, &indx[1]);
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = dlamda[indx[i__]];
+	z__[i__] = w[indx[i__]];
+/* L40: */
+    }
+
+/*     Calculate the allowable deflation tolerence */
+
+    imax = idamax_(n, &z__[1], &c__1);
+    jmax = idamax_(n, &d__[1], &c__1);
+    eps = EPSILON;
+    tol = eps * 8. * (d__1 = d__[jmax], abs(d__1));
+
+/*
+       If the rank-1 modifier is small enough, no more needs to be done
+       except to reorganize Q so that its columns correspond with the
+       elements in D.
+*/
+
+    if (*rho * (d__1 = z__[imax], abs(d__1)) <= tol) {
+	*k = 0;
+	if (*icompq == 0) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		perm[j] = indxq[indx[j]];
+/* L50: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		perm[j] = indxq[indx[j]];
+		dcopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1
+			+ 1], &c__1);
+/* L60: */
+	    }
+	    dlacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq);
+	}
+	return 0;
+    }
+
+/*
+       If there are multiple eigenvalues then the problem deflates.  Here
+       the number of equal eigenvalues are found.  As each equal
+       eigenvalue is found, an elementary reflector is computed to rotate
+       the corresponding eigensubspace so that the corresponding
+       components of Z are zero in this new basis.
+*/
+
+    *k = 0;
+    k2 = *n + 1;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (*rho * (d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    indxp[k2] = j;
+	    if (j == *n) {
+		goto L110;
+	    }
+	} else {
+	    jlam = j;
+	    goto L80;
+	}
+/* L70: */
+    }
+L80:
+    ++j;
+    if (j > *n) {
+	goto L100;
+    }
+    if (*rho * (d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	indxp[k2] = j;
+    } else {
+
+/*        Check if eigenvalues are close enough to allow deflation. */
+
+	s = z__[jlam];
+	c__ = z__[j];
+
+/*
+          Find sqrt(a**2+b**2) without overflow or
+          destructive underflow.
+*/
+
+	tau = dlapy2_(&c__, &s);
+	t = d__[j] - d__[jlam];
+	c__ /= tau;
+	s = -s / tau;
+	if ((d__1 = t * c__ * s, abs(d__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    z__[j] = tau;
+	    z__[jlam] = 0.;
+
+/*           Record the appropriate Givens rotation */
+
+	    ++(*givptr);
+	    givcol[(*givptr << 1) + 1] = indxq[indx[jlam]];
+	    givcol[(*givptr << 1) + 2] = indxq[indx[j]];
+	    givnum[(*givptr << 1) + 1] = c__;
+	    givnum[(*givptr << 1) + 2] = s;
+	    if (*icompq == 1) {
+		drot_(qsiz, &q[indxq[indx[jlam]] * q_dim1 + 1], &c__1, &q[
+			indxq[indx[j]] * q_dim1 + 1], &c__1, &c__, &s);
+	    }
+	    t = d__[jlam] * c__ * c__ + d__[j] * s * s;
+	    d__[j] = d__[jlam] * s * s + d__[j] * c__ * c__;
+	    d__[jlam] = t;
+	    --k2;
+	    i__ = 1;
+L90:
+	    if (k2 + i__ <= *n) {
+		if (d__[jlam] < d__[indxp[k2 + i__]]) {
+		    indxp[k2 + i__ - 1] = indxp[k2 + i__];
+		    indxp[k2 + i__] = jlam;
+		    ++i__;
+		    goto L90;
+		} else {
+		    indxp[k2 + i__ - 1] = jlam;
+		}
+	    } else {
+		indxp[k2 + i__ - 1] = jlam;
+	    }
+	    jlam = j;
+	} else {
+	    ++(*k);
+	    w[*k] = z__[jlam];
+	    dlamda[*k] = d__[jlam];
+	    indxp[*k] = jlam;
+	    jlam = j;
+	}
+    }
+    goto L80;
+L100:
+
+/*     Record the last eigenvalue. */
+
+    ++(*k);
+    w[*k] = z__[jlam];
+    dlamda[*k] = d__[jlam];
+    indxp[*k] = jlam;
+
+L110:
+
+/*
+       Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+       and Q2 respectively.  The eigenvalues/vectors which were not
+       deflated go into the first K slots of DLAMDA and Q2 respectively,
+       while those which were deflated go into the last N - K slots.
+*/
+
+    if (*icompq == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    jp = indxp[j];
+	    dlamda[j] = d__[jp];
+	    perm[j] = indxq[indx[jp]];
+/* L120: */
+	}
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    jp = indxp[j];
+	    dlamda[j] = d__[jp];
+	    perm[j] = indxq[indx[jp]];
+	    dcopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1 + 1]
+		    , &c__1);
+/* L130: */
+	}
+    }
+
+/*
+       The deflated eigenvalues and their corresponding vectors go back
+       into the last N - K slots of D and Q respectively.
+*/
+
+    if (*k < *n) {
+	if (*icompq == 0) {
+	    i__1 = *n - *k;
+	    dcopy_(&i__1, &dlamda[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	} else {
+	    i__1 = *n - *k;
+	    dcopy_(&i__1, &dlamda[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	    i__1 = *n - *k;
+	    dlacpy_("A", qsiz, &i__1, &q2[(*k + 1) * q2_dim1 + 1], ldq2, &q[(*
+		    k + 1) * q_dim1 + 1], ldq);
+	}
+    }
+
+    return 0;
+
+/*     End of DLAED8 */
+
+} /* dlaed8_ */
+
+/* Subroutine */ int dlaed9_(integer *k, integer *kstart, integer *kstop,
+	integer *n, doublereal *d__, doublereal *q, integer *ldq, doublereal *
+	rho, doublereal *dlamda, doublereal *w, doublereal *s, integer *lds,
+	integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, s_dim1, s_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlaed4_(integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, integer *);
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAED9 finds the roots of the secular equation, as defined by the
+    values in D, Z, and RHO, between KSTART and KSTOP.  It makes the
+    appropriate calls to DLAED4 and then stores the new matrix of
+    eigenvectors for use in calculating the next level of Z vectors.
+
+    Arguments
+    =========
+
+    K       (input) INTEGER
+            The number of terms in the rational function to be solved by
+            DLAED4.  K >= 0.
+
+    KSTART  (input) INTEGER
+    KSTOP   (input) INTEGER
+            The updated eigenvalues Lambda(I), KSTART <= I <= KSTOP
+            are to be computed.  1 <= KSTART <= KSTOP <= K.
+
+    N       (input) INTEGER
+            The number of rows and columns in the Q matrix.
+            N >= K (delation may result in N > K).
+
+    D       (output) DOUBLE PRECISION array, dimension (N)
+            D(I) contains the updated eigenvalues
+            for KSTART <= I <= KSTOP.
+
+    Q       (workspace) DOUBLE PRECISION array, dimension (LDQ,N)
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max( 1, N ).
+
+    RHO     (input) DOUBLE PRECISION
+            The value of the parameter in the rank one update equation.
+            RHO >= 0 required.
+
+    DLAMDA  (input) DOUBLE PRECISION array, dimension (K)
+            The first K elements of this array contain the old roots
+            of the deflated updating problem.  These are the poles
+            of the secular equation.
+
+    W       (input) DOUBLE PRECISION array, dimension (K)
+            The first K elements of this array contain the components
+            of the deflation-adjusted updating vector.
+
+    S       (output) DOUBLE PRECISION array, dimension (LDS, K)
+            Will contain the eigenvectors of the repaired matrix which
+            will be stored for subsequent Z vector calculation and
+            multiplied by the previously accumulated eigenvectors
+            to update the system.
+
+    LDS     (input) INTEGER
+            The leading dimension of S.  LDS >= max( 1, K ).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --dlamda;
+    --w;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1;
+    s -= s_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*k < 0) {
+	*info = -1;
+    } else if (*kstart < 1 || *kstart > max(1,*k)) {
+	*info = -2;
+    } else if (max(1,*kstop) < *kstart || *kstop > max(1,*k)) {
+	*info = -3;
+    } else if (*n < *k) {
+	*info = -4;
+    } else if (*ldq < max(1,*k)) {
+	*info = -7;
+    } else if (*lds < max(1,*k)) {
+	*info = -12;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAED9", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 0) {
+	return 0;
+    }
+
+/*
+       Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DLAMDA(I) if it is 1; this makes the subsequent
+       subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DLAMDA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DLAMDA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DLAMBDA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = dlamc3_(&dlamda[i__], &dlamda[i__]) - dlamda[i__];
+/* L10: */
+    }
+
+    i__1 = *kstop;
+    for (j = *kstart; j <= i__1; ++j) {
+	dlaed4_(k, &j, &dlamda[1], &w[1], &q[j * q_dim1 + 1], rho, &d__[j],
+		info);
+
+/*        If the zero finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    goto L120;
+	}
+/* L20: */
+    }
+
+    if (*k == 1 || *k == 2) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = *k;
+	    for (j = 1; j <= i__2; ++j) {
+		s[j + i__ * s_dim1] = q[j + i__ * q_dim1];
+/* L30: */
+	    }
+/* L40: */
+	}
+	goto L120;
+    }
+
+/*     Compute updated W. */
+
+    dcopy_(k, &w[1], &c__1, &s[s_offset], &c__1);
+
+/*     Initialize W(I) = Q(I,I) */
+
+    i__1 = *ldq + 1;
+    dcopy_(k, &q[q_offset], &i__1, &w[1], &c__1);
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L50: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L60: */
+	}
+/* L70: */
+    }
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__1 = sqrt(-w[i__]);
+	w[i__] = d_sign(&d__1, &s[i__ + s_dim1]);
+/* L80: */
+    }
+
+/*     Compute eigenvectors of the modified rank-1 modification. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    q[i__ + j * q_dim1] = w[i__] / q[i__ + j * q_dim1];
+/* L90: */
+	}
+	temp = dnrm2_(k, &q[j * q_dim1 + 1], &c__1);
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    s[i__ + j * s_dim1] = q[i__ + j * q_dim1] / temp;
+/* L100: */
+	}
+/* L110: */
+    }
+
+L120:
+    return 0;
+
+/*     End of DLAED9 */
+
+} /* dlaed9_ */
+
+/* Subroutine */ int dlaeda_(integer *n, integer *tlvls, integer *curlvl,
+	integer *curpbm, integer *prmptr, integer *perm, integer *givptr,
+	integer *givcol, doublereal *givnum, doublereal *q, integer *qptr,
+	doublereal *z__, doublereal *ztemp, integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k, mid, ptr;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static integer curr, bsiz1, bsiz2, psiz1, psiz2, zptr1;
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *), dcopy_(integer *,
+	    doublereal *, integer *, doublereal *, integer *), xerbla_(char *,
+	     integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLAEDA computes the Z vector corresponding to the merge step in the
+    CURLVLth step of the merge process with TLVLS steps for the CURPBMth
+    problem.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    TLVLS  (input) INTEGER
+           The total number of merging levels in the overall divide and
+           conquer tree.
+
+    CURLVL (input) INTEGER
+           The current level in the overall merge routine,
+           0 <= curlvl <= tlvls.
+
+    CURPBM (input) INTEGER
+           The current problem in the current level in the overall
+           merge routine (counting from upper left to lower right).
+
+    PRMPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in PERM a
+           level's permutation is stored.  PRMPTR(i+1) - PRMPTR(i)
+           indicates the size of the permutation and incidentally the
+           size of the full, non-deflated problem.
+
+    PERM   (input) INTEGER array, dimension (N lg N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in GIVCOL a
+           level's Givens rotations are stored.  GIVPTR(i+1) - GIVPTR(i)
+           indicates the number of Givens rotations.
+
+    GIVCOL (input) INTEGER array, dimension (2, N lg N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension (2, N lg N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    Q      (input) DOUBLE PRECISION array, dimension (N**2)
+           Contains the square eigenblocks from previous levels, the
+           starting positions for blocks are given by QPTR.
+
+    QPTR   (input) INTEGER array, dimension (N+2)
+           Contains a list of pointers which indicate where in Q an
+           eigenblock is stored.  SQRT( QPTR(i+1) - QPTR(i) ) indicates
+           the size of the block.
+
+    Z      (output) DOUBLE PRECISION array, dimension (N)
+           On output this vector contains the updating vector (the last
+           row of the first sub-eigenvector matrix and the first row of
+           the second sub-eigenvector matrix).
+
+    ZTEMP  (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --ztemp;
+    --z__;
+    --qptr;
+    --q;
+    givnum -= 3;
+    givcol -= 3;
+    --givptr;
+    --perm;
+    --prmptr;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -1;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAEDA", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine location of first number in second half. */
+
+    mid = *n / 2 + 1;
+
+/*     Gather last/first rows of appropriate eigenblocks into center of Z */
+
+    ptr = 1;
+
+/*
+       Determine location of lowest level subproblem in the full storage
+       scheme
+*/
+
+    i__1 = *curlvl - 1;
+    curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1;
+
+/*
+       Determine size of these matrices.  We add HALF to the value of
+       the SQRT in case the machine underestimates one of these square
+       roots.
+*/
+
+    bsiz1 = (integer) (sqrt((doublereal) (qptr[curr + 1] - qptr[curr])) + .5);
+    bsiz2 = (integer) (sqrt((doublereal) (qptr[curr + 2] - qptr[curr + 1])) +
+	    .5);
+    i__1 = mid - bsiz1 - 1;
+    for (k = 1; k <= i__1; ++k) {
+	z__[k] = 0.;
+/* L10: */
+    }
+    dcopy_(&bsiz1, &q[qptr[curr] + bsiz1 - 1], &bsiz1, &z__[mid - bsiz1], &
+	    c__1);
+    dcopy_(&bsiz2, &q[qptr[curr + 1]], &bsiz2, &z__[mid], &c__1);
+    i__1 = *n;
+    for (k = mid + bsiz2; k <= i__1; ++k) {
+	z__[k] = 0.;
+/* L20: */
+    }
+
+/*
+       Loop through remaining levels 1 -> CURLVL applying the Givens
+       rotations and permutation and then multiplying the center matrices
+       against the current Z.
+*/
+
+    ptr = pow_ii(&c__2, tlvls) + 1;
+    i__1 = *curlvl - 1;
+    for (k = 1; k <= i__1; ++k) {
+	i__2 = *curlvl - k;
+	i__3 = *curlvl - k - 1;
+	curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) -
+		1;
+	psiz1 = prmptr[curr + 1] - prmptr[curr];
+	psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
+	zptr1 = mid - psiz1;
+
+/*       Apply Givens at CURR and CURR+1 */
+
+	i__2 = givptr[curr + 1] - 1;
+	for (i__ = givptr[curr]; i__ <= i__2; ++i__) {
+	    drot_(&c__1, &z__[zptr1 + givcol[(i__ << 1) + 1] - 1], &c__1, &
+		    z__[zptr1 + givcol[(i__ << 1) + 2] - 1], &c__1, &givnum[(
+		    i__ << 1) + 1], &givnum[(i__ << 1) + 2]);
+/* L30: */
+	}
+	i__2 = givptr[curr + 2] - 1;
+	for (i__ = givptr[curr + 1]; i__ <= i__2; ++i__) {
+	    drot_(&c__1, &z__[mid - 1 + givcol[(i__ << 1) + 1]], &c__1, &z__[
+		    mid - 1 + givcol[(i__ << 1) + 2]], &c__1, &givnum[(i__ <<
+		    1) + 1], &givnum[(i__ << 1) + 2]);
+/* L40: */
+	}
+	psiz1 = prmptr[curr + 1] - prmptr[curr];
+	psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
+	i__2 = psiz1 - 1;
+	for (i__ = 0; i__ <= i__2; ++i__) {
+	    ztemp[i__ + 1] = z__[zptr1 + perm[prmptr[curr] + i__] - 1];
+/* L50: */
+	}
+	i__2 = psiz2 - 1;
+	for (i__ = 0; i__ <= i__2; ++i__) {
+	    ztemp[psiz1 + i__ + 1] = z__[mid + perm[prmptr[curr + 1] + i__] -
+		    1];
+/* L60: */
+	}
+
+/*
+          Multiply Blocks at CURR and CURR+1
+
+          Determine size of these matrices.  We add HALF to the value of
+          the SQRT in case the machine underestimates one of these
+          square roots.
+*/
+
+	bsiz1 = (integer) (sqrt((doublereal) (qptr[curr + 1] - qptr[curr])) +
+		.5);
+	bsiz2 = (integer) (sqrt((doublereal) (qptr[curr + 2] - qptr[curr + 1])
+		) + .5);
+	if (bsiz1 > 0) {
+	    dgemv_("T", &bsiz1, &bsiz1, &c_b15, &q[qptr[curr]], &bsiz1, &
+		    ztemp[1], &c__1, &c_b29, &z__[zptr1], &c__1);
+	}
+	i__2 = psiz1 - bsiz1;
+	dcopy_(&i__2, &ztemp[bsiz1 + 1], &c__1, &z__[zptr1 + bsiz1], &c__1);
+	if (bsiz2 > 0) {
+	    dgemv_("T", &bsiz2, &bsiz2, &c_b15, &q[qptr[curr + 1]], &bsiz2, &
+		    ztemp[psiz1 + 1], &c__1, &c_b29, &z__[mid], &c__1);
+	}
+	i__2 = psiz2 - bsiz2;
+	dcopy_(&i__2, &ztemp[psiz1 + bsiz2 + 1], &c__1, &z__[mid + bsiz2], &
+		c__1);
+
+	i__2 = *tlvls - k;
+	ptr += pow_ii(&c__2, &i__2);
+/* L70: */
+    }
+
+    return 0;
+
+/*     End of DLAEDA */
+
+} /* dlaeda_ */
+
+/* Subroutine */ int dlaev2_(doublereal *a, doublereal *b, doublereal *c__,
+	doublereal *rt1, doublereal *rt2, doublereal *cs1, doublereal *sn1)
+{
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal ab, df, cs, ct, tb, sm, tn, rt, adf, acs;
+    static integer sgn1, sgn2;
+    static doublereal acmn, acmx;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAEV2 computes the eigendecomposition of a 2-by-2 symmetric matrix
+       [  A   B  ]
+       [  B   C  ].
+    On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+    eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+    eigenvector for RT1, giving the decomposition
+
+       [ CS1  SN1 ] [  A   B  ] [ CS1 -SN1 ]  =  [ RT1  0  ]
+       [-SN1  CS1 ] [  B   C  ] [ SN1  CS1 ]     [  0  RT2 ].
+
+    Arguments
+    =========
+
+    A       (input) DOUBLE PRECISION
+            The (1,1) element of the 2-by-2 matrix.
+
+    B       (input) DOUBLE PRECISION
+            The (1,2) element and the conjugate of the (2,1) element of
+            the 2-by-2 matrix.
+
+    C       (input) DOUBLE PRECISION
+            The (2,2) element of the 2-by-2 matrix.
+
+    RT1     (output) DOUBLE PRECISION
+            The eigenvalue of larger absolute value.
+
+    RT2     (output) DOUBLE PRECISION
+            The eigenvalue of smaller absolute value.
+
+    CS1     (output) DOUBLE PRECISION
+    SN1     (output) DOUBLE PRECISION
+            The vector (CS1, SN1) is a unit right eigenvector for RT1.
+
+    Further Details
+    ===============
+
+    RT1 is accurate to a few ulps barring over/underflow.
+
+    RT2 may be inaccurate if there is massive cancellation in the
+    determinant A*C-B*B; higher precision or correctly rounded or
+    correctly truncated arithmetic would be needed to compute RT2
+    accurately in all cases.
+
+    CS1 and SN1 are accurate to a few ulps barring over/underflow.
+
+    Overflow is possible only if RT1 is within a factor of 5 of overflow.
+    Underflow is harmless if the input data is 0 or exceeds
+       underflow_threshold / macheps.
+
+   =====================================================================
+
+
+       Compute the eigenvalues
+*/
+
+    sm = *a + *c__;
+    df = *a - *c__;
+    adf = abs(df);
+    tb = *b + *b;
+    ab = abs(tb);
+    if (abs(*a) > abs(*c__)) {
+	acmx = *a;
+	acmn = *c__;
+    } else {
+	acmx = *c__;
+	acmn = *a;
+    }
+    if (adf > ab) {
+/* Computing 2nd power */
+	d__1 = ab / adf;
+	rt = adf * sqrt(d__1 * d__1 + 1.);
+    } else if (adf < ab) {
+/* Computing 2nd power */
+	d__1 = adf / ab;
+	rt = ab * sqrt(d__1 * d__1 + 1.);
+    } else {
+
+/*        Includes case AB=ADF=0 */
+
+	rt = ab * sqrt(2.);
+    }
+    if (sm < 0.) {
+	*rt1 = (sm - rt) * .5;
+	sgn1 = -1;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else if (sm > 0.) {
+	*rt1 = (sm + rt) * .5;
+	sgn1 = 1;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else {
+
+/*        Includes case RT1 = RT2 = 0 */
+
+	*rt1 = rt * .5;
+	*rt2 = rt * -.5;
+	sgn1 = 1;
+    }
+
+/*     Compute the eigenvector */
+
+    if (df >= 0.) {
+	cs = df + rt;
+	sgn2 = 1;
+    } else {
+	cs = df - rt;
+	sgn2 = -1;
+    }
+    acs = abs(cs);
+    if (acs > ab) {
+	ct = -tb / cs;
+	*sn1 = 1. / sqrt(ct * ct + 1.);
+	*cs1 = ct * *sn1;
+    } else {
+	if (ab == 0.) {
+	    *cs1 = 1.;
+	    *sn1 = 0.;
+	} else {
+	    tn = -cs / tb;
+	    *cs1 = 1. / sqrt(tn * tn + 1.);
+	    *sn1 = tn * *cs1;
+	}
+    }
+    if (sgn1 == sgn2) {
+	tn = *cs1;
+	*cs1 = -(*sn1);
+	*sn1 = tn;
+    }
+    return 0;
+
+/*     End of DLAEV2 */
+
+} /* dlaev2_ */
+
+/* Subroutine */ int dlaexc_(logical *wantq, integer *n, doublereal *t,
+	integer *ldt, doublereal *q, integer *ldq, integer *j1, integer *n1,
+	integer *n2, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, t_dim1, t_offset, i__1;
+    doublereal d__1, d__2, d__3;
+
+    /* Local variables */
+    static doublereal d__[16]	/* was [4][4] */;
+    static integer k;
+    static doublereal u[3], x[4]	/* was [2][2] */;
+    static integer j2, j3, j4;
+    static doublereal u1[3], u2[3];
+    static integer nd;
+    static doublereal cs, t11, t22, t33, sn, wi1, wi2, wr1, wr2, eps, tau,
+	    tau1, tau2;
+    static integer ierr;
+    static doublereal temp;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static doublereal scale, dnorm, xnorm;
+    extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *), dlasy2_(
+	    logical *, logical *, integer *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *);
+    extern doublereal dlamch_(char *), dlange_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *), dlacpy_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *),
+	    dlartg_(doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *), dlarfx_(char *, integer *, integer *, doublereal *,
+	     doublereal *, doublereal *, integer *, doublereal *);
+    static doublereal thresh, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLAEXC swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in
+    an upper quasi-triangular matrix T by an orthogonal similarity
+    transformation.
+
+    T must be in Schur canonical form, that is, block upper triangular
+    with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block
+    has its diagonal elemnts equal and its off-diagonal elements of
+    opposite sign.
+
+    Arguments
+    =========
+
+    WANTQ   (input) LOGICAL
+            = .TRUE. : accumulate the transformation in the matrix Q;
+            = .FALSE.: do not accumulate the transformation.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) DOUBLE PRECISION array, dimension (LDT,N)
+            On entry, the upper quasi-triangular matrix T, in Schur
+            canonical form.
+            On exit, the updated matrix T, again in Schur canonical form.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    Q       (input/output) DOUBLE PRECISION array, dimension (LDQ,N)
+            On entry, if WANTQ is .TRUE., the orthogonal matrix Q.
+            On exit, if WANTQ is .TRUE., the updated matrix Q.
+            If WANTQ is .FALSE., Q is not referenced.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.
+            LDQ >= 1; and if WANTQ is .TRUE., LDQ >= N.
+
+    J1      (input) INTEGER
+            The index of the first row of the first block T11.
+
+    N1      (input) INTEGER
+            The order of the first block T11. N1 = 0, 1 or 2.
+
+    N2      (input) INTEGER
+            The order of the second block T22. N2 = 0, 1 or 2.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            = 1: the transformed matrix T would be too far from Schur
+                 form; the blocks are not swapped and T and Q are
+                 unchanged.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *n1 == 0 || *n2 == 0) {
+	return 0;
+    }
+    if (*j1 + *n1 > *n) {
+	return 0;
+    }
+
+    j2 = *j1 + 1;
+    j3 = *j1 + 2;
+    j4 = *j1 + 3;
+
+    if (*n1 == 1 && *n2 == 1) {
+
+/*        Swap two 1-by-1 blocks. */
+
+	t11 = t[*j1 + *j1 * t_dim1];
+	t22 = t[j2 + j2 * t_dim1];
+
+/*        Determine the transformation to perform the interchange. */
+
+	d__1 = t22 - t11;
+	dlartg_(&t[*j1 + j2 * t_dim1], &d__1, &cs, &sn, &temp);
+
+/*        Apply transformation to the matrix T. */
+
+	if (j3 <= *n) {
+	    i__1 = *n - *j1 - 1;
+	    drot_(&i__1, &t[*j1 + j3 * t_dim1], ldt, &t[j2 + j3 * t_dim1],
+		    ldt, &cs, &sn);
+	}
+	i__1 = *j1 - 1;
+	drot_(&i__1, &t[*j1 * t_dim1 + 1], &c__1, &t[j2 * t_dim1 + 1], &c__1,
+		&cs, &sn);
+
+	t[*j1 + *j1 * t_dim1] = t22;
+	t[j2 + j2 * t_dim1] = t11;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    drot_(n, &q[*j1 * q_dim1 + 1], &c__1, &q[j2 * q_dim1 + 1], &c__1,
+		    &cs, &sn);
+	}
+
+    } else {
+
+/*
+          Swapping involves at least one 2-by-2 block.
+
+          Copy the diagonal block of order N1+N2 to the local array D
+          and compute its norm.
+*/
+
+	nd = *n1 + *n2;
+	dlacpy_("Full", &nd, &nd, &t[*j1 + *j1 * t_dim1], ldt, d__, &c__4);
+	dnorm = dlange_("Max", &nd, &nd, d__, &c__4, &work[1]);
+
+/*
+          Compute machine-dependent threshold for test for accepting
+          swap.
+*/
+
+	eps = PRECISION;
+	smlnum = SAFEMINIMUM / eps;
+/* Computing MAX */
+	d__1 = eps * 10. * dnorm;
+	thresh = max(d__1,smlnum);
+
+/*        Solve T11*X - X*T22 = scale*T12 for X. */
+
+	dlasy2_(&c_false, &c_false, &c_n1, n1, n2, d__, &c__4, &d__[*n1 + 1 +
+		(*n1 + 1 << 2) - 5], &c__4, &d__[(*n1 + 1 << 2) - 4], &c__4, &
+		scale, x, &c__2, &xnorm, &ierr);
+
+/*        Swap the adjacent diagonal blocks. */
+
+	k = *n1 + *n1 + *n2 - 3;
+	switch (k) {
+	    case 1:  goto L10;
+	    case 2:  goto L20;
+	    case 3:  goto L30;
+	}
+
+L10:
+
+/*
+          N1 = 1, N2 = 2: generate elementary reflector H so that:
+
+          ( scale, X11, X12 ) H = ( 0, 0, * )
+*/
+
+	u[0] = scale;
+	u[1] = x[0];
+	u[2] = x[2];
+	dlarfg_(&c__3, &u[2], u, &c__1, &tau);
+	u[2] = 1.;
+	t11 = t[*j1 + *j1 * t_dim1];
+
+/*        Perform swap provisionally on diagonal block in D. */
+
+	dlarfx_("L", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+	dlarfx_("R", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+
+/*
+          Test whether to reject swap.
+
+   Computing MAX
+*/
+	d__2 = abs(d__[2]), d__3 = abs(d__[6]), d__2 = max(d__2,d__3), d__3 =
+		(d__1 = d__[10] - t11, abs(d__1));
+	if (max(d__2,d__3) > thresh) {
+	    goto L50;
+	}
+
+/*        Accept swap: apply transformation to the entire matrix T. */
+
+	i__1 = *n - *j1 + 1;
+	dlarfx_("L", &c__3, &i__1, u, &tau, &t[*j1 + *j1 * t_dim1], ldt, &
+		work[1]);
+	dlarfx_("R", &j2, &c__3, u, &tau, &t[*j1 * t_dim1 + 1], ldt, &work[1]);
+
+	t[j3 + *j1 * t_dim1] = 0.;
+	t[j3 + j2 * t_dim1] = 0.;
+	t[j3 + j3 * t_dim1] = t11;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    dlarfx_("R", n, &c__3, u, &tau, &q[*j1 * q_dim1 + 1], ldq, &work[
+		    1]);
+	}
+	goto L40;
+
+L20:
+
+/*
+          N1 = 2, N2 = 1: generate elementary reflector H so that:
+
+          H (  -X11 ) = ( * )
+            (  -X21 ) = ( 0 )
+            ( scale ) = ( 0 )
+*/
+
+	u[0] = -x[0];
+	u[1] = -x[1];
+	u[2] = scale;
+	dlarfg_(&c__3, u, &u[1], &c__1, &tau);
+	u[0] = 1.;
+	t33 = t[j3 + j3 * t_dim1];
+
+/*        Perform swap provisionally on diagonal block in D. */
+
+	dlarfx_("L", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+	dlarfx_("R", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+
+/*
+          Test whether to reject swap.
+
+   Computing MAX
+*/
+	d__2 = abs(d__[1]), d__3 = abs(d__[2]), d__2 = max(d__2,d__3), d__3 =
+		(d__1 = d__[0] - t33, abs(d__1));
+	if (max(d__2,d__3) > thresh) {
+	    goto L50;
+	}
+
+/*        Accept swap: apply transformation to the entire matrix T. */
+
+	dlarfx_("R", &j3, &c__3, u, &tau, &t[*j1 * t_dim1 + 1], ldt, &work[1]);
+	i__1 = *n - *j1;
+	dlarfx_("L", &c__3, &i__1, u, &tau, &t[*j1 + j2 * t_dim1], ldt, &work[
+		1]);
+
+	t[*j1 + *j1 * t_dim1] = t33;
+	t[j2 + *j1 * t_dim1] = 0.;
+	t[j3 + *j1 * t_dim1] = 0.;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    dlarfx_("R", n, &c__3, u, &tau, &q[*j1 * q_dim1 + 1], ldq, &work[
+		    1]);
+	}
+	goto L40;
+
+L30:
+
+/*
+          N1 = 2, N2 = 2: generate elementary reflectors H(1) and H(2) so
+          that:
+
+          H(2) H(1) (  -X11  -X12 ) = (  *  * )
+                    (  -X21  -X22 )   (  0  * )
+                    ( scale    0  )   (  0  0 )
+                    (    0  scale )   (  0  0 )
+*/
+
+	u1[0] = -x[0];
+	u1[1] = -x[1];
+	u1[2] = scale;
+	dlarfg_(&c__3, u1, &u1[1], &c__1, &tau1);
+	u1[0] = 1.;
+
+	temp = -tau1 * (x[2] + u1[1] * x[3]);
+	u2[0] = -temp * u1[1] - x[3];
+	u2[1] = -temp * u1[2];
+	u2[2] = scale;
+	dlarfg_(&c__3, u2, &u2[1], &c__1, &tau2);
+	u2[0] = 1.;
+
+/*        Perform swap provisionally on diagonal block in D. */
+
+	dlarfx_("L", &c__3, &c__4, u1, &tau1, d__, &c__4, &work[1])
+		;
+	dlarfx_("R", &c__4, &c__3, u1, &tau1, d__, &c__4, &work[1])
+		;
+	dlarfx_("L", &c__3, &c__4, u2, &tau2, &d__[1], &c__4, &work[1]);
+	dlarfx_("R", &c__4, &c__3, u2, &tau2, &d__[4], &c__4, &work[1]);
+
+/*
+          Test whether to reject swap.
+
+   Computing MAX
+*/
+	d__1 = abs(d__[2]), d__2 = abs(d__[6]), d__1 = max(d__1,d__2), d__2 =
+		abs(d__[3]), d__1 = max(d__1,d__2), d__2 = abs(d__[7]);
+	if (max(d__1,d__2) > thresh) {
+	    goto L50;
+	}
+
+/*        Accept swap: apply transformation to the entire matrix T. */
+
+	i__1 = *n - *j1 + 1;
+	dlarfx_("L", &c__3, &i__1, u1, &tau1, &t[*j1 + *j1 * t_dim1], ldt, &
+		work[1]);
+	dlarfx_("R", &j4, &c__3, u1, &tau1, &t[*j1 * t_dim1 + 1], ldt, &work[
+		1]);
+	i__1 = *n - *j1 + 1;
+	dlarfx_("L", &c__3, &i__1, u2, &tau2, &t[j2 + *j1 * t_dim1], ldt, &
+		work[1]);
+	dlarfx_("R", &j4, &c__3, u2, &tau2, &t[j2 * t_dim1 + 1], ldt, &work[1]
+		);
+
+	t[j3 + *j1 * t_dim1] = 0.;
+	t[j3 + j2 * t_dim1] = 0.;
+	t[j4 + *j1 * t_dim1] = 0.;
+	t[j4 + j2 * t_dim1] = 0.;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    dlarfx_("R", n, &c__3, u1, &tau1, &q[*j1 * q_dim1 + 1], ldq, &
+		    work[1]);
+	    dlarfx_("R", n, &c__3, u2, &tau2, &q[j2 * q_dim1 + 1], ldq, &work[
+		    1]);
+	}
+
+L40:
+
+	if (*n2 == 2) {
+
+/*           Standardize new 2-by-2 block T11 */
+
+	    dlanv2_(&t[*j1 + *j1 * t_dim1], &t[*j1 + j2 * t_dim1], &t[j2 + *
+		    j1 * t_dim1], &t[j2 + j2 * t_dim1], &wr1, &wi1, &wr2, &
+		    wi2, &cs, &sn);
+	    i__1 = *n - *j1 - 1;
+	    drot_(&i__1, &t[*j1 + (*j1 + 2) * t_dim1], ldt, &t[j2 + (*j1 + 2)
+		    * t_dim1], ldt, &cs, &sn);
+	    i__1 = *j1 - 1;
+	    drot_(&i__1, &t[*j1 * t_dim1 + 1], &c__1, &t[j2 * t_dim1 + 1], &
+		    c__1, &cs, &sn);
+	    if (*wantq) {
+		drot_(n, &q[*j1 * q_dim1 + 1], &c__1, &q[j2 * q_dim1 + 1], &
+			c__1, &cs, &sn);
+	    }
+	}
+
+	if (*n1 == 2) {
+
+/*           Standardize new 2-by-2 block T22 */
+
+	    j3 = *j1 + *n2;
+	    j4 = j3 + 1;
+	    dlanv2_(&t[j3 + j3 * t_dim1], &t[j3 + j4 * t_dim1], &t[j4 + j3 *
+		    t_dim1], &t[j4 + j4 * t_dim1], &wr1, &wi1, &wr2, &wi2, &
+		    cs, &sn);
+	    if (j3 + 2 <= *n) {
+		i__1 = *n - j3 - 1;
+		drot_(&i__1, &t[j3 + (j3 + 2) * t_dim1], ldt, &t[j4 + (j3 + 2)
+			 * t_dim1], ldt, &cs, &sn);
+	    }
+	    i__1 = j3 - 1;
+	    drot_(&i__1, &t[j3 * t_dim1 + 1], &c__1, &t[j4 * t_dim1 + 1], &
+		    c__1, &cs, &sn);
+	    if (*wantq) {
+		drot_(n, &q[j3 * q_dim1 + 1], &c__1, &q[j4 * q_dim1 + 1], &
+			c__1, &cs, &sn);
+	    }
+	}
+
+    }
+    return 0;
+
+/*     Exit with INFO = 1 if swap was rejected. */
+
+L50:
+    *info = 1;
+    return 0;
+
+/*     End of DLAEXC */
+
+} /* dlaexc_ */
+
+/* Subroutine */ int dlahqr_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal
+	*wr, doublereal *wi, integer *iloz, integer *ihiz, doublereal *z__,
+	integer *ldz, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static integer i__, j, k, l, m;
+    static doublereal s, v[3];
+    static integer i1, i2;
+    static doublereal t1, t2, t3, v2, v3, aa, ab, ba, bb, h11, h12, h21, h22,
+	    cs;
+    static integer nh;
+    static doublereal sn;
+    static integer nr;
+    static doublereal tr;
+    static integer nz;
+    static doublereal det, h21s;
+    static integer its;
+    static doublereal ulp, sum, tst, rt1i, rt2i, rt1r, rt2r;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *), dcopy_(
+	    integer *, doublereal *, integer *, doublereal *, integer *),
+	    dlanv2_(doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *), dlabad_(doublereal *, doublereal *);
+
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    static doublereal safmin, safmax, rtdisc, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       DLAHQR is an auxiliary routine called by DHSEQR to update the
+       eigenvalues and Schur decomposition already computed by DHSEQR, by
+       dealing with the Hessenberg submatrix in rows and columns ILO to
+       IHI.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N       (input) INTEGER
+            The order of the matrix H.  N >= 0.
+
+       ILO     (input) INTEGER
+       IHI     (input) INTEGER
+            It is assumed that H is already upper quasi-triangular in
+            rows and columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless
+            ILO = 1). DLAHQR works primarily with the Hessenberg
+            submatrix in rows and columns ILO to IHI, but applies
+            transformations to all of H if WANTT is .TRUE..
+            1 <= ILO <= max(1,IHI); IHI <= N.
+
+       H       (input/output) DOUBLE PRECISION array, dimension (LDH,N)
+            On entry, the upper Hessenberg matrix H.
+            On exit, if INFO is zero and if WANTT is .TRUE., H is upper
+            quasi-triangular in rows and columns ILO:IHI, with any
+            2-by-2 diagonal blocks in standard form. If INFO is zero
+            and WANTT is .FALSE., the contents of H are unspecified on
+            exit.  The output state of H if INFO is nonzero is given
+            below under the description of INFO.
+
+       LDH     (input) INTEGER
+            The leading dimension of the array H. LDH >= max(1,N).
+
+       WR      (output) DOUBLE PRECISION array, dimension (N)
+       WI      (output) DOUBLE PRECISION array, dimension (N)
+            The real and imaginary parts, respectively, of the computed
+            eigenvalues ILO to IHI are stored in the corresponding
+            elements of WR and WI. If two eigenvalues are computed as a
+            complex conjugate pair, they are stored in consecutive
+            elements of WR and WI, say the i-th and (i+1)th, with
+            WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the
+            eigenvalues are stored in the same order as on the diagonal
+            of the Schur form returned in H, with WR(i) = H(i,i), and, if
+            H(i:i+1,i:i+1) is a 2-by-2 diagonal block,
+            WI(i) = sqrt(H(i+1,i)*H(i,i+1)) and WI(i+1) = -WI(i).
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE..
+            1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
+
+       Z       (input/output) DOUBLE PRECISION array, dimension (LDZ,N)
+            If WANTZ is .TRUE., on entry Z must contain the current
+            matrix Z of transformations accumulated by DHSEQR, and on
+            exit Z has been updated; transformations are applied only to
+            the submatrix Z(ILOZ:IHIZ,ILO:IHI).
+            If WANTZ is .FALSE., Z is not referenced.
+
+       LDZ     (input) INTEGER
+            The leading dimension of the array Z. LDZ >= max(1,N).
+
+       INFO    (output) INTEGER
+             =   0: successful exit
+            .GT. 0: If INFO = i, DLAHQR failed to compute all the
+                    eigenvalues ILO to IHI in a total of 30 iterations
+                    per eigenvalue; elements i+1:ihi of WR and WI
+                    contain those eigenvalues which have been
+                    successfully computed.
+
+                    If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+                    the remaining unconverged eigenvalues are the
+                    eigenvalues of the upper Hessenberg matrix rows
+                    and columns ILO thorugh INFO of the final, output
+                    value of H.
+
+                    If INFO .GT. 0 and WANTT is .TRUE., then on exit
+            (*)       (initial value of H)*U  = U*(final value of H)
+                    where U is an orthognal matrix.    The final
+                    value of H is upper Hessenberg and triangular in
+                    rows and columns INFO+1 through IHI.
+
+                    If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+                        (final value of Z)  = (initial value of Z)*U
+                    where U is the orthogonal matrix in (*)
+                    (regardless of the value of WANTT.)
+
+       Further Details
+       ===============
+
+       02-96 Based on modifications by
+       David Day, Sandia National Laboratory, USA
+
+       12-04 Further modifications by
+       Ralph Byers, University of Kansas, USA
+       This is a modified version of DLAHQR from LAPACK version 3.0.
+       It is (1) more robust against overflow and underflow and
+       (2) adopts the more conservative Ahues & Tisseur stopping
+       criterion (LAWN 122, 1997).
+
+       =========================================================
+*/
+
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*ilo == *ihi) {
+	wr[*ilo] = h__[*ilo + *ilo * h_dim1];
+	wi[*ilo] = 0.;
+	return 0;
+    }
+
+/*     ==== clear out the trash ==== */
+    i__1 = *ihi - 3;
+    for (j = *ilo; j <= i__1; ++j) {
+	h__[j + 2 + j * h_dim1] = 0.;
+	h__[j + 3 + j * h_dim1] = 0.;
+/* L10: */
+    }
+    if (*ilo <= *ihi - 2) {
+	h__[*ihi + (*ihi - 2) * h_dim1] = 0.;
+    }
+
+    nh = *ihi - *ilo + 1;
+    nz = *ihiz - *iloz + 1;
+
+/*     Set machine-dependent constants for the stopping criterion. */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) nh / ulp);
+
+/*
+       I1 and I2 are the indices of the first row and last column of H
+       to which transformations must be applied. If eigenvalues only are
+       being computed, I1 and I2 are set inside the main loop.
+*/
+
+    if (*wantt) {
+	i1 = 1;
+	i2 = *n;
+    }
+
+/*
+       The main loop begins here. I is the loop index and decreases from
+       IHI to ILO in steps of 1 or 2. Each iteration of the loop works
+       with the active submatrix in rows and columns L to I.
+       Eigenvalues I+1 to IHI have already converged. Either L = ILO or
+       H(L,L-1) is negligible so that the matrix splits.
+*/
+
+    i__ = *ihi;
+L20:
+    l = *ilo;
+    if (i__ < *ilo) {
+	goto L160;
+    }
+
+/*
+       Perform QR iterations on rows and columns ILO to I until a
+       submatrix of order 1 or 2 splits off at the bottom because a
+       subdiagonal element has become negligible.
+*/
+
+    for (its = 0; its <= 30; ++its) {
+
+/*        Look for a single small subdiagonal element. */
+
+	i__1 = l + 1;
+	for (k = i__; k >= i__1; --k) {
+	    if ((d__1 = h__[k + (k - 1) * h_dim1], abs(d__1)) <= smlnum) {
+		goto L40;
+	    }
+	    tst = (d__1 = h__[k - 1 + (k - 1) * h_dim1], abs(d__1)) + (d__2 =
+		    h__[k + k * h_dim1], abs(d__2));
+	    if (tst == 0.) {
+		if (k - 2 >= *ilo) {
+		    tst += (d__1 = h__[k - 1 + (k - 2) * h_dim1], abs(d__1));
+		}
+		if (k + 1 <= *ihi) {
+		    tst += (d__1 = h__[k + 1 + k * h_dim1], abs(d__1));
+		}
+	    }
+/*
+             ==== The following is a conservative small subdiagonal
+             .    deflation  criterion due to Ahues & Tisseur (LAWN 122,
+             .    1997). It has better mathematical foundation and
+             .    improves accuracy in some cases.  ====
+*/
+	    if ((d__1 = h__[k + (k - 1) * h_dim1], abs(d__1)) <= ulp * tst) {
+/* Computing MAX */
+		d__3 = (d__1 = h__[k + (k - 1) * h_dim1], abs(d__1)), d__4 = (
+			d__2 = h__[k - 1 + k * h_dim1], abs(d__2));
+		ab = max(d__3,d__4);
+/* Computing MIN */
+		d__3 = (d__1 = h__[k + (k - 1) * h_dim1], abs(d__1)), d__4 = (
+			d__2 = h__[k - 1 + k * h_dim1], abs(d__2));
+		ba = min(d__3,d__4);
+/* Computing MAX */
+		d__3 = (d__1 = h__[k + k * h_dim1], abs(d__1)), d__4 = (d__2 =
+			 h__[k - 1 + (k - 1) * h_dim1] - h__[k + k * h_dim1],
+			abs(d__2));
+		aa = max(d__3,d__4);
+/* Computing MIN */
+		d__3 = (d__1 = h__[k + k * h_dim1], abs(d__1)), d__4 = (d__2 =
+			 h__[k - 1 + (k - 1) * h_dim1] - h__[k + k * h_dim1],
+			abs(d__2));
+		bb = min(d__3,d__4);
+		s = aa + ab;
+/* Computing MAX */
+		d__1 = smlnum, d__2 = ulp * (bb * (aa / s));
+		if (ba * (ab / s) <= max(d__1,d__2)) {
+		    goto L40;
+		}
+	    }
+/* L30: */
+	}
+L40:
+	l = k;
+	if (l > *ilo) {
+
+/*           H(L,L-1) is negligible */
+
+	    h__[l + (l - 1) * h_dim1] = 0.;
+	}
+
+/*        Exit from loop if a submatrix of order 1 or 2 has split off. */
+
+	if (l >= i__ - 1) {
+	    goto L150;
+	}
+
+/*
+          Now the active submatrix is in rows and columns L to I. If
+          eigenvalues only are being computed, only the active submatrix
+          need be transformed.
+*/
+
+	if (! (*wantt)) {
+	    i1 = l;
+	    i2 = i__;
+	}
+
+	if (its == 10) {
+
+/*           Exceptional shift. */
+
+	    s = (d__1 = h__[l + 1 + l * h_dim1], abs(d__1)) + (d__2 = h__[l +
+		    2 + (l + 1) * h_dim1], abs(d__2));
+	    h11 = s * .75 + h__[l + l * h_dim1];
+	    h12 = s * -.4375;
+	    h21 = s;
+	    h22 = h11;
+	} else if (its == 20) {
+
+/*           Exceptional shift. */
+
+	    s = (d__1 = h__[i__ + (i__ - 1) * h_dim1], abs(d__1)) + (d__2 =
+		    h__[i__ - 1 + (i__ - 2) * h_dim1], abs(d__2));
+	    h11 = s * .75 + h__[i__ + i__ * h_dim1];
+	    h12 = s * -.4375;
+	    h21 = s;
+	    h22 = h11;
+	} else {
+
+/*
+             Prepare to use Francis' double shift
+             (i.e. 2nd degree generalized Rayleigh quotient)
+*/
+
+	    h11 = h__[i__ - 1 + (i__ - 1) * h_dim1];
+	    h21 = h__[i__ + (i__ - 1) * h_dim1];
+	    h12 = h__[i__ - 1 + i__ * h_dim1];
+	    h22 = h__[i__ + i__ * h_dim1];
+	}
+	s = abs(h11) + abs(h12) + abs(h21) + abs(h22);
+	if (s == 0.) {
+	    rt1r = 0.;
+	    rt1i = 0.;
+	    rt2r = 0.;
+	    rt2i = 0.;
+	} else {
+	    h11 /= s;
+	    h21 /= s;
+	    h12 /= s;
+	    h22 /= s;
+	    tr = (h11 + h22) / 2.;
+	    det = (h11 - tr) * (h22 - tr) - h12 * h21;
+	    rtdisc = sqrt((abs(det)));
+	    if (det >= 0.) {
+
+/*              ==== complex conjugate shifts ==== */
+
+		rt1r = tr * s;
+		rt2r = rt1r;
+		rt1i = rtdisc * s;
+		rt2i = -rt1i;
+	    } else {
+
+/*              ==== real shifts (use only one of them)  ==== */
+
+		rt1r = tr + rtdisc;
+		rt2r = tr - rtdisc;
+		if ((d__1 = rt1r - h22, abs(d__1)) <= (d__2 = rt2r - h22, abs(
+			d__2))) {
+		    rt1r *= s;
+		    rt2r = rt1r;
+		} else {
+		    rt2r *= s;
+		    rt1r = rt2r;
+		}
+		rt1i = 0.;
+		rt2i = 0.;
+	    }
+	}
+
+/*        Look for two consecutive small subdiagonal elements. */
+
+	i__1 = l;
+	for (m = i__ - 2; m >= i__1; --m) {
+/*
+             Determine the effect of starting the double-shift QR
+             iteration at row M, and see if this would make H(M,M-1)
+             negligible.  (The following uses scaling to avoid
+             overflows and most underflows.)
+*/
+
+	    h21s = h__[m + 1 + m * h_dim1];
+	    s = (d__1 = h__[m + m * h_dim1] - rt2r, abs(d__1)) + abs(rt2i) +
+		    abs(h21s);
+	    h21s = h__[m + 1 + m * h_dim1] / s;
+	    v[0] = h21s * h__[m + (m + 1) * h_dim1] + (h__[m + m * h_dim1] -
+		    rt1r) * ((h__[m + m * h_dim1] - rt2r) / s) - rt1i * (rt2i
+		    / s);
+	    v[1] = h21s * (h__[m + m * h_dim1] + h__[m + 1 + (m + 1) * h_dim1]
+		     - rt1r - rt2r);
+	    v[2] = h21s * h__[m + 2 + (m + 1) * h_dim1];
+	    s = abs(v[0]) + abs(v[1]) + abs(v[2]);
+	    v[0] /= s;
+	    v[1] /= s;
+	    v[2] /= s;
+	    if (m == l) {
+		goto L60;
+	    }
+	    if ((d__1 = h__[m + (m - 1) * h_dim1], abs(d__1)) * (abs(v[1]) +
+		    abs(v[2])) <= ulp * abs(v[0]) * ((d__2 = h__[m - 1 + (m -
+		    1) * h_dim1], abs(d__2)) + (d__3 = h__[m + m * h_dim1],
+		    abs(d__3)) + (d__4 = h__[m + 1 + (m + 1) * h_dim1], abs(
+		    d__4)))) {
+		goto L60;
+	    }
+/* L50: */
+	}
+L60:
+
+/*        Double-shift QR step */
+
+	i__1 = i__ - 1;
+	for (k = m; k <= i__1; ++k) {
+
+/*
+             The first iteration of this loop determines a reflection G
+             from the vector V and applies it from left and right to H,
+             thus creating a nonzero bulge below the subdiagonal.
+
+             Each subsequent iteration determines a reflection G to
+             restore the Hessenberg form in the (K-1)th column, and thus
+             chases the bulge one step toward the bottom of the active
+             submatrix. NR is the order of G.
+
+   Computing MIN
+*/
+	    i__2 = 3, i__3 = i__ - k + 1;
+	    nr = min(i__2,i__3);
+	    if (k > m) {
+		dcopy_(&nr, &h__[k + (k - 1) * h_dim1], &c__1, v, &c__1);
+	    }
+	    dlarfg_(&nr, v, &v[1], &c__1, &t1);
+	    if (k > m) {
+		h__[k + (k - 1) * h_dim1] = v[0];
+		h__[k + 1 + (k - 1) * h_dim1] = 0.;
+		if (k < i__ - 1) {
+		    h__[k + 2 + (k - 1) * h_dim1] = 0.;
+		}
+	    } else if (m > l) {
+/*
+                 ==== Use the following instead of
+                 .    H( K, K-1 ) = -H( K, K-1 ) to
+                 .    avoid a bug when v(2) and v(3)
+                 .    underflow. ====
+*/
+		h__[k + (k - 1) * h_dim1] *= 1. - t1;
+	    }
+	    v2 = v[1];
+	    t2 = t1 * v2;
+	    if (nr == 3) {
+		v3 = v[2];
+		t3 = t1 * v3;
+
+/*
+                Apply G from the left to transform the rows of the matrix
+                in columns K to I2.
+*/
+
+		i__2 = i2;
+		for (j = k; j <= i__2; ++j) {
+		    sum = h__[k + j * h_dim1] + v2 * h__[k + 1 + j * h_dim1]
+			    + v3 * h__[k + 2 + j * h_dim1];
+		    h__[k + j * h_dim1] -= sum * t1;
+		    h__[k + 1 + j * h_dim1] -= sum * t2;
+		    h__[k + 2 + j * h_dim1] -= sum * t3;
+/* L70: */
+		}
+
+/*
+                Apply G from the right to transform the columns of the
+                matrix in rows I1 to min(K+3,I).
+
+   Computing MIN
+*/
+		i__3 = k + 3;
+		i__2 = min(i__3,i__);
+		for (j = i1; j <= i__2; ++j) {
+		    sum = h__[j + k * h_dim1] + v2 * h__[j + (k + 1) * h_dim1]
+			     + v3 * h__[j + (k + 2) * h_dim1];
+		    h__[j + k * h_dim1] -= sum * t1;
+		    h__[j + (k + 1) * h_dim1] -= sum * t2;
+		    h__[j + (k + 2) * h_dim1] -= sum * t3;
+/* L80: */
+		}
+
+		if (*wantz) {
+
+/*                 Accumulate transformations in the matrix Z */
+
+		    i__2 = *ihiz;
+		    for (j = *iloz; j <= i__2; ++j) {
+			sum = z__[j + k * z_dim1] + v2 * z__[j + (k + 1) *
+				z_dim1] + v3 * z__[j + (k + 2) * z_dim1];
+			z__[j + k * z_dim1] -= sum * t1;
+			z__[j + (k + 1) * z_dim1] -= sum * t2;
+			z__[j + (k + 2) * z_dim1] -= sum * t3;
+/* L90: */
+		    }
+		}
+	    } else if (nr == 2) {
+
+/*
+                Apply G from the left to transform the rows of the matrix
+                in columns K to I2.
+*/
+
+		i__2 = i2;
+		for (j = k; j <= i__2; ++j) {
+		    sum = h__[k + j * h_dim1] + v2 * h__[k + 1 + j * h_dim1];
+		    h__[k + j * h_dim1] -= sum * t1;
+		    h__[k + 1 + j * h_dim1] -= sum * t2;
+/* L100: */
+		}
+
+/*
+                Apply G from the right to transform the columns of the
+                matrix in rows I1 to min(K+3,I).
+*/
+
+		i__2 = i__;
+		for (j = i1; j <= i__2; ++j) {
+		    sum = h__[j + k * h_dim1] + v2 * h__[j + (k + 1) * h_dim1]
+			    ;
+		    h__[j + k * h_dim1] -= sum * t1;
+		    h__[j + (k + 1) * h_dim1] -= sum * t2;
+/* L110: */
+		}
+
+		if (*wantz) {
+
+/*                 Accumulate transformations in the matrix Z */
+
+		    i__2 = *ihiz;
+		    for (j = *iloz; j <= i__2; ++j) {
+			sum = z__[j + k * z_dim1] + v2 * z__[j + (k + 1) *
+				z_dim1];
+			z__[j + k * z_dim1] -= sum * t1;
+			z__[j + (k + 1) * z_dim1] -= sum * t2;
+/* L120: */
+		    }
+		}
+	    }
+/* L130: */
+	}
+
+/* L140: */
+    }
+
+/*     Failure to converge in remaining number of iterations */
+
+    *info = i__;
+    return 0;
+
+L150:
+
+    if (l == i__) {
+
+/*        H(I,I-1) is negligible: one eigenvalue has converged. */
+
+	wr[i__] = h__[i__ + i__ * h_dim1];
+	wi[i__] = 0.;
+    } else if (l == i__ - 1) {
+
+/*
+          H(I-1,I-2) is negligible: a pair of eigenvalues have converged.
+
+          Transform the 2-by-2 submatrix to standard Schur form,
+          and compute and store the eigenvalues.
+*/
+
+	dlanv2_(&h__[i__ - 1 + (i__ - 1) * h_dim1], &h__[i__ - 1 + i__ *
+		h_dim1], &h__[i__ + (i__ - 1) * h_dim1], &h__[i__ + i__ *
+		h_dim1], &wr[i__ - 1], &wi[i__ - 1], &wr[i__], &wi[i__], &cs,
+		&sn);
+
+	if (*wantt) {
+
+/*           Apply the transformation to the rest of H. */
+
+	    if (i2 > i__) {
+		i__1 = i2 - i__;
+		drot_(&i__1, &h__[i__ - 1 + (i__ + 1) * h_dim1], ldh, &h__[
+			i__ + (i__ + 1) * h_dim1], ldh, &cs, &sn);
+	    }
+	    i__1 = i__ - i1 - 1;
+	    drot_(&i__1, &h__[i1 + (i__ - 1) * h_dim1], &c__1, &h__[i1 + i__ *
+		     h_dim1], &c__1, &cs, &sn);
+	}
+	if (*wantz) {
+
+/*           Apply the transformation to Z. */
+
+	    drot_(&nz, &z__[*iloz + (i__ - 1) * z_dim1], &c__1, &z__[*iloz +
+		    i__ * z_dim1], &c__1, &cs, &sn);
+	}
+    }
+
+/*     return to start of the main loop with new value of I. */
+
+    i__ = l - 1;
+    goto L20;
+
+L160:
+    return 0;
+
+/*     End of DLAHQR */
+
+} /* dlahqr_ */
+
+/* Subroutine */ int dlahr2_(integer *n, integer *k, integer *nb, doublereal *
+	a, integer *lda, doublereal *tau, doublereal *t, integer *ldt,
+	doublereal *y, integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal ei;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *), dgemm_(char *, char *, integer *, integer *, integer *
+	    , doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *), dgemv_(
+	    char *, integer *, integer *, doublereal *, doublereal *, integer
+	    *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *,
+	     integer *), dtrmm_(char *, char *, char *, char *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *), daxpy_(integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *),
+	    dtrmv_(char *, char *, char *, integer *, doublereal *, integer *,
+	     doublereal *, integer *), dlarfg_(
+	    integer *, doublereal *, doublereal *, integer *, doublereal *),
+	    dlacpy_(char *, integer *, integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    DLAHR2 reduces the first NB columns of A real general n-BY-(n-k+1)
+    matrix A so that elements below the k-th subdiagonal are zero. The
+    reduction is performed by an orthogonal similarity transformation
+    Q' * A * Q. The routine returns the matrices V and T which determine
+    Q as a block reflector I - V*T*V', and also the matrix Y = A * V * T.
+
+    This is an auxiliary routine called by DGEHRD.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    K       (input) INTEGER
+            The offset for the reduction. Elements below the k-th
+            subdiagonal in the first NB columns are reduced to zero.
+            K < N.
+
+    NB      (input) INTEGER
+            The number of columns to be reduced.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N-K+1)
+            On entry, the n-by-(n-k+1) general matrix A.
+            On exit, the elements on and above the k-th subdiagonal in
+            the first NB columns are overwritten with the corresponding
+            elements of the reduced matrix; the elements below the k-th
+            subdiagonal, with the array TAU, represent the matrix Q as a
+            product of elementary reflectors. The other columns of A are
+            unchanged. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) DOUBLE PRECISION array, dimension (NB)
+            The scalar factors of the elementary reflectors. See Further
+            Details.
+
+    T       (output) DOUBLE PRECISION array, dimension (LDT,NB)
+            The upper triangular matrix T.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T.  LDT >= NB.
+
+    Y       (output) DOUBLE PRECISION array, dimension (LDY,NB)
+            The n-by-nb matrix Y.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of nb elementary reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
+    A(i+k+1:n,i), and tau in TAU(i).
+
+    The elements of the vectors v together form the (n-k+1)-by-nb matrix
+    V which is needed, with T and Y, to apply the transformation to the
+    unreduced part of the matrix, using an update of the form:
+    A := (I - V*T*V') * (A - Y*V').
+
+    The contents of A on exit are illustrated by the following example
+    with n = 7, k = 3 and nb = 2:
+
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( h   h   a   a   a )
+       ( v1  h   a   a   a )
+       ( v1  v2  a   a   a )
+       ( v1  v2  a   a   a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This subroutine is a slight modification of LAPACK-3.0's DLAHRD
+    incorporating improvements proposed by Quintana-Orti and Van de
+    Gejin. Note that the entries of A(1:K,2:NB) differ from those
+    returned by the original LAPACK-3.0's DLAHRD routine. (This
+    subroutine is not backward compatible with LAPACK-3.0's DLAHRD.)
+
+    References
+    ==========
+
+    Gregorio Quintana-Orti and Robert van de Geijn, "Improving the
+    performance of reduction to Hessenberg form," ACM Transactions on
+    Mathematical Software, 32(2):180-194, June 2006.
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    --tau;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*n <= 1) {
+	return 0;
+    }
+
+    i__1 = *nb;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (i__ > 1) {
+
+/*
+             Update A(K+1:N,I)
+
+             Update I-th column of A - Y * V'
+*/
+
+	    i__2 = *n - *k;
+	    i__3 = i__ - 1;
+	    dgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b151, &y[*k + 1 + y_dim1],
+		     ldy, &a[*k + i__ - 1 + a_dim1], lda, &c_b15, &a[*k + 1 +
+		    i__ * a_dim1], &c__1);
+
+/*
+             Apply I - V * T' * V' to this column (call it b) from the
+             left, using the last column of T as workspace
+
+             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)
+                      ( V2 )             ( b2 )
+
+             where V1 is unit lower triangular
+
+             w := V1' * b1
+*/
+
+	    i__2 = i__ - 1;
+	    dcopy_(&i__2, &a[*k + 1 + i__ * a_dim1], &c__1, &t[*nb * t_dim1 +
+		    1], &c__1);
+	    i__2 = i__ - 1;
+	    dtrmv_("Lower", "Transpose", "UNIT", &i__2, &a[*k + 1 + a_dim1],
+		    lda, &t[*nb * t_dim1 + 1], &c__1);
+
+/*           w := w + V2'*b2 */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[*k + i__ + a_dim1],
+		    lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b15, &t[*nb *
+		    t_dim1 + 1], &c__1);
+
+/*           w := T'*w */
+
+	    i__2 = i__ - 1;
+	    dtrmv_("Upper", "Transpose", "NON-UNIT", &i__2, &t[t_offset], ldt,
+		     &t[*nb * t_dim1 + 1], &c__1);
+
+/*           b2 := b2 - V2*w */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b151, &a[*k + i__ +
+		    a_dim1], lda, &t[*nb * t_dim1 + 1], &c__1, &c_b15, &a[*k
+		    + i__ + i__ * a_dim1], &c__1);
+
+/*           b1 := b1 - V1*w */
+
+	    i__2 = i__ - 1;
+	    dtrmv_("Lower", "NO TRANSPOSE", "UNIT", &i__2, &a[*k + 1 + a_dim1]
+		    , lda, &t[*nb * t_dim1 + 1], &c__1);
+	    i__2 = i__ - 1;
+	    daxpy_(&i__2, &c_b151, &t[*nb * t_dim1 + 1], &c__1, &a[*k + 1 +
+		    i__ * a_dim1], &c__1);
+
+	    a[*k + i__ - 1 + (i__ - 1) * a_dim1] = ei;
+	}
+
+/*
+          Generate the elementary reflector H(I) to annihilate
+          A(K+I+1:N,I)
+*/
+
+	i__2 = *n - *k - i__ + 1;
+/* Computing MIN */
+	i__3 = *k + i__ + 1;
+	dlarfg_(&i__2, &a[*k + i__ + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		a_dim1], &c__1, &tau[i__]);
+	ei = a[*k + i__ + i__ * a_dim1];
+	a[*k + i__ + i__ * a_dim1] = 1.;
+
+/*        Compute  Y(K+1:N,I) */
+
+	i__2 = *n - *k;
+	i__3 = *n - *k - i__ + 1;
+	dgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b15, &a[*k + 1 + (i__ + 1) *
+		a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b29, &y[*
+		k + 1 + i__ * y_dim1], &c__1);
+	i__2 = *n - *k - i__ + 1;
+	i__3 = i__ - 1;
+	dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[*k + i__ + a_dim1], lda,
+		&a[*k + i__ + i__ * a_dim1], &c__1, &c_b29, &t[i__ * t_dim1 +
+		1], &c__1);
+	i__2 = *n - *k;
+	i__3 = i__ - 1;
+	dgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b151, &y[*k + 1 + y_dim1],
+		ldy, &t[i__ * t_dim1 + 1], &c__1, &c_b15, &y[*k + 1 + i__ *
+		y_dim1], &c__1);
+	i__2 = *n - *k;
+	dscal_(&i__2, &tau[i__], &y[*k + 1 + i__ * y_dim1], &c__1);
+
+/*        Compute T(1:I,I) */
+
+	i__2 = i__ - 1;
+	d__1 = -tau[i__];
+	dscal_(&i__2, &d__1, &t[i__ * t_dim1 + 1], &c__1);
+	i__2 = i__ - 1;
+	dtrmv_("Upper", "No Transpose", "NON-UNIT", &i__2, &t[t_offset], ldt,
+		&t[i__ * t_dim1 + 1], &c__1)
+		;
+	t[i__ + i__ * t_dim1] = tau[i__];
+
+/* L10: */
+    }
+    a[*k + *nb + *nb * a_dim1] = ei;
+
+/*     Compute Y(1:K,1:NB) */
+
+    dlacpy_("ALL", k, nb, &a[(a_dim1 << 1) + 1], lda, &y[y_offset], ldy);
+    dtrmm_("RIGHT", "Lower", "NO TRANSPOSE", "UNIT", k, nb, &c_b15, &a[*k + 1
+	    + a_dim1], lda, &y[y_offset], ldy);
+    if (*n > *k + *nb) {
+	i__1 = *n - *k - *nb;
+	dgemm_("NO TRANSPOSE", "NO TRANSPOSE", k, nb, &i__1, &c_b15, &a[(*nb
+		+ 2) * a_dim1 + 1], lda, &a[*k + 1 + *nb + a_dim1], lda, &
+		c_b15, &y[y_offset], ldy);
+    }
+    dtrmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b15, &t[
+	    t_offset], ldt, &y[y_offset], ldy);
+
+    return 0;
+
+/*     End of DLAHR2 */
+
+} /* dlahr2_ */
+
+logical dlaisnan_(doublereal *din1, doublereal *din2)
+{
+    /* System generated locals */
+    logical ret_val;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    This routine is not for general use.  It exists solely to avoid
+    over-optimization in DISNAN.
+
+    DLAISNAN checks for NaNs by comparing its two arguments for
+    inequality.  NaN is the only floating-point value where NaN != NaN
+    returns .TRUE.  To check for NaNs, pass the same variable as both
+    arguments.
+
+    A compiler must assume that the two arguments are
+    not the same variable, and the test will not be optimized away.
+    Interprocedural or whole-program optimization may delete this
+    test.  The ISNAN functions will be replaced by the correct
+    Fortran 03 intrinsic once the intrinsic is widely available.
+
+    Arguments
+    =========
+
+    DIN1    (input) DOUBLE PRECISION
+
+    DIN2    (input) DOUBLE PRECISION
+            Two numbers to compare for inequality.
+
+    =====================================================================
+*/
+
+    ret_val = *din1 != *din2;
+    return ret_val;
+} /* dlaisnan_ */
+
+/* Subroutine */ int dlaln2_(logical *ltrans, integer *na, integer *nw,
+	doublereal *smin, doublereal *ca, doublereal *a, integer *lda,
+	doublereal *d1, doublereal *d2, doublereal *b, integer *ldb,
+	doublereal *wr, doublereal *wi, doublereal *x, integer *ldx,
+	doublereal *scale, doublereal *xnorm, integer *info)
+{
+    /* Initialized data */
+
+    static logical zswap[4] = { FALSE_,FALSE_,TRUE_,TRUE_ };
+    static logical rswap[4] = { FALSE_,TRUE_,FALSE_,TRUE_ };
+    static integer ipivot[16]	/* was [4][4] */ = { 1,2,3,4,2,1,4,3,3,4,1,2,
+	    4,3,2,1 };
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    static doublereal equiv_0[4], equiv_1[4];
+
+    /* Local variables */
+    static integer j;
+#define ci (equiv_0)
+#define cr (equiv_1)
+    static doublereal bi1, bi2, br1, br2, xi1, xi2, xr1, xr2, ci21, ci22,
+	    cr21, cr22, li21, csi, ui11, lr21, ui12, ui22;
+#define civ (equiv_0)
+    static doublereal csr, ur11, ur12, ur22;
+#define crv (equiv_1)
+    static doublereal bbnd, cmax, ui11r, ui12s, temp, ur11r, ur12s, u22abs;
+    static integer icmax;
+    static doublereal bnorm, cnorm, smini;
+
+    extern /* Subroutine */ int dladiv_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *);
+    static doublereal bignum, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLALN2 solves a system of the form  (ca A - w D ) X = s B
+    or (ca A' - w D) X = s B   with possible scaling ("s") and
+    perturbation of A.  (A' means A-transpose.)
+
+    A is an NA x NA real matrix, ca is a real scalar, D is an NA x NA
+    real diagonal matrix, w is a real or complex value, and X and B are
+    NA x 1 matrices -- real if w is real, complex if w is complex.  NA
+    may be 1 or 2.
+
+    If w is complex, X and B are represented as NA x 2 matrices,
+    the first column of each being the real part and the second
+    being the imaginary part.
+
+    "s" is a scaling factor (.LE. 1), computed by DLALN2, which is
+    so chosen that X can be computed without overflow.  X is further
+    scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
+    than overflow.
+
+    If both singular values of (ca A - w D) are less than SMIN,
+    SMIN*identity will be used instead of (ca A - w D).  If only one
+    singular value is less than SMIN, one element of (ca A - w D) will be
+    perturbed enough to make the smallest singular value roughly SMIN.
+    If both singular values are at least SMIN, (ca A - w D) will not be
+    perturbed.  In any case, the perturbation will be at most some small
+    multiple of max( SMIN, ulp*norm(ca A - w D) ).  The singular values
+    are computed by infinity-norm approximations, and thus will only be
+    correct to a factor of 2 or so.
+
+    Note: all input quantities are assumed to be smaller than overflow
+    by a reasonable factor.  (See BIGNUM.)
+
+    Arguments
+    ==========
+
+    LTRANS  (input) LOGICAL
+            =.TRUE.:  A-transpose will be used.
+            =.FALSE.: A will be used (not transposed.)
+
+    NA      (input) INTEGER
+            The size of the matrix A.  It may (only) be 1 or 2.
+
+    NW      (input) INTEGER
+            1 if "w" is real, 2 if "w" is complex.  It may only be 1
+            or 2.
+
+    SMIN    (input) DOUBLE PRECISION
+            The desired lower bound on the singular values of A.  This
+            should be a safe distance away from underflow or overflow,
+            say, between (underflow/machine precision) and  (machine
+            precision * overflow ).  (See BIGNUM and ULP.)
+
+    CA      (input) DOUBLE PRECISION
+            The coefficient c, which A is multiplied by.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,NA)
+            The NA x NA matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of A.  It must be at least NA.
+
+    D1      (input) DOUBLE PRECISION
+            The 1,1 element in the diagonal matrix D.
+
+    D2      (input) DOUBLE PRECISION
+            The 2,2 element in the diagonal matrix D.  Not used if NW=1.
+
+    B       (input) DOUBLE PRECISION array, dimension (LDB,NW)
+            The NA x NW matrix B (right-hand side).  If NW=2 ("w" is
+            complex), column 1 contains the real part of B and column 2
+            contains the imaginary part.
+
+    LDB     (input) INTEGER
+            The leading dimension of B.  It must be at least NA.
+
+    WR      (input) DOUBLE PRECISION
+            The real part of the scalar "w".
+
+    WI      (input) DOUBLE PRECISION
+            The imaginary part of the scalar "w".  Not used if NW=1.
+
+    X       (output) DOUBLE PRECISION array, dimension (LDX,NW)
+            The NA x NW matrix X (unknowns), as computed by DLALN2.
+            If NW=2 ("w" is complex), on exit, column 1 will contain
+            the real part of X and column 2 will contain the imaginary
+            part.
+
+    LDX     (input) INTEGER
+            The leading dimension of X.  It must be at least NA.
+
+    SCALE   (output) DOUBLE PRECISION
+            The scale factor that B must be multiplied by to insure
+            that overflow does not occur when computing X.  Thus,
+            (ca A - w D) X  will be SCALE*B, not B (ignoring
+            perturbations of A.)  It will be at most 1.
+
+    XNORM   (output) DOUBLE PRECISION
+            The infinity-norm of X, when X is regarded as an NA x NW
+            real matrix.
+
+    INFO    (output) INTEGER
+            An error flag.  It will be set to zero if no error occurs,
+            a negative number if an argument is in error, or a positive
+            number if  ca A - w D  had to be perturbed.
+            The possible values are:
+            = 0: No error occurred, and (ca A - w D) did not have to be
+                   perturbed.
+            = 1: (ca A - w D) had to be perturbed to make its smallest
+                 (or only) singular value greater than SMIN.
+            NOTE: In the interests of speed, this routine does not
+                  check the inputs for errors.
+
+   =====================================================================
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+
+    /* Function Body */
+
+/*     Compute BIGNUM */
+
+    smlnum = 2. * SAFEMINIMUM;
+    bignum = 1. / smlnum;
+    smini = max(*smin,smlnum);
+
+/*     Don't check for input errors */
+
+    *info = 0;
+
+/*     Standard Initializations */
+
+    *scale = 1.;
+
+    if (*na == 1) {
+
+/*        1 x 1  (i.e., scalar) system   C X = B */
+
+	if (*nw == 1) {
+
+/*
+             Real 1x1 system.
+
+             C = ca A - w D
+*/
+
+	    csr = *ca * a[a_dim1 + 1] - *wr * *d1;
+	    cnorm = abs(csr);
+
+/*           If | C | < SMINI, use C = SMINI */
+
+	    if (cnorm < smini) {
+		csr = smini;
+		cnorm = smini;
+		*info = 1;
+	    }
+
+/*           Check scaling for  X = B / C */
+
+	    bnorm = (d__1 = b[b_dim1 + 1], abs(d__1));
+	    if (cnorm < 1. && bnorm > 1.) {
+		if (bnorm > bignum * cnorm) {
+		    *scale = 1. / bnorm;
+		}
+	    }
+
+/*           Compute X */
+
+	    x[x_dim1 + 1] = b[b_dim1 + 1] * *scale / csr;
+	    *xnorm = (d__1 = x[x_dim1 + 1], abs(d__1));
+	} else {
+
+/*
+             Complex 1x1 system (w is complex)
+
+             C = ca A - w D
+*/
+
+	    csr = *ca * a[a_dim1 + 1] - *wr * *d1;
+	    csi = -(*wi) * *d1;
+	    cnorm = abs(csr) + abs(csi);
+
+/*           If | C | < SMINI, use C = SMINI */
+
+	    if (cnorm < smini) {
+		csr = smini;
+		csi = 0.;
+		cnorm = smini;
+		*info = 1;
+	    }
+
+/*           Check scaling for  X = B / C */
+
+	    bnorm = (d__1 = b[b_dim1 + 1], abs(d__1)) + (d__2 = b[(b_dim1 <<
+		    1) + 1], abs(d__2));
+	    if (cnorm < 1. && bnorm > 1.) {
+		if (bnorm > bignum * cnorm) {
+		    *scale = 1. / bnorm;
+		}
+	    }
+
+/*           Compute X */
+
+	    d__1 = *scale * b[b_dim1 + 1];
+	    d__2 = *scale * b[(b_dim1 << 1) + 1];
+	    dladiv_(&d__1, &d__2, &csr, &csi, &x[x_dim1 + 1], &x[(x_dim1 << 1)
+		     + 1]);
+	    *xnorm = (d__1 = x[x_dim1 + 1], abs(d__1)) + (d__2 = x[(x_dim1 <<
+		    1) + 1], abs(d__2));
+	}
+
+    } else {
+
+/*
+          2x2 System
+
+          Compute the real part of  C = ca A - w D  (or  ca A' - w D )
+*/
+
+	cr[0] = *ca * a[a_dim1 + 1] - *wr * *d1;
+	cr[3] = *ca * a[(a_dim1 << 1) + 2] - *wr * *d2;
+	if (*ltrans) {
+	    cr[2] = *ca * a[a_dim1 + 2];
+	    cr[1] = *ca * a[(a_dim1 << 1) + 1];
+	} else {
+	    cr[1] = *ca * a[a_dim1 + 2];
+	    cr[2] = *ca * a[(a_dim1 << 1) + 1];
+	}
+
+	if (*nw == 1) {
+
+/*
+             Real 2x2 system  (w is real)
+
+             Find the largest element in C
+*/
+
+	    cmax = 0.;
+	    icmax = 0;
+
+	    for (j = 1; j <= 4; ++j) {
+		if ((d__1 = crv[j - 1], abs(d__1)) > cmax) {
+		    cmax = (d__1 = crv[j - 1], abs(d__1));
+		    icmax = j;
+		}
+/* L10: */
+	    }
+
+/*           If norm(C) < SMINI, use SMINI*identity. */
+
+	    if (cmax < smini) {
+/* Computing MAX */
+		d__3 = (d__1 = b[b_dim1 + 1], abs(d__1)), d__4 = (d__2 = b[
+			b_dim1 + 2], abs(d__2));
+		bnorm = max(d__3,d__4);
+		if (smini < 1. && bnorm > 1.) {
+		    if (bnorm > bignum * smini) {
+			*scale = 1. / bnorm;
+		    }
+		}
+		temp = *scale / smini;
+		x[x_dim1 + 1] = temp * b[b_dim1 + 1];
+		x[x_dim1 + 2] = temp * b[b_dim1 + 2];
+		*xnorm = temp * bnorm;
+		*info = 1;
+		return 0;
+	    }
+
+/*           Gaussian elimination with complete pivoting. */
+
+	    ur11 = crv[icmax - 1];
+	    cr21 = crv[ipivot[(icmax << 2) - 3] - 1];
+	    ur12 = crv[ipivot[(icmax << 2) - 2] - 1];
+	    cr22 = crv[ipivot[(icmax << 2) - 1] - 1];
+	    ur11r = 1. / ur11;
+	    lr21 = ur11r * cr21;
+	    ur22 = cr22 - ur12 * lr21;
+
+/*           If smaller pivot < SMINI, use SMINI */
+
+	    if (abs(ur22) < smini) {
+		ur22 = smini;
+		*info = 1;
+	    }
+	    if (rswap[icmax - 1]) {
+		br1 = b[b_dim1 + 2];
+		br2 = b[b_dim1 + 1];
+	    } else {
+		br1 = b[b_dim1 + 1];
+		br2 = b[b_dim1 + 2];
+	    }
+	    br2 -= lr21 * br1;
+/* Computing MAX */
+	    d__2 = (d__1 = br1 * (ur22 * ur11r), abs(d__1)), d__3 = abs(br2);
+	    bbnd = max(d__2,d__3);
+	    if (bbnd > 1. && abs(ur22) < 1.) {
+		if (bbnd >= bignum * abs(ur22)) {
+		    *scale = 1. / bbnd;
+		}
+	    }
+
+	    xr2 = br2 * *scale / ur22;
+	    xr1 = *scale * br1 * ur11r - xr2 * (ur11r * ur12);
+	    if (zswap[icmax - 1]) {
+		x[x_dim1 + 1] = xr2;
+		x[x_dim1 + 2] = xr1;
+	    } else {
+		x[x_dim1 + 1] = xr1;
+		x[x_dim1 + 2] = xr2;
+	    }
+/* Computing MAX */
+	    d__1 = abs(xr1), d__2 = abs(xr2);
+	    *xnorm = max(d__1,d__2);
+
+/*           Further scaling if  norm(A) norm(X) > overflow */
+
+	    if (*xnorm > 1. && cmax > 1.) {
+		if (*xnorm > bignum / cmax) {
+		    temp = cmax / bignum;
+		    x[x_dim1 + 1] = temp * x[x_dim1 + 1];
+		    x[x_dim1 + 2] = temp * x[x_dim1 + 2];
+		    *xnorm = temp * *xnorm;
+		    *scale = temp * *scale;
+		}
+	    }
+	} else {
+
+/*
+             Complex 2x2 system  (w is complex)
+
+             Find the largest element in C
+*/
+
+	    ci[0] = -(*wi) * *d1;
+	    ci[1] = 0.;
+	    ci[2] = 0.;
+	    ci[3] = -(*wi) * *d2;
+	    cmax = 0.;
+	    icmax = 0;
+
+	    for (j = 1; j <= 4; ++j) {
+		if ((d__1 = crv[j - 1], abs(d__1)) + (d__2 = civ[j - 1], abs(
+			d__2)) > cmax) {
+		    cmax = (d__1 = crv[j - 1], abs(d__1)) + (d__2 = civ[j - 1]
+			    , abs(d__2));
+		    icmax = j;
+		}
+/* L20: */
+	    }
+
+/*           If norm(C) < SMINI, use SMINI*identity. */
+
+	    if (cmax < smini) {
+/* Computing MAX */
+		d__5 = (d__1 = b[b_dim1 + 1], abs(d__1)) + (d__2 = b[(b_dim1
+			<< 1) + 1], abs(d__2)), d__6 = (d__3 = b[b_dim1 + 2],
+			abs(d__3)) + (d__4 = b[(b_dim1 << 1) + 2], abs(d__4));
+		bnorm = max(d__5,d__6);
+		if (smini < 1. && bnorm > 1.) {
+		    if (bnorm > bignum * smini) {
+			*scale = 1. / bnorm;
+		    }
+		}
+		temp = *scale / smini;
+		x[x_dim1 + 1] = temp * b[b_dim1 + 1];
+		x[x_dim1 + 2] = temp * b[b_dim1 + 2];
+		x[(x_dim1 << 1) + 1] = temp * b[(b_dim1 << 1) + 1];
+		x[(x_dim1 << 1) + 2] = temp * b[(b_dim1 << 1) + 2];
+		*xnorm = temp * bnorm;
+		*info = 1;
+		return 0;
+	    }
+
+/*           Gaussian elimination with complete pivoting. */
+
+	    ur11 = crv[icmax - 1];
+	    ui11 = civ[icmax - 1];
+	    cr21 = crv[ipivot[(icmax << 2) - 3] - 1];
+	    ci21 = civ[ipivot[(icmax << 2) - 3] - 1];
+	    ur12 = crv[ipivot[(icmax << 2) - 2] - 1];
+	    ui12 = civ[ipivot[(icmax << 2) - 2] - 1];
+	    cr22 = crv[ipivot[(icmax << 2) - 1] - 1];
+	    ci22 = civ[ipivot[(icmax << 2) - 1] - 1];
+	    if (icmax == 1 || icmax == 4) {
+
+/*              Code when off-diagonals of pivoted C are real */
+
+		if (abs(ur11) > abs(ui11)) {
+		    temp = ui11 / ur11;
+/* Computing 2nd power */
+		    d__1 = temp;
+		    ur11r = 1. / (ur11 * (d__1 * d__1 + 1.));
+		    ui11r = -temp * ur11r;
+		} else {
+		    temp = ur11 / ui11;
+/* Computing 2nd power */
+		    d__1 = temp;
+		    ui11r = -1. / (ui11 * (d__1 * d__1 + 1.));
+		    ur11r = -temp * ui11r;
+		}
+		lr21 = cr21 * ur11r;
+		li21 = cr21 * ui11r;
+		ur12s = ur12 * ur11r;
+		ui12s = ur12 * ui11r;
+		ur22 = cr22 - ur12 * lr21;
+		ui22 = ci22 - ur12 * li21;
+	    } else {
+
+/*              Code when diagonals of pivoted C are real */
+
+		ur11r = 1. / ur11;
+		ui11r = 0.;
+		lr21 = cr21 * ur11r;
+		li21 = ci21 * ur11r;
+		ur12s = ur12 * ur11r;
+		ui12s = ui12 * ur11r;
+		ur22 = cr22 - ur12 * lr21 + ui12 * li21;
+		ui22 = -ur12 * li21 - ui12 * lr21;
+	    }
+	    u22abs = abs(ur22) + abs(ui22);
+
+/*           If smaller pivot < SMINI, use SMINI */
+
+	    if (u22abs < smini) {
+		ur22 = smini;
+		ui22 = 0.;
+		*info = 1;
+	    }
+	    if (rswap[icmax - 1]) {
+		br2 = b[b_dim1 + 1];
+		br1 = b[b_dim1 + 2];
+		bi2 = b[(b_dim1 << 1) + 1];
+		bi1 = b[(b_dim1 << 1) + 2];
+	    } else {
+		br1 = b[b_dim1 + 1];
+		br2 = b[b_dim1 + 2];
+		bi1 = b[(b_dim1 << 1) + 1];
+		bi2 = b[(b_dim1 << 1) + 2];
+	    }
+	    br2 = br2 - lr21 * br1 + li21 * bi1;
+	    bi2 = bi2 - li21 * br1 - lr21 * bi1;
+/* Computing MAX */
+	    d__1 = (abs(br1) + abs(bi1)) * (u22abs * (abs(ur11r) + abs(ui11r))
+		    ), d__2 = abs(br2) + abs(bi2);
+	    bbnd = max(d__1,d__2);
+	    if (bbnd > 1. && u22abs < 1.) {
+		if (bbnd >= bignum * u22abs) {
+		    *scale = 1. / bbnd;
+		    br1 = *scale * br1;
+		    bi1 = *scale * bi1;
+		    br2 = *scale * br2;
+		    bi2 = *scale * bi2;
+		}
+	    }
+
+	    dladiv_(&br2, &bi2, &ur22, &ui22, &xr2, &xi2);
+	    xr1 = ur11r * br1 - ui11r * bi1 - ur12s * xr2 + ui12s * xi2;
+	    xi1 = ui11r * br1 + ur11r * bi1 - ui12s * xr2 - ur12s * xi2;
+	    if (zswap[icmax - 1]) {
+		x[x_dim1 + 1] = xr2;
+		x[x_dim1 + 2] = xr1;
+		x[(x_dim1 << 1) + 1] = xi2;
+		x[(x_dim1 << 1) + 2] = xi1;
+	    } else {
+		x[x_dim1 + 1] = xr1;
+		x[x_dim1 + 2] = xr2;
+		x[(x_dim1 << 1) + 1] = xi1;
+		x[(x_dim1 << 1) + 2] = xi2;
+	    }
+/* Computing MAX */
+	    d__1 = abs(xr1) + abs(xi1), d__2 = abs(xr2) + abs(xi2);
+	    *xnorm = max(d__1,d__2);
+
+/*           Further scaling if  norm(A) norm(X) > overflow */
+
+	    if (*xnorm > 1. && cmax > 1.) {
+		if (*xnorm > bignum / cmax) {
+		    temp = cmax / bignum;
+		    x[x_dim1 + 1] = temp * x[x_dim1 + 1];
+		    x[x_dim1 + 2] = temp * x[x_dim1 + 2];
+		    x[(x_dim1 << 1) + 1] = temp * x[(x_dim1 << 1) + 1];
+		    x[(x_dim1 << 1) + 2] = temp * x[(x_dim1 << 1) + 2];
+		    *xnorm = temp * *xnorm;
+		    *scale = temp * *scale;
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DLALN2 */
+
+} /* dlaln2_ */
+
+#undef crv
+#undef civ
+#undef cr
+#undef ci
+
+
+/* Subroutine */ int dlals0_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *nrhs, doublereal *b, integer *ldb, doublereal
+	*bx, integer *ldbx, integer *perm, integer *givptr, integer *givcol,
+	integer *ldgcol, doublereal *givnum, integer *ldgnum, doublereal *
+	poles, doublereal *difl, doublereal *difr, doublereal *z__, integer *
+	k, doublereal *c__, doublereal *s, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, b_dim1, b_offset, bx_dim1, bx_offset,
+	    difr_dim1, difr_offset, givnum_dim1, givnum_offset, poles_dim1,
+	    poles_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, m, n;
+    static doublereal dj;
+    static integer nlp1;
+    static doublereal temp;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    static doublereal diflj, difrj, dsigj;
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *), dcopy_(integer *,
+	    doublereal *, integer *, doublereal *, integer *);
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlacpy_(char *, integer *, integer
+	    *, doublereal *, integer *, doublereal *, integer *),
+	    xerbla_(char *, integer *);
+    static doublereal dsigjp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLALS0 applies back the multiplying factors of either the left or the
+    right singular vector matrix of a diagonal matrix appended by a row
+    to the right hand side matrix B in solving the least squares problem
+    using the divide-and-conquer SVD approach.
+
+    For the left singular vector matrix, three types of orthogonal
+    matrices are involved:
+
+    (1L) Givens rotations: the number of such rotations is GIVPTR; the
+         pairs of columns/rows they were applied to are stored in GIVCOL;
+         and the C- and S-values of these rotations are stored in GIVNUM.
+
+    (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+         row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+         J-th row.
+
+    (3L) The left singular vector matrix of the remaining matrix.
+
+    For the right singular vector matrix, four types of orthogonal
+    matrices are involved:
+
+    (1R) The right singular vector matrix of the remaining matrix.
+
+    (2R) If SQRE = 1, one extra Givens rotation to generate the right
+         null space.
+
+    (3R) The inverse transformation of (2L).
+
+    (4R) The inverse transformation of (1L).
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Left singular vector matrix.
+           = 1: Right singular vector matrix.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) DOUBLE PRECISION array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M. On output, B contains
+           the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B. LDB must be at least
+           max(1,MAX( M, N ) ).
+
+    BX     (workspace) DOUBLE PRECISION array, dimension ( LDBX, NRHS )
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    PERM   (input) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) applied
+           to the two blocks.
+
+    GIVPTR (input) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of rows/columns
+           involved in a Givens rotation.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value used in the
+           corresponding Givens rotation.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of arrays DIFR, POLES and
+           GIVNUM, must be at least K.
+
+    POLES  (input) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           On entry, POLES(1:K, 1) contains the new singular
+           values obtained from solving the secular equation, and
+           POLES(1:K, 2) is an array containing the poles in the secular
+           equation.
+
+    DIFL   (input) DOUBLE PRECISION array, dimension ( K ).
+           On entry, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (input) DOUBLE PRECISION array, dimension ( LDGNUM, 2 ).
+           On entry, DIFR(I, 1) contains the distances between I-th
+           updated (undeflated) singular value and the I+1-th
+           (undeflated) old singular value. And DIFR(I, 2) is the
+           normalizing factor for the I-th right singular vector.
+
+    Z      (input) DOUBLE PRECISION array, dimension ( K )
+           Contain the components of the deflation-adjusted updating row
+           vector.
+
+    K      (input) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (input) DOUBLE PRECISION
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (input) DOUBLE PRECISION
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension ( K )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    difr_dim1 = *ldgnum;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --z__;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    }
+
+    n = *nl + *nr + 1;
+
+    if (*nrhs < 1) {
+	*info = -5;
+    } else if (*ldb < n) {
+	*info = -7;
+    } else if (*ldbx < n) {
+	*info = -9;
+    } else if (*givptr < 0) {
+	*info = -11;
+    } else if (*ldgcol < n) {
+	*info = -13;
+    } else if (*ldgnum < n) {
+	*info = -15;
+    } else if (*k < 1) {
+	*info = -20;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLALS0", &i__1);
+	return 0;
+    }
+
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+
+    if (*icompq == 0) {
+
+/*
+          Apply back orthogonal transformations from the left.
+
+          Step (1L): apply back the Givens rotations performed.
+*/
+
+	i__1 = *givptr;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    drot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]);
+/* L10: */
+	}
+
+/*        Step (2L): permute rows of B. */
+
+	dcopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx);
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    dcopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1],
+		    ldbx);
+/* L20: */
+	}
+
+/*
+          Step (3L): apply the inverse of the left singular vector
+          matrix to BX.
+*/
+
+	if (*k == 1) {
+	    dcopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb);
+	    if (z__[1] < 0.) {
+		dscal_(nrhs, &c_b151, &b[b_offset], ldb);
+	    }
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		diflj = difl[j];
+		dj = poles[j + poles_dim1];
+		dsigj = -poles[j + (poles_dim1 << 1)];
+		if (j < *k) {
+		    difrj = -difr[j + difr_dim1];
+		    dsigjp = -poles[j + 1 + (poles_dim1 << 1)];
+		}
+		if (z__[j] == 0. || poles[j + (poles_dim1 << 1)] == 0.) {
+		    work[j] = 0.;
+		} else {
+		    work[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj /
+			     (poles[j + (poles_dim1 << 1)] + dj);
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0. || poles[i__ + (poles_dim1 << 1)] ==
+			    0.) {
+			work[i__] = 0.;
+		    } else {
+			work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				/ (dlamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigj) - diflj) / (poles[i__ + (poles_dim1 <<
+				1)] + dj);
+		    }
+/* L30: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0. || poles[i__ + (poles_dim1 << 1)] ==
+			    0.) {
+			work[i__] = 0.;
+		    } else {
+			work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				/ (dlamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigjp) + difrj) / (poles[i__ + (poles_dim1 <<
+				 1)] + dj);
+		    }
+/* L40: */
+		}
+		work[1] = -1.;
+		temp = dnrm2_(k, &work[1], &c__1);
+		dgemv_("T", k, nrhs, &c_b15, &bx[bx_offset], ldbx, &work[1], &
+			c__1, &c_b29, &b[j + b_dim1], ldb);
+		dlascl_("G", &c__0, &c__0, &temp, &c_b15, &c__1, nrhs, &b[j +
+			b_dim1], ldb, info);
+/* L50: */
+	    }
+	}
+
+/*        Move the deflated rows of BX to B also. */
+
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    dlacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1
+		    + b_dim1], ldb);
+	}
+    } else {
+
+/*
+          Apply back the right orthogonal transformations.
+
+          Step (1R): apply back the new right singular vector matrix
+          to B.
+*/
+
+	if (*k == 1) {
+	    dcopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx);
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		dsigj = poles[j + (poles_dim1 << 1)];
+		if (z__[j] == 0.) {
+		    work[j] = 0.;
+		} else {
+		    work[j] = -z__[j] / difl[j] / (dsigj + poles[j +
+			    poles_dim1]) / difr[j + (difr_dim1 << 1)];
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.) {
+			work[i__] = 0.;
+		    } else {
+			d__1 = -poles[i__ + 1 + (poles_dim1 << 1)];
+			work[i__] = z__[j] / (dlamc3_(&dsigj, &d__1) - difr[
+				i__ + difr_dim1]) / (dsigj + poles[i__ +
+				poles_dim1]) / difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L60: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.) {
+			work[i__] = 0.;
+		    } else {
+			d__1 = -poles[i__ + (poles_dim1 << 1)];
+			work[i__] = z__[j] / (dlamc3_(&dsigj, &d__1) - difl[
+				i__]) / (dsigj + poles[i__ + poles_dim1]) /
+				difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L70: */
+		}
+		dgemv_("T", k, nrhs, &c_b15, &b[b_offset], ldb, &work[1], &
+			c__1, &c_b29, &bx[j + bx_dim1], ldbx);
+/* L80: */
+	    }
+	}
+
+/*
+          Step (2R): if SQRE = 1, apply back the rotation that is
+          related to the right null space of the subproblem.
+*/
+
+	if (*sqre == 1) {
+	    dcopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx);
+	    drot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__,
+		    s);
+	}
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    dlacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 +
+		    bx_dim1], ldbx);
+	}
+
+/*        Step (3R): permute rows of B. */
+
+	dcopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb);
+	if (*sqre == 1) {
+	    dcopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb);
+	}
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    dcopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1],
+		    ldb);
+/* L90: */
+	}
+
+/*        Step (4R): apply back the Givens rotations performed. */
+
+	for (i__ = *givptr; i__ >= 1; --i__) {
+	    d__1 = -givnum[i__ + givnum_dim1];
+	    drot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &d__1);
+/* L100: */
+	}
+    }
+
+    return 0;
+
+/*     End of DLALS0 */
+
+} /* dlals0_ */
+
+/* Subroutine */ int dlalsa_(integer *icompq, integer *smlsiz, integer *n,
+	integer *nrhs, doublereal *b, integer *ldb, doublereal *bx, integer *
+	ldbx, doublereal *u, integer *ldu, doublereal *vt, integer *k,
+	doublereal *difl, doublereal *difr, doublereal *z__, doublereal *
+	poles, integer *givptr, integer *givcol, integer *ldgcol, integer *
+	perm, doublereal *givnum, doublereal *c__, doublereal *s, doublereal *
+	work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, b_dim1,
+	    b_offset, bx_dim1, bx_offset, difl_dim1, difl_offset, difr_dim1,
+	    difr_offset, givnum_dim1, givnum_offset, poles_dim1, poles_offset,
+	     u_dim1, u_offset, vt_dim1, vt_offset, z_dim1, z_offset, i__1,
+	    i__2;
+
+    /* Local variables */
+    static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl,
+	    ndb1, nlp1, lvl2, nrp1, nlvl, sqre;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer inode, ndiml, ndimr;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlals0_(integer *, integer *, integer *,
+	     integer *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, integer *, integer *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     integer *), dlasdt_(integer *, integer *, integer *, integer *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLALSA is an itermediate step in solving the least squares problem
+    by computing the SVD of the coefficient matrix in compact form (The
+    singular vectors are computed as products of simple orthorgonal
+    matrices.).
+
+    If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector
+    matrix of an upper bidiagonal matrix to the right hand side; and if
+    ICOMPQ = 1, DLALSA applies the right singular vector matrix to the
+    right hand side. The singular vector matrices were generated in
+    compact form by DLALSA.
+
+    Arguments
+    =========
+
+
+    ICOMPQ (input) INTEGER
+           Specifies whether the left or the right singular vector
+           matrix is involved.
+           = 0: Left singular vector matrix
+           = 1: Right singular vector matrix
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row and column dimensions of the upper bidiagonal matrix.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) DOUBLE PRECISION array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M.
+           On output, B contains the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,MAX( M, N ) ).
+
+    BX     (output) DOUBLE PRECISION array, dimension ( LDBX, NRHS )
+           On exit, the result of applying the left or right singular
+           vector matrix to B.
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    U      (input) DOUBLE PRECISION array, dimension ( LDU, SMLSIZ ).
+           On entry, U contains the left singular vector matrices of all
+           subproblems at the bottom level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR,
+           POLES, GIVNUM, and Z.
+
+    VT     (input) DOUBLE PRECISION array, dimension ( LDU, SMLSIZ+1 ).
+           On entry, VT' contains the right singular vector matrices of
+           all subproblems at the bottom level.
+
+    K      (input) INTEGER array, dimension ( N ).
+
+    DIFL   (input) DOUBLE PRECISION array, dimension ( LDU, NLVL ).
+           where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1.
+
+    DIFR   (input) DOUBLE PRECISION array, dimension ( LDU, 2 * NLVL ).
+           On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record
+           distances between singular values on the I-th level and
+           singular values on the (I -1)-th level, and DIFR(*, 2 * I)
+           record the normalizing factors of the right singular vectors
+           matrices of subproblems on I-th level.
+
+    Z      (input) DOUBLE PRECISION array, dimension ( LDU, NLVL ).
+           On entry, Z(1, I) contains the components of the deflation-
+           adjusted updating row vector for subproblems on the I-th
+           level.
+
+    POLES  (input) DOUBLE PRECISION array, dimension ( LDU, 2 * NLVL ).
+           On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old
+           singular values involved in the secular equations on the I-th
+           level.
+
+    GIVPTR (input) INTEGER array, dimension ( N ).
+           On entry, GIVPTR( I ) records the number of Givens
+           rotations performed on the I-th problem on the computation
+           tree.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ).
+           On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the
+           locations of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (input) INTEGER array, dimension ( LDGCOL, NLVL ).
+           On entry, PERM(*, I) records permutations done on the I-th
+           level of the computation tree.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension ( LDU, 2 * NLVL ).
+           On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S-
+           values of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    C      (input) DOUBLE PRECISION array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (input) DOUBLE PRECISION array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           S( I ) contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    WORK   (workspace) DOUBLE PRECISION array.
+           The dimension must be at least N.
+
+    IWORK  (workspace) INTEGER array.
+           The dimension must be at least 3 * N
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < *smlsiz) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < *n) {
+	*info = -6;
+    } else if (*ldbx < *n) {
+	*info = -8;
+    } else if (*ldu < *n) {
+	*info = -10;
+    } else if (*ldgcol < *n) {
+	*info = -19;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLALSA", &i__1);
+	return 0;
+    }
+
+/*     Book-keeping and  setting up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+
+    dlasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       The following code applies back the left singular vector factors.
+       For applying back the right singular vector factors, go to 50.
+*/
+
+    if (*icompq == 1) {
+	goto L50;
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by DLASDQ. The corresponding left and right singular vector
+       matrices are in explicit form. First apply back the left
+       singular vector matrices.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+	dgemm_("T", "N", &nl, nrhs, &nl, &c_b15, &u[nlf + u_dim1], ldu, &b[
+		nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx);
+	dgemm_("T", "N", &nr, nrhs, &nr, &c_b15, &u[nrf + u_dim1], ldu, &b[
+		nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx);
+/* L10: */
+    }
+
+/*
+       Next copy the rows of B that correspond to unchanged rows
+       in the bidiagonal matrix to BX.
+*/
+
+    i__1 = nd;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	ic = iwork[inode + i__ - 1];
+	dcopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx);
+/* L20: */
+    }
+
+/*
+       Finally go through the left singular vector matrices of all
+       the other subproblems bottom-up on the tree.
+*/
+
+    j = pow_ii(&c__2, &nlvl);
+    sqre = 0;
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          find the first node LF and last node LL on
+          the current level LVL
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    --j;
+	    dlals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, &
+		    b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &work[1], info);
+/* L30: */
+	}
+/* L40: */
+    }
+    goto L90;
+
+/*     ICOMPQ = 1: applying back the right singular vector factors. */
+
+L50:
+
+/*
+       First now go through the right singular vector matrices of all
+       the tree nodes top-down.
+*/
+
+    j = 0;
+    i__1 = nlvl;
+    for (lvl = 1; lvl <= i__1; ++lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__2 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__2);
+	    ll = (lf << 1) - 1;
+	}
+	i__2 = lf;
+	for (i__ = ll; i__ >= i__2; --i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqre = 0;
+	    } else {
+		sqre = 1;
+	    }
+	    ++j;
+	    dlals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[
+		    nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &work[1], info);
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by DLASDQ. The corresponding right singular vector
+       matrices are in explicit form. Apply them back.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlp1 = nl + 1;
+	if (i__ == nd) {
+	    nrp1 = nr;
+	} else {
+	    nrp1 = nr + 1;
+	}
+	nlf = ic - nl;
+	nrf = ic + 1;
+	dgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b15, &vt[nlf + vt_dim1], ldu,
+		&b[nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx);
+	dgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b15, &vt[nrf + vt_dim1], ldu,
+		&b[nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx);
+/* L80: */
+    }
+
+L90:
+
+    return 0;
+
+/*     End of DLALSA */
+
+} /* dlalsa_ */
+
+/* Subroutine */ int dlalsd_(char *uplo, integer *smlsiz, integer *n, integer
+	*nrhs, doublereal *d__, doublereal *e, doublereal *b, integer *ldb,
+	doublereal *rcond, integer *rank, doublereal *work, integer *iwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer c__, i__, j, k;
+    static doublereal r__;
+    static integer s, u, z__;
+    static doublereal cs;
+    static integer bx;
+    static doublereal sn;
+    static integer st, vt, nm1, st1;
+    static doublereal eps;
+    static integer iwk;
+    static doublereal tol;
+    static integer difl, difr;
+    static doublereal rcnd;
+    static integer perm, nsub;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static integer nlvl, sqre, bxst;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *),
+	     dcopy_(integer *, doublereal *, integer *, doublereal *, integer
+	    *);
+    static integer poles, sizei, nsize, nwork, icmpq1, icmpq2;
+
+    extern /* Subroutine */ int dlasda_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *, integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *), dlalsa_(integer *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     integer *, integer *), dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, doublereal *, doublereal *, doublereal *,
+	     integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlacpy_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *), dlaset_(char *, integer *, integer *,
+	     doublereal *, doublereal *, doublereal *, integer *),
+	    xerbla_(char *, integer *);
+    static integer givcol;
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *,
+	    integer *);
+    static doublereal orgnrm;
+    static integer givnum, givptr, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLALSD uses the singular value decomposition of A to solve the least
+    squares problem of finding X to minimize the Euclidean norm of each
+    column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+    are N-by-NRHS. The solution X overwrites B.
+
+    The singular values of A smaller than RCOND times the largest
+    singular value are treated as zero in solving the least squares
+    problem; in this case a minimum norm solution is returned.
+    The actual singular values are returned in D in ascending order.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    UPLO   (input) CHARACTER*1
+           = 'U': D and E define an upper bidiagonal matrix.
+           = 'L': D and E define a  lower bidiagonal matrix.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The dimension of the  bidiagonal matrix.  N >= 0.
+
+    NRHS   (input) INTEGER
+           The number of columns of B. NRHS must be at least 1.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit, if INFO = 0, D contains its singular values.
+
+    E      (input/output) DOUBLE PRECISION array, dimension (N-1)
+           Contains the super-diagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    B      (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
+           On input, B contains the right hand sides of the least
+           squares problem. On output, B contains the solution X.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,N).
+
+    RCOND  (input) DOUBLE PRECISION
+           The singular values of A less than or equal to RCOND times
+           the largest singular value are treated as zero in solving
+           the least squares problem. If RCOND is negative,
+           machine precision is used instead.
+           For example, if diag(S)*X=B were the least squares problem,
+           where diag(S) is a diagonal matrix of singular values, the
+           solution would be X(i) = B(i) / S(i) if S(i) is greater than
+           RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to
+           RCOND*max(S).
+
+    RANK   (output) INTEGER
+           The number of singular values of A greater than RCOND times
+           the largest singular value.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension at least
+           (9*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2),
+           where NLVL = max(0, INT(log_2 (N/(SMLSIZ+1))) + 1).
+
+    IWORK  (workspace) INTEGER array, dimension at least
+           (3*N*NLVL + 11*N)
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  The algorithm failed to compute a singular value while
+                 working on the submatrix lying in rows and columns
+                 INFO/(N+1) through MOD(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < 1 || *ldb < *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLALSD", &i__1);
+	return 0;
+    }
+
+    eps = EPSILON;
+
+/*     Set up the tolerance. */
+
+    if (*rcond <= 0. || *rcond >= 1.) {
+	rcnd = eps;
+    } else {
+	rcnd = *rcond;
+    }
+
+    *rank = 0;
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	if (d__[1] == 0.) {
+	    dlaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	} else {
+	    *rank = 1;
+	    dlascl_("G", &c__0, &c__0, &d__[1], &c_b15, &c__1, nrhs, &b[
+		    b_offset], ldb, info);
+	    d__[1] = abs(d__[1]);
+	}
+	return 0;
+    }
+
+/*     Rotate the matrix if it is lower bidiagonal. */
+
+    if (*(unsigned char *)uplo == 'L') {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dlartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (*nrhs == 1) {
+		drot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], &
+			c__1, &cs, &sn);
+	    } else {
+		work[(i__ << 1) - 1] = cs;
+		work[i__ * 2] = sn;
+	    }
+/* L10: */
+	}
+	if (*nrhs > 1) {
+	    i__1 = *nrhs;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = *n - 1;
+		for (j = 1; j <= i__2; ++j) {
+		    cs = work[(j << 1) - 1];
+		    sn = work[j * 2];
+		    drot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__ *
+			     b_dim1], &c__1, &cs, &sn);
+/* L20: */
+		}
+/* L30: */
+	    }
+	}
+    }
+
+/*     Scale. */
+
+    nm1 = *n - 1;
+    orgnrm = dlanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.) {
+	dlaset_("A", n, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	return 0;
+    }
+
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, info);
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1,
+	    info);
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= *smlsiz) {
+	nwork = *n * *n + 1;
+	dlaset_("A", n, n, &c_b29, &c_b15, &work[1], n);
+	dlasdq_("U", &c__0, n, n, &c__0, nrhs, &d__[1], &e[1], &work[1], n, &
+		work[1], n, &b[b_offset], ldb, &work[nwork], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	tol = rcnd * (d__1 = d__[idamax_(n, &d__[1], &c__1)], abs(d__1));
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (d__[i__] <= tol) {
+		dlaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[i__ + b_dim1],
+			ldb);
+	    } else {
+		dlascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &b[
+			i__ + b_dim1], ldb, info);
+		++(*rank);
+	    }
+/* L40: */
+	}
+	dgemm_("T", "N", n, nrhs, n, &c_b15, &work[1], n, &b[b_offset], ldb, &
+		c_b29, &work[nwork], n);
+	dlacpy_("A", n, nrhs, &work[nwork], n, &b[b_offset], ldb);
+
+/*        Unscale. */
+
+	dlascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n,
+		info);
+	dlasrt_("D", n, &d__[1], info);
+	dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset],
+		ldb, info);
+
+	return 0;
+    }
+
+/*     Book-keeping and setting up some constants. */
+
+    nlvl = (integer) (log((doublereal) (*n) / (doublereal) (*smlsiz + 1)) /
+	    log(2.)) + 1;
+
+    smlszp = *smlsiz + 1;
+
+    u = 1;
+    vt = *smlsiz * *n + 1;
+    difl = vt + smlszp * *n;
+    difr = difl + nlvl * *n;
+    z__ = difr + (nlvl * *n << 1);
+    c__ = z__ + nlvl * *n;
+    s = c__ + *n;
+    poles = s + *n;
+    givnum = poles + (nlvl << 1) * *n;
+    bx = givnum + (nlvl << 1) * *n;
+    nwork = bx + *n * *nrhs;
+
+    sizei = *n + 1;
+    k = sizei + *n;
+    givptr = k + *n;
+    perm = givptr + *n;
+    givcol = perm + nlvl * *n;
+    iwk = givcol + (nlvl * *n << 1);
+
+    st = 1;
+    sqre = 0;
+    icmpq1 = 1;
+    icmpq2 = 0;
+    nsub = 0;
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = d__[i__], abs(d__1)) < eps) {
+	    d__[i__] = d_sign(&eps, &d__[i__]);
+	}
+/* L50: */
+    }
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = e[i__], abs(d__1)) < eps || i__ == nm1) {
+	    ++nsub;
+	    iwork[nsub] = st;
+
+/*
+             Subproblem found. First determine its size and then
+             apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*              A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else if ((d__1 = e[i__], abs(d__1)) >= eps) {
+
+/*              A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else {
+
+/*
+                A subproblem with E(NM1) small. This implies an
+                1-by-1 subproblem at D(N), which is not solved
+                explicitly.
+*/
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+		++nsub;
+		iwork[nsub] = *n;
+		iwork[sizei + nsub - 1] = 1;
+		dcopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n);
+	    }
+	    st1 = st - 1;
+	    if (nsize == 1) {
+
+/*
+                This is a 1-by-1 subproblem and is not solved
+                explicitly.
+*/
+
+		dcopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n);
+	    } else if (nsize <= *smlsiz) {
+
+/*              This is a small subproblem and is solved by DLASDQ. */
+
+		dlaset_("A", &nsize, &nsize, &c_b29, &c_b15, &work[vt + st1],
+			n);
+		dlasdq_("U", &c__0, &nsize, &nsize, &c__0, nrhs, &d__[st], &e[
+			st], &work[vt + st1], n, &work[nwork], n, &b[st +
+			b_dim1], ldb, &work[nwork], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		dlacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx +
+			st1], n);
+	    } else {
+
+/*              A large problem. Solve it using divide and conquer. */
+
+		dlasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], &
+			work[u + st1], n, &work[vt + st1], &iwork[k + st1], &
+			work[difl + st1], &work[difr + st1], &work[z__ + st1],
+			 &work[poles + st1], &iwork[givptr + st1], &iwork[
+			givcol + st1], n, &iwork[perm + st1], &work[givnum +
+			st1], &work[c__ + st1], &work[s + st1], &work[nwork],
+			&iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		bxst = bx + st1;
+		dlalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, &
+			work[bxst], n, &work[u + st1], n, &work[vt + st1], &
+			iwork[k + st1], &work[difl + st1], &work[difr + st1],
+			&work[z__ + st1], &work[poles + st1], &iwork[givptr +
+			st1], &iwork[givcol + st1], n, &iwork[perm + st1], &
+			work[givnum + st1], &work[c__ + st1], &work[s + st1],
+			&work[nwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+	    }
+	    st = i__ + 1;
+	}
+/* L60: */
+    }
+
+/*     Apply the singular values and treat the tiny ones as zero. */
+
+    tol = rcnd * (d__1 = d__[idamax_(n, &d__[1], &c__1)], abs(d__1));
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+          Some of the elements in D can be negative because 1-by-1
+          subproblems were not solved explicitly.
+*/
+
+	if ((d__1 = d__[i__], abs(d__1)) <= tol) {
+	    dlaset_("A", &c__1, nrhs, &c_b29, &c_b29, &work[bx + i__ - 1], n);
+	} else {
+	    ++(*rank);
+	    dlascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &work[
+		    bx + i__ - 1], n, info);
+	}
+	d__[i__] = (d__1 = d__[i__], abs(d__1));
+/* L70: */
+    }
+
+/*     Now apply back the right singular vectors. */
+
+    icmpq2 = 1;
+    i__1 = nsub;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	st = iwork[i__];
+	st1 = st - 1;
+	nsize = iwork[sizei + i__ - 1];
+	bxst = bx + st1;
+	if (nsize == 1) {
+	    dcopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb);
+	} else if (nsize <= *smlsiz) {
+	    dgemm_("T", "N", &nsize, nrhs, &nsize, &c_b15, &work[vt + st1], n,
+		     &work[bxst], n, &c_b29, &b[st + b_dim1], ldb);
+	} else {
+	    dlalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st +
+		    b_dim1], ldb, &work[u + st1], n, &work[vt + st1], &iwork[
+		    k + st1], &work[difl + st1], &work[difr + st1], &work[z__
+		    + st1], &work[poles + st1], &iwork[givptr + st1], &iwork[
+		    givcol + st1], n, &iwork[perm + st1], &work[givnum + st1],
+		     &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[
+		    iwk], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+	}
+/* L80: */
+    }
+
+/*     Unscale and sort the singular values. */
+
+    dlascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, info);
+    dlasrt_("D", n, &d__[1], info);
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset], ldb,
+	    info);
+
+    return 0;
+
+/*     End of DLALSD */
+
+} /* dlalsd_ */
+
+/* Subroutine */ int dlamrg_(integer *n1, integer *n2, doublereal *a, integer
+	*dtrd1, integer *dtrd2, integer *index)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, ind1, ind2, n1sv, n2sv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAMRG will create a permutation list which will merge the elements
+    of A (which is composed of two independently sorted sets) into a
+    single set which is sorted in ascending order.
+
+    Arguments
+    =========
+
+    N1     (input) INTEGER
+    N2     (input) INTEGER
+           These arguements contain the respective lengths of the two
+           sorted lists to be merged.
+
+    A      (input) DOUBLE PRECISION array, dimension (N1+N2)
+           The first N1 elements of A contain a list of numbers which
+           are sorted in either ascending or descending order.  Likewise
+           for the final N2 elements.
+
+    DTRD1  (input) INTEGER
+    DTRD2  (input) INTEGER
+           These are the strides to be taken through the array A.
+           Allowable strides are 1 and -1.  They indicate whether a
+           subset of A is sorted in ascending (DTRDx = 1) or descending
+           (DTRDx = -1) order.
+
+    INDEX  (output) INTEGER array, dimension (N1+N2)
+           On exit this array will contain a permutation such that
+           if B( I ) = A( INDEX( I ) ) for I=1,N1+N2, then B will be
+           sorted in ascending order.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --index;
+    --a;
+
+    /* Function Body */
+    n1sv = *n1;
+    n2sv = *n2;
+    if (*dtrd1 > 0) {
+	ind1 = 1;
+    } else {
+	ind1 = *n1;
+    }
+    if (*dtrd2 > 0) {
+	ind2 = *n1 + 1;
+    } else {
+	ind2 = *n1 + *n2;
+    }
+    i__ = 1;
+/*     while ( (N1SV > 0) & (N2SV > 0) ) */
+L10:
+    if (n1sv > 0 && n2sv > 0) {
+	if (a[ind1] <= a[ind2]) {
+	    index[i__] = ind1;
+	    ++i__;
+	    ind1 += *dtrd1;
+	    --n1sv;
+	} else {
+	    index[i__] = ind2;
+	    ++i__;
+	    ind2 += *dtrd2;
+	    --n2sv;
+	}
+	goto L10;
+    }
+/*     end while */
+    if (n1sv == 0) {
+	i__1 = n2sv;
+	for (n1sv = 1; n1sv <= i__1; ++n1sv) {
+	    index[i__] = ind2;
+	    ++i__;
+	    ind2 += *dtrd2;
+/* L20: */
+	}
+    } else {
+/*     N2SV .EQ. 0 */
+	i__1 = n1sv;
+	for (n2sv = 1; n2sv <= i__1; ++n2sv) {
+	    index[i__] = ind1;
+	    ++i__;
+	    ind1 += *dtrd1;
+/* L30: */
+	}
+    }
+
+    return 0;
+
+/*     End of DLAMRG */
+
+} /* dlamrg_ */
+
+doublereal dlange_(char *norm, integer *m, integer *n, doublereal *a, integer
+	*lda, doublereal *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal ret_val, d__1, d__2, d__3;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal sum, scale;
+    extern logical lsame_(char *, char *);
+    static doublereal value;
+    extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *,
+	    doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLANGE  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    real matrix A.
+
+    Description
+    ===========
+
+    DLANGE returns the value
+
+       DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANGE as described
+            above.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.  When M = 0,
+            DLANGE is set to zero.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.  When N = 0,
+            DLANGE is set to zero.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(M,1).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)),
+            where LWORK >= M when NORM = 'I'; otherwise, WORK is not
+            referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (min(*m,*n) == 0) {
+	value = 0.;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		d__2 = value, d__3 = (d__1 = a[i__ + j * a_dim1], abs(d__1));
+		value = max(d__2,d__3);
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1') {
+
+/*        Find norm1(A). */
+
+	value = 0.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = 0.;
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		sum += (d__1 = a[i__ + j * a_dim1], abs(d__1));
+/* L30: */
+	    }
+	    value = max(value,sum);
+/* L40: */
+	}
+    } else if (lsame_(norm, "I")) {
+
+/*        Find normI(A). */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[i__] = 0.;
+/* L50: */
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		work[i__] += (d__1 = a[i__ + j * a_dim1], abs(d__1));
+/* L60: */
+	    }
+/* L70: */
+	}
+	value = 0.;
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    d__1 = value, d__2 = work[i__];
+	    value = max(d__1,d__2);
+/* L80: */
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.;
+	sum = 1.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    dlassq_(m, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L90: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of DLANGE */
+
+} /* dlange_ */
+
+doublereal dlanst_(char *norm, integer *n, doublereal *d__, doublereal *e)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal ret_val, d__1, d__2, d__3, d__4, d__5;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal sum, scale;
+    extern logical lsame_(char *, char *);
+    static doublereal anorm;
+    extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *,
+	    doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLANST  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    real symmetric tridiagonal matrix A.
+
+    Description
+    ===========
+
+    DLANST returns the value
+
+       DLANST = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANST as described
+            above.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.  When N = 0, DLANST is
+            set to zero.
+
+    D       (input) DOUBLE PRECISION array, dimension (N)
+            The diagonal elements of A.
+
+    E       (input) DOUBLE PRECISION array, dimension (N-1)
+            The (n-1) sub-diagonal or super-diagonal elements of A.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --e;
+    --d__;
+
+    /* Function Body */
+    if (*n <= 0) {
+	anorm = 0.;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	anorm = (d__1 = d__[*n], abs(d__1));
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    d__2 = anorm, d__3 = (d__1 = d__[i__], abs(d__1));
+	    anorm = max(d__2,d__3);
+/* Computing MAX */
+	    d__2 = anorm, d__3 = (d__1 = e[i__], abs(d__1));
+	    anorm = max(d__2,d__3);
+/* L10: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1' || lsame_(norm, "I")) {
+
+/*        Find norm1(A). */
+
+	if (*n == 1) {
+	    anorm = abs(d__[1]);
+	} else {
+/* Computing MAX */
+	    d__3 = abs(d__[1]) + abs(e[1]), d__4 = (d__1 = e[*n - 1], abs(
+		    d__1)) + (d__2 = d__[*n], abs(d__2));
+	    anorm = max(d__3,d__4);
+	    i__1 = *n - 1;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+/* Computing MAX */
+		d__4 = anorm, d__5 = (d__1 = d__[i__], abs(d__1)) + (d__2 = e[
+			i__], abs(d__2)) + (d__3 = e[i__ - 1], abs(d__3));
+		anorm = max(d__4,d__5);
+/* L20: */
+	    }
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.;
+	sum = 1.;
+	if (*n > 1) {
+	    i__1 = *n - 1;
+	    dlassq_(&i__1, &e[1], &c__1, &scale, &sum);
+	    sum *= 2;
+	}
+	dlassq_(n, &d__[1], &c__1, &scale, &sum);
+	anorm = scale * sqrt(sum);
+    }
+
+    ret_val = anorm;
+    return ret_val;
+
+/*     End of DLANST */
+
+} /* dlanst_ */
+
+doublereal dlansy_(char *norm, char *uplo, integer *n, doublereal *a, integer
+	*lda, doublereal *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal ret_val, d__1, d__2, d__3;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal sum, absa, scale;
+    extern logical lsame_(char *, char *);
+    static doublereal value;
+    extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *,
+	    doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLANSY  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    real symmetric matrix A.
+
+    Description
+    ===========
+
+    DLANSY returns the value
+
+       DLANSY = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANSY as described
+            above.
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is to be referenced.
+            = 'U':  Upper triangular part of A is referenced
+            = 'L':  Lower triangular part of A is referenced
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.  When N = 0, DLANSY is
+            set to zero.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The symmetric matrix A.  If UPLO = 'U', the leading n by n
+            upper triangular part of A contains the upper triangular part
+            of the matrix A, and the strictly lower triangular part of A
+            is not referenced.  If UPLO = 'L', the leading n by n lower
+            triangular part of A contains the lower triangular part of
+            the matrix A, and the strictly upper triangular part of A is
+            not referenced.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(N,1).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)),
+            where LWORK >= N when NORM = 'I' or '1' or 'O'; otherwise,
+            WORK is not referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (*n == 0) {
+	value = 0.;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    d__2 = value, d__3 = (d__1 = a[i__ + j * a_dim1], abs(
+			    d__1));
+		    value = max(d__2,d__3);
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    d__2 = value, d__3 = (d__1 = a[i__ + j * a_dim1], abs(
+			    d__1));
+		    value = max(d__2,d__3);
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+    } else if (lsame_(norm, "I") || lsame_(norm, "O") || *(unsigned char *)norm == '1') {
+
+/*        Find normI(A) ( = norm1(A), since A is symmetric). */
+
+	value = 0.;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		sum = 0.;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    absa = (d__1 = a[i__ + j * a_dim1], abs(d__1));
+		    sum += absa;
+		    work[i__] += absa;
+/* L50: */
+		}
+		work[j] = sum + (d__1 = a[j + j * a_dim1], abs(d__1));
+/* L60: */
+	    }
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+		d__1 = value, d__2 = work[i__];
+		value = max(d__1,d__2);
+/* L70: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		work[i__] = 0.;
+/* L80: */
+	    }
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		sum = work[j] + (d__1 = a[j + j * a_dim1], abs(d__1));
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    absa = (d__1 = a[i__ + j * a_dim1], abs(d__1));
+		    sum += absa;
+		    work[i__] += absa;
+/* L90: */
+		}
+		value = max(value,sum);
+/* L100: */
+	    }
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.;
+	sum = 1.;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		i__2 = j - 1;
+		dlassq_(&i__2, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n - j;
+		dlassq_(&i__2, &a[j + 1 + j * a_dim1], &c__1, &scale, &sum);
+/* L120: */
+	    }
+	}
+	sum *= 2;
+	i__1 = *lda + 1;
+	dlassq_(n, &a[a_offset], &i__1, &scale, &sum);
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of DLANSY */
+
+} /* dlansy_ */
+
+/* Subroutine */ int dlanv2_(doublereal *a, doublereal *b, doublereal *c__,
+	doublereal *d__, doublereal *rt1r, doublereal *rt1i, doublereal *rt2r,
+	 doublereal *rt2i, doublereal *cs, doublereal *sn)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal p, z__, aa, bb, cc, dd, cs1, sn1, sab, sac, eps, tau,
+	    temp, scale, bcmax, bcmis, sigma;
+
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLANV2 computes the Schur factorization of a real 2-by-2 nonsymmetric
+    matrix in standard form:
+
+         [ A  B ] = [ CS -SN ] [ AA  BB ] [ CS  SN ]
+         [ C  D ]   [ SN  CS ] [ CC  DD ] [-SN  CS ]
+
+    where either
+    1) CC = 0 so that AA and DD are real eigenvalues of the matrix, or
+    2) AA = DD and BB*CC < 0, so that AA + or - sqrt(BB*CC) are complex
+    conjugate eigenvalues.
+
+    Arguments
+    =========
+
+    A       (input/output) DOUBLE PRECISION
+    B       (input/output) DOUBLE PRECISION
+    C       (input/output) DOUBLE PRECISION
+    D       (input/output) DOUBLE PRECISION
+            On entry, the elements of the input matrix.
+            On exit, they are overwritten by the elements of the
+            standardised Schur form.
+
+    RT1R    (output) DOUBLE PRECISION
+    RT1I    (output) DOUBLE PRECISION
+    RT2R    (output) DOUBLE PRECISION
+    RT2I    (output) DOUBLE PRECISION
+            The real and imaginary parts of the eigenvalues. If the
+            eigenvalues are a complex conjugate pair, RT1I > 0.
+
+    CS      (output) DOUBLE PRECISION
+    SN      (output) DOUBLE PRECISION
+            Parameters of the rotation matrix.
+
+    Further Details
+    ===============
+
+    Modified by V. Sima, Research Institute for Informatics, Bucharest,
+    Romania, to reduce the risk of cancellation errors,
+    when computing real eigenvalues, and to ensure, if possible, that
+    abs(RT1R) >= abs(RT2R).
+
+    =====================================================================
+*/
+
+
+    eps = PRECISION;
+    if (*c__ == 0.) {
+	*cs = 1.;
+	*sn = 0.;
+	goto L10;
+
+    } else if (*b == 0.) {
+
+/*        Swap rows and columns */
+
+	*cs = 0.;
+	*sn = 1.;
+	temp = *d__;
+	*d__ = *a;
+	*a = temp;
+	*b = -(*c__);
+	*c__ = 0.;
+	goto L10;
+    } else if (*a - *d__ == 0. && d_sign(&c_b15, b) != d_sign(&c_b15, c__)) {
+	*cs = 1.;
+	*sn = 0.;
+	goto L10;
+    } else {
+
+	temp = *a - *d__;
+	p = temp * .5;
+/* Computing MAX */
+	d__1 = abs(*b), d__2 = abs(*c__);
+	bcmax = max(d__1,d__2);
+/* Computing MIN */
+	d__1 = abs(*b), d__2 = abs(*c__);
+	bcmis = min(d__1,d__2) * d_sign(&c_b15, b) * d_sign(&c_b15, c__);
+/* Computing MAX */
+	d__1 = abs(p);
+	scale = max(d__1,bcmax);
+	z__ = p / scale * p + bcmax / scale * bcmis;
+
+/*
+          If Z is of the order of the machine accuracy, postpone the
+          decision on the nature of eigenvalues
+*/
+
+	if (z__ >= eps * 4.) {
+
+/*           Real eigenvalues. Compute A and D. */
+
+	    d__1 = sqrt(scale) * sqrt(z__);
+	    z__ = p + d_sign(&d__1, &p);
+	    *a = *d__ + z__;
+	    *d__ -= bcmax / z__ * bcmis;
+
+/*           Compute B and the rotation matrix */
+
+	    tau = dlapy2_(c__, &z__);
+	    *cs = z__ / tau;
+	    *sn = *c__ / tau;
+	    *b -= *c__;
+	    *c__ = 0.;
+	} else {
+
+/*
+             Complex eigenvalues, or real (almost) equal eigenvalues.
+             Make diagonal elements equal.
+*/
+
+	    sigma = *b + *c__;
+	    tau = dlapy2_(&sigma, &temp);
+	    *cs = sqrt((abs(sigma) / tau + 1.) * .5);
+	    *sn = -(p / (tau * *cs)) * d_sign(&c_b15, &sigma);
+
+/*
+             Compute [ AA  BB ] = [ A  B ] [ CS -SN ]
+                     [ CC  DD ]   [ C  D ] [ SN  CS ]
+*/
+
+	    aa = *a * *cs + *b * *sn;
+	    bb = -(*a) * *sn + *b * *cs;
+	    cc = *c__ * *cs + *d__ * *sn;
+	    dd = -(*c__) * *sn + *d__ * *cs;
+
+/*
+             Compute [ A  B ] = [ CS  SN ] [ AA  BB ]
+                     [ C  D ]   [-SN  CS ] [ CC  DD ]
+*/
+
+	    *a = aa * *cs + cc * *sn;
+	    *b = bb * *cs + dd * *sn;
+	    *c__ = -aa * *sn + cc * *cs;
+	    *d__ = -bb * *sn + dd * *cs;
+
+	    temp = (*a + *d__) * .5;
+	    *a = temp;
+	    *d__ = temp;
+
+	    if (*c__ != 0.) {
+		if (*b != 0.) {
+		    if (d_sign(&c_b15, b) == d_sign(&c_b15, c__)) {
+
+/*                    Real eigenvalues: reduce to upper triangular form */
+
+			sab = sqrt((abs(*b)));
+			sac = sqrt((abs(*c__)));
+			d__1 = sab * sac;
+			p = d_sign(&d__1, c__);
+			tau = 1. / sqrt((d__1 = *b + *c__, abs(d__1)));
+			*a = temp + p;
+			*d__ = temp - p;
+			*b -= *c__;
+			*c__ = 0.;
+			cs1 = sab * tau;
+			sn1 = sac * tau;
+			temp = *cs * cs1 - *sn * sn1;
+			*sn = *cs * sn1 + *sn * cs1;
+			*cs = temp;
+		    }
+		} else {
+		    *b = -(*c__);
+		    *c__ = 0.;
+		    temp = *cs;
+		    *cs = -(*sn);
+		    *sn = temp;
+		}
+	    }
+	}
+
+    }
+
+L10:
+
+/*     Store eigenvalues in (RT1R,RT1I) and (RT2R,RT2I). */
+
+    *rt1r = *a;
+    *rt2r = *d__;
+    if (*c__ == 0.) {
+	*rt1i = 0.;
+	*rt2i = 0.;
+    } else {
+	*rt1i = sqrt((abs(*b))) * sqrt((abs(*c__)));
+	*rt2i = -(*rt1i);
+    }
+    return 0;
+
+/*     End of DLANV2 */
+
+} /* dlanv2_ */
+
+doublereal dlapy2_(doublereal *x, doublereal *y)
+{
+    /* System generated locals */
+    doublereal ret_val, d__1;
+
+    /* Local variables */
+    static doublereal w, z__, xabs, yabs;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAPY2 returns sqrt(x**2+y**2), taking care not to cause unnecessary
+    overflow.
+
+    Arguments
+    =========
+
+    X       (input) DOUBLE PRECISION
+    Y       (input) DOUBLE PRECISION
+            X and Y specify the values x and y.
+
+    =====================================================================
+*/
+
+
+    xabs = abs(*x);
+    yabs = abs(*y);
+    w = max(xabs,yabs);
+    z__ = min(xabs,yabs);
+    if (z__ == 0.) {
+	ret_val = w;
+    } else {
+/* Computing 2nd power */
+	d__1 = z__ / w;
+	ret_val = w * sqrt(d__1 * d__1 + 1.);
+    }
+    return ret_val;
+
+/*     End of DLAPY2 */
+
+} /* dlapy2_ */
+
+doublereal dlapy3_(doublereal *x, doublereal *y, doublereal *z__)
+{
+    /* System generated locals */
+    doublereal ret_val, d__1, d__2, d__3;
+
+    /* Local variables */
+    static doublereal w, xabs, yabs, zabs;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAPY3 returns sqrt(x**2+y**2+z**2), taking care not to cause
+    unnecessary overflow.
+
+    Arguments
+    =========
+
+    X       (input) DOUBLE PRECISION
+    Y       (input) DOUBLE PRECISION
+    Z       (input) DOUBLE PRECISION
+            X, Y and Z specify the values x, y and z.
+
+    =====================================================================
+*/
+
+
+    xabs = abs(*x);
+    yabs = abs(*y);
+    zabs = abs(*z__);
+/* Computing MAX */
+    d__1 = max(xabs,yabs);
+    w = max(d__1,zabs);
+    if (w == 0.) {
+/*
+       W can be zero for max(0,nan,0)
+       adding all three entries together will make sure
+       NaN will not disappear.
+*/
+	ret_val = xabs + yabs + zabs;
+    } else {
+/* Computing 2nd power */
+	d__1 = xabs / w;
+/* Computing 2nd power */
+	d__2 = yabs / w;
+/* Computing 2nd power */
+	d__3 = zabs / w;
+	ret_val = w * sqrt(d__1 * d__1 + d__2 * d__2 + d__3 * d__3);
+    }
+    return ret_val;
+
+/*     End of DLAPY3 */
+
+} /* dlapy3_ */
+
+/* Subroutine */ int dlaqr0_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal
+	*wr, doublereal *wi, integer *iloz, integer *ihiz, doublereal *z__,
+	integer *ldz, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal aa, bb, cc, dd;
+    static integer ld;
+    static doublereal cs;
+    static integer nh, it, ks, kt;
+    static doublereal sn;
+    static integer ku, kv, ls, ns;
+    static doublereal ss;
+    static integer nw, inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl,
+	    kbot, nmin;
+    static doublereal swap;
+    static integer ktop;
+    static doublereal zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *), dlaqr3_(
+	    logical *, logical *, integer *, integer *, integer *, integer *,
+	    doublereal *, integer *, integer *, integer *, doublereal *,
+	    integer *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *),
+	    dlaqr4_(logical *, logical *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *), dlaqr5_(logical *, logical *, integer *, integer *,
+	    integer *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    integer *, doublereal *, integer *, integer *, doublereal *,
+	    integer *);
+    static integer nibble;
+    extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    static integer nwupbd;
+    static logical sorted;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       DLAQR0 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+       Schur form), and Z is the orthogonal matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input orthogonal
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to DGEBAL, and then passed to DGEHRD when the
+             matrix output by DGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) DOUBLE PRECISION array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H contains
+             the upper quasi-triangular matrix T from the Schur
+             decomposition (the Schur form); 2-by-2 diagonal blocks
+             (corresponding to complex conjugate pairs of eigenvalues)
+             are returned in standard form, with H(i,i) = H(i+1,i+1)
+             and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       WR    (output) DOUBLE PRECISION array, dimension (IHI)
+       WI    (output) DOUBLE PRECISION array, dimension (IHI)
+             The real and imaginary parts, respectively, of the computed
+             eigenvalues of H(ILO:IHI,ILO:IHI) are stored in WR(ILO:IHI)
+             and WI(ILO:IHI). If two eigenvalues are computed as a
+             complex conjugate pair, they are stored in consecutive
+             elements of WR and WI, say the i-th and (i+1)th, with
+             WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+             the eigenvalues are stored in the same order as on the
+             diagonal of the Schur form returned in H, with
+             WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
+             block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
+             WI(i+1) = -WI(i).
+
+       ILOZ     (input) INTEGER
+       IHIZ     (input) INTEGER
+             Specify the rows of Z to which transformations must be
+             applied if WANTZ is .TRUE..
+             1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+
+       Z     (input/output) DOUBLE PRECISION array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) DOUBLE PRECISION array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then DLAQR0 does a workspace query.
+             In this case, DLAQR0 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, DLAQR0 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is an orthogonal matrix.  The final
+                  value of H is upper Hessenberg and quasi-triangular
+                  in rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    DLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constants WILK1 and WILK2 are used to form the
+       .    exceptional shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use DLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    dlahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1], &
+		    wi[1], iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "DLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "DLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to DLAQR3 ====
+*/
+
+	i__1 = nwr + 1;
+	dlaqr3_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1], &h__[
+		h_offset], ldh, n, &h__[h_offset], ldh, n, &h__[h_offset],
+		ldh, &work[1], &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(DLAQR5, DLAQR3) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1];
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    work[1] = (doublereal) lwkopt;
+	    return 0;
+	}
+
+/*        ==== DLAHQR/DLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "DLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "DLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "DLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L90;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		if (h__[k + (k - 1) * h_dim1] == 0.) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    if ((d__1 = h__[kwtop + (kwtop - 1) * h_dim1], abs(d__1))
+			    > (d__2 = h__[kwtop - 1 + (kwtop - 2) * h_dim1],
+			    abs(d__2))) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    dlaqr3_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1],
+		     &h__[kv + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1],
+		    ldh, &nve, &h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if DLAQR3
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    DLAQR3 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+/* Computing MAX */
+		    i__3 = ks + 1, i__4 = ktop + 2;
+		    i__2 = max(i__3,i__4);
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			ss = (d__1 = h__[i__ + (i__ - 1) * h_dim1], abs(d__1))
+				 + (d__2 = h__[i__ - 1 + (i__ - 2) * h_dim1],
+				abs(d__2));
+			aa = ss * .75 + h__[i__ + i__ * h_dim1];
+			bb = ss;
+			cc = ss * -.4375;
+			dd = aa;
+			dlanv2_(&aa, &bb, &cc, &dd, &wr[i__ - 1], &wi[i__ - 1]
+				, &wr[i__], &wi[i__], &cs, &sn);
+/* L30: */
+		    }
+		    if (ks == ktop) {
+			wr[ks + 1] = h__[ks + 1 + (ks + 1) * h_dim1];
+			wi[ks + 1] = 0.;
+			wr[ks] = wr[ks + 1];
+			wi[ks] = wi[ks + 1];
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use DLAQR4 or
+                   .    DLAHQR on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			dlacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			if (ns > nmin) {
+			    dlaqr4_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &wr[ks], &wi[ks], &
+				    c__1, &c__1, zdum, &c__1, &work[1], lwork,
+				     &inf);
+			} else {
+			    dlahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &wr[ks], &wi[ks], &
+				    c__1, &c__1, zdum, &c__1, &inf);
+			}
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  ====
+*/
+
+			if (ks >= kbot) {
+			    aa = h__[kbot - 1 + (kbot - 1) * h_dim1];
+			    cc = h__[kbot + (kbot - 1) * h_dim1];
+			    bb = h__[kbot - 1 + kbot * h_dim1];
+			    dd = h__[kbot + kbot * h_dim1];
+			    dlanv2_(&aa, &bb, &cc, &dd, &wr[kbot - 1], &wi[
+				    kbot - 1], &wr[kbot], &wi[kbot], &cs, &sn)
+				    ;
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*
+                      ==== Sort the shifts (Helps a little)
+                      .    Bubble sort keeps complex conjugate
+                      .    pairs together. ====
+*/
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				if ((d__1 = wr[i__], abs(d__1)) + (d__2 = wi[
+					i__], abs(d__2)) < (d__3 = wr[i__ + 1]
+					, abs(d__3)) + (d__4 = wi[i__ + 1],
+					abs(d__4))) {
+				    sorted = FALSE_;
+
+				    swap = wr[i__];
+				    wr[i__] = wr[i__ + 1];
+				    wr[i__ + 1] = swap;
+
+				    swap = wi[i__];
+				    wi[i__] = wi[i__ + 1];
+				    wi[i__ + 1] = swap;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+
+/*
+                   ==== Shuffle shifts into pairs of real shifts
+                   .    and pairs of complex conjugate shifts
+                   .    assuming complex conjugate shifts are
+                   .    already adjacent to one another. (Yes,
+                   .    they are.)  ====
+*/
+
+		    i__2 = ks + 2;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			if (wi[i__] != -wi[i__ - 1]) {
+
+			    swap = wr[i__];
+			    wr[i__] = wr[i__ - 1];
+			    wr[i__ - 1] = wr[i__ - 2];
+			    wr[i__ - 2] = swap;
+
+			    swap = wi[i__];
+			    wi[i__] = wi[i__ - 1];
+			    wi[i__ - 1] = wi[i__ - 2];
+			    wi[i__ - 2] = swap;
+			}
+/* L70: */
+		    }
+		}
+
+/*
+                ==== If there are only two shifts and both are
+                .    real, then use only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    if (wi[kbot] == 0.) {
+			if ((d__1 = wr[kbot] - h__[kbot + kbot * h_dim1], abs(
+				d__1)) < (d__2 = wr[kbot - 1] - h__[kbot +
+				kbot * h_dim1], abs(d__2))) {
+			    wr[kbot - 1] = wr[kbot];
+			} else {
+			    wr[kbot] = wr[kbot - 1];
+			}
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		dlaqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &wr[ks],
+			&wi[ks], &h__[h_offset], ldh, iloz, ihiz, &z__[
+			z_offset], ldz, &work[1], &c__3, &h__[ku + h_dim1],
+			ldh, &nve, &h__[kwv + h_dim1], ldh, &nho, &h__[ku +
+			kwh * h_dim1], ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L80:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L90:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    work[1] = (doublereal) lwkopt;
+
+/*     ==== End of DLAQR0 ==== */
+
+    return 0;
+} /* dlaqr0_ */
+
+/* Subroutine */ int dlaqr1_(integer *n, doublereal *h__, integer *ldh,
+	doublereal *sr1, doublereal *si1, doublereal *sr2, doublereal *si2,
+	doublereal *v)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset;
+    doublereal d__1, d__2, d__3;
+
+    /* Local variables */
+    static doublereal s, h21s, h31s;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+         Given a 2-by-2 or 3-by-3 matrix H, DLAQR1 sets v to a
+         scalar multiple of the first column of the product
+
+         (*)  K = (H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+
+         scaling to avoid overflows and most underflows. It
+         is assumed that either
+
+                 1) sr1 = sr2 and si1 = -si2
+             or
+                 2) si1 = si2 = 0.
+
+         This is useful for starting double implicit shift bulges
+         in the QR algorithm.
+
+
+         N      (input) integer
+                Order of the matrix H. N must be either 2 or 3.
+
+         H      (input) DOUBLE PRECISION array of dimension (LDH,N)
+                The 2-by-2 or 3-by-3 matrix H in (*).
+
+         LDH    (input) integer
+                The leading dimension of H as declared in
+                the calling procedure.  LDH.GE.N
+
+         SR1    (input) DOUBLE PRECISION
+         SI1    The shifts in (*).
+         SR2
+         SI2
+
+         V      (output) DOUBLE PRECISION array of dimension N
+                A scalar multiple of the first column of the
+                matrix K in (*).
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --v;
+
+    /* Function Body */
+    if (*n == 2) {
+	s = (d__1 = h__[h_dim1 + 1] - *sr2, abs(d__1)) + abs(*si2) + (d__2 =
+		h__[h_dim1 + 2], abs(d__2));
+	if (s == 0.) {
+	    v[1] = 0.;
+	    v[2] = 0.;
+	} else {
+	    h21s = h__[h_dim1 + 2] / s;
+	    v[1] = h21s * h__[(h_dim1 << 1) + 1] + (h__[h_dim1 + 1] - *sr1) *
+		    ((h__[h_dim1 + 1] - *sr2) / s) - *si1 * (*si2 / s);
+	    v[2] = h21s * (h__[h_dim1 + 1] + h__[(h_dim1 << 1) + 2] - *sr1 - *
+		    sr2);
+	}
+    } else {
+	s = (d__1 = h__[h_dim1 + 1] - *sr2, abs(d__1)) + abs(*si2) + (d__2 =
+		h__[h_dim1 + 2], abs(d__2)) + (d__3 = h__[h_dim1 + 3], abs(
+		d__3));
+	if (s == 0.) {
+	    v[1] = 0.;
+	    v[2] = 0.;
+	    v[3] = 0.;
+	} else {
+	    h21s = h__[h_dim1 + 2] / s;
+	    h31s = h__[h_dim1 + 3] / s;
+	    v[1] = (h__[h_dim1 + 1] - *sr1) * ((h__[h_dim1 + 1] - *sr2) / s)
+		    - *si1 * (*si2 / s) + h__[(h_dim1 << 1) + 1] * h21s + h__[
+		    h_dim1 * 3 + 1] * h31s;
+	    v[2] = h21s * (h__[h_dim1 + 1] + h__[(h_dim1 << 1) + 2] - *sr1 - *
+		    sr2) + h__[h_dim1 * 3 + 2] * h31s;
+	    v[3] = h31s * (h__[h_dim1 + 1] + h__[h_dim1 * 3 + 3] - *sr1 - *
+		    sr2) + h21s * h__[(h_dim1 << 1) + 3];
+	}
+    }
+    return 0;
+} /* dlaqr1_ */
+
+/* Subroutine */ int dlaqr2_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, doublereal *h__, integer *
+	ldh, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz,
+	integer *ns, integer *nd, doublereal *sr, doublereal *si, doublereal *
+	v, integer *ldv, integer *nh, doublereal *t, integer *ldt, integer *
+	nv, doublereal *wv, integer *ldwv, doublereal *work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static doublereal s, aa, bb, cc, dd, cs, sn;
+    static integer jw;
+    static doublereal evi, evk, foo;
+    static integer kln;
+    static doublereal tau, ulp;
+    static integer lwk1, lwk2;
+    static doublereal beta;
+    static integer kend, kcol, info, ifst, ilst, ltop, krow;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *), dgemm_(char *, char *, integer *, integer *
+	    , integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static logical bulge;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer infqr, kwtop;
+    extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *), dlabad_(
+	    doublereal *, doublereal *);
+
+    extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *), dlarfg_(integer *, doublereal *, doublereal *,
+	    integer *, doublereal *), dlahqr_(logical *, logical *, integer *,
+	     integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *);
+    static doublereal safmin;
+    extern /* Subroutine */ int dlaset_(char *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *);
+    static doublereal safmax;
+    extern /* Subroutine */ int dtrexc_(char *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, integer *, integer *,
+	    doublereal *, integer *), dormhr_(char *, char *, integer
+	    *, integer *, integer *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    integer *);
+    static logical sorted;
+    static doublereal smlnum;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- June 2010                                                       --
+
+
+       This subroutine is identical to DLAQR3 except that it avoids
+       recursion by calling DLAHQR instead of DLAQR4.
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an orthogonal similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an orthogonal similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the quasi-triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the orthogonal matrix Z is updated so
+            so that the orthogonal Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the orthogonal matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) DOUBLE PRECISION array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by an orthogonal
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) DOUBLE PRECISION array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the orthogonal
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SR      (output) DOUBLE PRECISION array, dimension (KBOT)
+       SI      (output) DOUBLE PRECISION array, dimension (KBOT)
+            On output, the real and imaginary parts of approximate
+            eigenvalues that may be used for shifts are stored in
+            SR(KBOT-ND-NS+1) through SR(KBOT-ND) and
+            SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively.
+            The real and imaginary parts of converged eigenvalues
+            are stored in SR(KBOT-ND+1) through SR(KBOT) and
+            SI(KBOT-ND+1) through SI(KBOT), respectively.
+
+       V       (workspace) DOUBLE PRECISION array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) DOUBLE PRECISION array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) DOUBLE PRECISION array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) DOUBLE PRECISION array, dimension (LWORK)
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; DLAQR2
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sr;
+    --si;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to DGEHRD ==== */
+
+	i__1 = jw - 1;
+	dgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1];
+
+/*        ==== Workspace query call to DORMHR ==== */
+
+	i__1 = jw - 1;
+	dormhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1];
+
+/*        ==== Optimal workspace ==== */
+
+	lwkopt = jw + max(lwk1,lwk2);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1] = 1.;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s = 0.;
+    } else {
+	s = h__[kwtop + (kwtop - 1) * h_dim1];
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	sr[kwtop] = h__[kwtop + kwtop * h_dim1];
+	si[kwtop] = 0.;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	d__2 = smlnum, d__3 = ulp * (d__1 = h__[kwtop + kwtop * h_dim1], abs(
+		d__1));
+	if (abs(s) <= max(d__2,d__3)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		h__[kwtop + (kwtop - 1) * h_dim1] = 0.;
+	    }
+	}
+	work[1] = 1.;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    dlacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    dcopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    dlaset_("A", &jw, &jw, &c_b29, &c_b15, &v[v_offset], ldv);
+    dlahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[kwtop],
+	    &si[kwtop], &c__1, &jw, &v[v_offset], ldv, &infqr);
+
+/*     ==== DTREXC needs a clean margin near the diagonal ==== */
+
+    i__1 = jw - 3;
+    for (j = 1; j <= i__1; ++j) {
+	t[j + 2 + j * t_dim1] = 0.;
+	t[j + 3 + j * t_dim1] = 0.;
+/* L10: */
+    }
+    if (jw > 2) {
+	t[jw + (jw - 2) * t_dim1] = 0.;
+    }
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+L20:
+    if (ilst <= *ns) {
+	if (*ns == 1) {
+	    bulge = FALSE_;
+	} else {
+	    bulge = t[*ns + (*ns - 1) * t_dim1] != 0.;
+	}
+
+/*        ==== Small spike tip test for deflation ==== */
+
+	if (! bulge) {
+
+/*           ==== Real eigenvalue ==== */
+
+	    foo = (d__1 = t[*ns + *ns * t_dim1], abs(d__1));
+	    if (foo == 0.) {
+		foo = abs(s);
+	    }
+/* Computing MAX */
+	    d__2 = smlnum, d__3 = ulp * foo;
+	    if ((d__1 = s * v[*ns * v_dim1 + 1], abs(d__1)) <= max(d__2,d__3))
+		     {
+
+/*              ==== Deflatable ==== */
+
+		--(*ns);
+	    } else {
+
+/*
+                ==== Undeflatable.   Move it up out of the way.
+                .    (DTREXC can not fail in this case.) ====
+*/
+
+		ifst = *ns;
+		dtrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		++ilst;
+	    }
+	} else {
+
+/*           ==== Complex conjugate pair ==== */
+
+	    foo = (d__3 = t[*ns + *ns * t_dim1], abs(d__3)) + sqrt((d__1 = t[*
+		    ns + (*ns - 1) * t_dim1], abs(d__1))) * sqrt((d__2 = t[*
+		    ns - 1 + *ns * t_dim1], abs(d__2)));
+	    if (foo == 0.) {
+		foo = abs(s);
+	    }
+/* Computing MAX */
+	    d__3 = (d__1 = s * v[*ns * v_dim1 + 1], abs(d__1)), d__4 = (d__2 =
+		     s * v[(*ns - 1) * v_dim1 + 1], abs(d__2));
+/* Computing MAX */
+	    d__5 = smlnum, d__6 = ulp * foo;
+	    if (max(d__3,d__4) <= max(d__5,d__6)) {
+
+/*              ==== Deflatable ==== */
+
+		*ns += -2;
+	    } else {
+
+/*
+                ==== Undeflatable. Move them up out of the way.
+                .    Fortunately, DTREXC does the right thing with
+                .    ILST in case of a rare exchange failure. ====
+*/
+
+		ifst = *ns;
+		dtrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		ilst += 2;
+	    }
+	}
+
+/*        ==== End deflation detection loop ==== */
+
+	goto L20;
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s = 0.;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting diagonal blocks of T improves accuracy for
+          .    graded matrices.  Bubble sort deals well with
+          .    exchange failures. ====
+*/
+
+	sorted = FALSE_;
+	i__ = *ns + 1;
+L30:
+	if (sorted) {
+	    goto L50;
+	}
+	sorted = TRUE_;
+
+	kend = i__ - 1;
+	i__ = infqr + 1;
+	if (i__ == *ns) {
+	    k = i__ + 1;
+	} else if (t[i__ + 1 + i__ * t_dim1] == 0.) {
+	    k = i__ + 1;
+	} else {
+	    k = i__ + 2;
+	}
+L40:
+	if (k <= kend) {
+	    if (k == i__ + 1) {
+		evi = (d__1 = t[i__ + i__ * t_dim1], abs(d__1));
+	    } else {
+		evi = (d__3 = t[i__ + i__ * t_dim1], abs(d__3)) + sqrt((d__1 =
+			 t[i__ + 1 + i__ * t_dim1], abs(d__1))) * sqrt((d__2 =
+			 t[i__ + (i__ + 1) * t_dim1], abs(d__2)));
+	    }
+
+	    if (k == kend) {
+		evk = (d__1 = t[k + k * t_dim1], abs(d__1));
+	    } else if (t[k + 1 + k * t_dim1] == 0.) {
+		evk = (d__1 = t[k + k * t_dim1], abs(d__1));
+	    } else {
+		evk = (d__3 = t[k + k * t_dim1], abs(d__3)) + sqrt((d__1 = t[
+			k + 1 + k * t_dim1], abs(d__1))) * sqrt((d__2 = t[k +
+			(k + 1) * t_dim1], abs(d__2)));
+	    }
+
+	    if (evi >= evk) {
+		i__ = k;
+	    } else {
+		sorted = FALSE_;
+		ifst = i__;
+		ilst = k;
+		dtrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		if (info == 0) {
+		    i__ = ilst;
+		} else {
+		    i__ = k;
+		}
+	    }
+	    if (i__ == kend) {
+		k = i__ + 1;
+	    } else if (t[i__ + 1 + i__ * t_dim1] == 0.) {
+		k = i__ + 1;
+	    } else {
+		k = i__ + 2;
+	    }
+	    goto L40;
+	}
+	goto L30;
+L50:
+	;
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__ = jw;
+L60:
+    if (i__ >= infqr + 1) {
+	if (i__ == infqr + 1) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.;
+	    --i__;
+	} else if (t[i__ + (i__ - 1) * t_dim1] == 0.) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.;
+	    --i__;
+	} else {
+	    aa = t[i__ - 1 + (i__ - 1) * t_dim1];
+	    cc = t[i__ + (i__ - 1) * t_dim1];
+	    bb = t[i__ - 1 + i__ * t_dim1];
+	    dd = t[i__ + i__ * t_dim1];
+	    dlanv2_(&aa, &bb, &cc, &dd, &sr[kwtop + i__ - 2], &si[kwtop + i__
+		    - 2], &sr[kwtop + i__ - 1], &si[kwtop + i__ - 1], &cs, &
+		    sn);
+	    i__ += -2;
+	}
+	goto L60;
+    }
+
+    if (*ns < jw || s == 0.) {
+	if (*ns > 1 && s != 0.) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    dcopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    beta = work[1];
+	    dlarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1] = 1.;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    dlaset_("L", &i__1, &i__2, &c_b29, &c_b29, &t[t_dim1 + 3], ldt);
+
+	    dlarf_("L", ns, &jw, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    dlarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    dlarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    dgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    h__[kwtop + (kwtop - 1) * h_dim1] = s * v[v_dim1 + 1];
+	}
+	dlacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	dcopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && s != 0.) {
+	    i__1 = *lwork - jw;
+	    dormhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    dgemm_("N", "N", &kln, &jw, &jw, &c_b15, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b29, &wv[wv_offset],
+		    ldwv);
+	    dlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L70: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		dgemm_("C", "N", &jw, &kln, &jw, &c_b15, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b29, &t[t_offset],
+			 ldt);
+		dlacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L80: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		dgemm_("N", "N", &kln, &jw, &jw, &c_b15, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b29, &wv[
+			wv_offset], ldwv);
+		dlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L90: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    work[1] = (doublereal) lwkopt;
+
+/*     ==== End of DLAQR2 ==== */
+
+    return 0;
+} /* dlaqr2_ */
+
+/* Subroutine */ int dlaqr3_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, doublereal *h__, integer *
+	ldh, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz,
+	integer *ns, integer *nd, doublereal *sr, doublereal *si, doublereal *
+	v, integer *ldv, integer *nh, doublereal *t, integer *ldt, integer *
+	nv, doublereal *wv, integer *ldwv, doublereal *work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static doublereal s, aa, bb, cc, dd, cs, sn;
+    static integer jw;
+    static doublereal evi, evk, foo;
+    static integer kln;
+    static doublereal tau, ulp;
+    static integer lwk1, lwk2, lwk3;
+    static doublereal beta;
+    static integer kend, kcol, info, nmin, ifst, ilst, ltop, krow;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *), dgemm_(char *, char *, integer *, integer *
+	    , integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static logical bulge;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer infqr, kwtop;
+    extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *), dlaqr4_(
+	    logical *, logical *, integer *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, integer *),
+	    dlabad_(doublereal *, doublereal *);
+
+    extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *), dlarfg_(integer *, doublereal *, doublereal *,
+	    integer *, doublereal *), dlahqr_(logical *, logical *, integer *,
+	     integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *);
+    static doublereal safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static doublereal safmax;
+    extern /* Subroutine */ int dlaset_(char *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *),
+	    dtrexc_(char *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, integer *, integer *, doublereal *, integer *),
+	     dormhr_(char *, char *, integer *, integer *, integer *, integer
+	    *, doublereal *, integer *, doublereal *, doublereal *, integer *,
+	     doublereal *, integer *, integer *);
+    static logical sorted;
+    static doublereal smlnum;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- June 2010                                                       --
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an orthogonal similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an orthogonal similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the quasi-triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the orthogonal matrix Z is updated so
+            so that the orthogonal Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the orthogonal matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) DOUBLE PRECISION array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by an orthogonal
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) DOUBLE PRECISION array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the orthogonal
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SR      (output) DOUBLE PRECISION array, dimension (KBOT)
+       SI      (output) DOUBLE PRECISION array, dimension (KBOT)
+            On output, the real and imaginary parts of approximate
+            eigenvalues that may be used for shifts are stored in
+            SR(KBOT-ND-NS+1) through SR(KBOT-ND) and
+            SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively.
+            The real and imaginary parts of converged eigenvalues
+            are stored in SR(KBOT-ND+1) through SR(KBOT) and
+            SI(KBOT-ND+1) through SI(KBOT), respectively.
+
+       V       (workspace) DOUBLE PRECISION array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) DOUBLE PRECISION array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) DOUBLE PRECISION array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) DOUBLE PRECISION array, dimension (LWORK)
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; DLAQR3
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sr;
+    --si;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to DGEHRD ==== */
+
+	i__1 = jw - 1;
+	dgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1];
+
+/*        ==== Workspace query call to DORMHR ==== */
+
+	i__1 = jw - 1;
+	dormhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1];
+
+/*        ==== Workspace query call to DLAQR4 ==== */
+
+	dlaqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[1],
+		&si[1], &c__1, &jw, &v[v_offset], ldv, &work[1], &c_n1, &
+		infqr);
+	lwk3 = (integer) work[1];
+
+/*
+          ==== Optimal workspace ====
+
+   Computing MAX
+*/
+	i__1 = jw + max(lwk1,lwk2);
+	lwkopt = max(i__1,lwk3);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1] = 1.;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s = 0.;
+    } else {
+	s = h__[kwtop + (kwtop - 1) * h_dim1];
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	sr[kwtop] = h__[kwtop + kwtop * h_dim1];
+	si[kwtop] = 0.;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	d__2 = smlnum, d__3 = ulp * (d__1 = h__[kwtop + kwtop * h_dim1], abs(
+		d__1));
+	if (abs(s) <= max(d__2,d__3)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		h__[kwtop + (kwtop - 1) * h_dim1] = 0.;
+	    }
+	}
+	work[1] = 1.;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    dlacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    dcopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    dlaset_("A", &jw, &jw, &c_b29, &c_b15, &v[v_offset], ldv);
+    nmin = ilaenv_(&c__12, "DLAQR3", "SV", &jw, &c__1, &jw, lwork, (ftnlen)6,
+	    (ftnlen)2);
+    if (jw > nmin) {
+	dlaqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[
+		kwtop], &si[kwtop], &c__1, &jw, &v[v_offset], ldv, &work[1],
+		lwork, &infqr);
+    } else {
+	dlahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[
+		kwtop], &si[kwtop], &c__1, &jw, &v[v_offset], ldv, &infqr);
+    }
+
+/*     ==== DTREXC needs a clean margin near the diagonal ==== */
+
+    i__1 = jw - 3;
+    for (j = 1; j <= i__1; ++j) {
+	t[j + 2 + j * t_dim1] = 0.;
+	t[j + 3 + j * t_dim1] = 0.;
+/* L10: */
+    }
+    if (jw > 2) {
+	t[jw + (jw - 2) * t_dim1] = 0.;
+    }
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+L20:
+    if (ilst <= *ns) {
+	if (*ns == 1) {
+	    bulge = FALSE_;
+	} else {
+	    bulge = t[*ns + (*ns - 1) * t_dim1] != 0.;
+	}
+
+/*        ==== Small spike tip test for deflation ==== */
+
+	if (! bulge) {
+
+/*           ==== Real eigenvalue ==== */
+
+	    foo = (d__1 = t[*ns + *ns * t_dim1], abs(d__1));
+	    if (foo == 0.) {
+		foo = abs(s);
+	    }
+/* Computing MAX */
+	    d__2 = smlnum, d__3 = ulp * foo;
+	    if ((d__1 = s * v[*ns * v_dim1 + 1], abs(d__1)) <= max(d__2,d__3))
+		     {
+
+/*              ==== Deflatable ==== */
+
+		--(*ns);
+	    } else {
+
+/*
+                ==== Undeflatable.   Move it up out of the way.
+                .    (DTREXC can not fail in this case.) ====
+*/
+
+		ifst = *ns;
+		dtrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		++ilst;
+	    }
+	} else {
+
+/*           ==== Complex conjugate pair ==== */
+
+	    foo = (d__3 = t[*ns + *ns * t_dim1], abs(d__3)) + sqrt((d__1 = t[*
+		    ns + (*ns - 1) * t_dim1], abs(d__1))) * sqrt((d__2 = t[*
+		    ns - 1 + *ns * t_dim1], abs(d__2)));
+	    if (foo == 0.) {
+		foo = abs(s);
+	    }
+/* Computing MAX */
+	    d__3 = (d__1 = s * v[*ns * v_dim1 + 1], abs(d__1)), d__4 = (d__2 =
+		     s * v[(*ns - 1) * v_dim1 + 1], abs(d__2));
+/* Computing MAX */
+	    d__5 = smlnum, d__6 = ulp * foo;
+	    if (max(d__3,d__4) <= max(d__5,d__6)) {
+
+/*              ==== Deflatable ==== */
+
+		*ns += -2;
+	    } else {
+
+/*
+                ==== Undeflatable. Move them up out of the way.
+                .    Fortunately, DTREXC does the right thing with
+                .    ILST in case of a rare exchange failure. ====
+*/
+
+		ifst = *ns;
+		dtrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		ilst += 2;
+	    }
+	}
+
+/*        ==== End deflation detection loop ==== */
+
+	goto L20;
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s = 0.;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting diagonal blocks of T improves accuracy for
+          .    graded matrices.  Bubble sort deals well with
+          .    exchange failures. ====
+*/
+
+	sorted = FALSE_;
+	i__ = *ns + 1;
+L30:
+	if (sorted) {
+	    goto L50;
+	}
+	sorted = TRUE_;
+
+	kend = i__ - 1;
+	i__ = infqr + 1;
+	if (i__ == *ns) {
+	    k = i__ + 1;
+	} else if (t[i__ + 1 + i__ * t_dim1] == 0.) {
+	    k = i__ + 1;
+	} else {
+	    k = i__ + 2;
+	}
+L40:
+	if (k <= kend) {
+	    if (k == i__ + 1) {
+		evi = (d__1 = t[i__ + i__ * t_dim1], abs(d__1));
+	    } else {
+		evi = (d__3 = t[i__ + i__ * t_dim1], abs(d__3)) + sqrt((d__1 =
+			 t[i__ + 1 + i__ * t_dim1], abs(d__1))) * sqrt((d__2 =
+			 t[i__ + (i__ + 1) * t_dim1], abs(d__2)));
+	    }
+
+	    if (k == kend) {
+		evk = (d__1 = t[k + k * t_dim1], abs(d__1));
+	    } else if (t[k + 1 + k * t_dim1] == 0.) {
+		evk = (d__1 = t[k + k * t_dim1], abs(d__1));
+	    } else {
+		evk = (d__3 = t[k + k * t_dim1], abs(d__3)) + sqrt((d__1 = t[
+			k + 1 + k * t_dim1], abs(d__1))) * sqrt((d__2 = t[k +
+			(k + 1) * t_dim1], abs(d__2)));
+	    }
+
+	    if (evi >= evk) {
+		i__ = k;
+	    } else {
+		sorted = FALSE_;
+		ifst = i__;
+		ilst = k;
+		dtrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		if (info == 0) {
+		    i__ = ilst;
+		} else {
+		    i__ = k;
+		}
+	    }
+	    if (i__ == kend) {
+		k = i__ + 1;
+	    } else if (t[i__ + 1 + i__ * t_dim1] == 0.) {
+		k = i__ + 1;
+	    } else {
+		k = i__ + 2;
+	    }
+	    goto L40;
+	}
+	goto L30;
+L50:
+	;
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__ = jw;
+L60:
+    if (i__ >= infqr + 1) {
+	if (i__ == infqr + 1) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.;
+	    --i__;
+	} else if (t[i__ + (i__ - 1) * t_dim1] == 0.) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.;
+	    --i__;
+	} else {
+	    aa = t[i__ - 1 + (i__ - 1) * t_dim1];
+	    cc = t[i__ + (i__ - 1) * t_dim1];
+	    bb = t[i__ - 1 + i__ * t_dim1];
+	    dd = t[i__ + i__ * t_dim1];
+	    dlanv2_(&aa, &bb, &cc, &dd, &sr[kwtop + i__ - 2], &si[kwtop + i__
+		    - 2], &sr[kwtop + i__ - 1], &si[kwtop + i__ - 1], &cs, &
+		    sn);
+	    i__ += -2;
+	}
+	goto L60;
+    }
+
+    if (*ns < jw || s == 0.) {
+	if (*ns > 1 && s != 0.) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    dcopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    beta = work[1];
+	    dlarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1] = 1.;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    dlaset_("L", &i__1, &i__2, &c_b29, &c_b29, &t[t_dim1 + 3], ldt);
+
+	    dlarf_("L", ns, &jw, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    dlarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    dlarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    dgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    h__[kwtop + (kwtop - 1) * h_dim1] = s * v[v_dim1 + 1];
+	}
+	dlacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	dcopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && s != 0.) {
+	    i__1 = *lwork - jw;
+	    dormhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    dgemm_("N", "N", &kln, &jw, &jw, &c_b15, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b29, &wv[wv_offset],
+		    ldwv);
+	    dlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L70: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		dgemm_("C", "N", &jw, &kln, &jw, &c_b15, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b29, &t[t_offset],
+			 ldt);
+		dlacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L80: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		dgemm_("N", "N", &kln, &jw, &jw, &c_b15, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b29, &wv[
+			wv_offset], ldwv);
+		dlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L90: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    work[1] = (doublereal) lwkopt;
+
+/*     ==== End of DLAQR3 ==== */
+
+    return 0;
+} /* dlaqr3_ */
+
+/* Subroutine */ int dlaqr4_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal
+	*wr, doublereal *wi, integer *iloz, integer *ihiz, doublereal *z__,
+	integer *ldz, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal aa, bb, cc, dd;
+    static integer ld;
+    static doublereal cs;
+    static integer nh, it, ks, kt;
+    static doublereal sn;
+    static integer ku, kv, ls, ns;
+    static doublereal ss;
+    static integer nw, inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl,
+	    kbot, nmin;
+    static doublereal swap;
+    static integer ktop;
+    static doublereal zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int dlaqr2_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, doublereal *, integer *, integer
+	    *, integer *, doublereal *, integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *, integer *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlanv2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *), dlaqr5_(
+	    logical *, logical *, integer *, integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *, doublereal *, integer *);
+    static integer nibble;
+    extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    static integer nwupbd;
+    static logical sorted;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       This subroutine implements one level of recursion for DLAQR0.
+       It is a complete implementation of the small bulge multi-shift
+       QR algorithm.  It may be called by DLAQR0 and, for large enough
+       deflation window size, it may be called by DLAQR3.  This
+       subroutine is identical to DLAQR0 except that it calls DLAQR2
+       instead of DLAQR3.
+
+       Purpose
+       =======
+
+       DLAQR4 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+       Schur form), and Z is the orthogonal matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input orthogonal
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to DGEBAL, and then passed to DGEHRD when the
+             matrix output by DGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) DOUBLE PRECISION array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H contains
+             the upper quasi-triangular matrix T from the Schur
+             decomposition (the Schur form); 2-by-2 diagonal blocks
+             (corresponding to complex conjugate pairs of eigenvalues)
+             are returned in standard form, with H(i,i) = H(i+1,i+1)
+             and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       WR    (output) DOUBLE PRECISION array, dimension (IHI)
+       WI    (output) DOUBLE PRECISION array, dimension (IHI)
+             The real and imaginary parts, respectively, of the computed
+             eigenvalues of H(ILO:IHI,ILO:IHI) are stored in WR(ILO:IHI)
+             and WI(ILO:IHI). If two eigenvalues are computed as a
+             complex conjugate pair, they are stored in consecutive
+             elements of WR and WI, say the i-th and (i+1)th, with
+             WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+             the eigenvalues are stored in the same order as on the
+             diagonal of the Schur form returned in H, with
+             WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
+             block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
+             WI(i+1) = -WI(i).
+
+       ILOZ     (input) INTEGER
+       IHIZ     (input) INTEGER
+             Specify the rows of Z to which transformations must be
+             applied if WANTZ is .TRUE..
+             1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+
+       Z     (input/output) DOUBLE PRECISION array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) DOUBLE PRECISION array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then DLAQR4 does a workspace query.
+             In this case, DLAQR4 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, DLAQR4 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is an orthogonal matrix.  The final
+                  value of H is upper Hessenberg and quasi-triangular
+                  in rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    DLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constants WILK1 and WILK2 are used to form the
+       .    exceptional shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use DLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    dlahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1], &
+		    wi[1], iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "DLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "DLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to DLAQR2 ====
+*/
+
+	i__1 = nwr + 1;
+	dlaqr2_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1], &h__[
+		h_offset], ldh, n, &h__[h_offset], ldh, n, &h__[h_offset],
+		ldh, &work[1], &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1];
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    work[1] = (doublereal) lwkopt;
+	    return 0;
+	}
+
+/*        ==== DLAHQR/DLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "DLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "DLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "DLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L90;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		if (h__[k + (k - 1) * h_dim1] == 0.) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    if ((d__1 = h__[kwtop + (kwtop - 1) * h_dim1], abs(d__1))
+			    > (d__2 = h__[kwtop - 1 + (kwtop - 2) * h_dim1],
+			    abs(d__2))) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    dlaqr2_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1],
+		     &h__[kv + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1],
+		    ldh, &nve, &h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if DLAQR2
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    DLAQR2 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+/* Computing MAX */
+		    i__3 = ks + 1, i__4 = ktop + 2;
+		    i__2 = max(i__3,i__4);
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			ss = (d__1 = h__[i__ + (i__ - 1) * h_dim1], abs(d__1))
+				 + (d__2 = h__[i__ - 1 + (i__ - 2) * h_dim1],
+				abs(d__2));
+			aa = ss * .75 + h__[i__ + i__ * h_dim1];
+			bb = ss;
+			cc = ss * -.4375;
+			dd = aa;
+			dlanv2_(&aa, &bb, &cc, &dd, &wr[i__ - 1], &wi[i__ - 1]
+				, &wr[i__], &wi[i__], &cs, &sn);
+/* L30: */
+		    }
+		    if (ks == ktop) {
+			wr[ks + 1] = h__[ks + 1 + (ks + 1) * h_dim1];
+			wi[ks + 1] = 0.;
+			wr[ks] = wr[ks + 1];
+			wi[ks] = wi[ks + 1];
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use DLAHQR
+                   .    on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			dlacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			dlahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[kt
+				+ h_dim1], ldh, &wr[ks], &wi[ks], &c__1, &
+				c__1, zdum, &c__1, &inf);
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  ====
+*/
+
+			if (ks >= kbot) {
+			    aa = h__[kbot - 1 + (kbot - 1) * h_dim1];
+			    cc = h__[kbot + (kbot - 1) * h_dim1];
+			    bb = h__[kbot - 1 + kbot * h_dim1];
+			    dd = h__[kbot + kbot * h_dim1];
+			    dlanv2_(&aa, &bb, &cc, &dd, &wr[kbot - 1], &wi[
+				    kbot - 1], &wr[kbot], &wi[kbot], &cs, &sn)
+				    ;
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*
+                      ==== Sort the shifts (Helps a little)
+                      .    Bubble sort keeps complex conjugate
+                      .    pairs together. ====
+*/
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				if ((d__1 = wr[i__], abs(d__1)) + (d__2 = wi[
+					i__], abs(d__2)) < (d__3 = wr[i__ + 1]
+					, abs(d__3)) + (d__4 = wi[i__ + 1],
+					abs(d__4))) {
+				    sorted = FALSE_;
+
+				    swap = wr[i__];
+				    wr[i__] = wr[i__ + 1];
+				    wr[i__ + 1] = swap;
+
+				    swap = wi[i__];
+				    wi[i__] = wi[i__ + 1];
+				    wi[i__ + 1] = swap;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+
+/*
+                   ==== Shuffle shifts into pairs of real shifts
+                   .    and pairs of complex conjugate shifts
+                   .    assuming complex conjugate shifts are
+                   .    already adjacent to one another. (Yes,
+                   .    they are.)  ====
+*/
+
+		    i__2 = ks + 2;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			if (wi[i__] != -wi[i__ - 1]) {
+
+			    swap = wr[i__];
+			    wr[i__] = wr[i__ - 1];
+			    wr[i__ - 1] = wr[i__ - 2];
+			    wr[i__ - 2] = swap;
+
+			    swap = wi[i__];
+			    wi[i__] = wi[i__ - 1];
+			    wi[i__ - 1] = wi[i__ - 2];
+			    wi[i__ - 2] = swap;
+			}
+/* L70: */
+		    }
+		}
+
+/*
+                ==== If there are only two shifts and both are
+                .    real, then use only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    if (wi[kbot] == 0.) {
+			if ((d__1 = wr[kbot] - h__[kbot + kbot * h_dim1], abs(
+				d__1)) < (d__2 = wr[kbot - 1] - h__[kbot +
+				kbot * h_dim1], abs(d__2))) {
+			    wr[kbot - 1] = wr[kbot];
+			} else {
+			    wr[kbot] = wr[kbot - 1];
+			}
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		dlaqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &wr[ks],
+			&wi[ks], &h__[h_offset], ldh, iloz, ihiz, &z__[
+			z_offset], ldz, &work[1], &c__3, &h__[ku + h_dim1],
+			ldh, &nve, &h__[kwv + h_dim1], ldh, &nho, &h__[ku +
+			kwh * h_dim1], ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L80:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L90:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    work[1] = (doublereal) lwkopt;
+
+/*     ==== End of DLAQR4 ==== */
+
+    return 0;
+} /* dlaqr4_ */
+
+/* Subroutine */ int dlaqr5_(logical *wantt, logical *wantz, integer *kacc22,
+	integer *n, integer *ktop, integer *kbot, integer *nshfts, doublereal
+	*sr, doublereal *si, doublereal *h__, integer *ldh, integer *iloz,
+	integer *ihiz, doublereal *z__, integer *ldz, doublereal *v, integer *
+	ldv, doublereal *u, integer *ldu, integer *nv, doublereal *wv,
+	integer *ldwv, integer *nh, doublereal *wh, integer *ldwh)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, u_dim1, u_offset, v_dim1, v_offset, wh_dim1,
+	    wh_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3,
+	     i__4, i__5, i__6, i__7;
+    doublereal d__1, d__2, d__3, d__4, d__5;
+
+    /* Local variables */
+    static integer i__, j, k, m, i2, j2, i4, j4, k1;
+    static doublereal h11, h12, h21, h22;
+    static integer m22, ns, nu;
+    static doublereal vt[3], scl;
+    static integer kdu, kms;
+    static doublereal ulp;
+    static integer knz, kzs;
+    static doublereal tst1, tst2, beta;
+    static logical blk22, bmp22;
+    static integer mend, jcol, jlen, jbot, mbot;
+    static doublereal swap;
+    static integer jtop, jrow, mtop;
+    static doublereal alpha;
+    static logical accum;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer ndcol, incol, krcol, nbmps;
+    extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *), dlaqr1_(
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *), dlabad_(doublereal *,
+	    doublereal *);
+
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *), dlacpy_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *);
+    static doublereal safmin;
+    extern /* Subroutine */ int dlaset_(char *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *);
+    static doublereal safmax, refsum;
+    static integer mstart;
+    static doublereal smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+       This auxiliary subroutine called by DLAQR0 performs a
+       single small-bulge multi-shift QR sweep.
+
+        WANTT  (input) logical scalar
+               WANTT = .true. if the quasi-triangular Schur factor
+               is being computed.  WANTT is set to .false. otherwise.
+
+        WANTZ  (input) logical scalar
+               WANTZ = .true. if the orthogonal Schur factor is being
+               computed.  WANTZ is set to .false. otherwise.
+
+        KACC22 (input) integer with value 0, 1, or 2.
+               Specifies the computation mode of far-from-diagonal
+               orthogonal updates.
+          = 0: DLAQR5 does not accumulate reflections and does not
+               use matrix-matrix multiply to update far-from-diagonal
+               matrix entries.
+          = 1: DLAQR5 accumulates reflections and uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries.
+          = 2: DLAQR5 accumulates reflections, uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries,
+               and takes advantage of 2-by-2 block structure during
+               matrix multiplies.
+
+        N      (input) integer scalar
+               N is the order of the Hessenberg matrix H upon which this
+               subroutine operates.
+
+        KTOP   (input) integer scalar
+        KBOT   (input) integer scalar
+               These are the first and last rows and columns of an
+               isolated diagonal block upon which the QR sweep is to be
+               applied. It is assumed without a check that
+                         either KTOP = 1  or   H(KTOP,KTOP-1) = 0
+               and
+                         either KBOT = N  or   H(KBOT+1,KBOT) = 0.
+
+        NSHFTS (input) integer scalar
+               NSHFTS gives the number of simultaneous shifts.  NSHFTS
+               must be positive and even.
+
+        SR     (input/output) DOUBLE PRECISION array of size (NSHFTS)
+        SI     (input/output) DOUBLE PRECISION array of size (NSHFTS)
+               SR contains the real parts and SI contains the imaginary
+               parts of the NSHFTS shifts of origin that define the
+               multi-shift QR sweep.  On output SR and SI may be
+               reordered.
+
+        H      (input/output) DOUBLE PRECISION array of size (LDH,N)
+               On input H contains a Hessenberg matrix.  On output a
+               multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied
+               to the isolated diagonal block in rows and columns KTOP
+               through KBOT.
+
+        LDH    (input) integer scalar
+               LDH is the leading dimension of H just as declared in the
+               calling procedure.  LDH.GE.MAX(1,N).
+
+        ILOZ   (input) INTEGER
+        IHIZ   (input) INTEGER
+               Specify the rows of Z to which transformations must be
+               applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+
+        Z      (input/output) DOUBLE PRECISION array of size (LDZ,IHI)
+               If WANTZ = .TRUE., then the QR Sweep orthogonal
+               similarity transformation is accumulated into
+               Z(ILOZ:IHIZ,ILO:IHI) from the right.
+               If WANTZ = .FALSE., then Z is unreferenced.
+
+        LDZ    (input) integer scalar
+               LDA is the leading dimension of Z just as declared in
+               the calling procedure. LDZ.GE.N.
+
+        V      (workspace) DOUBLE PRECISION array of size (LDV,NSHFTS/2)
+
+        LDV    (input) integer scalar
+               LDV is the leading dimension of V as declared in the
+               calling procedure.  LDV.GE.3.
+
+        U      (workspace) DOUBLE PRECISION array of size
+               (LDU,3*NSHFTS-3)
+
+        LDU    (input) integer scalar
+               LDU is the leading dimension of U just as declared in the
+               in the calling subroutine.  LDU.GE.3*NSHFTS-3.
+
+        NH     (input) integer scalar
+               NH is the number of columns in array WH available for
+               workspace. NH.GE.1.
+
+        WH     (workspace) DOUBLE PRECISION array of size (LDWH,NH)
+
+        LDWH   (input) integer scalar
+               Leading dimension of WH just as declared in the
+               calling procedure.  LDWH.GE.3*NSHFTS-3.
+
+        NV     (input) integer scalar
+               NV is the number of rows in WV agailable for workspace.
+               NV.GE.1.
+
+        WV     (workspace) DOUBLE PRECISION array of size
+               (LDWV,3*NSHFTS-3)
+
+        LDWV   (input) integer scalar
+               LDWV is the leading dimension of WV as declared in the
+               in the calling subroutine.  LDWV.GE.NV.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       Reference:
+
+       K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+       Algorithm Part I: Maintaining Well Focused Shifts, and
+       Level 3 Performance, SIAM Journal of Matrix Analysis,
+       volume 23, pages 929--947, 2002.
+
+       ================================================================
+
+
+       ==== If there are no shifts, then there is nothing to do. ====
+*/
+
+    /* Parameter adjustments */
+    --sr;
+    --si;
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    wh_dim1 = *ldwh;
+    wh_offset = 1 + wh_dim1;
+    wh -= wh_offset;
+
+    /* Function Body */
+    if (*nshfts < 2) {
+	return 0;
+    }
+
+/*
+       ==== If the active block is empty or 1-by-1, then there
+       .    is nothing to do. ====
+*/
+
+    if (*ktop >= *kbot) {
+	return 0;
+    }
+
+/*
+       ==== Shuffle shifts into pairs of real shifts and pairs
+       .    of complex conjugate shifts assuming complex
+       .    conjugate shifts are already adjacent to one
+       .    another. ====
+*/
+
+    i__1 = *nshfts - 2;
+    for (i__ = 1; i__ <= i__1; i__ += 2) {
+	if (si[i__] != -si[i__ + 1]) {
+
+	    swap = sr[i__];
+	    sr[i__] = sr[i__ + 1];
+	    sr[i__ + 1] = sr[i__ + 2];
+	    sr[i__ + 2] = swap;
+
+	    swap = si[i__];
+	    si[i__] = si[i__ + 1];
+	    si[i__ + 1] = si[i__ + 2];
+	    si[i__ + 2] = swap;
+	}
+/* L10: */
+    }
+
+/*
+       ==== NSHFTS is supposed to be even, but if it is odd,
+       .    then simply reduce it by one.  The shuffle above
+       .    ensures that the dropped shift is real and that
+       .    the remaining shifts are paired. ====
+*/
+
+    ns = *nshfts - *nshfts % 2;
+
+/*     ==== Machine constants for deflation ==== */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) (*n) / ulp);
+
+/*
+       ==== Use accumulated reflections to update far-from-diagonal
+       .    entries ? ====
+*/
+
+    accum = *kacc22 == 1 || *kacc22 == 2;
+
+/*     ==== If so, exploit the 2-by-2 block structure? ==== */
+
+    blk22 = ns > 2 && *kacc22 == 2;
+
+/*     ==== clear trash ==== */
+
+    if (*ktop + 2 <= *kbot) {
+	h__[*ktop + 2 + *ktop * h_dim1] = 0.;
+    }
+
+/*     ==== NBMPS = number of 2-shift bulges in the chain ==== */
+
+    nbmps = ns / 2;
+
+/*     ==== KDU = width of slab ==== */
+
+    kdu = nbmps * 6 - 3;
+
+/*     ==== Create and chase chains of NBMPS bulges ==== */
+
+    i__1 = *kbot - 2;
+    i__2 = nbmps * 3 - 2;
+    for (incol = (1 - nbmps) * 3 + *ktop - 1; i__2 < 0 ? incol >= i__1 :
+	    incol <= i__1; incol += i__2) {
+	ndcol = incol + kdu;
+	if (accum) {
+	    dlaset_("ALL", &kdu, &kdu, &c_b29, &c_b15, &u[u_offset], ldu);
+	}
+
+/*
+          ==== Near-the-diagonal bulge chase.  The following loop
+          .    performs the near-the-diagonal part of a small bulge
+          .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+          .    chunk extends from column INCOL to column NDCOL
+          .    (including both column INCOL and column NDCOL). The
+          .    following loop chases a 3*NBMPS column long chain of
+          .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+          .    may be less than KTOP and and NDCOL may be greater than
+          .    KBOT indicating phantom columns from which to chase
+          .    bulges before they are actually introduced or to which
+          .    to chase bulges beyond column KBOT.)  ====
+
+   Computing MIN
+*/
+	i__4 = incol + nbmps * 3 - 3, i__5 = *kbot - 2;
+	i__3 = min(i__4,i__5);
+	for (krcol = incol; krcol <= i__3; ++krcol) {
+
+/*
+             ==== Bulges number MTOP to MBOT are active double implicit
+             .    shift bulges.  There may or may not also be small
+             .    2-by-2 bulge, if there is room.  The inactive bulges
+             .    (if any) must wait until the active bulges have moved
+             .    down the diagonal to make room.  The phantom matrix
+             .    paradigm described above helps keep track.  ====
+
+   Computing MAX
+*/
+	    i__4 = 1, i__5 = (*ktop - 1 - krcol + 2) / 3 + 1;
+	    mtop = max(i__4,i__5);
+/* Computing MIN */
+	    i__4 = nbmps, i__5 = (*kbot - krcol) / 3;
+	    mbot = min(i__4,i__5);
+	    m22 = mbot + 1;
+	    bmp22 = mbot < nbmps && krcol + (m22 - 1) * 3 == *kbot - 2;
+
+/*
+             ==== Generate reflections to chase the chain right
+             .    one column.  (The minimum value of K is KTOP-1.) ====
+*/
+
+	    i__4 = mbot;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		if (k == *ktop - 1) {
+		    dlaqr1_(&c__3, &h__[*ktop + *ktop * h_dim1], ldh, &sr[(m
+			    << 1) - 1], &si[(m << 1) - 1], &sr[m * 2], &si[m *
+			     2], &v[m * v_dim1 + 1]);
+		    alpha = v[m * v_dim1 + 1];
+		    dlarfg_(&c__3, &alpha, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+		} else {
+		    beta = h__[k + 1 + k * h_dim1];
+		    v[m * v_dim1 + 2] = h__[k + 2 + k * h_dim1];
+		    v[m * v_dim1 + 3] = h__[k + 3 + k * h_dim1];
+		    dlarfg_(&c__3, &beta, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+
+/*
+                   ==== A Bulge may collapse because of vigilant
+                   .    deflation or destructive underflow.  In the
+                   .    underflow case, try the two-small-subdiagonals
+                   .    trick to try to reinflate the bulge.  ====
+*/
+
+		    if (h__[k + 3 + k * h_dim1] != 0. || h__[k + 3 + (k + 1) *
+			     h_dim1] != 0. || h__[k + 3 + (k + 2) * h_dim1] ==
+			     0.) {
+
+/*                    ==== Typical case: not collapsed (yet). ==== */
+
+			h__[k + 1 + k * h_dim1] = beta;
+			h__[k + 2 + k * h_dim1] = 0.;
+			h__[k + 3 + k * h_dim1] = 0.;
+		    } else {
+
+/*
+                      ==== Atypical case: collapsed.  Attempt to
+                      .    reintroduce ignoring H(K+1,K) and H(K+2,K).
+                      .    If the fill resulting from the new
+                      .    reflector is too large, then abandon it.
+                      .    Otherwise, use the new one. ====
+*/
+
+			dlaqr1_(&c__3, &h__[k + 1 + (k + 1) * h_dim1], ldh, &
+				sr[(m << 1) - 1], &si[(m << 1) - 1], &sr[m *
+				2], &si[m * 2], vt);
+			alpha = vt[0];
+			dlarfg_(&c__3, &alpha, &vt[1], &c__1, vt);
+			refsum = vt[0] * (h__[k + 1 + k * h_dim1] + vt[1] *
+				h__[k + 2 + k * h_dim1]);
+
+			if ((d__1 = h__[k + 2 + k * h_dim1] - refsum * vt[1],
+				abs(d__1)) + (d__2 = refsum * vt[2], abs(d__2)
+				) > ulp * ((d__3 = h__[k + k * h_dim1], abs(
+				d__3)) + (d__4 = h__[k + 1 + (k + 1) * h_dim1]
+				, abs(d__4)) + (d__5 = h__[k + 2 + (k + 2) *
+				h_dim1], abs(d__5)))) {
+
+/*
+                         ==== Starting a new bulge here would
+                         .    create non-negligible fill.  Use
+                         .    the old one with trepidation. ====
+*/
+
+			    h__[k + 1 + k * h_dim1] = beta;
+			    h__[k + 2 + k * h_dim1] = 0.;
+			    h__[k + 3 + k * h_dim1] = 0.;
+			} else {
+
+/*
+                         ==== Stating a new bulge here would
+                         .    create only negligible fill.
+                         .    Replace the old reflector with
+                         .    the new one. ====
+*/
+
+			    h__[k + 1 + k * h_dim1] -= refsum;
+			    h__[k + 2 + k * h_dim1] = 0.;
+			    h__[k + 3 + k * h_dim1] = 0.;
+			    v[m * v_dim1 + 1] = vt[0];
+			    v[m * v_dim1 + 2] = vt[1];
+			    v[m * v_dim1 + 3] = vt[2];
+			}
+		    }
+		}
+/* L20: */
+	    }
+
+/*           ==== Generate a 2-by-2 reflection, if needed. ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    if (bmp22) {
+		if (k == *ktop - 1) {
+		    dlaqr1_(&c__2, &h__[k + 1 + (k + 1) * h_dim1], ldh, &sr[(
+			    m22 << 1) - 1], &si[(m22 << 1) - 1], &sr[m22 * 2],
+			     &si[m22 * 2], &v[m22 * v_dim1 + 1]);
+		    beta = v[m22 * v_dim1 + 1];
+		    dlarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		} else {
+		    beta = h__[k + 1 + k * h_dim1];
+		    v[m22 * v_dim1 + 2] = h__[k + 2 + k * h_dim1];
+		    dlarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		    h__[k + 1 + k * h_dim1] = beta;
+		    h__[k + 2 + k * h_dim1] = 0.;
+		}
+	    }
+
+/*           ==== Multiply H by reflections from the left ==== */
+
+	    if (accum) {
+		jbot = min(ndcol,*kbot);
+	    } else if (*wantt) {
+		jbot = *n;
+	    } else {
+		jbot = *kbot;
+	    }
+	    i__4 = jbot;
+	    for (j = max(*ktop,krcol); j <= i__4; ++j) {
+/* Computing MIN */
+		i__5 = mbot, i__6 = (j - krcol + 2) / 3;
+		mend = min(i__5,i__6);
+		i__5 = mend;
+		for (m = mtop; m <= i__5; ++m) {
+		    k = krcol + (m - 1) * 3;
+		    refsum = v[m * v_dim1 + 1] * (h__[k + 1 + j * h_dim1] + v[
+			    m * v_dim1 + 2] * h__[k + 2 + j * h_dim1] + v[m *
+			    v_dim1 + 3] * h__[k + 3 + j * h_dim1]);
+		    h__[k + 1 + j * h_dim1] -= refsum;
+		    h__[k + 2 + j * h_dim1] -= refsum * v[m * v_dim1 + 2];
+		    h__[k + 3 + j * h_dim1] -= refsum * v[m * v_dim1 + 3];
+/* L30: */
+		}
+/* L40: */
+	    }
+	    if (bmp22) {
+		k = krcol + (m22 - 1) * 3;
+/* Computing MAX */
+		i__4 = k + 1;
+		i__5 = jbot;
+		for (j = max(i__4,*ktop); j <= i__5; ++j) {
+		    refsum = v[m22 * v_dim1 + 1] * (h__[k + 1 + j * h_dim1] +
+			    v[m22 * v_dim1 + 2] * h__[k + 2 + j * h_dim1]);
+		    h__[k + 1 + j * h_dim1] -= refsum;
+		    h__[k + 2 + j * h_dim1] -= refsum * v[m22 * v_dim1 + 2];
+/* L50: */
+		}
+	    }
+
+/*
+             ==== Multiply H by reflections from the right.
+             .    Delay filling in the last row until the
+             .    vigilant deflation check is complete. ====
+*/
+
+	    if (accum) {
+		jtop = max(*ktop,incol);
+	    } else if (*wantt) {
+		jtop = 1;
+	    } else {
+		jtop = *ktop;
+	    }
+	    i__5 = mbot;
+	    for (m = mtop; m <= i__5; ++m) {
+		if (v[m * v_dim1 + 1] != 0.) {
+		    k = krcol + (m - 1) * 3;
+/* Computing MIN */
+		    i__6 = *kbot, i__7 = k + 3;
+		    i__4 = min(i__6,i__7);
+		    for (j = jtop; j <= i__4; ++j) {
+			refsum = v[m * v_dim1 + 1] * (h__[j + (k + 1) *
+				h_dim1] + v[m * v_dim1 + 2] * h__[j + (k + 2)
+				* h_dim1] + v[m * v_dim1 + 3] * h__[j + (k +
+				3) * h_dim1]);
+			h__[j + (k + 1) * h_dim1] -= refsum;
+			h__[j + (k + 2) * h_dim1] -= refsum * v[m * v_dim1 +
+				2];
+			h__[j + (k + 3) * h_dim1] -= refsum * v[m * v_dim1 +
+				3];
+/* L60: */
+		    }
+
+		    if (accum) {
+
+/*
+                      ==== Accumulate U. (If necessary, update Z later
+                      .    with with an efficient matrix-matrix
+                      .    multiply.) ====
+*/
+
+			kms = k - incol;
+/* Computing MAX */
+			i__4 = 1, i__6 = *ktop - incol;
+			i__7 = kdu;
+			for (j = max(i__4,i__6); j <= i__7; ++j) {
+			    refsum = v[m * v_dim1 + 1] * (u[j + (kms + 1) *
+				    u_dim1] + v[m * v_dim1 + 2] * u[j + (kms
+				    + 2) * u_dim1] + v[m * v_dim1 + 3] * u[j
+				    + (kms + 3) * u_dim1]);
+			    u[j + (kms + 1) * u_dim1] -= refsum;
+			    u[j + (kms + 2) * u_dim1] -= refsum * v[m *
+				    v_dim1 + 2];
+			    u[j + (kms + 3) * u_dim1] -= refsum * v[m *
+				    v_dim1 + 3];
+/* L70: */
+			}
+		    } else if (*wantz) {
+
+/*
+                      ==== U is not accumulated, so update Z
+                      .    now by multiplying by reflections
+                      .    from the right. ====
+*/
+
+			i__7 = *ihiz;
+			for (j = *iloz; j <= i__7; ++j) {
+			    refsum = v[m * v_dim1 + 1] * (z__[j + (k + 1) *
+				    z_dim1] + v[m * v_dim1 + 2] * z__[j + (k
+				    + 2) * z_dim1] + v[m * v_dim1 + 3] * z__[
+				    j + (k + 3) * z_dim1]);
+			    z__[j + (k + 1) * z_dim1] -= refsum;
+			    z__[j + (k + 2) * z_dim1] -= refsum * v[m *
+				    v_dim1 + 2];
+			    z__[j + (k + 3) * z_dim1] -= refsum * v[m *
+				    v_dim1 + 3];
+/* L80: */
+			}
+		    }
+		}
+/* L90: */
+	    }
+
+/*           ==== Special case: 2-by-2 reflection (if needed) ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    if (bmp22 && v[m22 * v_dim1 + 1] != 0.) {
+/* Computing MIN */
+		i__7 = *kbot, i__4 = k + 3;
+		i__5 = min(i__7,i__4);
+		for (j = jtop; j <= i__5; ++j) {
+		    refsum = v[m22 * v_dim1 + 1] * (h__[j + (k + 1) * h_dim1]
+			    + v[m22 * v_dim1 + 2] * h__[j + (k + 2) * h_dim1])
+			    ;
+		    h__[j + (k + 1) * h_dim1] -= refsum;
+		    h__[j + (k + 2) * h_dim1] -= refsum * v[m22 * v_dim1 + 2];
+/* L100: */
+		}
+
+		if (accum) {
+		    kms = k - incol;
+/* Computing MAX */
+		    i__5 = 1, i__7 = *ktop - incol;
+		    i__4 = kdu;
+		    for (j = max(i__5,i__7); j <= i__4; ++j) {
+			refsum = v[m22 * v_dim1 + 1] * (u[j + (kms + 1) *
+				u_dim1] + v[m22 * v_dim1 + 2] * u[j + (kms +
+				2) * u_dim1]);
+			u[j + (kms + 1) * u_dim1] -= refsum;
+			u[j + (kms + 2) * u_dim1] -= refsum * v[m22 * v_dim1
+				+ 2];
+/* L110: */
+		    }
+		} else if (*wantz) {
+		    i__4 = *ihiz;
+		    for (j = *iloz; j <= i__4; ++j) {
+			refsum = v[m22 * v_dim1 + 1] * (z__[j + (k + 1) *
+				z_dim1] + v[m22 * v_dim1 + 2] * z__[j + (k +
+				2) * z_dim1]);
+			z__[j + (k + 1) * z_dim1] -= refsum;
+			z__[j + (k + 2) * z_dim1] -= refsum * v[m22 * v_dim1
+				+ 2];
+/* L120: */
+		    }
+		}
+	    }
+
+/*           ==== Vigilant deflation check ==== */
+
+	    mstart = mtop;
+	    if (krcol + (mstart - 1) * 3 < *ktop) {
+		++mstart;
+	    }
+	    mend = mbot;
+	    if (bmp22) {
+		++mend;
+	    }
+	    if (krcol == *kbot - 2) {
+		++mend;
+	    }
+	    i__4 = mend;
+	    for (m = mstart; m <= i__4; ++m) {
+/* Computing MIN */
+		i__5 = *kbot - 1, i__7 = krcol + (m - 1) * 3;
+		k = min(i__5,i__7);
+
+/*
+                ==== The following convergence test requires that
+                .    the tradition small-compared-to-nearby-diagonals
+                .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+                .    criteria both be satisfied.  The latter improves
+                .    accuracy in some examples. Falling back on an
+                .    alternate convergence criterion when TST1 or TST2
+                .    is zero (as done here) is traditional but probably
+                .    unnecessary. ====
+*/
+
+		if (h__[k + 1 + k * h_dim1] != 0.) {
+		    tst1 = (d__1 = h__[k + k * h_dim1], abs(d__1)) + (d__2 =
+			    h__[k + 1 + (k + 1) * h_dim1], abs(d__2));
+		    if (tst1 == 0.) {
+			if (k >= *ktop + 1) {
+			    tst1 += (d__1 = h__[k + (k - 1) * h_dim1], abs(
+				    d__1));
+			}
+			if (k >= *ktop + 2) {
+			    tst1 += (d__1 = h__[k + (k - 2) * h_dim1], abs(
+				    d__1));
+			}
+			if (k >= *ktop + 3) {
+			    tst1 += (d__1 = h__[k + (k - 3) * h_dim1], abs(
+				    d__1));
+			}
+			if (k <= *kbot - 2) {
+			    tst1 += (d__1 = h__[k + 2 + (k + 1) * h_dim1],
+				    abs(d__1));
+			}
+			if (k <= *kbot - 3) {
+			    tst1 += (d__1 = h__[k + 3 + (k + 1) * h_dim1],
+				    abs(d__1));
+			}
+			if (k <= *kbot - 4) {
+			    tst1 += (d__1 = h__[k + 4 + (k + 1) * h_dim1],
+				    abs(d__1));
+			}
+		    }
+/* Computing MAX */
+		    d__2 = smlnum, d__3 = ulp * tst1;
+		    if ((d__1 = h__[k + 1 + k * h_dim1], abs(d__1)) <= max(
+			    d__2,d__3)) {
+/* Computing MAX */
+			d__3 = (d__1 = h__[k + 1 + k * h_dim1], abs(d__1)),
+				d__4 = (d__2 = h__[k + (k + 1) * h_dim1], abs(
+				d__2));
+			h12 = max(d__3,d__4);
+/* Computing MIN */
+			d__3 = (d__1 = h__[k + 1 + k * h_dim1], abs(d__1)),
+				d__4 = (d__2 = h__[k + (k + 1) * h_dim1], abs(
+				d__2));
+			h21 = min(d__3,d__4);
+/* Computing MAX */
+			d__3 = (d__1 = h__[k + 1 + (k + 1) * h_dim1], abs(
+				d__1)), d__4 = (d__2 = h__[k + k * h_dim1] -
+				h__[k + 1 + (k + 1) * h_dim1], abs(d__2));
+			h11 = max(d__3,d__4);
+/* Computing MIN */
+			d__3 = (d__1 = h__[k + 1 + (k + 1) * h_dim1], abs(
+				d__1)), d__4 = (d__2 = h__[k + k * h_dim1] -
+				h__[k + 1 + (k + 1) * h_dim1], abs(d__2));
+			h22 = min(d__3,d__4);
+			scl = h11 + h12;
+			tst2 = h22 * (h11 / scl);
+
+/* Computing MAX */
+			d__1 = smlnum, d__2 = ulp * tst2;
+			if (tst2 == 0. || h21 * (h12 / scl) <= max(d__1,d__2))
+				 {
+			    h__[k + 1 + k * h_dim1] = 0.;
+			}
+		    }
+		}
+/* L130: */
+	    }
+
+/*
+             ==== Fill in the last row of each bulge. ====
+
+   Computing MIN
+*/
+	    i__4 = nbmps, i__5 = (*kbot - krcol - 1) / 3;
+	    mend = min(i__4,i__5);
+	    i__4 = mend;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		refsum = v[m * v_dim1 + 1] * v[m * v_dim1 + 3] * h__[k + 4 + (
+			k + 3) * h_dim1];
+		h__[k + 4 + (k + 1) * h_dim1] = -refsum;
+		h__[k + 4 + (k + 2) * h_dim1] = -refsum * v[m * v_dim1 + 2];
+		h__[k + 4 + (k + 3) * h_dim1] -= refsum * v[m * v_dim1 + 3];
+/* L140: */
+	    }
+
+/*
+             ==== End of near-the-diagonal bulge chase. ====
+
+   L150:
+*/
+	}
+
+/*
+          ==== Use U (if accumulated) to update far-from-diagonal
+          .    entries in H.  If required, use U to update Z as
+          .    well. ====
+*/
+
+	if (accum) {
+	    if (*wantt) {
+		jtop = 1;
+		jbot = *n;
+	    } else {
+		jtop = *ktop;
+		jbot = *kbot;
+	    }
+	    if (! blk22 || incol < *ktop || ndcol > *kbot || ns <= 2) {
+
+/*
+                ==== Updates not exploiting the 2-by-2 block
+                .    structure of U.  K1 and NU keep track of
+                .    the location and size of U in the special
+                .    cases of introducing bulges and chasing
+                .    bulges off the bottom.  In these special
+                .    cases and in case the number of shifts
+                .    is NS = 2, there is no 2-by-2 block
+                .    structure to exploit.  ====
+
+   Computing MAX
+*/
+		i__3 = 1, i__4 = *ktop - incol;
+		k1 = max(i__3,i__4);
+/* Computing MAX */
+		i__3 = 0, i__4 = ndcol - *kbot;
+		nu = kdu - max(i__3,i__4) - k1 + 1;
+
+/*              ==== Horizontal Multiply ==== */
+
+		i__3 = jbot;
+		i__4 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__4 < 0 ? jcol >= i__3 :
+			jcol <= i__3; jcol += i__4) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+		    dgemm_("C", "N", &nu, &jlen, &nu, &c_b15, &u[k1 + k1 *
+			    u_dim1], ldu, &h__[incol + k1 + jcol * h_dim1],
+			    ldh, &c_b29, &wh[wh_offset], ldwh);
+		    dlacpy_("ALL", &nu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + k1 + jcol * h_dim1], ldh);
+/* L160: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__4 = max(*ktop,incol) - 1;
+		i__3 = *nv;
+		for (jrow = jtop; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			jrow += i__3) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(*ktop,incol) - jrow;
+		    jlen = min(i__5,i__7);
+		    dgemm_("N", "N", &jlen, &nu, &nu, &c_b15, &h__[jrow + (
+			    incol + k1) * h_dim1], ldh, &u[k1 + k1 * u_dim1],
+			    ldu, &c_b29, &wv[wv_offset], ldwv);
+		    dlacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + k1) * h_dim1], ldh);
+/* L170: */
+		}
+
+/*              ==== Z multiply (also vertical) ==== */
+
+		if (*wantz) {
+		    i__3 = *ihiz;
+		    i__4 = *nv;
+		    for (jrow = *iloz; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			     jrow += i__4) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+			dgemm_("N", "N", &jlen, &nu, &nu, &c_b15, &z__[jrow +
+				(incol + k1) * z_dim1], ldz, &u[k1 + k1 *
+				u_dim1], ldu, &c_b29, &wv[wv_offset], ldwv);
+			dlacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &z__[
+				jrow + (incol + k1) * z_dim1], ldz)
+				;
+/* L180: */
+		    }
+		}
+	    } else {
+
+/*
+                ==== Updates exploiting U's 2-by-2 block structure.
+                .    (I2, I4, J2, J4 are the last rows and columns
+                .    of the blocks.) ====
+*/
+
+		i2 = (kdu + 1) / 2;
+		i4 = kdu;
+		j2 = i4 - i2;
+		j4 = kdu;
+
+/*
+                ==== KZS and KNZ deal with the band of zeros
+                .    along the diagonal of one of the triangular
+                .    blocks. ====
+*/
+
+		kzs = j4 - j2 - (ns + 1);
+		knz = ns + 1;
+
+/*              ==== Horizontal multiply ==== */
+
+		i__4 = jbot;
+		i__3 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__3 < 0 ? jcol >= i__4 :
+			jcol <= i__4; jcol += i__3) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy bottom of H to top+KZS of scratch ====
+                    (The first KZS rows get multiplied by zero.) ====
+*/
+
+		    dlacpy_("ALL", &knz, &jlen, &h__[incol + 1 + j2 + jcol *
+			    h_dim1], ldh, &wh[kzs + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    dlaset_("ALL", &kzs, &jlen, &c_b29, &c_b29, &wh[wh_offset]
+			    , ldwh);
+		    dtrmm_("L", "U", "C", "N", &knz, &jlen, &c_b15, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wh[kzs + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Multiply top of H by U11' ==== */
+
+		    dgemm_("C", "N", &i2, &jlen, &j2, &c_b15, &u[u_offset],
+			    ldu, &h__[incol + 1 + jcol * h_dim1], ldh, &c_b15,
+			     &wh[wh_offset], ldwh);
+
+/*                 ==== Copy top of H to bottom of WH ==== */
+
+		    dlacpy_("ALL", &j2, &jlen, &h__[incol + 1 + jcol * h_dim1]
+			    , ldh, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    dtrmm_("L", "L", "C", "N", &j2, &jlen, &c_b15, &u[(i2 + 1)
+			     * u_dim1 + 1], ldu, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    dgemm_("C", "N", &i__5, &jlen, &i__7, &c_b15, &u[j2 + 1 +
+			    (i2 + 1) * u_dim1], ldu, &h__[incol + 1 + j2 +
+			    jcol * h_dim1], ldh, &c_b15, &wh[i2 + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Copy it back ==== */
+
+		    dlacpy_("ALL", &kdu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + 1 + jcol * h_dim1], ldh);
+/* L190: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__3 = max(incol,*ktop) - 1;
+		i__4 = *nv;
+		for (jrow = jtop; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			jrow += i__4) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(incol,*ktop) - jrow;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy right of H to scratch (the first KZS
+                   .    columns get multiplied by zero) ====
+*/
+
+		    dlacpy_("ALL", &jlen, &knz, &h__[jrow + (incol + 1 + j2) *
+			     h_dim1], ldh, &wv[(kzs + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    dlaset_("ALL", &jlen, &kzs, &c_b29, &c_b29, &wv[wv_offset]
+			    , ldwv);
+		    dtrmm_("R", "U", "N", "N", &jlen, &knz, &c_b15, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U11 ==== */
+
+		    dgemm_("N", "N", &jlen, &i2, &j2, &c_b15, &h__[jrow + (
+			    incol + 1) * h_dim1], ldh, &u[u_offset], ldu, &
+			    c_b15, &wv[wv_offset], ldwv)
+			    ;
+
+/*                 ==== Copy left of H to right of scratch ==== */
+
+		    dlacpy_("ALL", &jlen, &j2, &h__[jrow + (incol + 1) *
+			    h_dim1], ldh, &wv[(i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    i__5 = i4 - i2;
+		    dtrmm_("R", "L", "N", "N", &jlen, &i__5, &c_b15, &u[(i2 +
+			    1) * u_dim1 + 1], ldu, &wv[(i2 + 1) * wv_dim1 + 1]
+			    , ldwv);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    dgemm_("N", "N", &jlen, &i__5, &i__7, &c_b15, &h__[jrow +
+			    (incol + 1 + j2) * h_dim1], ldh, &u[j2 + 1 + (i2
+			    + 1) * u_dim1], ldu, &c_b15, &wv[(i2 + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Copy it back ==== */
+
+		    dlacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + 1) * h_dim1], ldh);
+/* L200: */
+		}
+
+/*              ==== Multiply Z (also vertical) ==== */
+
+		if (*wantz) {
+		    i__4 = *ihiz;
+		    i__3 = *nv;
+		    for (jrow = *iloz; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			     jrow += i__3) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+
+/*
+                      ==== Copy right of Z to left of scratch (first
+                      .     KZS columns get multiplied by zero) ====
+*/
+
+			dlacpy_("ALL", &jlen, &knz, &z__[jrow + (incol + 1 +
+				j2) * z_dim1], ldz, &wv[(kzs + 1) * wv_dim1 +
+				1], ldwv);
+
+/*                    ==== Multiply by U12 ==== */
+
+			dlaset_("ALL", &jlen, &kzs, &c_b29, &c_b29, &wv[
+				wv_offset], ldwv);
+			dtrmm_("R", "U", "N", "N", &jlen, &knz, &c_b15, &u[j2
+				+ 1 + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1)
+				* wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U11 ==== */
+
+			dgemm_("N", "N", &jlen, &i2, &j2, &c_b15, &z__[jrow +
+				(incol + 1) * z_dim1], ldz, &u[u_offset], ldu,
+				 &c_b15, &wv[wv_offset], ldwv);
+
+/*                    ==== Copy left of Z to right of scratch ==== */
+
+			dlacpy_("ALL", &jlen, &j2, &z__[jrow + (incol + 1) *
+				z_dim1], ldz, &wv[(i2 + 1) * wv_dim1 + 1],
+				ldwv);
+
+/*                    ==== Multiply by U21 ==== */
+
+			i__5 = i4 - i2;
+			dtrmm_("R", "L", "N", "N", &jlen, &i__5, &c_b15, &u[(
+				i2 + 1) * u_dim1 + 1], ldu, &wv[(i2 + 1) *
+				wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U22 ==== */
+
+			i__5 = i4 - i2;
+			i__7 = j4 - j2;
+			dgemm_("N", "N", &jlen, &i__5, &i__7, &c_b15, &z__[
+				jrow + (incol + 1 + j2) * z_dim1], ldz, &u[j2
+				+ 1 + (i2 + 1) * u_dim1], ldu, &c_b15, &wv[(
+				i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                    ==== Copy the result back to Z ==== */
+
+			dlacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &
+				z__[jrow + (incol + 1) * z_dim1], ldz);
+/* L210: */
+		    }
+		}
+	    }
+	}
+/* L220: */
+    }
+
+/*     ==== End of DLAQR5 ==== */
+
+    return 0;
+} /* dlaqr5_ */
+
+/* Subroutine */ int dlarf_(char *side, integer *m, integer *n, doublereal *v,
+	 integer *incv, doublereal *tau, doublereal *c__, integer *ldc,
+	doublereal *work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__;
+    static logical applyleft;
+    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *);
+    static integer lastc, lastv;
+    extern integer iladlc_(integer *, integer *, doublereal *, integer *),
+	    iladlr_(integer *, integer *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLARF applies a real elementary reflector H to a real m by n matrix
+    C, from either the left or the right. H is represented in the form
+
+          H = I - tau * v * v'
+
+    where tau is a real scalar and v is a real vector.
+
+    If tau = 0, then H is taken to be the unit matrix.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': form  H * C
+            = 'R': form  C * H
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    V       (input) DOUBLE PRECISION array, dimension
+                       (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+                    or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+            The vector v in the representation of H. V is not used if
+            TAU = 0.
+
+    INCV    (input) INTEGER
+            The increment between elements of v. INCV <> 0.
+
+    TAU     (input) DOUBLE PRECISION
+            The value tau in the representation of H.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+            or C * H if SIDE = 'R'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension
+                           (N) if SIDE = 'L'
+                        or (M) if SIDE = 'R'
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    applyleft = lsame_(side, "L");
+    lastv = 0;
+    lastc = 0;
+    if (*tau != 0.) {
+/*
+       Set up variables for scanning V.  LASTV begins pointing to the end
+       of V.
+*/
+	if (applyleft) {
+	    lastv = *m;
+	} else {
+	    lastv = *n;
+	}
+	if (*incv > 0) {
+	    i__ = (lastv - 1) * *incv + 1;
+	} else {
+	    i__ = 1;
+	}
+/*     Look for the last non-zero row in V. */
+	while(lastv > 0 && v[i__] == 0.) {
+	    --lastv;
+	    i__ -= *incv;
+	}
+	if (applyleft) {
+/*     Scan for the last non-zero column in C(1:lastv,:). */
+	    lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+	} else {
+/*     Scan for the last non-zero row in C(:,1:lastv). */
+	    lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+	}
+    }
+/*
+       Note that lastc.eq.0 renders the BLAS operations null; no special
+       case is needed at this level.
+*/
+    if (applyleft) {
+
+/*        Form  H * C */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastv,1:lastc)' * v(1:lastv,1) */
+
+	    dgemv_("Transpose", &lastv, &lastc, &c_b15, &c__[c_offset], ldc, &
+		    v[1], incv, &c_b29, &work[1], &c__1);
+
+/*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)' */
+
+	    d__1 = -(*tau);
+	    dger_(&lastv, &lastc, &d__1, &v[1], incv, &work[1], &c__1, &c__[
+		    c_offset], ldc);
+	}
+    } else {
+
+/*        Form  C * H */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1) */
+
+	    dgemv_("No transpose", &lastc, &lastv, &c_b15, &c__[c_offset],
+		    ldc, &v[1], incv, &c_b29, &work[1], &c__1);
+
+/*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)' */
+
+	    d__1 = -(*tau);
+	    dger_(&lastc, &lastv, &d__1, &work[1], &c__1, &v[1], incv, &c__[
+		    c_offset], ldc);
+	}
+    }
+    return 0;
+
+/*     End of DLARF */
+
+} /* dlarf_ */
+
+/* Subroutine */ int dlarfb_(char *side, char *trans, char *direct, char *
+	storev, integer *m, integer *n, integer *k, doublereal *v, integer *
+	ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc,
+	doublereal *work, integer *ldwork)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1,
+	    work_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    static integer lastc;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *), dtrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer lastv;
+    extern integer iladlc_(integer *, integer *, doublereal *, integer *),
+	    iladlr_(integer *, integer *, doublereal *, integer *);
+    static char transt[1];
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLARFB applies a real block reflector H or its transpose H' to a
+    real m by n matrix C, from either the left or the right.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply H or H' from the Left
+            = 'R': apply H or H' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply H (No transpose)
+            = 'T': apply H' (Transpose)
+
+    DIRECT  (input) CHARACTER*1
+            Indicates how H is formed from a product of elementary
+            reflectors
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Indicates how the vectors which define the elementary
+            reflectors are stored:
+            = 'C': Columnwise
+            = 'R': Rowwise
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    K       (input) INTEGER
+            The order of the matrix T (= the number of elementary
+            reflectors whose product defines the block reflector).
+
+    V       (input) DOUBLE PRECISION array, dimension
+                                  (LDV,K) if STOREV = 'C'
+                                  (LDV,M) if STOREV = 'R' and SIDE = 'L'
+                                  (LDV,N) if STOREV = 'R' and SIDE = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+            if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+            if STOREV = 'R', LDV >= K.
+
+    T       (input) DOUBLE PRECISION array, dimension (LDT,K)
+            The triangular k by k matrix T in the representation of the
+            block reflector.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by H*C or H'*C or C*H or C*H'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDA >= max(1,M).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (LDWORK,K)
+
+    LDWORK  (input) INTEGER
+            The leading dimension of the array WORK.
+            If SIDE = 'L', LDWORK >= max(1,N);
+            if SIDE = 'R', LDWORK >= max(1,M).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    work_dim1 = *ldwork;
+    work_offset = 1 + work_dim1;
+    work -= work_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(trans, "N")) {
+	*(unsigned char *)transt = 'T';
+    } else {
+	*(unsigned char *)transt = 'N';
+    }
+
+    if (lsame_(storev, "C")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1 )    (first K rows)
+                       ( V2 )
+             where  V1  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+/* L10: */
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[*k + 1 + c_dim1], ldc, &v[*k + 1 +
+			    v_dim1], ldv, &c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2 * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[*k + 1 + v_dim1], ldv, &work[
+			    work_offset], ldwork, &c_b15, &c__[*k + 1 +
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];
+/* L20: */
+		    }
+/* L30: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L40: */
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[*k +
+			    1 + v_dim1], ldv, &c_b15, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[*k + 1 +
+			    v_dim1], ldv, &c_b15, &c__[(*k + 1) * c_dim1 + 1],
+			     ldc);
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] -= work[i__ + j * work_dim1];
+/* L50: */
+		    }
+/* L60: */
+		}
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1 )
+                       ( V2 )    (last K rows)
+             where  V2  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+/* L70: */
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1 * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[v_offset], ldv, &work[work_offset],
+			    ldwork, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[lastv - *k + j + i__ * c_dim1] -= work[i__ + j *
+				work_dim1];
+/* L80: */
+		    }
+/* L90: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[(*n - *k + j) * c_dim1 + 1], &c__1, &
+			    work[j * work_dim1 + 1], &c__1);
+/* L100: */
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[v_offset],
+			    ldv, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + (lastv - *k + j) * c_dim1] -= work[i__ + j *
+				 work_dim1];
+/* L110: */
+		    }
+/* L120: */
+		}
+	    }
+	}
+
+    } else if (lsame_(storev, "R")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1  V2 )    (V1: first K columns)
+             where  V1  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+/* L130: */
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &lastc, k, &i__1, &c_b15,
+			     &c__[*k + 1 + c_dim1], ldc, &v[(*k + 1) * v_dim1
+			    + 1], ldv, &c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2' * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[(*k + 1) * v_dim1 + 1], ldv, &work[
+			    work_offset], ldwork, &c_b15, &c__[*k + 1 +
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];
+/* L140: */
+		    }
+/* L150: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L160: */
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[(*k +
+			    1) * v_dim1 + 1], ldv, &c_b15, &work[work_offset],
+			     ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[(*k + 1) *
+			    v_dim1 + 1], ldv, &c_b15, &c__[(*k + 1) * c_dim1
+			    + 1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] -= work[i__ + j * work_dim1];
+/* L170: */
+		    }
+/* L180: */
+		}
+
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1  V2 )    (V2: last K columns)
+             where  V2  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+/* L190: */
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &lastc, k, &i__1, &c_b15,
+			     &c__[c_offset], ldc, &v[v_offset], ldv, &c_b15, &
+			    work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1' * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[v_offset], ldv, &work[work_offset],
+			    ldwork, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[lastv - *k + j + i__ * c_dim1] -= work[i__ + j *
+				work_dim1];
+/* L200: */
+		    }
+/* L210: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = iladlc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1,
+			     &work[j * work_dim1 + 1], &c__1);
+/* L220: */
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[v_offset],
+			    ldv, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + (lastv - *k + j) * c_dim1] -= work[i__ + j *
+				 work_dim1];
+/* L230: */
+		    }
+/* L240: */
+		}
+
+	    }
+
+	}
+    }
+
+    return 0;
+
+/*     End of DLARFB */
+
+} /* dlarfb_ */
+
+/* Subroutine */ int dlarfg_(integer *n, doublereal *alpha, doublereal *x,
+	integer *incx, doublereal *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer j, knt;
+    static doublereal beta;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    static doublereal xnorm;
+
+    static doublereal safmin, rsafmn;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLARFG generates a real elementary reflector H of order n, such
+    that
+
+          H * ( alpha ) = ( beta ),   H' * H = I.
+              (   x   )   (   0  )
+
+    where alpha and beta are scalars, and x is an (n-1)-element real
+    vector. H is represented in the form
+
+          H = I - tau * ( 1 ) * ( 1 v' ) ,
+                        ( v )
+
+    where tau is a real scalar and v is a real (n-1)-element
+    vector.
+
+    If the elements of x are all zero, then tau = 0 and H is taken to be
+    the unit matrix.
+
+    Otherwise  1 <= tau <= 2.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the elementary reflector.
+
+    ALPHA   (input/output) DOUBLE PRECISION
+            On entry, the value alpha.
+            On exit, it is overwritten with the value beta.
+
+    X       (input/output) DOUBLE PRECISION array, dimension
+                           (1+(N-2)*abs(INCX))
+            On entry, the vector x.
+            On exit, it is overwritten with the vector v.
+
+    INCX    (input) INTEGER
+            The increment between elements of X. INCX > 0.
+
+    TAU     (output) DOUBLE PRECISION
+            The value tau.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 1) {
+	*tau = 0.;
+	return 0;
+    }
+
+    i__1 = *n - 1;
+    xnorm = dnrm2_(&i__1, &x[1], incx);
+
+    if (xnorm == 0.) {
+
+/*        H  =  I */
+
+	*tau = 0.;
+    } else {
+
+/*        general case */
+
+	d__1 = dlapy2_(alpha, &xnorm);
+	beta = -d_sign(&d__1, alpha);
+	safmin = SAFEMINIMUM / EPSILON;
+	knt = 0;
+	if (abs(beta) < safmin) {
+
+/*           XNORM, BETA may be inaccurate; scale X and recompute them */
+
+	    rsafmn = 1. / safmin;
+L10:
+	    ++knt;
+	    i__1 = *n - 1;
+	    dscal_(&i__1, &rsafmn, &x[1], incx);
+	    beta *= rsafmn;
+	    *alpha *= rsafmn;
+	    if (abs(beta) < safmin) {
+		goto L10;
+	    }
+
+/*           New BETA is at most 1, at least SAFMIN */
+
+	    i__1 = *n - 1;
+	    xnorm = dnrm2_(&i__1, &x[1], incx);
+	    d__1 = dlapy2_(alpha, &xnorm);
+	    beta = -d_sign(&d__1, alpha);
+	}
+	*tau = (beta - *alpha) / beta;
+	i__1 = *n - 1;
+	d__1 = 1. / (*alpha - beta);
+	dscal_(&i__1, &d__1, &x[1], incx);
+
+/*        If ALPHA is subnormal, it may lose relative accuracy */
+
+	i__1 = knt;
+	for (j = 1; j <= i__1; ++j) {
+	    beta *= safmin;
+/* L20: */
+	}
+	*alpha = beta;
+    }
+
+    return 0;
+
+/*     End of DLARFG */
+
+} /* dlarfg_ */
+
+/* Subroutine */ int dlarft_(char *direct, char *storev, integer *n, integer *
+	k, doublereal *v, integer *ldv, doublereal *tau, doublereal *t,
+	integer *ldt)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, prevlastv;
+    static doublereal vii;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *);
+    static integer lastv;
+    extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *,
+	    doublereal *, integer *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLARFT forms the triangular factor T of a real block reflector H
+    of order n, which is defined as a product of k elementary reflectors.
+
+    If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+
+    If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+
+    If STOREV = 'C', the vector which defines the elementary reflector
+    H(i) is stored in the i-th column of the array V, and
+
+       H  =  I - V * T * V'
+
+    If STOREV = 'R', the vector which defines the elementary reflector
+    H(i) is stored in the i-th row of the array V, and
+
+       H  =  I - V' * T * V
+
+    Arguments
+    =========
+
+    DIRECT  (input) CHARACTER*1
+            Specifies the order in which the elementary reflectors are
+            multiplied to form the block reflector:
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Specifies how the vectors which define the elementary
+            reflectors are stored (see also Further Details):
+            = 'C': columnwise
+            = 'R': rowwise
+
+    N       (input) INTEGER
+            The order of the block reflector H. N >= 0.
+
+    K       (input) INTEGER
+            The order of the triangular factor T (= the number of
+            elementary reflectors). K >= 1.
+
+    V       (input/output) DOUBLE PRECISION array, dimension
+                                 (LDV,K) if STOREV = 'C'
+                                 (LDV,N) if STOREV = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i).
+
+    T       (output) DOUBLE PRECISION array, dimension (LDT,K)
+            The k by k triangular factor T of the block reflector.
+            If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+            lower triangular. The rest of the array is not used.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    Further Details
+    ===============
+
+    The shape of the matrix V and the storage of the vectors which define
+    the H(i) is best illustrated by the following example with n = 5 and
+    k = 3. The elements equal to 1 are not stored; the corresponding
+    array elements are modified but restored on exit. The rest of the
+    array is not used.
+
+    DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+
+                 V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+                     ( v1  1    )                     (     1 v2 v2 v2 )
+                     ( v1 v2  1 )                     (        1 v3 v3 )
+                     ( v1 v2 v3 )
+                     ( v1 v2 v3 )
+
+    DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+
+                 V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+                     ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+                     (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+                     (     1 v3 )
+                     (        1 )
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    --tau;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (lsame_(direct, "F")) {
+	prevlastv = *n;
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    prevlastv = max(i__,prevlastv);
+	    if (tau[i__] == 0.) {
+
+/*              H(i)  =  I */
+
+		i__2 = i__;
+		for (j = 1; j <= i__2; ++j) {
+		    t[j + i__ * t_dim1] = 0.;
+/* L10: */
+		}
+	    } else {
+
+/*              general case */
+
+		vii = v[i__ + i__ * v_dim1];
+		v[i__ + i__ * v_dim1] = 1.;
+		if (lsame_(storev, "C")) {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			if (v[lastv + i__ * v_dim1] != 0.) {
+			    goto L15;
+			}
+		    }
+L15:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)' * V(i:j,i) */
+
+		    i__2 = j - i__ + 1;
+		    i__3 = i__ - 1;
+		    d__1 = -tau[i__];
+		    dgemv_("Transpose", &i__2, &i__3, &d__1, &v[i__ + v_dim1],
+			     ldv, &v[i__ + i__ * v_dim1], &c__1, &c_b29, &t[
+			    i__ * t_dim1 + 1], &c__1);
+		} else {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			if (v[i__ + lastv * v_dim1] != 0.) {
+			    goto L16;
+			}
+		    }
+L16:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)' */
+
+		    i__2 = i__ - 1;
+		    i__3 = j - i__ + 1;
+		    d__1 = -tau[i__];
+		    dgemv_("No transpose", &i__2, &i__3, &d__1, &v[i__ *
+			    v_dim1 + 1], ldv, &v[i__ + i__ * v_dim1], ldv, &
+			    c_b29, &t[i__ * t_dim1 + 1], &c__1);
+		}
+		v[i__ + i__ * v_dim1] = vii;
+
+/*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		dtrmv_("Upper", "No transpose", "Non-unit", &i__2, &t[
+			t_offset], ldt, &t[i__ * t_dim1 + 1], &c__1);
+		t[i__ + i__ * t_dim1] = tau[i__];
+		if (i__ > 1) {
+		    prevlastv = max(prevlastv,lastv);
+		} else {
+		    prevlastv = lastv;
+		}
+	    }
+/* L20: */
+	}
+    } else {
+	prevlastv = 1;
+	for (i__ = *k; i__ >= 1; --i__) {
+	    if (tau[i__] == 0.) {
+
+/*              H(i)  =  I */
+
+		i__1 = *k;
+		for (j = i__; j <= i__1; ++j) {
+		    t[j + i__ * t_dim1] = 0.;
+/* L30: */
+		}
+	    } else {
+
+/*              general case */
+
+		if (i__ < *k) {
+		    if (lsame_(storev, "C")) {
+			vii = v[*n - *k + i__ + i__ * v_dim1];
+			v[*n - *k + i__ + i__ * v_dim1] = 1.;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    if (v[lastv + i__ * v_dim1] != 0.) {
+				goto L35;
+			    }
+			}
+L35:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(j:n-k+i,i+1:k)' * V(j:n-k+i,i)
+*/
+
+			i__1 = *n - *k + i__ - j + 1;
+			i__2 = *k - i__;
+			d__1 = -tau[i__];
+			dgemv_("Transpose", &i__1, &i__2, &d__1, &v[j + (i__
+				+ 1) * v_dim1], ldv, &v[j + i__ * v_dim1], &
+				c__1, &c_b29, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			v[*n - *k + i__ + i__ * v_dim1] = vii;
+		    } else {
+			vii = v[i__ + (*n - *k + i__) * v_dim1];
+			v[i__ + (*n - *k + i__) * v_dim1] = 1.;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    if (v[i__ + lastv * v_dim1] != 0.) {
+				goto L36;
+			    }
+			}
+L36:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)'
+*/
+
+			i__1 = *k - i__;
+			i__2 = *n - *k + i__ - j + 1;
+			d__1 = -tau[i__];
+			dgemv_("No transpose", &i__1, &i__2, &d__1, &v[i__ +
+				1 + j * v_dim1], ldv, &v[i__ + j * v_dim1],
+				ldv, &c_b29, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			v[i__ + (*n - *k + i__) * v_dim1] = vii;
+		    }
+
+/*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) */
+
+		    i__1 = *k - i__;
+		    dtrmv_("Lower", "No transpose", "Non-unit", &i__1, &t[i__
+			    + 1 + (i__ + 1) * t_dim1], ldt, &t[i__ + 1 + i__ *
+			     t_dim1], &c__1)
+			    ;
+		    if (i__ > 1) {
+			prevlastv = min(prevlastv,lastv);
+		    } else {
+			prevlastv = lastv;
+		    }
+		}
+		t[i__ + i__ * t_dim1] = tau[i__];
+	    }
+/* L40: */
+	}
+    }
+    return 0;
+
+/*     End of DLARFT */
+
+} /* dlarft_ */
+
+/* Subroutine */ int dlarfx_(char *side, integer *m, integer *n, doublereal *
+	v, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1;
+
+    /* Local variables */
+    static integer j;
+    static doublereal t1, t2, t3, t4, t5, t6, t7, t8, t9, v1, v2, v3, v4, v5,
+	    v6, v7, v8, v9, t10, v10, sum;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *);
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLARFX applies a real elementary reflector H to a real m by n
+    matrix C, from either the left or the right. H is represented in the
+    form
+
+          H = I - tau * v * v'
+
+    where tau is a real scalar and v is a real vector.
+
+    If tau = 0, then H is taken to be the unit matrix
+
+    This version uses inline code if H has order < 11.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': form  H * C
+            = 'R': form  C * H
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    V       (input) DOUBLE PRECISION array, dimension (M) if SIDE = 'L'
+                                       or (N) if SIDE = 'R'
+            The vector v in the representation of H.
+
+    TAU     (input) DOUBLE PRECISION
+            The value tau in the representation of H.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+            or C * H if SIDE = 'R'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDA >= (1,M).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension
+                        (N) if SIDE = 'L'
+                        or (M) if SIDE = 'R'
+            WORK is not referenced if H has order < 11.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    if (*tau == 0.) {
+	return 0;
+    }
+    if (lsame_(side, "L")) {
+
+/*        Form  H * C, where H has order m. */
+
+	switch (*m) {
+	    case 1:  goto L10;
+	    case 2:  goto L30;
+	    case 3:  goto L50;
+	    case 4:  goto L70;
+	    case 5:  goto L90;
+	    case 6:  goto L110;
+	    case 7:  goto L130;
+	    case 8:  goto L150;
+	    case 9:  goto L170;
+	    case 10:  goto L190;
+	}
+
+/*        Code for general M */
+
+	dlarf_(side, m, n, &v[1], &c__1, tau, &c__[c_offset], ldc, &work[1]);
+	goto L410;
+L10:
+
+/*        Special code for 1 x 1 Householder */
+
+	t1 = 1. - *tau * v[1] * v[1];
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    c__[j * c_dim1 + 1] = t1 * c__[j * c_dim1 + 1];
+/* L20: */
+	}
+	goto L410;
+L30:
+
+/*        Special code for 2 x 2 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+/* L40: */
+	}
+	goto L410;
+L50:
+
+/*        Special code for 3 x 3 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+/* L60: */
+	}
+	goto L410;
+L70:
+
+/*        Special code for 4 x 4 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+/* L80: */
+	}
+	goto L410;
+L90:
+
+/*        Special code for 5 x 5 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+/* L100: */
+	}
+	goto L410;
+L110:
+
+/*        Special code for 6 x 6 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+/* L120: */
+	}
+	goto L410;
+L130:
+
+/*        Special code for 7 x 7 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+/* L140: */
+	}
+	goto L410;
+L150:
+
+/*        Special code for 8 x 8 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7] + v8 * c__[j * c_dim1 + 8];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+	    c__[j * c_dim1 + 8] -= sum * t8;
+/* L160: */
+	}
+	goto L410;
+L170:
+
+/*        Special code for 9 x 9 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7] + v8 * c__[j * c_dim1 + 8] + v9 * c__[j *
+		    c_dim1 + 9];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+	    c__[j * c_dim1 + 8] -= sum * t8;
+	    c__[j * c_dim1 + 9] -= sum * t9;
+/* L180: */
+	}
+	goto L410;
+L190:
+
+/*        Special code for 10 x 10 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	v10 = v[10];
+	t10 = *tau * v10;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7] + v8 * c__[j * c_dim1 + 8] + v9 * c__[j *
+		    c_dim1 + 9] + v10 * c__[j * c_dim1 + 10];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+	    c__[j * c_dim1 + 8] -= sum * t8;
+	    c__[j * c_dim1 + 9] -= sum * t9;
+	    c__[j * c_dim1 + 10] -= sum * t10;
+/* L200: */
+	}
+	goto L410;
+    } else {
+
+/*        Form  C * H, where H has order n. */
+
+	switch (*n) {
+	    case 1:  goto L210;
+	    case 2:  goto L230;
+	    case 3:  goto L250;
+	    case 4:  goto L270;
+	    case 5:  goto L290;
+	    case 6:  goto L310;
+	    case 7:  goto L330;
+	    case 8:  goto L350;
+	    case 9:  goto L370;
+	    case 10:  goto L390;
+	}
+
+/*        Code for general N */
+
+	dlarf_(side, m, n, &v[1], &c__1, tau, &c__[c_offset], ldc, &work[1]);
+	goto L410;
+L210:
+
+/*        Special code for 1 x 1 Householder */
+
+	t1 = 1. - *tau * v[1] * v[1];
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    c__[j + c_dim1] = t1 * c__[j + c_dim1];
+/* L220: */
+	}
+	goto L410;
+L230:
+
+/*        Special code for 2 x 2 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+/* L240: */
+	}
+	goto L410;
+L250:
+
+/*        Special code for 3 x 3 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+/* L260: */
+	}
+	goto L410;
+L270:
+
+/*        Special code for 4 x 4 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+/* L280: */
+	}
+	goto L410;
+L290:
+
+/*        Special code for 5 x 5 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+/* L300: */
+	}
+	goto L410;
+L310:
+
+/*        Special code for 6 x 6 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+/* L320: */
+	}
+	goto L410;
+L330:
+
+/*        Special code for 7 x 7 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+/* L340: */
+	}
+	goto L410;
+L350:
+
+/*        Special code for 8 x 8 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7] + v8 * c__[j + (c_dim1 << 3)];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+	    c__[j + (c_dim1 << 3)] -= sum * t8;
+/* L360: */
+	}
+	goto L410;
+L370:
+
+/*        Special code for 9 x 9 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7] + v8 * c__[j + (c_dim1 << 3)] + v9 * c__[
+		    j + c_dim1 * 9];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+	    c__[j + (c_dim1 << 3)] -= sum * t8;
+	    c__[j + c_dim1 * 9] -= sum * t9;
+/* L380: */
+	}
+	goto L410;
+L390:
+
+/*        Special code for 10 x 10 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	v10 = v[10];
+	t10 = *tau * v10;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7] + v8 * c__[j + (c_dim1 << 3)] + v9 * c__[
+		    j + c_dim1 * 9] + v10 * c__[j + c_dim1 * 10];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+	    c__[j + (c_dim1 << 3)] -= sum * t8;
+	    c__[j + c_dim1 * 9] -= sum * t9;
+	    c__[j + c_dim1 * 10] -= sum * t10;
+/* L400: */
+	}
+	goto L410;
+    }
+L410:
+    return 0;
+
+/*     End of DLARFX */
+
+} /* dlarfx_ */
+
+/* Subroutine */ int dlartg_(doublereal *f, doublereal *g, doublereal *cs,
+	doublereal *sn, doublereal *r__)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal f1, g1, eps, scale;
+    static integer count;
+    static doublereal safmn2, safmx2;
+
+    static doublereal safmin;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLARTG generate a plane rotation so that
+
+       [  CS  SN  ]  .  [ F ]  =  [ R ]   where CS**2 + SN**2 = 1.
+       [ -SN  CS  ]     [ G ]     [ 0 ]
+
+    This is a slower, more accurate version of the BLAS1 routine DROTG,
+    with the following other differences:
+       F and G are unchanged on return.
+       If G=0, then CS=1 and SN=0.
+       If F=0 and (G .ne. 0), then CS=0 and SN=1 without doing any
+          floating point operations (saves work in DBDSQR when
+          there are zeros on the diagonal).
+
+    If F exceeds G in magnitude, CS will be positive.
+
+    Arguments
+    =========
+
+    F       (input) DOUBLE PRECISION
+            The first component of vector to be rotated.
+
+    G       (input) DOUBLE PRECISION
+            The second component of vector to be rotated.
+
+    CS      (output) DOUBLE PRECISION
+            The cosine of the rotation.
+
+    SN      (output) DOUBLE PRECISION
+            The sine of the rotation.
+
+    R       (output) DOUBLE PRECISION
+            The nonzero component of the rotated vector.
+
+    This version has a few statements commented out for thread safety
+    (machine parameters are computed on each entry). 10 feb 03, SJH.
+
+    =====================================================================
+
+       LOGICAL            FIRST
+       SAVE               FIRST, SAFMX2, SAFMIN, SAFMN2
+       DATA               FIRST / .TRUE. /
+
+       IF( FIRST ) THEN
+*/
+    safmin = SAFEMINIMUM;
+    eps = EPSILON;
+    d__1 = BASE;
+    i__1 = (integer) (log(safmin / eps) / log(BASE) / 2.);
+    safmn2 = pow_di(&d__1, &i__1);
+    safmx2 = 1. / safmn2;
+/*
+          FIRST = .FALSE.
+       END IF
+*/
+    if (*g == 0.) {
+	*cs = 1.;
+	*sn = 0.;
+	*r__ = *f;
+    } else if (*f == 0.) {
+	*cs = 0.;
+	*sn = 1.;
+	*r__ = *g;
+    } else {
+	f1 = *f;
+	g1 = *g;
+/* Computing MAX */
+	d__1 = abs(f1), d__2 = abs(g1);
+	scale = max(d__1,d__2);
+	if (scale >= safmx2) {
+	    count = 0;
+L10:
+	    ++count;
+	    f1 *= safmn2;
+	    g1 *= safmn2;
+/* Computing MAX */
+	    d__1 = abs(f1), d__2 = abs(g1);
+	    scale = max(d__1,d__2);
+	    if (scale >= safmx2) {
+		goto L10;
+	    }
+/* Computing 2nd power */
+	    d__1 = f1;
+/* Computing 2nd power */
+	    d__2 = g1;
+	    *r__ = sqrt(d__1 * d__1 + d__2 * d__2);
+	    *cs = f1 / *r__;
+	    *sn = g1 / *r__;
+	    i__1 = count;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		*r__ *= safmx2;
+/* L20: */
+	    }
+	} else if (scale <= safmn2) {
+	    count = 0;
+L30:
+	    ++count;
+	    f1 *= safmx2;
+	    g1 *= safmx2;
+/* Computing MAX */
+	    d__1 = abs(f1), d__2 = abs(g1);
+	    scale = max(d__1,d__2);
+	    if (scale <= safmn2) {
+		goto L30;
+	    }
+/* Computing 2nd power */
+	    d__1 = f1;
+/* Computing 2nd power */
+	    d__2 = g1;
+	    *r__ = sqrt(d__1 * d__1 + d__2 * d__2);
+	    *cs = f1 / *r__;
+	    *sn = g1 / *r__;
+	    i__1 = count;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		*r__ *= safmn2;
+/* L40: */
+	    }
+	} else {
+/* Computing 2nd power */
+	    d__1 = f1;
+/* Computing 2nd power */
+	    d__2 = g1;
+	    *r__ = sqrt(d__1 * d__1 + d__2 * d__2);
+	    *cs = f1 / *r__;
+	    *sn = g1 / *r__;
+	}
+	if (abs(*f) > abs(*g) && *cs < 0.) {
+	    *cs = -(*cs);
+	    *sn = -(*sn);
+	    *r__ = -(*r__);
+	}
+    }
+    return 0;
+
+/*     End of DLARTG */
+
+} /* dlartg_ */
+
+/* Subroutine */ int dlas2_(doublereal *f, doublereal *g, doublereal *h__,
+	doublereal *ssmin, doublereal *ssmax)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal c__, fa, ga, ha, as, at, au, fhmn, fhmx;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAS2  computes the singular values of the 2-by-2 matrix
+       [  F   G  ]
+       [  0   H  ].
+    On return, SSMIN is the smaller singular value and SSMAX is the
+    larger singular value.
+
+    Arguments
+    =========
+
+    F       (input) DOUBLE PRECISION
+            The (1,1) element of the 2-by-2 matrix.
+
+    G       (input) DOUBLE PRECISION
+            The (1,2) element of the 2-by-2 matrix.
+
+    H       (input) DOUBLE PRECISION
+            The (2,2) element of the 2-by-2 matrix.
+
+    SSMIN   (output) DOUBLE PRECISION
+            The smaller singular value.
+
+    SSMAX   (output) DOUBLE PRECISION
+            The larger singular value.
+
+    Further Details
+    ===============
+
+    Barring over/underflow, all output quantities are correct to within
+    a few units in the last place (ulps), even in the absence of a guard
+    digit in addition/subtraction.
+
+    In IEEE arithmetic, the code works correctly if one matrix element is
+    infinite.
+
+    Overflow will not occur unless the largest singular value itself
+    overflows, or is within a few ulps of overflow. (On machines with
+    partial overflow, like the Cray, overflow may occur if the largest
+    singular value is within a factor of 2 of overflow.)
+
+    Underflow is harmless if underflow is gradual. Otherwise, results
+    may correspond to a matrix modified by perturbations of size near
+    the underflow threshold.
+
+    ====================================================================
+*/
+
+
+    fa = abs(*f);
+    ga = abs(*g);
+    ha = abs(*h__);
+    fhmn = min(fa,ha);
+    fhmx = max(fa,ha);
+    if (fhmn == 0.) {
+	*ssmin = 0.;
+	if (fhmx == 0.) {
+	    *ssmax = ga;
+	} else {
+/* Computing 2nd power */
+	    d__1 = min(fhmx,ga) / max(fhmx,ga);
+	    *ssmax = max(fhmx,ga) * sqrt(d__1 * d__1 + 1.);
+	}
+    } else {
+	if (ga < fhmx) {
+	    as = fhmn / fhmx + 1.;
+	    at = (fhmx - fhmn) / fhmx;
+/* Computing 2nd power */
+	    d__1 = ga / fhmx;
+	    au = d__1 * d__1;
+	    c__ = 2. / (sqrt(as * as + au) + sqrt(at * at + au));
+	    *ssmin = fhmn * c__;
+	    *ssmax = fhmx / c__;
+	} else {
+	    au = fhmx / ga;
+	    if (au == 0.) {
+
+/*
+                Avoid possible harmful underflow if exponent range
+                asymmetric (true SSMIN may not underflow even if
+                AU underflows)
+*/
+
+		*ssmin = fhmn * fhmx / ga;
+		*ssmax = ga;
+	    } else {
+		as = fhmn / fhmx + 1.;
+		at = (fhmx - fhmn) / fhmx;
+/* Computing 2nd power */
+		d__1 = as * au;
+/* Computing 2nd power */
+		d__2 = at * au;
+		c__ = 1. / (sqrt(d__1 * d__1 + 1.) + sqrt(d__2 * d__2 + 1.));
+		*ssmin = fhmn * c__ * au;
+		*ssmin += *ssmin;
+		*ssmax = ga / (c__ + c__);
+	    }
+	}
+    }
+    return 0;
+
+/*     End of DLAS2 */
+
+} /* dlas2_ */
+
+/* Subroutine */ int dlascl_(char *type__, integer *kl, integer *ku,
+	doublereal *cfrom, doublereal *cto, integer *m, integer *n,
+	doublereal *a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+
+    /* Local variables */
+    static integer i__, j, k1, k2, k3, k4;
+    static doublereal mul, cto1;
+    static logical done;
+    static doublereal ctoc;
+    extern logical lsame_(char *, char *);
+    static integer itype;
+    static doublereal cfrom1;
+
+    static doublereal cfromc;
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal bignum, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASCL multiplies the M by N real matrix A by the real scalar
+    CTO/CFROM.  This is done without over/underflow as long as the final
+    result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+    A may be full, upper triangular, lower triangular, upper Hessenberg,
+    or banded.
+
+    Arguments
+    =========
+
+    TYPE    (input) CHARACTER*1
+            TYPE indices the storage type of the input matrix.
+            = 'G':  A is a full matrix.
+            = 'L':  A is a lower triangular matrix.
+            = 'U':  A is an upper triangular matrix.
+            = 'H':  A is an upper Hessenberg matrix.
+            = 'B':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the lower
+                    half stored.
+            = 'Q':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the upper
+                    half stored.
+            = 'Z':  A is a band matrix with lower bandwidth KL and upper
+                    bandwidth KU.
+
+    KL      (input) INTEGER
+            The lower bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    KU      (input) INTEGER
+            The upper bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    CFROM   (input) DOUBLE PRECISION
+    CTO     (input) DOUBLE PRECISION
+            The matrix A is multiplied by CTO/CFROM. A(I,J) is computed
+            without over/underflow if the final result CTO*A(I,J)/CFROM
+            can be represented without over/underflow.  CFROM must be
+            nonzero.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            The matrix to be multiplied by CTO/CFROM.  See TYPE for the
+            storage type.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    INFO    (output) INTEGER
+            0  - successful exit
+            <0 - if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(type__, "G")) {
+	itype = 0;
+    } else if (lsame_(type__, "L")) {
+	itype = 1;
+    } else if (lsame_(type__, "U")) {
+	itype = 2;
+    } else if (lsame_(type__, "H")) {
+	itype = 3;
+    } else if (lsame_(type__, "B")) {
+	itype = 4;
+    } else if (lsame_(type__, "Q")) {
+	itype = 5;
+    } else if (lsame_(type__, "Z")) {
+	itype = 6;
+    } else {
+	itype = -1;
+    }
+
+    if (itype == -1) {
+	*info = -1;
+    } else if (*cfrom == 0. || disnan_(cfrom)) {
+	*info = -4;
+    } else if (disnan_(cto)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || itype == 4 && *n != *m || itype == 5 && *n != *m) {
+	*info = -7;
+    } else if (itype <= 3 && *lda < max(1,*m)) {
+	*info = -9;
+    } else if (itype >= 4) {
+/* Computing MAX */
+	i__1 = *m - 1;
+	if (*kl < 0 || *kl > max(i__1,0)) {
+	    *info = -2;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = *n - 1;
+	    if (*ku < 0 || *ku > max(i__1,0) || (itype == 4 || itype == 5) &&
+		    *kl != *ku) {
+		*info = -3;
+	    } else if (itype == 4 && *lda < *kl + 1 || itype == 5 && *lda < *
+		    ku + 1 || itype == 6 && *lda < (*kl << 1) + *ku + 1) {
+		*info = -9;
+	    }
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASCL", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = SAFEMINIMUM;
+    bignum = 1. / smlnum;
+
+    cfromc = *cfrom;
+    ctoc = *cto;
+
+L10:
+    cfrom1 = cfromc * smlnum;
+    if (cfrom1 == cfromc) {
+/*
+          CFROMC is an inf.  Multiply by a correctly signed zero for
+          finite CTOC, or a NaN if CTOC is infinite.
+*/
+	mul = ctoc / cfromc;
+	done = TRUE_;
+	cto1 = ctoc;
+    } else {
+	cto1 = ctoc / bignum;
+	if (cto1 == ctoc) {
+/*
+             CTOC is either 0 or an inf.  In both cases, CTOC itself
+             serves as the correct multiplication factor.
+*/
+	    mul = ctoc;
+	    done = TRUE_;
+	    cfromc = 1.;
+	} else if (abs(cfrom1) > abs(ctoc) && ctoc != 0.) {
+	    mul = smlnum;
+	    done = FALSE_;
+	    cfromc = cfrom1;
+	} else if (abs(cto1) > abs(cfromc)) {
+	    mul = bignum;
+	    done = FALSE_;
+	    ctoc = cto1;
+	} else {
+	    mul = ctoc / cfromc;
+	    done = TRUE_;
+	}
+    }
+
+    if (itype == 0) {
+
+/*        Full matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L20: */
+	    }
+/* L30: */
+	}
+
+    } else if (itype == 1) {
+
+/*        Lower triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L40: */
+	    }
+/* L50: */
+	}
+
+    } else if (itype == 2) {
+
+/*        Upper triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L60: */
+	    }
+/* L70: */
+	}
+
+    } else if (itype == 3) {
+
+/*        Upper Hessenberg matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j + 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+    } else if (itype == 4) {
+
+/*        Lower half of a symmetric band matrix */
+
+	k3 = *kl + 1;
+	k4 = *n + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = k3, i__4 = k4 - j;
+	    i__2 = min(i__3,i__4);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L100: */
+	    }
+/* L110: */
+	}
+
+    } else if (itype == 5) {
+
+/*        Upper half of a symmetric band matrix */
+
+	k1 = *ku + 2;
+	k3 = *ku + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__2 = k1 - j;
+	    i__3 = k3;
+	    for (i__ = max(i__2,1); i__ <= i__3; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L120: */
+	    }
+/* L130: */
+	}
+
+    } else if (itype == 6) {
+
+/*        Band matrix */
+
+	k1 = *kl + *ku + 2;
+	k2 = *kl + 1;
+	k3 = (*kl << 1) + *ku + 1;
+	k4 = *kl + *ku + 1 + *m;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__3 = k1 - j;
+/* Computing MIN */
+	    i__4 = k3, i__5 = k4 - j;
+	    i__2 = min(i__4,i__5);
+	    for (i__ = max(i__3,k2); i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L140: */
+	    }
+/* L150: */
+	}
+
+    }
+
+    if (! done) {
+	goto L10;
+    }
+
+    return 0;
+
+/*     End of DLASCL */
+
+} /* dlascl_ */
+
+/* Subroutine */ int dlasd0_(integer *n, integer *sqre, doublereal *d__,
+	doublereal *e, doublereal *u, integer *ldu, doublereal *vt, integer *
+	ldvt, integer *smlsiz, integer *iwork, doublereal *work, integer *
+	info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, vt_dim1, vt_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, m, i1, ic, lf, nd, ll, nl, nr, im1, ncc, nlf, nrf,
+	    iwk, lvl, ndb1, nlp1, nrp1;
+    static doublereal beta;
+    static integer idxq, nlvl;
+    static doublereal alpha;
+    static integer inode, ndiml, idxqc, ndimr, itemp, sqrei;
+    extern /* Subroutine */ int dlasd1_(integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     doublereal *, integer *, integer *, integer *, doublereal *,
+	    integer *), dlasdq_(char *, integer *, integer *, integer *,
+	    integer *, integer *, doublereal *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlasdt_(integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *), xerbla_(
+	    char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    Using a divide and conquer approach, DLASD0 computes the singular
+    value decomposition (SVD) of a real upper bidiagonal N-by-M
+    matrix B with diagonal D and offdiagonal E, where M = N + SQRE.
+    The algorithm computes orthogonal matrices U and VT such that
+    B = U * S * VT. The singular values S are overwritten on D.
+
+    A related subroutine, DLASDA, computes only the singular values,
+    and optionally, the singular vectors in compact form.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           On entry, the row dimension of the upper bidiagonal matrix.
+           This is also the dimension of the main diagonal array D.
+
+    SQRE   (input) INTEGER
+           Specifies the column dimension of the bidiagonal matrix.
+           = 0: The bidiagonal matrix has column dimension M = N;
+           = 1: The bidiagonal matrix has column dimension M = N+1;
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix.
+           On exit D, if INFO = 0, contains its singular values.
+
+    E      (input) DOUBLE PRECISION array, dimension (M-1)
+           Contains the subdiagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    U      (output) DOUBLE PRECISION array, dimension at least (LDQ, N)
+           On exit, U contains the left singular vectors.
+
+    LDU    (input) INTEGER
+           On entry, leading dimension of U.
+
+    VT     (output) DOUBLE PRECISION array, dimension at least (LDVT, M)
+           On exit, VT' contains the right singular vectors.
+
+    LDVT   (input) INTEGER
+           On entry, leading dimension of VT.
+
+    SMLSIZ (input) INTEGER
+           On entry, maximum size of the subproblems at the
+           bottom of the computation tree.
+
+    IWORK  (workspace) INTEGER work array.
+           Dimension must be at least (8 * N)
+
+    WORK   (workspace) DOUBLE PRECISION work array.
+           Dimension must be at least (3 * M**2 + 2 * M)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --iwork;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -1;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -2;
+    }
+
+    m = *n + *sqre;
+
+    if (*ldu < *n) {
+	*info = -6;
+    } else if (*ldvt < m) {
+	*info = -8;
+    } else if (*smlsiz < 3) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD0", &i__1);
+	return 0;
+    }
+
+/*     If the input matrix is too small, call DLASDQ to find the SVD. */
+
+    if (*n <= *smlsiz) {
+	dlasdq_("U", sqre, n, &m, n, &c__0, &d__[1], &e[1], &vt[vt_offset],
+		ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[1], info);
+	return 0;
+    }
+
+/*     Set up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+    idxq = ndimr + *n;
+    iwk = idxq + *n;
+    dlasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       For the nodes on bottom level of the tree, solve
+       their subproblems by DLASDQ.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    ncc = 0;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+       IC : center row of each node
+       NL : number of rows of left  subproblem
+       NR : number of rows of right subproblem
+       NLF: starting row of the left   subproblem
+       NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nlp1 = nl + 1;
+	nr = iwork[ndimr + i1];
+	nrp1 = nr + 1;
+	nlf = ic - nl;
+	nrf = ic + 1;
+	sqrei = 1;
+	dlasdq_("U", &sqrei, &nl, &nlp1, &nl, &ncc, &d__[nlf], &e[nlf], &vt[
+		nlf + nlf * vt_dim1], ldvt, &u[nlf + nlf * u_dim1], ldu, &u[
+		nlf + nlf * u_dim1], ldu, &work[1], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	itemp = idxq + nlf - 2;
+	i__2 = nl;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[itemp + j] = j;
+/* L10: */
+	}
+	if (i__ == nd) {
+	    sqrei = *sqre;
+	} else {
+	    sqrei = 1;
+	}
+	nrp1 = nr + sqrei;
+	dlasdq_("U", &sqrei, &nr, &nrp1, &nr, &ncc, &d__[nrf], &e[nrf], &vt[
+		nrf + nrf * vt_dim1], ldvt, &u[nrf + nrf * u_dim1], ldu, &u[
+		nrf + nrf * u_dim1], ldu, &work[1], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	itemp = idxq + ic;
+	i__2 = nr;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[itemp + j - 1] = j;
+/* L20: */
+	}
+/* L30: */
+    }
+
+/*     Now conquer each subproblem bottom-up. */
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+
+/*
+          Find the first node LF and last node LL on the
+          current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    if (*sqre == 0 && i__ == ll) {
+		sqrei = *sqre;
+	    } else {
+		sqrei = 1;
+	    }
+	    idxqc = idxq + nlf - 1;
+	    alpha = d__[ic];
+	    beta = e[ic];
+	    dlasd1_(&nl, &nr, &sqrei, &d__[nlf], &alpha, &beta, &u[nlf + nlf *
+		     u_dim1], ldu, &vt[nlf + nlf * vt_dim1], ldvt, &iwork[
+		    idxqc], &iwork[iwk], &work[1], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+/* L40: */
+	}
+/* L50: */
+    }
+
+    return 0;
+
+/*     End of DLASD0 */
+
+} /* dlasd0_ */
+
+/* Subroutine */ int dlasd1_(integer *nl, integer *nr, integer *sqre,
+	doublereal *d__, doublereal *alpha, doublereal *beta, doublereal *u,
+	integer *ldu, doublereal *vt, integer *ldvt, integer *idxq, integer *
+	iwork, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, vt_dim1, vt_offset, i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, k, m, n, n1, n2, iq, iz, iu2, ldq, idx, ldu2, ivt2,
+	    idxc, idxp, ldvt2;
+    extern /* Subroutine */ int dlasd2_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *), dlasd3_(
+	    integer *, integer *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, integer *, integer *, doublereal *, integer *),
+	    dlascl_(char *, integer *, integer *, doublereal *, doublereal *,
+	    integer *, integer *, doublereal *, integer *, integer *),
+	     dlamrg_(integer *, integer *, doublereal *, integer *, integer *,
+	     integer *);
+    static integer isigma;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal orgnrm;
+    static integer coltyp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLASD1 computes the SVD of an upper bidiagonal N-by-M matrix B,
+    where N = NL + NR + 1 and M = N + SQRE. DLASD1 is called from DLASD0.
+
+    A related subroutine DLASD7 handles the case in which the singular
+    values (and the singular vectors in factored form) are desired.
+
+    DLASD1 computes the SVD as follows:
+
+                  ( D1(in)  0    0     0 )
+      B = U(in) * (   Z1'   a   Z2'    b ) * VT(in)
+                  (   0     0   D2(in) 0 )
+
+        = U(out) * ( D(out) 0) * VT(out)
+
+    where Z' = (Z1' a Z2' b) = u' VT', and u is a vector of dimension M
+    with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+    elsewhere; and the entry b is empty if SQRE = 0.
+
+    The left singular vectors of the original matrix are stored in U, and
+    the transpose of the right singular vectors are stored in VT, and the
+    singular values are in D.  The algorithm consists of three stages:
+
+       The first stage consists of deflating the size of the problem
+       when there are multiple singular values or when there are zeros in
+       the Z vector.  For each such occurence the dimension of the
+       secular equation problem is reduced by one.  This stage is
+       performed by the routine DLASD2.
+
+       The second stage consists of calculating the updated
+       singular values. This is done by finding the square roots of the
+       roots of the secular equation via the routine DLASD4 (as called
+       by DLASD3). This routine also calculates the singular vectors of
+       the current problem.
+
+       The final stage consists of computing the updated singular vectors
+       directly using the updated singular values.  The singular vectors
+       for the current problem are multiplied with the singular vectors
+       from the overall problem.
+
+    Arguments
+    =========
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    D      (input/output) DOUBLE PRECISION array,
+                          dimension (N = NL+NR+1).
+           On entry D(1:NL,1:NL) contains the singular values of the
+           upper block; and D(NL+2:N) contains the singular values of
+           the lower block. On exit D(1:N) contains the singular values
+           of the modified matrix.
+
+    ALPHA  (input/output) DOUBLE PRECISION
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input/output) DOUBLE PRECISION
+           Contains the off-diagonal element associated with the added
+           row.
+
+    U      (input/output) DOUBLE PRECISION array, dimension(LDU,N)
+           On entry U(1:NL, 1:NL) contains the left singular vectors of
+           the upper block; U(NL+2:N, NL+2:N) contains the left singular
+           vectors of the lower block. On exit U contains the left
+           singular vectors of the bidiagonal matrix.
+
+    LDU    (input) INTEGER
+           The leading dimension of the array U.  LDU >= max( 1, N ).
+
+    VT     (input/output) DOUBLE PRECISION array, dimension(LDVT,M)
+           where M = N + SQRE.
+           On entry VT(1:NL+1, 1:NL+1)' contains the right singular
+           vectors of the upper block; VT(NL+2:M, NL+2:M)' contains
+           the right singular vectors of the lower block. On exit
+           VT' contains the right singular vectors of the
+           bidiagonal matrix.
+
+    LDVT   (input) INTEGER
+           The leading dimension of the array VT.  LDVT >= max( 1, M ).
+
+    IDXQ  (output) INTEGER array, dimension(N)
+           This contains the permutation which will reintegrate the
+           subproblem just solved back into sorted order, i.e.
+           D( IDXQ( I = 1, N ) ) will be in ascending order.
+
+    IWORK  (workspace) INTEGER array, dimension( 4 * N )
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension( 3*M**2 + 2*M )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --idxq;
+    --iwork;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*nl < 1) {
+	*info = -1;
+    } else if (*nr < 1) {
+	*info = -2;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -3;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD1", &i__1);
+	return 0;
+    }
+
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in DLASD2 and DLASD3.
+*/
+
+    ldu2 = n;
+    ldvt2 = m;
+
+    iz = 1;
+    isigma = iz + m;
+    iu2 = isigma + n;
+    ivt2 = iu2 + ldu2 * n;
+    iq = ivt2 + ldvt2 * m;
+
+    idx = 1;
+    idxc = idx + n;
+    coltyp = idxc + n;
+    idxp = coltyp + n;
+
+/*
+       Scale.
+
+   Computing MAX
+*/
+    d__1 = abs(*alpha), d__2 = abs(*beta);
+    orgnrm = max(d__1,d__2);
+    d__[*nl + 1] = 0.;
+    i__1 = n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = d__[i__], abs(d__1)) > orgnrm) {
+	    orgnrm = (d__1 = d__[i__], abs(d__1));
+	}
+/* L10: */
+    }
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &n, &c__1, &d__[1], &n, info);
+    *alpha /= orgnrm;
+    *beta /= orgnrm;
+
+/*     Deflate singular values. */
+
+    dlasd2_(nl, nr, sqre, &k, &d__[1], &work[iz], alpha, beta, &u[u_offset],
+	    ldu, &vt[vt_offset], ldvt, &work[isigma], &work[iu2], &ldu2, &
+	    work[ivt2], &ldvt2, &iwork[idxp], &iwork[idx], &iwork[idxc], &
+	    idxq[1], &iwork[coltyp], info);
+
+/*     Solve Secular Equation and update singular vectors. */
+
+    ldq = k;
+    dlasd3_(nl, nr, sqre, &k, &d__[1], &work[iq], &ldq, &work[isigma], &u[
+	    u_offset], ldu, &work[iu2], &ldu2, &vt[vt_offset], ldvt, &work[
+	    ivt2], &ldvt2, &iwork[idxc], &iwork[coltyp], &work[iz], info);
+    if (*info != 0) {
+	return 0;
+    }
+
+/*     Unscale. */
+
+    dlascl_("G", &c__0, &c__0, &c_b15, &orgnrm, &n, &c__1, &d__[1], &n, info);
+
+/*     Prepare the IDXQ sorting permutation. */
+
+    n1 = k;
+    n2 = n - k;
+    dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]);
+
+    return 0;
+
+/*     End of DLASD1 */
+
+} /* dlasd1_ */
+
+/* Subroutine */ int dlasd2_(integer *nl, integer *nr, integer *sqre, integer
+	*k, doublereal *d__, doublereal *z__, doublereal *alpha, doublereal *
+	beta, doublereal *u, integer *ldu, doublereal *vt, integer *ldvt,
+	doublereal *dsigma, doublereal *u2, integer *ldu2, doublereal *vt2,
+	integer *ldvt2, integer *idxp, integer *idx, integer *idxc, integer *
+	idxq, integer *coltyp, integer *info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, u2_dim1, u2_offset, vt_dim1, vt_offset,
+	    vt2_dim1, vt2_offset, i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal c__;
+    static integer i__, j, m, n;
+    static doublereal s;
+    static integer k2;
+    static doublereal z1;
+    static integer ct, jp;
+    static doublereal eps, tau, tol;
+    static integer psm[4], nlp1, nlp2, idxi, idxj;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static integer ctot[4], idxjp;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer jprev;
+
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), dlacpy_(char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *), xerbla_(char *,
+	    integer *);
+    static doublereal hlftol;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASD2 merges the two sets of singular values together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    singular values are close together or if there is a tiny entry in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    DLASD2 is called from DLASD1.
+
+    Arguments
+    =========
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has N = NL + NR + 1 rows and
+           M = N + SQRE >= N columns.
+
+    K      (output) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    D      (input/output) DOUBLE PRECISION array, dimension(N)
+           On entry D contains the singular values of the two submatrices
+           to be combined.  On exit D contains the trailing (N-K) updated
+           singular values (those which were deflated) sorted into
+           increasing order.
+
+    Z      (output) DOUBLE PRECISION array, dimension(N)
+           On exit Z contains the updating row vector in the secular
+           equation.
+
+    ALPHA  (input) DOUBLE PRECISION
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input) DOUBLE PRECISION
+           Contains the off-diagonal element associated with the added
+           row.
+
+    U      (input/output) DOUBLE PRECISION array, dimension(LDU,N)
+           On entry U contains the left singular vectors of two
+           submatrices in the two square blocks with corners at (1,1),
+           (NL, NL), and (NL+2, NL+2), (N,N).
+           On exit U contains the trailing (N-K) updated left singular
+           vectors (those which were deflated) in its last N-K columns.
+
+    LDU    (input) INTEGER
+           The leading dimension of the array U.  LDU >= N.
+
+    VT     (input/output) DOUBLE PRECISION array, dimension(LDVT,M)
+           On entry VT' contains the right singular vectors of two
+           submatrices in the two square blocks with corners at (1,1),
+           (NL+1, NL+1), and (NL+2, NL+2), (M,M).
+           On exit VT' contains the trailing (N-K) updated right singular
+           vectors (those which were deflated) in its last N-K columns.
+           In case SQRE =1, the last row of VT spans the right null
+           space.
+
+    LDVT   (input) INTEGER
+           The leading dimension of the array VT.  LDVT >= M.
+
+    DSIGMA (output) DOUBLE PRECISION array, dimension (N)
+           Contains a copy of the diagonal elements (K-1 singular values
+           and one zero) in the secular equation.
+
+    U2     (output) DOUBLE PRECISION array, dimension(LDU2,N)
+           Contains a copy of the first K-1 left singular vectors which
+           will be used by DLASD3 in a matrix multiply (DGEMM) to solve
+           for the new left singular vectors. U2 is arranged into four
+           blocks. The first block contains a column with 1 at NL+1 and
+           zero everywhere else; the second block contains non-zero
+           entries only at and above NL; the third contains non-zero
+           entries only below NL+1; and the fourth is dense.
+
+    LDU2   (input) INTEGER
+           The leading dimension of the array U2.  LDU2 >= N.
+
+    VT2    (output) DOUBLE PRECISION array, dimension(LDVT2,N)
+           VT2' contains a copy of the first K right singular vectors
+           which will be used by DLASD3 in a matrix multiply (DGEMM) to
+           solve for the new right singular vectors. VT2 is arranged into
+           three blocks. The first block contains a row that corresponds
+           to the special 0 diagonal element in SIGMA; the second block
+           contains non-zeros only at and before NL +1; the third block
+           contains non-zeros only at and after  NL +2.
+
+    LDVT2  (input) INTEGER
+           The leading dimension of the array VT2.  LDVT2 >= M.
+
+    IDXP   (workspace) INTEGER array dimension(N)
+           This will contain the permutation used to place deflated
+           values of D at the end of the array. On output IDXP(2:K)
+           points to the nondeflated D-values and IDXP(K+1:N)
+           points to the deflated singular values.
+
+    IDX    (workspace) INTEGER array dimension(N)
+           This will contain the permutation used to sort the contents of
+           D into ascending order.
+
+    IDXC   (output) INTEGER array dimension(N)
+           This will contain the permutation used to arrange the columns
+           of the deflated U matrix into three groups:  the first group
+           contains non-zero entries only at and above NL, the second
+           contains non-zero entries only below NL+2, and the third is
+           dense.
+
+    IDXQ   (input/output) INTEGER array dimension(N)
+           This contains the permutation which separately sorts the two
+           sub-problems in D into ascending order.  Note that entries in
+           the first hlaf of this permutation must first be moved one
+           position backward; and entries in the second half
+           must first have NL+1 added to their values.
+
+    COLTYP (workspace/output) INTEGER array dimension(N)
+           As workspace, this will contain a label which will indicate
+           which of the following types a column in the U2 matrix or a
+           row in the VT2 matrix is:
+           1 : non-zero in the upper half only
+           2 : non-zero in the lower half only
+           3 : dense
+           4 : deflated
+
+           On exit, it is an array of dimension 4, with COLTYP(I) being
+           the dimension of the I-th type columns.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --z__;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --dsigma;
+    u2_dim1 = *ldu2;
+    u2_offset = 1 + u2_dim1;
+    u2 -= u2_offset;
+    vt2_dim1 = *ldvt2;
+    vt2_offset = 1 + vt2_dim1;
+    vt2 -= vt2_offset;
+    --idxp;
+    --idx;
+    --idxc;
+    --idxq;
+    --coltyp;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*nl < 1) {
+	*info = -1;
+    } else if (*nr < 1) {
+	*info = -2;
+    } else if (*sqre != 1 && *sqre != 0) {
+	*info = -3;
+    }
+
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+    if (*ldu < n) {
+	*info = -10;
+    } else if (*ldvt < m) {
+	*info = -12;
+    } else if (*ldu2 < n) {
+	*info = -15;
+    } else if (*ldvt2 < m) {
+	*info = -17;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD2", &i__1);
+	return 0;
+    }
+
+    nlp1 = *nl + 1;
+    nlp2 = *nl + 2;
+
+/*
+       Generate the first part of the vector Z; and move the singular
+       values in the first part of D one position backward.
+*/
+
+    z1 = *alpha * vt[nlp1 + nlp1 * vt_dim1];
+    z__[1] = z1;
+    for (i__ = *nl; i__ >= 1; --i__) {
+	z__[i__ + 1] = *alpha * vt[i__ + nlp1 * vt_dim1];
+	d__[i__ + 1] = d__[i__];
+	idxq[i__ + 1] = idxq[i__] + 1;
+/* L10: */
+    }
+
+/*     Generate the second part of the vector Z. */
+
+    i__1 = m;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	z__[i__] = *beta * vt[i__ + nlp2 * vt_dim1];
+/* L20: */
+    }
+
+/*     Initialize some reference arrays. */
+
+    i__1 = nlp1;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	coltyp[i__] = 1;
+/* L30: */
+    }
+    i__1 = n;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	coltyp[i__] = 2;
+/* L40: */
+    }
+
+/*     Sort the singular values into increasing order */
+
+    i__1 = n;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	idxq[i__] += nlp1;
+/* L50: */
+    }
+
+/*
+       DSIGMA, IDXC, IDXC, and the first column of U2
+       are used as storage space.
+*/
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	dsigma[i__] = d__[idxq[i__]];
+	u2[i__ + u2_dim1] = z__[idxq[i__]];
+	idxc[i__] = coltyp[idxq[i__]];
+/* L60: */
+    }
+
+    dlamrg_(nl, nr, &dsigma[2], &c__1, &c__1, &idx[2]);
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	idxi = idx[i__] + 1;
+	d__[i__] = dsigma[idxi];
+	z__[i__] = u2[idxi + u2_dim1];
+	coltyp[i__] = idxc[idxi];
+/* L70: */
+    }
+
+/*     Calculate the allowable deflation tolerance */
+
+    eps = EPSILON;
+/* Computing MAX */
+    d__1 = abs(*alpha), d__2 = abs(*beta);
+    tol = max(d__1,d__2);
+/* Computing MAX */
+    d__2 = (d__1 = d__[n], abs(d__1));
+    tol = eps * 8. * max(d__2,tol);
+
+/*
+       There are 2 kinds of deflation -- first a value in the z-vector
+       is small, second two (or more) singular values are very close
+       together (their difference is small).
+
+       If the value in the z-vector is small, we simply permute the
+       array so that the corresponding singular value is moved to the
+       end.
+
+       If two values in the D-vector are close, we perform a two-sided
+       rotation designed to make one of the corresponding z-vector
+       entries zero, and then permute the array so that the deflated
+       singular value is moved to the end.
+
+       If there are multiple singular values then the problem deflates.
+       Here the number of equal singular values are found.  As each equal
+       singular value is found, an elementary reflector is computed to
+       rotate the corresponding singular subspace so that the
+       corresponding components of Z are zero in this new basis.
+*/
+
+    *k = 1;
+    k2 = n + 1;
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	if ((d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    idxp[k2] = j;
+	    coltyp[j] = 4;
+	    if (j == n) {
+		goto L120;
+	    }
+	} else {
+	    jprev = j;
+	    goto L90;
+	}
+/* L80: */
+    }
+L90:
+    j = jprev;
+L100:
+    ++j;
+    if (j > n) {
+	goto L110;
+    }
+    if ((d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	idxp[k2] = j;
+	coltyp[j] = 4;
+    } else {
+
+/*        Check if singular values are close enough to allow deflation. */
+
+	if ((d__1 = d__[j] - d__[jprev], abs(d__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    s = z__[jprev];
+	    c__ = z__[j];
+
+/*
+             Find sqrt(a**2+b**2) without overflow or
+             destructive underflow.
+*/
+
+	    tau = dlapy2_(&c__, &s);
+	    c__ /= tau;
+	    s = -s / tau;
+	    z__[j] = tau;
+	    z__[jprev] = 0.;
+
+/*
+             Apply back the Givens rotation to the left and right
+             singular vector matrices.
+*/
+
+	    idxjp = idxq[idx[jprev] + 1];
+	    idxj = idxq[idx[j] + 1];
+	    if (idxjp <= nlp1) {
+		--idxjp;
+	    }
+	    if (idxj <= nlp1) {
+		--idxj;
+	    }
+	    drot_(&n, &u[idxjp * u_dim1 + 1], &c__1, &u[idxj * u_dim1 + 1], &
+		    c__1, &c__, &s);
+	    drot_(&m, &vt[idxjp + vt_dim1], ldvt, &vt[idxj + vt_dim1], ldvt, &
+		    c__, &s);
+	    if (coltyp[j] != coltyp[jprev]) {
+		coltyp[j] = 3;
+	    }
+	    coltyp[jprev] = 4;
+	    --k2;
+	    idxp[k2] = jprev;
+	    jprev = j;
+	} else {
+	    ++(*k);
+	    u2[*k + u2_dim1] = z__[jprev];
+	    dsigma[*k] = d__[jprev];
+	    idxp[*k] = jprev;
+	    jprev = j;
+	}
+    }
+    goto L100;
+L110:
+
+/*     Record the last singular value. */
+
+    ++(*k);
+    u2[*k + u2_dim1] = z__[jprev];
+    dsigma[*k] = d__[jprev];
+    idxp[*k] = jprev;
+
+L120:
+
+/*
+       Count up the total number of the various types of columns, then
+       form a permutation which positions the four column types into
+       four groups of uniform structure (although one or more of these
+       groups may be empty).
+*/
+
+    for (j = 1; j <= 4; ++j) {
+	ctot[j - 1] = 0;
+/* L130: */
+    }
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	ct = coltyp[j];
+	++ctot[ct - 1];
+/* L140: */
+    }
+
+/*     PSM(*) = Position in SubMatrix (of types 1 through 4) */
+
+    psm[0] = 2;
+    psm[1] = ctot[0] + 2;
+    psm[2] = psm[1] + ctot[1];
+    psm[3] = psm[2] + ctot[2];
+
+/*
+       Fill out the IDXC array so that the permutation which it induces
+       will place all type-1 columns first, all type-2 columns next,
+       then all type-3's, and finally all type-4's, starting from the
+       second column. This applies similarly to the rows of VT.
+*/
+
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	jp = idxp[j];
+	ct = coltyp[jp];
+	idxc[psm[ct - 1]] = j;
+	++psm[ct - 1];
+/* L150: */
+    }
+
+/*
+       Sort the singular values and corresponding singular vectors into
+       DSIGMA, U2, and VT2 respectively.  The singular values/vectors
+       which were not deflated go into the first K slots of DSIGMA, U2,
+       and VT2 respectively, while those which were deflated go into the
+       last N - K slots, except that the first column/row will be treated
+       separately.
+*/
+
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	jp = idxp[j];
+	dsigma[j] = d__[jp];
+	idxj = idxq[idx[idxp[idxc[j]]] + 1];
+	if (idxj <= nlp1) {
+	    --idxj;
+	}
+	dcopy_(&n, &u[idxj * u_dim1 + 1], &c__1, &u2[j * u2_dim1 + 1], &c__1);
+	dcopy_(&m, &vt[idxj + vt_dim1], ldvt, &vt2[j + vt2_dim1], ldvt2);
+/* L160: */
+    }
+
+/*     Determine DSIGMA(1), DSIGMA(2) and Z(1) */
+
+    dsigma[1] = 0.;
+    hlftol = tol / 2.;
+    if (abs(dsigma[2]) <= hlftol) {
+	dsigma[2] = hlftol;
+    }
+    if (m > n) {
+	z__[1] = dlapy2_(&z1, &z__[m]);
+	if (z__[1] <= tol) {
+	    c__ = 1.;
+	    s = 0.;
+	    z__[1] = tol;
+	} else {
+	    c__ = z1 / z__[1];
+	    s = z__[m] / z__[1];
+	}
+    } else {
+	if (abs(z1) <= tol) {
+	    z__[1] = tol;
+	} else {
+	    z__[1] = z1;
+	}
+    }
+
+/*     Move the rest of the updating row to Z. */
+
+    i__1 = *k - 1;
+    dcopy_(&i__1, &u2[u2_dim1 + 2], &c__1, &z__[2], &c__1);
+
+/*
+       Determine the first column of U2, the first row of VT2 and the
+       last row of VT.
+*/
+
+    dlaset_("A", &n, &c__1, &c_b29, &c_b29, &u2[u2_offset], ldu2);
+    u2[nlp1 + u2_dim1] = 1.;
+    if (m > n) {
+	i__1 = nlp1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    vt[m + i__ * vt_dim1] = -s * vt[nlp1 + i__ * vt_dim1];
+	    vt2[i__ * vt2_dim1 + 1] = c__ * vt[nlp1 + i__ * vt_dim1];
+/* L170: */
+	}
+	i__1 = m;
+	for (i__ = nlp2; i__ <= i__1; ++i__) {
+	    vt2[i__ * vt2_dim1 + 1] = s * vt[m + i__ * vt_dim1];
+	    vt[m + i__ * vt_dim1] = c__ * vt[m + i__ * vt_dim1];
+/* L180: */
+	}
+    } else {
+	dcopy_(&m, &vt[nlp1 + vt_dim1], ldvt, &vt2[vt2_dim1 + 1], ldvt2);
+    }
+    if (m > n) {
+	dcopy_(&m, &vt[m + vt_dim1], ldvt, &vt2[m + vt2_dim1], ldvt2);
+    }
+
+/*
+       The deflated singular values and their corresponding vectors go
+       into the back of D, U, and V respectively.
+*/
+
+    if (n > *k) {
+	i__1 = n - *k;
+	dcopy_(&i__1, &dsigma[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	i__1 = n - *k;
+	dlacpy_("A", &n, &i__1, &u2[(*k + 1) * u2_dim1 + 1], ldu2, &u[(*k + 1)
+		 * u_dim1 + 1], ldu);
+	i__1 = n - *k;
+	dlacpy_("A", &i__1, &m, &vt2[*k + 1 + vt2_dim1], ldvt2, &vt[*k + 1 +
+		vt_dim1], ldvt);
+    }
+
+/*     Copy CTOT into COLTYP for referencing in DLASD3. */
+
+    for (j = 1; j <= 4; ++j) {
+	coltyp[j] = ctot[j - 1];
+/* L190: */
+    }
+
+    return 0;
+
+/*     End of DLASD2 */
+
+} /* dlasd2_ */
+
+/* Subroutine */ int dlasd3_(integer *nl, integer *nr, integer *sqre, integer
+	*k, doublereal *d__, doublereal *q, integer *ldq, doublereal *dsigma,
+	doublereal *u, integer *ldu, doublereal *u2, integer *ldu2,
+	doublereal *vt, integer *ldvt, doublereal *vt2, integer *ldvt2,
+	integer *idxc, integer *ctot, doublereal *z__, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, u_dim1, u_offset, u2_dim1, u2_offset, vt_dim1,
+	    vt_offset, vt2_dim1, vt2_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, j, m, n, jc;
+    static doublereal rho;
+    static integer nlp1, nlp2, nrp1;
+    static doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer ctemp;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer ktemp;
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    extern /* Subroutine */ int dlasd4_(integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, integer *), dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlacpy_(char *, integer *, integer
+	    *, doublereal *, integer *, doublereal *, integer *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLASD3 finds all the square roots of the roots of the secular
+    equation, as defined by the values in D and Z.  It makes the
+    appropriate calls to DLASD4 and then updates the singular
+    vectors by matrix multiplication.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    DLASD3 is called from DLASD1.
+
+    Arguments
+    =========
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has N = NL + NR + 1 rows and
+           M = N + SQRE >= N columns.
+
+    K      (input) INTEGER
+           The size of the secular equation, 1 =< K = < N.
+
+    D      (output) DOUBLE PRECISION array, dimension(K)
+           On exit the square roots of the roots of the secular equation,
+           in ascending order.
+
+    Q      (workspace) DOUBLE PRECISION array,
+                       dimension at least (LDQ,K).
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= K.
+
+    DSIGMA (input) DOUBLE PRECISION array, dimension(K)
+           The first K elements of this array contain the old roots
+           of the deflated updating problem.  These are the poles
+           of the secular equation.
+
+    U      (output) DOUBLE PRECISION array, dimension (LDU, N)
+           The last N - K columns of this matrix contain the deflated
+           left singular vectors.
+
+    LDU    (input) INTEGER
+           The leading dimension of the array U.  LDU >= N.
+
+    U2     (input/output) DOUBLE PRECISION array, dimension (LDU2, N)
+           The first K columns of this matrix contain the non-deflated
+           left singular vectors for the split problem.
+
+    LDU2   (input) INTEGER
+           The leading dimension of the array U2.  LDU2 >= N.
+
+    VT     (output) DOUBLE PRECISION array, dimension (LDVT, M)
+           The last M - K columns of VT' contain the deflated
+           right singular vectors.
+
+    LDVT   (input) INTEGER
+           The leading dimension of the array VT.  LDVT >= N.
+
+    VT2    (input/output) DOUBLE PRECISION array, dimension (LDVT2, N)
+           The first K columns of VT2' contain the non-deflated
+           right singular vectors for the split problem.
+
+    LDVT2  (input) INTEGER
+           The leading dimension of the array VT2.  LDVT2 >= N.
+
+    IDXC   (input) INTEGER array, dimension ( N )
+           The permutation used to arrange the columns of U (and rows of
+           VT) into three groups:  the first group contains non-zero
+           entries only at and above (or before) NL +1; the second
+           contains non-zero entries only at and below (or after) NL+2;
+           and the third is dense. The first column of U and the row of
+           VT are treated separately, however.
+
+           The rows of the singular vectors found by DLASD4
+           must be likewise permuted before the matrix multiplies can
+           take place.
+
+    CTOT   (input) INTEGER array, dimension ( 4 )
+           A count of the total number of the various types of columns
+           in U (or rows in VT), as described in IDXC. The fourth column
+           type is any column which has been deflated.
+
+    Z      (input) DOUBLE PRECISION array, dimension (K)
+           The first K elements of this array contain the components
+           of the deflation-adjusted updating row vector.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --dsigma;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    u2_dim1 = *ldu2;
+    u2_offset = 1 + u2_dim1;
+    u2 -= u2_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    vt2_dim1 = *ldvt2;
+    vt2_offset = 1 + vt2_dim1;
+    vt2 -= vt2_offset;
+    --idxc;
+    --ctot;
+    --z__;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*nl < 1) {
+	*info = -1;
+    } else if (*nr < 1) {
+	*info = -2;
+    } else if (*sqre != 1 && *sqre != 0) {
+	*info = -3;
+    }
+
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+    nlp2 = *nl + 2;
+
+    if (*k < 1 || *k > n) {
+	*info = -4;
+    } else if (*ldq < *k) {
+	*info = -7;
+    } else if (*ldu < n) {
+	*info = -10;
+    } else if (*ldu2 < n) {
+	*info = -12;
+    } else if (*ldvt < m) {
+	*info = -14;
+    } else if (*ldvt2 < m) {
+	*info = -16;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD3", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 1) {
+	d__[1] = abs(z__[1]);
+	dcopy_(&m, &vt2[vt2_dim1 + 1], ldvt2, &vt[vt_dim1 + 1], ldvt);
+	if (z__[1] > 0.) {
+	    dcopy_(&n, &u2[u2_dim1 + 1], &c__1, &u[u_dim1 + 1], &c__1);
+	} else {
+	    i__1 = n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		u[i__ + u_dim1] = -u2[i__ + u2_dim1];
+/* L10: */
+	    }
+	}
+	return 0;
+    }
+
+/*
+       Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DSIGMA(I) if it is 1; this makes the subsequent
+       subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DSIGMA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DSIGMA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DSIGMA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dsigma[i__] = dlamc3_(&dsigma[i__], &dsigma[i__]) - dsigma[i__];
+/* L20: */
+    }
+
+/*     Keep a copy of Z. */
+
+    dcopy_(k, &z__[1], &c__1, &q[q_offset], &c__1);
+
+/*     Normalize Z. */
+
+    rho = dnrm2_(k, &z__[1], &c__1);
+    dlascl_("G", &c__0, &c__0, &rho, &c_b15, k, &c__1, &z__[1], k, info);
+    rho *= rho;
+
+/*     Find the new singular values. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	dlasd4_(k, &j, &dsigma[1], &z__[1], &u[j * u_dim1 + 1], &rho, &d__[j],
+		 &vt[j * vt_dim1 + 1], info);
+
+/*        If the zero finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    return 0;
+	}
+/* L30: */
+    }
+
+/*     Compute updated Z. */
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	z__[i__] = u[i__ + *k * u_dim1] * vt[i__ + *k * vt_dim1];
+	i__2 = i__ - 1;
+	for (j = 1; j <= i__2; ++j) {
+	    z__[i__] *= u[i__ + j * u_dim1] * vt[i__ + j * vt_dim1] / (dsigma[
+		    i__] - dsigma[j]) / (dsigma[i__] + dsigma[j]);
+/* L40: */
+	}
+	i__2 = *k - 1;
+	for (j = i__; j <= i__2; ++j) {
+	    z__[i__] *= u[i__ + j * u_dim1] * vt[i__ + j * vt_dim1] / (dsigma[
+		    i__] - dsigma[j + 1]) / (dsigma[i__] + dsigma[j + 1]);
+/* L50: */
+	}
+	d__2 = sqrt((d__1 = z__[i__], abs(d__1)));
+	z__[i__] = d_sign(&d__2, &q[i__ + q_dim1]);
+/* L60: */
+    }
+
+/*
+       Compute left singular vectors of the modified diagonal matrix,
+       and store related information for the right singular vectors.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	vt[i__ * vt_dim1 + 1] = z__[1] / u[i__ * u_dim1 + 1] / vt[i__ *
+		vt_dim1 + 1];
+	u[i__ * u_dim1 + 1] = -1.;
+	i__2 = *k;
+	for (j = 2; j <= i__2; ++j) {
+	    vt[j + i__ * vt_dim1] = z__[j] / u[j + i__ * u_dim1] / vt[j + i__
+		    * vt_dim1];
+	    u[j + i__ * u_dim1] = dsigma[j] * vt[j + i__ * vt_dim1];
+/* L70: */
+	}
+	temp = dnrm2_(k, &u[i__ * u_dim1 + 1], &c__1);
+	q[i__ * q_dim1 + 1] = u[i__ * u_dim1 + 1] / temp;
+	i__2 = *k;
+	for (j = 2; j <= i__2; ++j) {
+	    jc = idxc[j];
+	    q[j + i__ * q_dim1] = u[jc + i__ * u_dim1] / temp;
+/* L80: */
+	}
+/* L90: */
+    }
+
+/*     Update the left singular vector matrix. */
+
+    if (*k == 2) {
+	dgemm_("N", "N", &n, k, k, &c_b15, &u2[u2_offset], ldu2, &q[q_offset],
+		 ldq, &c_b29, &u[u_offset], ldu);
+	goto L100;
+    }
+    if (ctot[1] > 0) {
+	dgemm_("N", "N", nl, k, &ctot[1], &c_b15, &u2[(u2_dim1 << 1) + 1],
+		ldu2, &q[q_dim1 + 2], ldq, &c_b29, &u[u_dim1 + 1], ldu);
+	if (ctot[3] > 0) {
+	    ktemp = ctot[1] + 2 + ctot[2];
+	    dgemm_("N", "N", nl, k, &ctot[3], &c_b15, &u2[ktemp * u2_dim1 + 1]
+		    , ldu2, &q[ktemp + q_dim1], ldq, &c_b15, &u[u_dim1 + 1],
+		    ldu);
+	}
+    } else if (ctot[3] > 0) {
+	ktemp = ctot[1] + 2 + ctot[2];
+	dgemm_("N", "N", nl, k, &ctot[3], &c_b15, &u2[ktemp * u2_dim1 + 1],
+		ldu2, &q[ktemp + q_dim1], ldq, &c_b29, &u[u_dim1 + 1], ldu);
+    } else {
+	dlacpy_("F", nl, k, &u2[u2_offset], ldu2, &u[u_offset], ldu);
+    }
+    dcopy_(k, &q[q_dim1 + 1], ldq, &u[nlp1 + u_dim1], ldu);
+    ktemp = ctot[1] + 2;
+    ctemp = ctot[2] + ctot[3];
+    dgemm_("N", "N", nr, k, &ctemp, &c_b15, &u2[nlp2 + ktemp * u2_dim1], ldu2,
+	     &q[ktemp + q_dim1], ldq, &c_b29, &u[nlp2 + u_dim1], ldu);
+
+/*     Generate the right singular vectors. */
+
+L100:
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	temp = dnrm2_(k, &vt[i__ * vt_dim1 + 1], &c__1);
+	q[i__ + q_dim1] = vt[i__ * vt_dim1 + 1] / temp;
+	i__2 = *k;
+	for (j = 2; j <= i__2; ++j) {
+	    jc = idxc[j];
+	    q[i__ + j * q_dim1] = vt[jc + i__ * vt_dim1] / temp;
+/* L110: */
+	}
+/* L120: */
+    }
+
+/*     Update the right singular vector matrix. */
+
+    if (*k == 2) {
+	dgemm_("N", "N", k, &m, k, &c_b15, &q[q_offset], ldq, &vt2[vt2_offset]
+		, ldvt2, &c_b29, &vt[vt_offset], ldvt);
+	return 0;
+    }
+    ktemp = ctot[1] + 1;
+    dgemm_("N", "N", k, &nlp1, &ktemp, &c_b15, &q[q_dim1 + 1], ldq, &vt2[
+	    vt2_dim1 + 1], ldvt2, &c_b29, &vt[vt_dim1 + 1], ldvt);
+    ktemp = ctot[1] + 2 + ctot[2];
+    if (ktemp <= *ldvt2) {
+	dgemm_("N", "N", k, &nlp1, &ctot[3], &c_b15, &q[ktemp * q_dim1 + 1],
+		ldq, &vt2[ktemp + vt2_dim1], ldvt2, &c_b15, &vt[vt_dim1 + 1],
+		ldvt);
+    }
+
+    ktemp = ctot[1] + 1;
+    nrp1 = *nr + *sqre;
+    if (ktemp > 1) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    q[i__ + ktemp * q_dim1] = q[i__ + q_dim1];
+/* L130: */
+	}
+	i__1 = m;
+	for (i__ = nlp2; i__ <= i__1; ++i__) {
+	    vt2[ktemp + i__ * vt2_dim1] = vt2[i__ * vt2_dim1 + 1];
+/* L140: */
+	}
+    }
+    ctemp = ctot[2] + 1 + ctot[3];
+    dgemm_("N", "N", k, &nrp1, &ctemp, &c_b15, &q[ktemp * q_dim1 + 1], ldq, &
+	    vt2[ktemp + nlp2 * vt2_dim1], ldvt2, &c_b29, &vt[nlp2 * vt_dim1 +
+	    1], ldvt);
+
+    return 0;
+
+/*     End of DLASD3 */
+
+} /* dlasd3_ */
+
+/* Subroutine */ int dlasd4_(integer *n, integer *i__, doublereal *d__,
+	doublereal *z__, doublereal *delta, doublereal *rho, doublereal *
+	sigma, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal a, b, c__;
+    static integer j;
+    static doublereal w, dd[3];
+    static integer ii;
+    static doublereal dw, zz[3];
+    static integer ip1;
+    static doublereal eta, phi, eps, tau, psi;
+    static integer iim1, iip1;
+    static doublereal dphi, dpsi;
+    static integer iter;
+    static doublereal temp, prew, sg2lb, sg2ub, temp1, temp2, dtiim, delsq,
+	    dtiip;
+    static integer niter;
+    static doublereal dtisq;
+    static logical swtch;
+    static doublereal dtnsq;
+    extern /* Subroutine */ int dlaed6_(integer *, logical *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *)
+	    , dlasd5_(integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *);
+    static doublereal delsq2, dtnsq1;
+    static logical swtch3;
+
+    static logical orgati;
+    static doublereal erretm, dtipsq, rhoinv;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the square root of the I-th updated
+    eigenvalue of a positive symmetric rank-one modification to
+    a positive diagonal matrix whose entries are given as the squares
+    of the corresponding entries in the array d, and that
+
+           0 <= D(i) < D(j)  for  i < j
+
+    and that RHO > 0. This is arranged by the calling routine, and is
+    no loss in generality.  The rank-one modified system is thus
+
+           diag( D ) * diag( D ) +  RHO *  Z * Z_transpose.
+
+    where we assume the Euclidean norm of Z is 1.
+
+    The method consists of approximating the rational functions in the
+    secular equation by simpler interpolating rational functions.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The length of all arrays.
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  1 <= I <= N.
+
+    D      (input) DOUBLE PRECISION array, dimension ( N )
+           The original eigenvalues.  It is assumed that they are in
+           order, 0 <= D(I) < D(J)  for I < J.
+
+    Z      (input) DOUBLE PRECISION array, dimension ( N )
+           The components of the updating vector.
+
+    DELTA  (output) DOUBLE PRECISION array, dimension ( N )
+           If N .ne. 1, DELTA contains (D(j) - sigma_I) in its  j-th
+           component.  If N = 1, then DELTA(1) = 1.  The vector DELTA
+           contains the information necessary to construct the
+           (singular) eigenvectors.
+
+    RHO    (input) DOUBLE PRECISION
+           The scalar in the symmetric updating formula.
+
+    SIGMA  (output) DOUBLE PRECISION
+           The computed sigma_I, the I-th updated eigenvalue.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension ( N )
+           If N .ne. 1, WORK contains (D(j) + sigma_I) in its  j-th
+           component.  If N = 1, then WORK( 1 ) = 1.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit
+           > 0:  if INFO = 1, the updating process failed.
+
+    Internal Parameters
+    ===================
+
+    Logical variable ORGATI (origin-at-i?) is used for distinguishing
+    whether D(i) or D(i+1) is treated as the origin.
+
+              ORGATI = .true.    origin at i
+              ORGATI = .false.   origin at i+1
+
+    Logical variable SWTCH3 (switch-for-3-poles?) is for noting
+    if we are working with THREE poles!
+
+    MAXIT is the maximum number of iterations allowed for each
+    eigenvalue.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Since this routine is called in an inner loop, we do no argument
+       checking.
+
+       Quick return for N=1 and 2.
+*/
+
+    /* Parameter adjustments */
+    --work;
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    if (*n == 1) {
+
+/*        Presumably, I=1 upon entry */
+
+	*sigma = sqrt(d__[1] * d__[1] + *rho * z__[1] * z__[1]);
+	delta[1] = 1.;
+	work[1] = 1.;
+	return 0;
+    }
+    if (*n == 2) {
+	dlasd5_(i__, &d__[1], &z__[1], &delta[1], rho, sigma, &work[1]);
+	return 0;
+    }
+
+/*     Compute machine epsilon */
+
+    eps = EPSILON;
+    rhoinv = 1. / *rho;
+
+/*     The case I = N */
+
+    if (*i__ == *n) {
+
+/*        Initialize some basic variables */
+
+	ii = *n - 1;
+	niter = 1;
+
+/*        Calculate initial guess */
+
+	temp = *rho / 2.;
+
+/*
+          If ||Z||_2 is not one, then TEMP should be set to
+          RHO * ||Z||_2^2 / TWO
+*/
+
+	temp1 = temp / (d__[*n] + sqrt(d__[*n] * d__[*n] + temp));
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    work[j] = d__[j] + d__[*n] + temp1;
+	    delta[j] = d__[j] - d__[*n] - temp1;
+/* L10: */
+	}
+
+	psi = 0.;
+	i__1 = *n - 2;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / (delta[j] * work[j]);
+/* L20: */
+	}
+
+	c__ = rhoinv + psi;
+	w = c__ + z__[ii] * z__[ii] / (delta[ii] * work[ii]) + z__[*n] * z__[*
+		n] / (delta[*n] * work[*n]);
+
+	if (w <= 0.) {
+	    temp1 = sqrt(d__[*n] * d__[*n] + *rho);
+	    temp = z__[*n - 1] * z__[*n - 1] / ((d__[*n - 1] + temp1) * (d__[*
+		    n] - d__[*n - 1] + *rho / (d__[*n] + temp1))) + z__[*n] *
+		    z__[*n] / *rho;
+
+/*
+             The following TAU is to approximate
+             SIGMA_n^2 - D( N )*D( N )
+*/
+
+	    if (c__ <= temp) {
+		tau = *rho;
+	    } else {
+		delsq = (d__[*n] - d__[*n - 1]) * (d__[*n] + d__[*n - 1]);
+		a = -c__ * delsq + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*
+			n];
+		b = z__[*n] * z__[*n] * delsq;
+		if (a < 0.) {
+		    tau = b * 2. / (sqrt(a * a + b * 4. * c__) - a);
+		} else {
+		    tau = (a + sqrt(a * a + b * 4. * c__)) / (c__ * 2.);
+		}
+	    }
+
+/*
+             It can be proved that
+                 D(N)^2+RHO/2 <= SIGMA_n^2 < D(N)^2+TAU <= D(N)^2+RHO
+*/
+
+	} else {
+	    delsq = (d__[*n] - d__[*n - 1]) * (d__[*n] + d__[*n - 1]);
+	    a = -c__ * delsq + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*n];
+	    b = z__[*n] * z__[*n] * delsq;
+
+/*
+             The following TAU is to approximate
+             SIGMA_n^2 - D( N )*D( N )
+*/
+
+	    if (a < 0.) {
+		tau = b * 2. / (sqrt(a * a + b * 4. * c__) - a);
+	    } else {
+		tau = (a + sqrt(a * a + b * 4. * c__)) / (c__ * 2.);
+	    }
+
+/*
+             It can be proved that
+             D(N)^2 < D(N)^2+TAU < SIGMA(N)^2 < D(N)^2+RHO/2
+*/
+
+	}
+
+/*        The following ETA is to approximate SIGMA_n - D( N ) */
+
+	eta = tau / (d__[*n] + sqrt(d__[*n] * d__[*n] + tau));
+
+	*sigma = d__[*n] + eta;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - eta;
+	    work[j] = d__[j] + d__[*i__] + eta;
+/* L30: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (delta[j] * work[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L40: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / (delta[*n] * work[*n]);
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8. + erretm - phi + rhoinv + abs(tau) * (dpsi
+		+ dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Test for convergence */
+
+	if (abs(w) <= eps * erretm) {
+	    goto L240;
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	dtnsq1 = work[*n - 1] * delta[*n - 1];
+	dtnsq = work[*n] * delta[*n];
+	c__ = w - dtnsq1 * dpsi - dtnsq * dphi;
+	a = (dtnsq + dtnsq1) * w - dtnsq * dtnsq1 * (dpsi + dphi);
+	b = dtnsq * dtnsq1 * w;
+	if (c__ < 0.) {
+	    c__ = abs(c__);
+	}
+	if (c__ == 0.) {
+	    eta = *rho - *sigma * *sigma;
+	} else if (a >= 0.) {
+	    eta = (a + sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (c__
+		    * 2.);
+	} else {
+	    eta = b * 2. / (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))
+		    );
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta > 0.) {
+	    eta = -w / (dpsi + dphi);
+	}
+	temp = eta - dtnsq;
+	if (temp > *rho) {
+	    eta = *rho + dtnsq;
+	}
+
+	tau += eta;
+	eta /= *sigma + sqrt(eta + *sigma * *sigma);
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] -= eta;
+	    work[j] += eta;
+/* L50: */
+	}
+
+	*sigma += eta;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L60: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / (work[*n] * delta[*n]);
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8. + erretm - phi + rhoinv + abs(tau) * (dpsi
+		+ dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Main loop to update the values of the array   DELTA */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 20; ++niter) {
+
+/*           Test for convergence */
+
+	    if (abs(w) <= eps * erretm) {
+		goto L240;
+	    }
+
+/*           Calculate the new step */
+
+	    dtnsq1 = work[*n - 1] * delta[*n - 1];
+	    dtnsq = work[*n] * delta[*n];
+	    c__ = w - dtnsq1 * dpsi - dtnsq * dphi;
+	    a = (dtnsq + dtnsq1) * w - dtnsq1 * dtnsq * (dpsi + dphi);
+	    b = dtnsq1 * dtnsq * w;
+	    if (a >= 0.) {
+		eta = (a + sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+			c__ * 2.);
+	    } else {
+		eta = b * 2. / (a - sqrt((d__1 = a * a - b * 4. * c__, abs(
+			d__1))));
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta > 0.) {
+		eta = -w / (dpsi + dphi);
+	    }
+	    temp = eta - dtnsq;
+	    if (temp <= 0.) {
+		eta /= 2.;
+	    }
+
+	    tau += eta;
+	    eta /= *sigma + sqrt(eta + *sigma * *sigma);
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] -= eta;
+		work[j] += eta;
+/* L70: */
+	    }
+
+	    *sigma += eta;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.;
+	    psi = 0.;
+	    erretm = 0.;
+	    i__1 = ii;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / (work[j] * delta[j]);
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L80: */
+	    }
+	    erretm = abs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    temp = z__[*n] / (work[*n] * delta[*n]);
+	    phi = z__[*n] * temp;
+	    dphi = temp * temp;
+	    erretm = (-phi - psi) * 8. + erretm - phi + rhoinv + abs(tau) * (
+		    dpsi + dphi);
+
+	    w = rhoinv + phi + psi;
+/* L90: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+	goto L240;
+
+/*        End for the case I = N */
+
+    } else {
+
+/*        The case for I < N */
+
+	niter = 1;
+	ip1 = *i__ + 1;
+
+/*        Calculate initial guess */
+
+	delsq = (d__[ip1] - d__[*i__]) * (d__[ip1] + d__[*i__]);
+	delsq2 = delsq / 2.;
+	temp = delsq2 / (d__[*i__] + sqrt(d__[*i__] * d__[*i__] + delsq2));
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    work[j] = d__[j] + d__[*i__] + temp;
+	    delta[j] = d__[j] - d__[*i__] - temp;
+/* L100: */
+	}
+
+	psi = 0.;
+	i__1 = *i__ - 1;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / (work[j] * delta[j]);
+/* L110: */
+	}
+
+	phi = 0.;
+	i__1 = *i__ + 2;
+	for (j = *n; j >= i__1; --j) {
+	    phi += z__[j] * z__[j] / (work[j] * delta[j]);
+/* L120: */
+	}
+	c__ = rhoinv + psi + phi;
+	w = c__ + z__[*i__] * z__[*i__] / (work[*i__] * delta[*i__]) + z__[
+		ip1] * z__[ip1] / (work[ip1] * delta[ip1]);
+
+	if (w > 0.) {
+
+/*
+             d(i)^2 < the ith sigma^2 < (d(i)^2+d(i+1)^2)/2
+
+             We choose d(i) as origin.
+*/
+
+	    orgati = TRUE_;
+	    sg2lb = 0.;
+	    sg2ub = delsq2;
+	    a = c__ * delsq + z__[*i__] * z__[*i__] + z__[ip1] * z__[ip1];
+	    b = z__[*i__] * z__[*i__] * delsq;
+	    if (a > 0.) {
+		tau = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__, abs(
+			d__1))));
+	    } else {
+		tau = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+			c__ * 2.);
+	    }
+
+/*
+             TAU now is an estimation of SIGMA^2 - D( I )^2. The
+             following, however, is the corresponding estimation of
+             SIGMA - D( I ).
+*/
+
+	    eta = tau / (d__[*i__] + sqrt(d__[*i__] * d__[*i__] + tau));
+	} else {
+
+/*
+             (d(i)^2+d(i+1)^2)/2 <= the ith sigma^2 < d(i+1)^2/2
+
+             We choose d(i+1) as origin.
+*/
+
+	    orgati = FALSE_;
+	    sg2lb = -delsq2;
+	    sg2ub = 0.;
+	    a = c__ * delsq - z__[*i__] * z__[*i__] - z__[ip1] * z__[ip1];
+	    b = z__[ip1] * z__[ip1] * delsq;
+	    if (a < 0.) {
+		tau = b * 2. / (a - sqrt((d__1 = a * a + b * 4. * c__, abs(
+			d__1))));
+	    } else {
+		tau = -(a + sqrt((d__1 = a * a + b * 4. * c__, abs(d__1)))) /
+			(c__ * 2.);
+	    }
+
+/*
+             TAU now is an estimation of SIGMA^2 - D( IP1 )^2. The
+             following, however, is the corresponding estimation of
+             SIGMA - D( IP1 ).
+*/
+
+	    eta = tau / (d__[ip1] + sqrt((d__1 = d__[ip1] * d__[ip1] + tau,
+		    abs(d__1))));
+	}
+
+	if (orgati) {
+	    ii = *i__;
+	    *sigma = d__[*i__] + eta;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		work[j] = d__[j] + d__[*i__] + eta;
+		delta[j] = d__[j] - d__[*i__] - eta;
+/* L130: */
+	    }
+	} else {
+	    ii = *i__ + 1;
+	    *sigma = d__[ip1] + eta;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		work[j] = d__[j] + d__[ip1] + eta;
+		delta[j] = d__[j] - d__[ip1] - eta;
+/* L140: */
+	    }
+	}
+	iim1 = ii - 1;
+	iip1 = ii + 1;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L150: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.;
+	phi = 0.;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L160: */
+	}
+
+	w = rhoinv + phi + psi;
+
+/*
+          W is the value of the secular function with
+          its ii-th element removed.
+*/
+
+	swtch3 = FALSE_;
+	if (orgati) {
+	    if (w < 0.) {
+		swtch3 = TRUE_;
+	    }
+	} else {
+	    if (w > 0.) {
+		swtch3 = TRUE_;
+	    }
+	}
+	if (ii == 1 || ii == *n) {
+	    swtch3 = FALSE_;
+	}
+
+	temp = z__[ii] / (work[ii] * delta[ii]);
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w += temp;
+	erretm = (phi - psi) * 8. + erretm + rhoinv * 2. + abs(temp) * 3. +
+		abs(tau) * dw;
+
+/*        Test for convergence */
+
+	if (abs(w) <= eps * erretm) {
+	    goto L240;
+	}
+
+	if (w <= 0.) {
+	    sg2lb = max(sg2lb,tau);
+	} else {
+	    sg2ub = min(sg2ub,tau);
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	if (! swtch3) {
+	    dtipsq = work[ip1] * delta[ip1];
+	    dtisq = work[*i__] * delta[*i__];
+	    if (orgati) {
+/* Computing 2nd power */
+		d__1 = z__[*i__] / dtisq;
+		c__ = w - dtipsq * dw + delsq * (d__1 * d__1);
+	    } else {
+/* Computing 2nd power */
+		d__1 = z__[ip1] / dtipsq;
+		c__ = w - dtisq * dw - delsq * (d__1 * d__1);
+	    }
+	    a = (dtipsq + dtisq) * w - dtipsq * dtisq * dw;
+	    b = dtipsq * dtisq * w;
+	    if (c__ == 0.) {
+		if (a == 0.) {
+		    if (orgati) {
+			a = z__[*i__] * z__[*i__] + dtipsq * dtipsq * (dpsi +
+				dphi);
+		    } else {
+			a = z__[ip1] * z__[ip1] + dtisq * dtisq * (dpsi +
+				dphi);
+		    }
+		}
+		eta = b / a;
+	    } else if (a <= 0.) {
+		eta = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1)))) / (
+			c__ * 2.);
+	    } else {
+		eta = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__, abs(
+			d__1))));
+	    }
+	} else {
+
+/*           Interpolation using THREE most relevant poles */
+
+	    dtiim = work[iim1] * delta[iim1];
+	    dtiip = work[iip1] * delta[iip1];
+	    temp = rhoinv + psi + phi;
+	    if (orgati) {
+		temp1 = z__[iim1] / dtiim;
+		temp1 *= temp1;
+		c__ = temp - dtiip * (dpsi + dphi) - (d__[iim1] - d__[iip1]) *
+			 (d__[iim1] + d__[iip1]) * temp1;
+		zz[0] = z__[iim1] * z__[iim1];
+		if (dpsi < temp1) {
+		    zz[2] = dtiip * dtiip * dphi;
+		} else {
+		    zz[2] = dtiip * dtiip * (dpsi - temp1 + dphi);
+		}
+	    } else {
+		temp1 = z__[iip1] / dtiip;
+		temp1 *= temp1;
+		c__ = temp - dtiim * (dpsi + dphi) - (d__[iip1] - d__[iim1]) *
+			 (d__[iim1] + d__[iip1]) * temp1;
+		if (dphi < temp1) {
+		    zz[0] = dtiim * dtiim * dpsi;
+		} else {
+		    zz[0] = dtiim * dtiim * (dpsi + (dphi - temp1));
+		}
+		zz[2] = z__[iip1] * z__[iip1];
+	    }
+	    zz[1] = z__[ii] * z__[ii];
+	    dd[0] = dtiim;
+	    dd[1] = delta[ii] * work[ii];
+	    dd[2] = dtiip;
+	    dlaed6_(&niter, &orgati, &c__, dd, zz, &w, &eta, info);
+	    if (*info != 0) {
+		goto L240;
+	    }
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta >= 0.) {
+	    eta = -w / dw;
+	}
+	if (orgati) {
+	    temp1 = work[*i__] * delta[*i__];
+	    temp = eta - temp1;
+	} else {
+	    temp1 = work[ip1] * delta[ip1];
+	    temp = eta - temp1;
+	}
+	if (temp > sg2ub || temp < sg2lb) {
+	    if (w < 0.) {
+		eta = (sg2ub - tau) / 2.;
+	    } else {
+		eta = (sg2lb - tau) / 2.;
+	    }
+	}
+
+	tau += eta;
+	eta /= *sigma + sqrt(*sigma * *sigma + eta);
+
+	prew = w;
+
+	*sigma += eta;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    work[j] += eta;
+	    delta[j] -= eta;
+/* L170: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.;
+	psi = 0.;
+	erretm = 0.;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L180: */
+	}
+	erretm = abs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.;
+	phi = 0.;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L190: */
+	}
+
+	temp = z__[ii] / (work[ii] * delta[ii]);
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w = rhoinv + phi + psi + temp;
+	erretm = (phi - psi) * 8. + erretm + rhoinv * 2. + abs(temp) * 3. +
+		abs(tau) * dw;
+
+	if (w <= 0.) {
+	    sg2lb = max(sg2lb,tau);
+	} else {
+	    sg2ub = min(sg2ub,tau);
+	}
+
+	swtch = FALSE_;
+	if (orgati) {
+	    if (-w > abs(prew) / 10.) {
+		swtch = TRUE_;
+	    }
+	} else {
+	    if (w > abs(prew) / 10.) {
+		swtch = TRUE_;
+	    }
+	}
+
+/*        Main loop to update the values of the array   DELTA and WORK */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 20; ++niter) {
+
+/*           Test for convergence */
+
+	    if (abs(w) <= eps * erretm) {
+		goto L240;
+	    }
+
+/*           Calculate the new step */
+
+	    if (! swtch3) {
+		dtipsq = work[ip1] * delta[ip1];
+		dtisq = work[*i__] * delta[*i__];
+		if (! swtch) {
+		    if (orgati) {
+/* Computing 2nd power */
+			d__1 = z__[*i__] / dtisq;
+			c__ = w - dtipsq * dw + delsq * (d__1 * d__1);
+		    } else {
+/* Computing 2nd power */
+			d__1 = z__[ip1] / dtipsq;
+			c__ = w - dtisq * dw - delsq * (d__1 * d__1);
+		    }
+		} else {
+		    temp = z__[ii] / (work[ii] * delta[ii]);
+		    if (orgati) {
+			dpsi += temp * temp;
+		    } else {
+			dphi += temp * temp;
+		    }
+		    c__ = w - dtisq * dpsi - dtipsq * dphi;
+		}
+		a = (dtipsq + dtisq) * w - dtipsq * dtisq * dw;
+		b = dtipsq * dtisq * w;
+		if (c__ == 0.) {
+		    if (a == 0.) {
+			if (! swtch) {
+			    if (orgati) {
+				a = z__[*i__] * z__[*i__] + dtipsq * dtipsq *
+					(dpsi + dphi);
+			    } else {
+				a = z__[ip1] * z__[ip1] + dtisq * dtisq * (
+					dpsi + dphi);
+			    }
+			} else {
+			    a = dtisq * dtisq * dpsi + dtipsq * dtipsq * dphi;
+			}
+		    }
+		    eta = b / a;
+		} else if (a <= 0.) {
+		    eta = (a - sqrt((d__1 = a * a - b * 4. * c__, abs(d__1))))
+			     / (c__ * 2.);
+		} else {
+		    eta = b * 2. / (a + sqrt((d__1 = a * a - b * 4. * c__,
+			    abs(d__1))));
+		}
+	    } else {
+
+/*              Interpolation using THREE most relevant poles */
+
+		dtiim = work[iim1] * delta[iim1];
+		dtiip = work[iip1] * delta[iip1];
+		temp = rhoinv + psi + phi;
+		if (swtch) {
+		    c__ = temp - dtiim * dpsi - dtiip * dphi;
+		    zz[0] = dtiim * dtiim * dpsi;
+		    zz[2] = dtiip * dtiip * dphi;
+		} else {
+		    if (orgati) {
+			temp1 = z__[iim1] / dtiim;
+			temp1 *= temp1;
+			temp2 = (d__[iim1] - d__[iip1]) * (d__[iim1] + d__[
+				iip1]) * temp1;
+			c__ = temp - dtiip * (dpsi + dphi) - temp2;
+			zz[0] = z__[iim1] * z__[iim1];
+			if (dpsi < temp1) {
+			    zz[2] = dtiip * dtiip * dphi;
+			} else {
+			    zz[2] = dtiip * dtiip * (dpsi - temp1 + dphi);
+			}
+		    } else {
+			temp1 = z__[iip1] / dtiip;
+			temp1 *= temp1;
+			temp2 = (d__[iip1] - d__[iim1]) * (d__[iim1] + d__[
+				iip1]) * temp1;
+			c__ = temp - dtiim * (dpsi + dphi) - temp2;
+			if (dphi < temp1) {
+			    zz[0] = dtiim * dtiim * dpsi;
+			} else {
+			    zz[0] = dtiim * dtiim * (dpsi + (dphi - temp1));
+			}
+			zz[2] = z__[iip1] * z__[iip1];
+		    }
+		}
+		dd[0] = dtiim;
+		dd[1] = delta[ii] * work[ii];
+		dd[2] = dtiip;
+		dlaed6_(&niter, &orgati, &c__, dd, zz, &w, &eta, info);
+		if (*info != 0) {
+		    goto L240;
+		}
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta >= 0.) {
+		eta = -w / dw;
+	    }
+	    if (orgati) {
+		temp1 = work[*i__] * delta[*i__];
+		temp = eta - temp1;
+	    } else {
+		temp1 = work[ip1] * delta[ip1];
+		temp = eta - temp1;
+	    }
+	    if (temp > sg2ub || temp < sg2lb) {
+		if (w < 0.) {
+		    eta = (sg2ub - tau) / 2.;
+		} else {
+		    eta = (sg2lb - tau) / 2.;
+		}
+	    }
+
+	    tau += eta;
+	    eta /= *sigma + sqrt(*sigma * *sigma + eta);
+
+	    *sigma += eta;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		work[j] += eta;
+		delta[j] -= eta;
+/* L200: */
+	    }
+
+	    prew = w;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.;
+	    psi = 0.;
+	    erretm = 0.;
+	    i__1 = iim1;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / (work[j] * delta[j]);
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L210: */
+	    }
+	    erretm = abs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    dphi = 0.;
+	    phi = 0.;
+	    i__1 = iip1;
+	    for (j = *n; j >= i__1; --j) {
+		temp = z__[j] / (work[j] * delta[j]);
+		phi += z__[j] * temp;
+		dphi += temp * temp;
+		erretm += phi;
+/* L220: */
+	    }
+
+	    temp = z__[ii] / (work[ii] * delta[ii]);
+	    dw = dpsi + dphi + temp * temp;
+	    temp = z__[ii] * temp;
+	    w = rhoinv + phi + psi + temp;
+	    erretm = (phi - psi) * 8. + erretm + rhoinv * 2. + abs(temp) * 3.
+		    + abs(tau) * dw;
+	    if (w * prew > 0. && abs(w) > abs(prew) / 10.) {
+		swtch = ! swtch;
+	    }
+
+	    if (w <= 0.) {
+		sg2lb = max(sg2lb,tau);
+	    } else {
+		sg2ub = min(sg2ub,tau);
+	    }
+
+/* L230: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+
+    }
+
+L240:
+    return 0;
+
+/*     End of DLASD4 */
+
+} /* dlasd4_ */
+
+/* Subroutine */ int dlasd5_(integer *i__, doublereal *d__, doublereal *z__,
+	doublereal *delta, doublereal *rho, doublereal *dsigma, doublereal *
+	work)
+{
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal b, c__, w, del, tau, delsq;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the square root of the I-th eigenvalue
+    of a positive symmetric rank-one modification of a 2-by-2 diagonal
+    matrix
+
+               diag( D ) * diag( D ) +  RHO *  Z * transpose(Z) .
+
+    The diagonal entries in the array D are assumed to satisfy
+
+               0 <= D(i) < D(j)  for  i < j .
+
+    We also assume RHO > 0 and that the Euclidean norm of the vector
+    Z is one.
+
+    Arguments
+    =========
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  I = 1 or I = 2.
+
+    D      (input) DOUBLE PRECISION array, dimension ( 2 )
+           The original eigenvalues.  We assume 0 <= D(1) < D(2).
+
+    Z      (input) DOUBLE PRECISION array, dimension ( 2 )
+           The components of the updating vector.
+
+    DELTA  (output) DOUBLE PRECISION array, dimension ( 2 )
+           Contains (D(j) - sigma_I) in its  j-th component.
+           The vector DELTA contains the information necessary
+           to construct the eigenvectors.
+
+    RHO    (input) DOUBLE PRECISION
+           The scalar in the symmetric updating formula.
+
+    DSIGMA (output) DOUBLE PRECISION
+           The computed sigma_I, the I-th updated eigenvalue.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension ( 2 )
+           WORK contains (D(j) + sigma_I) in its  j-th component.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --work;
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    del = d__[2] - d__[1];
+    delsq = del * (d__[2] + d__[1]);
+    if (*i__ == 1) {
+	w = *rho * 4. * (z__[2] * z__[2] / (d__[1] + d__[2] * 3.) - z__[1] *
+		z__[1] / (d__[1] * 3. + d__[2])) / del + 1.;
+	if (w > 0.) {
+	    b = delsq + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[1] * z__[1] * delsq;
+
+/*
+             B > ZERO, always
+
+             The following TAU is DSIGMA * DSIGMA - D( 1 ) * D( 1 )
+*/
+
+	    tau = c__ * 2. / (b + sqrt((d__1 = b * b - c__ * 4., abs(d__1))));
+
+/*           The following TAU is DSIGMA - D( 1 ) */
+
+	    tau /= d__[1] + sqrt(d__[1] * d__[1] + tau);
+	    *dsigma = d__[1] + tau;
+	    delta[1] = -tau;
+	    delta[2] = del - tau;
+	    work[1] = d__[1] * 2. + tau;
+	    work[2] = d__[1] + tau + d__[2];
+/*
+             DELTA( 1 ) = -Z( 1 ) / TAU
+             DELTA( 2 ) = Z( 2 ) / ( DEL-TAU )
+*/
+	} else {
+	    b = -delsq + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[2] * z__[2] * delsq;
+
+/*           The following TAU is DSIGMA * DSIGMA - D( 2 ) * D( 2 ) */
+
+	    if (b > 0.) {
+		tau = c__ * -2. / (b + sqrt(b * b + c__ * 4.));
+	    } else {
+		tau = (b - sqrt(b * b + c__ * 4.)) / 2.;
+	    }
+
+/*           The following TAU is DSIGMA - D( 2 ) */
+
+	    tau /= d__[2] + sqrt((d__1 = d__[2] * d__[2] + tau, abs(d__1)));
+	    *dsigma = d__[2] + tau;
+	    delta[1] = -(del + tau);
+	    delta[2] = -tau;
+	    work[1] = d__[1] + tau + d__[2];
+	    work[2] = d__[2] * 2. + tau;
+/*
+             DELTA( 1 ) = -Z( 1 ) / ( DEL+TAU )
+             DELTA( 2 ) = -Z( 2 ) / TAU
+*/
+	}
+/*
+          TEMP = SQRT( DELTA( 1 )*DELTA( 1 )+DELTA( 2 )*DELTA( 2 ) )
+          DELTA( 1 ) = DELTA( 1 ) / TEMP
+          DELTA( 2 ) = DELTA( 2 ) / TEMP
+*/
+    } else {
+
+/*        Now I=2 */
+
+	b = -delsq + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	c__ = *rho * z__[2] * z__[2] * delsq;
+
+/*        The following TAU is DSIGMA * DSIGMA - D( 2 ) * D( 2 ) */
+
+	if (b > 0.) {
+	    tau = (b + sqrt(b * b + c__ * 4.)) / 2.;
+	} else {
+	    tau = c__ * 2. / (-b + sqrt(b * b + c__ * 4.));
+	}
+
+/*        The following TAU is DSIGMA - D( 2 ) */
+
+	tau /= d__[2] + sqrt(d__[2] * d__[2] + tau);
+	*dsigma = d__[2] + tau;
+	delta[1] = -(del + tau);
+	delta[2] = -tau;
+	work[1] = d__[1] + tau + d__[2];
+	work[2] = d__[2] * 2. + tau;
+/*
+          DELTA( 1 ) = -Z( 1 ) / ( DEL+TAU )
+          DELTA( 2 ) = -Z( 2 ) / TAU
+          TEMP = SQRT( DELTA( 1 )*DELTA( 1 )+DELTA( 2 )*DELTA( 2 ) )
+          DELTA( 1 ) = DELTA( 1 ) / TEMP
+          DELTA( 2 ) = DELTA( 2 ) / TEMP
+*/
+    }
+    return 0;
+
+/*     End of DLASD5 */
+
+} /* dlasd5_ */
+
+/* Subroutine */ int dlasd6_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, doublereal *d__, doublereal *vf, doublereal *vl,
+	doublereal *alpha, doublereal *beta, integer *idxq, integer *perm,
+	integer *givptr, integer *givcol, integer *ldgcol, doublereal *givnum,
+	 integer *ldgnum, doublereal *poles, doublereal *difl, doublereal *
+	difr, doublereal *z__, integer *k, doublereal *c__, doublereal *s,
+	doublereal *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, m, n, n1, n2, iw, idx, idxc, idxp, ivfw, ivlw;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlasd7_(integer *, integer *, integer *,
+	     integer *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *), dlasd8_(
+	    integer *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     doublereal *, integer *), dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlamrg_(integer *, integer *,
+	    doublereal *, integer *, integer *, integer *);
+    static integer isigma;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal orgnrm;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLASD6 computes the SVD of an updated upper bidiagonal matrix B
+    obtained by merging two smaller ones by appending a row. This
+    routine is used only for the problem which requires all singular
+    values and optionally singular vector matrices in factored form.
+    B is an N-by-M matrix with N = NL + NR + 1 and M = N + SQRE.
+    A related subroutine, DLASD1, handles the case in which all singular
+    values and singular vectors of the bidiagonal matrix are desired.
+
+    DLASD6 computes the SVD as follows:
+
+                  ( D1(in)  0    0     0 )
+      B = U(in) * (   Z1'   a   Z2'    b ) * VT(in)
+                  (   0     0   D2(in) 0 )
+
+        = U(out) * ( D(out) 0) * VT(out)
+
+    where Z' = (Z1' a Z2' b) = u' VT', and u is a vector of dimension M
+    with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+    elsewhere; and the entry b is empty if SQRE = 0.
+
+    The singular values of B can be computed using D1, D2, the first
+    components of all the right singular vectors of the lower block, and
+    the last components of all the right singular vectors of the upper
+    block. These components are stored and updated in VF and VL,
+    respectively, in DLASD6. Hence U and VT are not explicitly
+    referenced.
+
+    The singular values are stored in D. The algorithm consists of two
+    stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple singular values or if there is a zero
+          in the Z vector. For each such occurence the dimension of the
+          secular equation problem is reduced by one. This stage is
+          performed by the routine DLASD7.
+
+          The second stage consists of calculating the updated
+          singular values. This is done by finding the roots of the
+          secular equation via the routine DLASD4 (as called by DLASD8).
+          This routine also updates VF and VL and computes the distances
+          between the updated singular values and the old singular
+          values.
+
+    DLASD6 is called from DLASDA.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Compute singular values only.
+           = 1: Compute singular vectors in factored form as well.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    D      (input/output) DOUBLE PRECISION array, dimension ( NL+NR+1 ).
+           On entry D(1:NL,1:NL) contains the singular values of the
+           upper block, and D(NL+2:N) contains the singular values
+           of the lower block. On exit D(1:N) contains the singular
+           values of the modified matrix.
+
+    VF     (input/output) DOUBLE PRECISION array, dimension ( M )
+           On entry, VF(1:NL+1) contains the first components of all
+           right singular vectors of the upper block; and VF(NL+2:M)
+           contains the first components of all right singular vectors
+           of the lower block. On exit, VF contains the first components
+           of all right singular vectors of the bidiagonal matrix.
+
+    VL     (input/output) DOUBLE PRECISION array, dimension ( M )
+           On entry, VL(1:NL+1) contains the  last components of all
+           right singular vectors of the upper block; and VL(NL+2:M)
+           contains the last components of all right singular vectors of
+           the lower block. On exit, VL contains the last components of
+           all right singular vectors of the bidiagonal matrix.
+
+    ALPHA  (input/output) DOUBLE PRECISION
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input/output) DOUBLE PRECISION
+           Contains the off-diagonal element associated with the added
+           row.
+
+    IDXQ   (output) INTEGER array, dimension ( N )
+           This contains the permutation which will reintegrate the
+           subproblem just solved back into sorted order, i.e.
+           D( IDXQ( I = 1, N ) ) will be in ascending order.
+
+    PERM   (output) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) to be applied
+           to each block. Not referenced if ICOMPQ = 0.
+
+    GIVPTR (output) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem. Not referenced if ICOMPQ = 0.
+
+    GIVCOL (output) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGCOL (input) INTEGER
+           leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (output) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value to be used in the
+           corresponding Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of GIVNUM and POLES, must be at least N.
+
+    POLES  (output) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           On exit, POLES(1,*) is an array containing the new singular
+           values obtained from solving the secular equation, and
+           POLES(2,*) is an array containing the poles in the secular
+           equation. Not referenced if ICOMPQ = 0.
+
+    DIFL   (output) DOUBLE PRECISION array, dimension ( N )
+           On exit, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (output) DOUBLE PRECISION array,
+                    dimension ( LDGNUM, 2 ) if ICOMPQ = 1 and
+                    dimension ( N ) if ICOMPQ = 0.
+           On exit, DIFR(I, 1) is the distance between I-th updated
+           (undeflated) singular value and the I+1-th (undeflated) old
+           singular value.
+
+           If ICOMPQ = 1, DIFR(1:K,2) is an array containing the
+           normalizing factors for the right singular vector matrix.
+
+           See DLASD8 for details on DIFL and DIFR.
+
+    Z      (output) DOUBLE PRECISION array, dimension ( M )
+           The first elements of this array contain the components
+           of the deflation-adjusted updating row vector.
+
+    K      (output) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (output) DOUBLE PRECISION
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (output) DOUBLE PRECISION
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension ( 4 * M )
+
+    IWORK  (workspace) INTEGER array, dimension ( 3 * N )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --vf;
+    --vl;
+    --idxq;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --difr;
+    --z__;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    } else if (*ldgcol < n) {
+	*info = -14;
+    } else if (*ldgnum < n) {
+	*info = -16;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD6", &i__1);
+	return 0;
+    }
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in DLASD7 and DLASD8.
+*/
+
+    isigma = 1;
+    iw = isigma + n;
+    ivfw = iw + m;
+    ivlw = ivfw + m;
+
+    idx = 1;
+    idxc = idx + n;
+    idxp = idxc + n;
+
+/*
+       Scale.
+
+   Computing MAX
+*/
+    d__1 = abs(*alpha), d__2 = abs(*beta);
+    orgnrm = max(d__1,d__2);
+    d__[*nl + 1] = 0.;
+    i__1 = n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = d__[i__], abs(d__1)) > orgnrm) {
+	    orgnrm = (d__1 = d__[i__], abs(d__1));
+	}
+/* L10: */
+    }
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &n, &c__1, &d__[1], &n, info);
+    *alpha /= orgnrm;
+    *beta /= orgnrm;
+
+/*     Sort and Deflate singular values. */
+
+    dlasd7_(icompq, nl, nr, sqre, k, &d__[1], &z__[1], &work[iw], &vf[1], &
+	    work[ivfw], &vl[1], &work[ivlw], alpha, beta, &work[isigma], &
+	    iwork[idx], &iwork[idxp], &idxq[1], &perm[1], givptr, &givcol[
+	    givcol_offset], ldgcol, &givnum[givnum_offset], ldgnum, c__, s,
+	    info);
+
+/*     Solve Secular Equation, compute DIFL, DIFR, and update VF, VL. */
+
+    dlasd8_(icompq, k, &d__[1], &z__[1], &vf[1], &vl[1], &difl[1], &difr[1],
+	    ldgnum, &work[isigma], &work[iw], info);
+
+/*     Save the poles if ICOMPQ = 1. */
+
+    if (*icompq == 1) {
+	dcopy_(k, &d__[1], &c__1, &poles[poles_dim1 + 1], &c__1);
+	dcopy_(k, &work[isigma], &c__1, &poles[(poles_dim1 << 1) + 1], &c__1);
+    }
+
+/*     Unscale. */
+
+    dlascl_("G", &c__0, &c__0, &c_b15, &orgnrm, &n, &c__1, &d__[1], &n, info);
+
+/*     Prepare the IDXQ sorting permutation. */
+
+    n1 = *k;
+    n2 = n - *k;
+    dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]);
+
+    return 0;
+
+/*     End of DLASD6 */
+
+} /* dlasd6_ */
+
+/* Subroutine */ int dlasd7_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *k, doublereal *d__, doublereal *z__,
+	doublereal *zw, doublereal *vf, doublereal *vfw, doublereal *vl,
+	doublereal *vlw, doublereal *alpha, doublereal *beta, doublereal *
+	dsigma, integer *idx, integer *idxp, integer *idxq, integer *perm,
+	integer *givptr, integer *givcol, integer *ldgcol, doublereal *givnum,
+	 integer *ldgnum, doublereal *c__, doublereal *s, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, givnum_dim1, givnum_offset, i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, j, m, n, k2;
+    static doublereal z1;
+    static integer jp;
+    static doublereal eps, tau, tol;
+    static integer nlp1, nlp2, idxi, idxj;
+    extern /* Subroutine */ int drot_(integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *);
+    static integer idxjp;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer jprev;
+
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *);
+    static doublereal hlftol;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASD7 merges the two sets of singular values together into a single
+    sorted set. Then it tries to deflate the size of the problem. There
+    are two ways in which deflation can occur:  when two or more singular
+    values are close together or if there is a tiny entry in the Z
+    vector. For each such occurrence the order of the related
+    secular equation problem is reduced by one.
+
+    DLASD7 is called from DLASD6.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            Specifies whether singular vectors are to be computed
+            in compact form, as follows:
+            = 0: Compute singular values only.
+            = 1: Compute singular vectors of upper
+                 bidiagonal matrix in compact form.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has
+           N = NL + NR + 1 rows and
+           M = N + SQRE >= N columns.
+
+    K      (output) INTEGER
+           Contains the dimension of the non-deflated matrix, this is
+           the order of the related secular equation. 1 <= K <=N.
+
+    D      (input/output) DOUBLE PRECISION array, dimension ( N )
+           On entry D contains the singular values of the two submatrices
+           to be combined. On exit D contains the trailing (N-K) updated
+           singular values (those which were deflated) sorted into
+           increasing order.
+
+    Z      (output) DOUBLE PRECISION array, dimension ( M )
+           On exit Z contains the updating row vector in the secular
+           equation.
+
+    ZW     (workspace) DOUBLE PRECISION array, dimension ( M )
+           Workspace for Z.
+
+    VF     (input/output) DOUBLE PRECISION array, dimension ( M )
+           On entry, VF(1:NL+1) contains the first components of all
+           right singular vectors of the upper block; and VF(NL+2:M)
+           contains the first components of all right singular vectors
+           of the lower block. On exit, VF contains the first components
+           of all right singular vectors of the bidiagonal matrix.
+
+    VFW    (workspace) DOUBLE PRECISION array, dimension ( M )
+           Workspace for VF.
+
+    VL     (input/output) DOUBLE PRECISION array, dimension ( M )
+           On entry, VL(1:NL+1) contains the  last components of all
+           right singular vectors of the upper block; and VL(NL+2:M)
+           contains the last components of all right singular vectors
+           of the lower block. On exit, VL contains the last components
+           of all right singular vectors of the bidiagonal matrix.
+
+    VLW    (workspace) DOUBLE PRECISION array, dimension ( M )
+           Workspace for VL.
+
+    ALPHA  (input) DOUBLE PRECISION
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input) DOUBLE PRECISION
+           Contains the off-diagonal element associated with the added
+           row.
+
+    DSIGMA (output) DOUBLE PRECISION array, dimension ( N )
+           Contains a copy of the diagonal elements (K-1 singular values
+           and one zero) in the secular equation.
+
+    IDX    (workspace) INTEGER array, dimension ( N )
+           This will contain the permutation used to sort the contents of
+           D into ascending order.
+
+    IDXP   (workspace) INTEGER array, dimension ( N )
+           This will contain the permutation used to place deflated
+           values of D at the end of the array. On output IDXP(2:K)
+           points to the nondeflated D-values and IDXP(K+1:N)
+           points to the deflated singular values.
+
+    IDXQ   (input) INTEGER array, dimension ( N )
+           This contains the permutation which separately sorts the two
+           sub-problems in D into ascending order.  Note that entries in
+           the first half of this permutation must first be moved one
+           position backward; and entries in the second half
+           must first have NL+1 added to their values.
+
+    PERM   (output) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) to be applied
+           to each singular block. Not referenced if ICOMPQ = 0.
+
+    GIVPTR (output) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem. Not referenced if ICOMPQ = 0.
+
+    GIVCOL (output) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (output) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value to be used in the
+           corresponding Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of GIVNUM, must be at least N.
+
+    C      (output) DOUBLE PRECISION
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (output) DOUBLE PRECISION
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --z__;
+    --zw;
+    --vf;
+    --vfw;
+    --vl;
+    --vlw;
+    --dsigma;
+    --idx;
+    --idxp;
+    --idxq;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+
+    /* Function Body */
+    *info = 0;
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    } else if (*ldgcol < n) {
+	*info = -22;
+    } else if (*ldgnum < n) {
+	*info = -24;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD7", &i__1);
+	return 0;
+    }
+
+    nlp1 = *nl + 1;
+    nlp2 = *nl + 2;
+    if (*icompq == 1) {
+	*givptr = 0;
+    }
+
+/*
+       Generate the first part of the vector Z and move the singular
+       values in the first part of D one position backward.
+*/
+
+    z1 = *alpha * vl[nlp1];
+    vl[nlp1] = 0.;
+    tau = vf[nlp1];
+    for (i__ = *nl; i__ >= 1; --i__) {
+	z__[i__ + 1] = *alpha * vl[i__];
+	vl[i__] = 0.;
+	vf[i__ + 1] = vf[i__];
+	d__[i__ + 1] = d__[i__];
+	idxq[i__ + 1] = idxq[i__] + 1;
+/* L10: */
+    }
+    vf[1] = tau;
+
+/*     Generate the second part of the vector Z. */
+
+    i__1 = m;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	z__[i__] = *beta * vf[i__];
+	vf[i__] = 0.;
+/* L20: */
+    }
+
+/*     Sort the singular values into increasing order */
+
+    i__1 = n;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	idxq[i__] += nlp1;
+/* L30: */
+    }
+
+/*     DSIGMA, IDXC, IDXC, and ZW are used as storage space. */
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	dsigma[i__] = d__[idxq[i__]];
+	zw[i__] = z__[idxq[i__]];
+	vfw[i__] = vf[idxq[i__]];
+	vlw[i__] = vl[idxq[i__]];
+/* L40: */
+    }
+
+    dlamrg_(nl, nr, &dsigma[2], &c__1, &c__1, &idx[2]);
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	idxi = idx[i__] + 1;
+	d__[i__] = dsigma[idxi];
+	z__[i__] = zw[idxi];
+	vf[i__] = vfw[idxi];
+	vl[i__] = vlw[idxi];
+/* L50: */
+    }
+
+/*     Calculate the allowable deflation tolerence */
+
+    eps = EPSILON;
+/* Computing MAX */
+    d__1 = abs(*alpha), d__2 = abs(*beta);
+    tol = max(d__1,d__2);
+/* Computing MAX */
+    d__2 = (d__1 = d__[n], abs(d__1));
+    tol = eps * 64. * max(d__2,tol);
+
+/*
+       There are 2 kinds of deflation -- first a value in the z-vector
+       is small, second two (or more) singular values are very close
+       together (their difference is small).
+
+       If the value in the z-vector is small, we simply permute the
+       array so that the corresponding singular value is moved to the
+       end.
+
+       If two values in the D-vector are close, we perform a two-sided
+       rotation designed to make one of the corresponding z-vector
+       entries zero, and then permute the array so that the deflated
+       singular value is moved to the end.
+
+       If there are multiple singular values then the problem deflates.
+       Here the number of equal singular values are found.  As each equal
+       singular value is found, an elementary reflector is computed to
+       rotate the corresponding singular subspace so that the
+       corresponding components of Z are zero in this new basis.
+*/
+
+    *k = 1;
+    k2 = n + 1;
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	if ((d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    idxp[k2] = j;
+	    if (j == n) {
+		goto L100;
+	    }
+	} else {
+	    jprev = j;
+	    goto L70;
+	}
+/* L60: */
+    }
+L70:
+    j = jprev;
+L80:
+    ++j;
+    if (j > n) {
+	goto L90;
+    }
+    if ((d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	idxp[k2] = j;
+    } else {
+
+/*        Check if singular values are close enough to allow deflation. */
+
+	if ((d__1 = d__[j] - d__[jprev], abs(d__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    *s = z__[jprev];
+	    *c__ = z__[j];
+
+/*
+             Find sqrt(a**2+b**2) without overflow or
+             destructive underflow.
+*/
+
+	    tau = dlapy2_(c__, s);
+	    z__[j] = tau;
+	    z__[jprev] = 0.;
+	    *c__ /= tau;
+	    *s = -(*s) / tau;
+
+/*           Record the appropriate Givens rotation */
+
+	    if (*icompq == 1) {
+		++(*givptr);
+		idxjp = idxq[idx[jprev] + 1];
+		idxj = idxq[idx[j] + 1];
+		if (idxjp <= nlp1) {
+		    --idxjp;
+		}
+		if (idxj <= nlp1) {
+		    --idxj;
+		}
+		givcol[*givptr + (givcol_dim1 << 1)] = idxjp;
+		givcol[*givptr + givcol_dim1] = idxj;
+		givnum[*givptr + (givnum_dim1 << 1)] = *c__;
+		givnum[*givptr + givnum_dim1] = *s;
+	    }
+	    drot_(&c__1, &vf[jprev], &c__1, &vf[j], &c__1, c__, s);
+	    drot_(&c__1, &vl[jprev], &c__1, &vl[j], &c__1, c__, s);
+	    --k2;
+	    idxp[k2] = jprev;
+	    jprev = j;
+	} else {
+	    ++(*k);
+	    zw[*k] = z__[jprev];
+	    dsigma[*k] = d__[jprev];
+	    idxp[*k] = jprev;
+	    jprev = j;
+	}
+    }
+    goto L80;
+L90:
+
+/*     Record the last singular value. */
+
+    ++(*k);
+    zw[*k] = z__[jprev];
+    dsigma[*k] = d__[jprev];
+    idxp[*k] = jprev;
+
+L100:
+
+/*
+       Sort the singular values into DSIGMA. The singular values which
+       were not deflated go into the first K slots of DSIGMA, except
+       that DSIGMA(1) is treated separately.
+*/
+
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	jp = idxp[j];
+	dsigma[j] = d__[jp];
+	vfw[j] = vf[jp];
+	vlw[j] = vl[jp];
+/* L110: */
+    }
+    if (*icompq == 1) {
+	i__1 = n;
+	for (j = 2; j <= i__1; ++j) {
+	    jp = idxp[j];
+	    perm[j] = idxq[idx[jp] + 1];
+	    if (perm[j] <= nlp1) {
+		--perm[j];
+	    }
+/* L120: */
+	}
+    }
+
+/*
+       The deflated singular values go back into the last N - K slots of
+       D.
+*/
+
+    i__1 = n - *k;
+    dcopy_(&i__1, &dsigma[*k + 1], &c__1, &d__[*k + 1], &c__1);
+
+/*
+       Determine DSIGMA(1), DSIGMA(2), Z(1), VF(1), VL(1), VF(M), and
+       VL(M).
+*/
+
+    dsigma[1] = 0.;
+    hlftol = tol / 2.;
+    if (abs(dsigma[2]) <= hlftol) {
+	dsigma[2] = hlftol;
+    }
+    if (m > n) {
+	z__[1] = dlapy2_(&z1, &z__[m]);
+	if (z__[1] <= tol) {
+	    *c__ = 1.;
+	    *s = 0.;
+	    z__[1] = tol;
+	} else {
+	    *c__ = z1 / z__[1];
+	    *s = -z__[m] / z__[1];
+	}
+	drot_(&c__1, &vf[m], &c__1, &vf[1], &c__1, c__, s);
+	drot_(&c__1, &vl[m], &c__1, &vl[1], &c__1, c__, s);
+    } else {
+	if (abs(z1) <= tol) {
+	    z__[1] = tol;
+	} else {
+	    z__[1] = z1;
+	}
+    }
+
+/*     Restore Z, VF, and VL. */
+
+    i__1 = *k - 1;
+    dcopy_(&i__1, &zw[2], &c__1, &z__[2], &c__1);
+    i__1 = n - 1;
+    dcopy_(&i__1, &vfw[2], &c__1, &vf[2], &c__1);
+    i__1 = n - 1;
+    dcopy_(&i__1, &vlw[2], &c__1, &vl[2], &c__1);
+
+    return 0;
+
+/*     End of DLASD7 */
+
+} /* dlasd7_ */
+
+/* Subroutine */ int dlasd8_(integer *icompq, integer *k, doublereal *d__,
+	doublereal *z__, doublereal *vf, doublereal *vl, doublereal *difl,
+	doublereal *difr, integer *lddifr, doublereal *dsigma, doublereal *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer difr_dim1, difr_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal dj, rho;
+    static integer iwk1, iwk2, iwk3;
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    static doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    static integer iwk2i, iwk3i;
+    static doublereal diflj, difrj, dsigj;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    extern /* Subroutine */ int dlasd4_(integer *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, integer *), dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlaset_(char *, integer *, integer
+	    *, doublereal *, doublereal *, doublereal *, integer *),
+	    xerbla_(char *, integer *);
+    static doublereal dsigjp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLASD8 finds the square roots of the roots of the secular equation,
+    as defined by the values in DSIGMA and Z. It makes the appropriate
+    calls to DLASD4, and stores, for each  element in D, the distance
+    to its two nearest poles (elements in DSIGMA). It also updates
+    the arrays VF and VL, the first and last components of all the
+    right singular vectors of the original bidiagonal matrix.
+
+    DLASD8 is called from DLASD6.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            Specifies whether singular vectors are to be computed in
+            factored form in the calling routine:
+            = 0: Compute singular values only.
+            = 1: Compute singular vectors in factored form as well.
+
+    K       (input) INTEGER
+            The number of terms in the rational function to be solved
+            by DLASD4.  K >= 1.
+
+    D       (output) DOUBLE PRECISION array, dimension ( K )
+            On output, D contains the updated singular values.
+
+    Z       (input/output) DOUBLE PRECISION array, dimension ( K )
+            On entry, the first K elements of this array contain the
+            components of the deflation-adjusted updating row vector.
+            On exit, Z is updated.
+
+    VF      (input/output) DOUBLE PRECISION array, dimension ( K )
+            On entry, VF contains  information passed through DBEDE8.
+            On exit, VF contains the first K components of the first
+            components of all right singular vectors of the bidiagonal
+            matrix.
+
+    VL      (input/output) DOUBLE PRECISION array, dimension ( K )
+            On entry, VL contains  information passed through DBEDE8.
+            On exit, VL contains the first K components of the last
+            components of all right singular vectors of the bidiagonal
+            matrix.
+
+    DIFL    (output) DOUBLE PRECISION array, dimension ( K )
+            On exit, DIFL(I) = D(I) - DSIGMA(I).
+
+    DIFR    (output) DOUBLE PRECISION array,
+                     dimension ( LDDIFR, 2 ) if ICOMPQ = 1 and
+                     dimension ( K ) if ICOMPQ = 0.
+            On exit, DIFR(I,1) = D(I) - DSIGMA(I+1), DIFR(K,1) is not
+            defined and will not be referenced.
+
+            If ICOMPQ = 1, DIFR(1:K,2) is an array containing the
+            normalizing factors for the right singular vector matrix.
+
+    LDDIFR  (input) INTEGER
+            The leading dimension of DIFR, must be at least K.
+
+    DSIGMA  (input/output) DOUBLE PRECISION array, dimension ( K )
+            On entry, the first K elements of this array contain the old
+            roots of the deflated updating problem.  These are the poles
+            of the secular equation.
+            On exit, the elements of DSIGMA may be very slightly altered
+            in value.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension at least 3 * K
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --z__;
+    --vf;
+    --vl;
+    --difl;
+    difr_dim1 = *lddifr;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    --dsigma;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*k < 1) {
+	*info = -2;
+    } else if (*lddifr < *k) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASD8", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 1) {
+	d__[1] = abs(z__[1]);
+	difl[1] = d__[1];
+	if (*icompq == 1) {
+	    difl[2] = 1.;
+	    difr[(difr_dim1 << 1) + 1] = 1.;
+	}
+	return 0;
+    }
+
+/*
+       Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DSIGMA(I) if it is 1; this makes the subsequent
+       subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DSIGMA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DSIGMA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DLAMBDA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dsigma[i__] = dlamc3_(&dsigma[i__], &dsigma[i__]) - dsigma[i__];
+/* L10: */
+    }
+
+/*     Book keeping. */
+
+    iwk1 = 1;
+    iwk2 = iwk1 + *k;
+    iwk3 = iwk2 + *k;
+    iwk2i = iwk2 - 1;
+    iwk3i = iwk3 - 1;
+
+/*     Normalize Z. */
+
+    rho = dnrm2_(k, &z__[1], &c__1);
+    dlascl_("G", &c__0, &c__0, &rho, &c_b15, k, &c__1, &z__[1], k, info);
+    rho *= rho;
+
+/*     Initialize WORK(IWK3). */
+
+    dlaset_("A", k, &c__1, &c_b15, &c_b15, &work[iwk3], k);
+
+/*
+       Compute the updated singular values, the arrays DIFL, DIFR,
+       and the updated Z.
+*/
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	dlasd4_(k, &j, &dsigma[1], &z__[1], &work[iwk1], &rho, &d__[j], &work[
+		iwk2], info);
+
+/*        If the root finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    return 0;
+	}
+	work[iwk3i + j] = work[iwk3i + j] * work[j] * work[iwk2i + j];
+	difl[j] = -work[j];
+	difr[j + difr_dim1] = -work[j + 1];
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    work[iwk3i + i__] = work[iwk3i + i__] * work[i__] * work[iwk2i +
+		    i__] / (dsigma[i__] - dsigma[j]) / (dsigma[i__] + dsigma[
+		    j]);
+/* L20: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    work[iwk3i + i__] = work[iwk3i + i__] * work[i__] * work[iwk2i +
+		    i__] / (dsigma[i__] - dsigma[j]) / (dsigma[i__] + dsigma[
+		    j]);
+/* L30: */
+	}
+/* L40: */
+    }
+
+/*     Compute updated Z. */
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__2 = sqrt((d__1 = work[iwk3i + i__], abs(d__1)));
+	z__[i__] = d_sign(&d__2, &z__[i__]);
+/* L50: */
+    }
+
+/*     Update VF and VL. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	diflj = difl[j];
+	dj = d__[j];
+	dsigj = -dsigma[j];
+	if (j < *k) {
+	    difrj = -difr[j + difr_dim1];
+	    dsigjp = -dsigma[j + 1];
+	}
+	work[j] = -z__[j] / diflj / (dsigma[j] + dj);
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    work[i__] = z__[i__] / (dlamc3_(&dsigma[i__], &dsigj) - diflj) / (
+		    dsigma[i__] + dj);
+/* L60: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    work[i__] = z__[i__] / (dlamc3_(&dsigma[i__], &dsigjp) + difrj) /
+		    (dsigma[i__] + dj);
+/* L70: */
+	}
+	temp = dnrm2_(k, &work[1], &c__1);
+	work[iwk2i + j] = ddot_(k, &work[1], &c__1, &vf[1], &c__1) / temp;
+	work[iwk3i + j] = ddot_(k, &work[1], &c__1, &vl[1], &c__1) / temp;
+	if (*icompq == 1) {
+	    difr[j + (difr_dim1 << 1)] = temp;
+	}
+/* L80: */
+    }
+
+    dcopy_(k, &work[iwk2], &c__1, &vf[1], &c__1);
+    dcopy_(k, &work[iwk3], &c__1, &vl[1], &c__1);
+
+    return 0;
+
+/*     End of DLASD8 */
+
+} /* dlasd8_ */
+
+/* Subroutine */ int dlasda_(integer *icompq, integer *smlsiz, integer *n,
+	integer *sqre, doublereal *d__, doublereal *e, doublereal *u, integer
+	*ldu, doublereal *vt, integer *k, doublereal *difl, doublereal *difr,
+	doublereal *z__, doublereal *poles, integer *givptr, integer *givcol,
+	integer *ldgcol, integer *perm, doublereal *givnum, doublereal *c__,
+	doublereal *s, doublereal *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, difl_dim1,
+	    difl_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset,
+	    z_dim1, z_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, m, i1, ic, lf, nd, ll, nl, vf, nr, vl, im1, ncc,
+	    nlf, nrf, vfi, iwk, vli, lvl, nru, ndb1, nlp1, lvl2, nrp1;
+    static doublereal beta;
+    static integer idxq, nlvl;
+    static doublereal alpha;
+    static integer inode, ndiml, ndimr, idxqi, itemp;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer sqrei;
+    extern /* Subroutine */ int dlasd6_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *, integer *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
+	     doublereal *, integer *, integer *);
+    static integer nwork1, nwork2;
+    extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, doublereal *, doublereal *, doublereal *,
+	     integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlasdt_(integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *), dlaset_(
+	    char *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *), xerbla_(char *, integer *);
+    static integer smlszp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    Using a divide and conquer approach, DLASDA computes the singular
+    value decomposition (SVD) of a real upper bidiagonal N-by-M matrix
+    B with diagonal D and offdiagonal E, where M = N + SQRE. The
+    algorithm computes the singular values in the SVD B = U * S * VT.
+    The orthogonal matrices U and VT are optionally computed in
+    compact form.
+
+    A related subroutine, DLASD0, computes the singular values and
+    the singular vectors in explicit form.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed
+           in compact form, as follows
+           = 0: Compute singular values only.
+           = 1: Compute singular vectors of upper bidiagonal
+                matrix in compact form.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row dimension of the upper bidiagonal matrix. This is
+           also the dimension of the main diagonal array D.
+
+    SQRE   (input) INTEGER
+           Specifies the column dimension of the bidiagonal matrix.
+           = 0: The bidiagonal matrix has column dimension M = N;
+           = 1: The bidiagonal matrix has column dimension M = N + 1.
+
+    D      (input/output) DOUBLE PRECISION array, dimension ( N )
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit D, if INFO = 0, contains its singular values.
+
+    E      (input) DOUBLE PRECISION array, dimension ( M-1 )
+           Contains the subdiagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    U      (output) DOUBLE PRECISION array,
+           dimension ( LDU, SMLSIZ ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, U contains the left
+           singular vector matrices of all subproblems at the bottom
+           level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR, POLES,
+           GIVNUM, and Z.
+
+    VT     (output) DOUBLE PRECISION array,
+           dimension ( LDU, SMLSIZ+1 ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, VT' contains the right
+           singular vector matrices of all subproblems at the bottom
+           level.
+
+    K      (output) INTEGER array,
+           dimension ( N ) if ICOMPQ = 1 and dimension 1 if ICOMPQ = 0.
+           If ICOMPQ = 1, on exit, K(I) is the dimension of the I-th
+           secular equation on the computation tree.
+
+    DIFL   (output) DOUBLE PRECISION array, dimension ( LDU, NLVL ),
+           where NLVL = floor(log_2 (N/SMLSIZ))).
+
+    DIFR   (output) DOUBLE PRECISION array,
+                    dimension ( LDU, 2 * NLVL ) if ICOMPQ = 1 and
+                    dimension ( N ) if ICOMPQ = 0.
+           If ICOMPQ = 1, on exit, DIFL(1:N, I) and DIFR(1:N, 2 * I - 1)
+           record distances between singular values on the I-th
+           level and singular values on the (I -1)-th level, and
+           DIFR(1:N, 2 * I ) contains the normalizing factors for
+           the right singular vector matrix. See DLASD8 for details.
+
+    Z      (output) DOUBLE PRECISION array,
+                    dimension ( LDU, NLVL ) if ICOMPQ = 1 and
+                    dimension ( N ) if ICOMPQ = 0.
+           The first K elements of Z(1, I) contain the components of
+           the deflation-adjusted updating row vector for subproblems
+           on the I-th level.
+
+    POLES  (output) DOUBLE PRECISION array,
+           dimension ( LDU, 2 * NLVL ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, POLES(1, 2*I - 1) and
+           POLES(1, 2*I) contain  the new and old singular values
+           involved in the secular equations on the I-th level.
+
+    GIVPTR (output) INTEGER array,
+           dimension ( N ) if ICOMPQ = 1, and not referenced if
+           ICOMPQ = 0. If ICOMPQ = 1, on exit, GIVPTR( I ) records
+           the number of Givens rotations performed on the I-th
+           problem on the computation tree.
+
+    GIVCOL (output) INTEGER array,
+           dimension ( LDGCOL, 2 * NLVL ) if ICOMPQ = 1, and not
+           referenced if ICOMPQ = 0. If ICOMPQ = 1, on exit, for each I,
+           GIVCOL(1, 2 *I - 1) and GIVCOL(1, 2 *I) record the locations
+           of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (output) INTEGER array,
+           dimension ( LDGCOL, NLVL ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, PERM(1, I) records
+           permutations done on the I-th level of the computation tree.
+
+    GIVNUM (output) DOUBLE PRECISION array,
+           dimension ( LDU,  2 * NLVL ) if ICOMPQ = 1, and not
+           referenced if ICOMPQ = 0. If ICOMPQ = 1, on exit, for each I,
+           GIVNUM(1, 2 *I - 1) and GIVNUM(1, 2 *I) record the C- and S-
+           values of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    C      (output) DOUBLE PRECISION array,
+           dimension ( N ) if ICOMPQ = 1, and dimension 1 if ICOMPQ = 0.
+           If ICOMPQ = 1 and the I-th subproblem is not square, on exit,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (output) DOUBLE PRECISION array, dimension ( N ) if
+           ICOMPQ = 1, and dimension 1 if ICOMPQ = 0. If ICOMPQ = 1
+           and the I-th subproblem is not square, on exit, S( I )
+           contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    WORK   (workspace) DOUBLE PRECISION array, dimension
+           (6 * N + (SMLSIZ + 1)*(SMLSIZ + 1)).
+
+    IWORK  (workspace) INTEGER array.
+           Dimension must be at least (7 * N).
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    } else if (*ldu < *n + *sqre) {
+	*info = -8;
+    } else if (*ldgcol < *n) {
+	*info = -17;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASDA", &i__1);
+	return 0;
+    }
+
+    m = *n + *sqre;
+
+/*     If the input matrix is too small, call DLASDQ to find the SVD. */
+
+    if (*n <= *smlsiz) {
+	if (*icompq == 0) {
+	    dlasdq_("U", sqre, n, &c__0, &c__0, &c__0, &d__[1], &e[1], &vt[
+		    vt_offset], ldu, &u[u_offset], ldu, &u[u_offset], ldu, &
+		    work[1], info);
+	} else {
+	    dlasdq_("U", sqre, n, &m, n, &c__0, &d__[1], &e[1], &vt[vt_offset]
+		    , ldu, &u[u_offset], ldu, &u[u_offset], ldu, &work[1],
+		    info);
+	}
+	return 0;
+    }
+
+/*     Book-keeping and  set up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+    idxq = ndimr + *n;
+    iwk = idxq + *n;
+
+    ncc = 0;
+    nru = 0;
+
+    smlszp = *smlsiz + 1;
+    vf = 1;
+    vl = vf + m;
+    nwork1 = vl + m;
+    nwork2 = nwork1 + smlszp * smlszp;
+
+    dlasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       for the nodes on bottom level of the tree, solve
+       their subproblems by DLASDQ.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nlp1 = nl + 1;
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+	idxqi = idxq + nlf - 2;
+	vfi = vf + nlf - 1;
+	vli = vl + nlf - 1;
+	sqrei = 1;
+	if (*icompq == 0) {
+	    dlaset_("A", &nlp1, &nlp1, &c_b29, &c_b15, &work[nwork1], &smlszp);
+	    dlasdq_("U", &sqrei, &nl, &nlp1, &nru, &ncc, &d__[nlf], &e[nlf], &
+		    work[nwork1], &smlszp, &work[nwork2], &nl, &work[nwork2],
+		    &nl, &work[nwork2], info);
+	    itemp = nwork1 + nl * smlszp;
+	    dcopy_(&nlp1, &work[nwork1], &c__1, &work[vfi], &c__1);
+	    dcopy_(&nlp1, &work[itemp], &c__1, &work[vli], &c__1);
+	} else {
+	    dlaset_("A", &nl, &nl, &c_b29, &c_b15, &u[nlf + u_dim1], ldu);
+	    dlaset_("A", &nlp1, &nlp1, &c_b29, &c_b15, &vt[nlf + vt_dim1],
+		    ldu);
+	    dlasdq_("U", &sqrei, &nl, &nlp1, &nl, &ncc, &d__[nlf], &e[nlf], &
+		    vt[nlf + vt_dim1], ldu, &u[nlf + u_dim1], ldu, &u[nlf +
+		    u_dim1], ldu, &work[nwork1], info);
+	    dcopy_(&nlp1, &vt[nlf + vt_dim1], &c__1, &work[vfi], &c__1);
+	    dcopy_(&nlp1, &vt[nlf + nlp1 * vt_dim1], &c__1, &work[vli], &c__1)
+		    ;
+	}
+	if (*info != 0) {
+	    return 0;
+	}
+	i__2 = nl;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[idxqi + j] = j;
+/* L10: */
+	}
+	if (i__ == nd && *sqre == 0) {
+	    sqrei = 0;
+	} else {
+	    sqrei = 1;
+	}
+	idxqi += nlp1;
+	vfi += nlp1;
+	vli += nlp1;
+	nrp1 = nr + sqrei;
+	if (*icompq == 0) {
+	    dlaset_("A", &nrp1, &nrp1, &c_b29, &c_b15, &work[nwork1], &smlszp);
+	    dlasdq_("U", &sqrei, &nr, &nrp1, &nru, &ncc, &d__[nrf], &e[nrf], &
+		    work[nwork1], &smlszp, &work[nwork2], &nr, &work[nwork2],
+		    &nr, &work[nwork2], info);
+	    itemp = nwork1 + (nrp1 - 1) * smlszp;
+	    dcopy_(&nrp1, &work[nwork1], &c__1, &work[vfi], &c__1);
+	    dcopy_(&nrp1, &work[itemp], &c__1, &work[vli], &c__1);
+	} else {
+	    dlaset_("A", &nr, &nr, &c_b29, &c_b15, &u[nrf + u_dim1], ldu);
+	    dlaset_("A", &nrp1, &nrp1, &c_b29, &c_b15, &vt[nrf + vt_dim1],
+		    ldu);
+	    dlasdq_("U", &sqrei, &nr, &nrp1, &nr, &ncc, &d__[nrf], &e[nrf], &
+		    vt[nrf + vt_dim1], ldu, &u[nrf + u_dim1], ldu, &u[nrf +
+		    u_dim1], ldu, &work[nwork1], info);
+	    dcopy_(&nrp1, &vt[nrf + vt_dim1], &c__1, &work[vfi], &c__1);
+	    dcopy_(&nrp1, &vt[nrf + nrp1 * vt_dim1], &c__1, &work[vli], &c__1)
+		    ;
+	}
+	if (*info != 0) {
+	    return 0;
+	}
+	i__2 = nr;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[idxqi + j] = j;
+/* L20: */
+	}
+/* L30: */
+    }
+
+/*     Now conquer each subproblem bottom-up. */
+
+    j = pow_ii(&c__2, &nlvl);
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqrei = *sqre;
+	    } else {
+		sqrei = 1;
+	    }
+	    vfi = vf + nlf - 1;
+	    vli = vl + nlf - 1;
+	    idxqi = idxq + nlf - 1;
+	    alpha = d__[ic];
+	    beta = e[ic];
+	    if (*icompq == 0) {
+		dlasd6_(icompq, &nl, &nr, &sqrei, &d__[nlf], &work[vfi], &
+			work[vli], &alpha, &beta, &iwork[idxqi], &perm[
+			perm_offset], &givptr[1], &givcol[givcol_offset],
+			ldgcol, &givnum[givnum_offset], ldu, &poles[
+			poles_offset], &difl[difl_offset], &difr[difr_offset],
+			 &z__[z_offset], &k[1], &c__[1], &s[1], &work[nwork1],
+			 &iwork[iwk], info);
+	    } else {
+		--j;
+		dlasd6_(icompq, &nl, &nr, &sqrei, &d__[nlf], &work[vfi], &
+			work[vli], &alpha, &beta, &iwork[idxqi], &perm[nlf +
+			lvl * perm_dim1], &givptr[j], &givcol[nlf + lvl2 *
+			givcol_dim1], ldgcol, &givnum[nlf + lvl2 *
+			givnum_dim1], ldu, &poles[nlf + lvl2 * poles_dim1], &
+			difl[nlf + lvl * difl_dim1], &difr[nlf + lvl2 *
+			difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[j],
+			&s[j], &work[nwork1], &iwork[iwk], info);
+	    }
+	    if (*info != 0) {
+		return 0;
+	    }
+/* L40: */
+	}
+/* L50: */
+    }
+
+    return 0;
+
+/*     End of DLASDA */
+
+} /* dlasda_ */
+
+/* Subroutine */ int dlasdq_(char *uplo, integer *sqre, integer *n, integer *
+	ncvt, integer *nru, integer *ncc, doublereal *d__, doublereal *e,
+	doublereal *vt, integer *ldvt, doublereal *u, integer *ldu,
+	doublereal *c__, integer *ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal r__, cs, sn;
+    static integer np1, isub;
+    static doublereal smin;
+    static integer sqre1;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *
+	    , doublereal *, integer *);
+    static integer iuplo;
+    extern /* Subroutine */ int dlartg_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *), xerbla_(char *,
+	    integer *), dbdsqr_(char *, integer *, integer *, integer
+	    *, integer *, doublereal *, doublereal *, doublereal *, integer *,
+	     doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    static logical rotate;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASDQ computes the singular value decomposition (SVD) of a real
+    (upper or lower) bidiagonal matrix with diagonal D and offdiagonal
+    E, accumulating the transformations if desired. Letting B denote
+    the input bidiagonal matrix, the algorithm computes orthogonal
+    matrices Q and P such that B = Q * S * P' (P' denotes the transpose
+    of P). The singular values S are overwritten on D.
+
+    The input matrix U  is changed to U  * Q  if desired.
+    The input matrix VT is changed to P' * VT if desired.
+    The input matrix C  is changed to Q' * C  if desired.
+
+    See "Computing  Small Singular Values of Bidiagonal Matrices With
+    Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+    LAPACK Working Note #3, for a detailed description of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO  (input) CHARACTER*1
+          On entry, UPLO specifies whether the input bidiagonal matrix
+          is upper or lower bidiagonal, and wether it is square are
+          not.
+             UPLO = 'U' or 'u'   B is upper bidiagonal.
+             UPLO = 'L' or 'l'   B is lower bidiagonal.
+
+    SQRE  (input) INTEGER
+          = 0: then the input matrix is N-by-N.
+          = 1: then the input matrix is N-by-(N+1) if UPLU = 'U' and
+               (N+1)-by-N if UPLU = 'L'.
+
+          The bidiagonal matrix has
+          N = NL + NR + 1 rows and
+          M = N + SQRE >= N columns.
+
+    N     (input) INTEGER
+          On entry, N specifies the number of rows and columns
+          in the matrix. N must be at least 0.
+
+    NCVT  (input) INTEGER
+          On entry, NCVT specifies the number of columns of
+          the matrix VT. NCVT must be at least 0.
+
+    NRU   (input) INTEGER
+          On entry, NRU specifies the number of rows of
+          the matrix U. NRU must be at least 0.
+
+    NCC   (input) INTEGER
+          On entry, NCC specifies the number of columns of
+          the matrix C. NCC must be at least 0.
+
+    D     (input/output) DOUBLE PRECISION array, dimension (N)
+          On entry, D contains the diagonal entries of the
+          bidiagonal matrix whose SVD is desired. On normal exit,
+          D contains the singular values in ascending order.
+
+    E     (input/output) DOUBLE PRECISION array.
+          dimension is (N-1) if SQRE = 0 and N if SQRE = 1.
+          On entry, the entries of E contain the offdiagonal entries
+          of the bidiagonal matrix whose SVD is desired. On normal
+          exit, E will contain 0. If the algorithm does not converge,
+          D and E will contain the diagonal and superdiagonal entries
+          of a bidiagonal matrix orthogonally equivalent to the one
+          given as input.
+
+    VT    (input/output) DOUBLE PRECISION array, dimension (LDVT, NCVT)
+          On entry, contains a matrix which on exit has been
+          premultiplied by P', dimension N-by-NCVT if SQRE = 0
+          and (N+1)-by-NCVT if SQRE = 1 (not referenced if NCVT=0).
+
+    LDVT  (input) INTEGER
+          On entry, LDVT specifies the leading dimension of VT as
+          declared in the calling (sub) program. LDVT must be at
+          least 1. If NCVT is nonzero LDVT must also be at least N.
+
+    U     (input/output) DOUBLE PRECISION array, dimension (LDU, N)
+          On entry, contains a  matrix which on exit has been
+          postmultiplied by Q, dimension NRU-by-N if SQRE = 0
+          and NRU-by-(N+1) if SQRE = 1 (not referenced if NRU=0).
+
+    LDU   (input) INTEGER
+          On entry, LDU  specifies the leading dimension of U as
+          declared in the calling (sub) program. LDU must be at
+          least max( 1, NRU ) .
+
+    C     (input/output) DOUBLE PRECISION array, dimension (LDC, NCC)
+          On entry, contains an N-by-NCC matrix which on exit
+          has been premultiplied by Q'  dimension N-by-NCC if SQRE = 0
+          and (N+1)-by-NCC if SQRE = 1 (not referenced if NCC=0).
+
+    LDC   (input) INTEGER
+          On entry, LDC  specifies the leading dimension of C as
+          declared in the calling (sub) program. LDC must be at
+          least 1. If NCC is nonzero, LDC must also be at least N.
+
+    WORK  (workspace) DOUBLE PRECISION array, dimension (4*N)
+          Workspace. Only referenced if one of NCVT, NRU, or NCC is
+          nonzero, and if N is at least 2.
+
+    INFO  (output) INTEGER
+          On exit, a value of 0 indicates a successful exit.
+          If INFO < 0, argument number -INFO is illegal.
+          If INFO > 0, the algorithm did not converge, and INFO
+          specifies how many superdiagonals did not converge.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    iuplo = 0;
+    if (lsame_(uplo, "U")) {
+	iuplo = 1;
+    }
+    if (lsame_(uplo, "L")) {
+	iuplo = 2;
+    }
+    if (iuplo == 0) {
+	*info = -1;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ncvt < 0) {
+	*info = -4;
+    } else if (*nru < 0) {
+	*info = -5;
+    } else if (*ncc < 0) {
+	*info = -6;
+    } else if (*ncvt == 0 && *ldvt < 1 || *ncvt > 0 && *ldvt < max(1,*n)) {
+	*info = -10;
+    } else if (*ldu < max(1,*nru)) {
+	*info = -12;
+    } else if (*ncc == 0 && *ldc < 1 || *ncc > 0 && *ldc < max(1,*n)) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASDQ", &i__1);
+	return 0;
+    }
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     ROTATE is true if any singular vectors desired, false otherwise */
+
+    rotate = *ncvt > 0 || *nru > 0 || *ncc > 0;
+    np1 = *n + 1;
+    sqre1 = *sqre;
+
+/*
+       If matrix non-square upper bidiagonal, rotate to be lower
+       bidiagonal.  The rotations are on the right.
+*/
+
+    if (iuplo == 1 && sqre1 == 1) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dlartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (rotate) {
+		work[i__] = cs;
+		work[*n + i__] = sn;
+	    }
+/* L10: */
+	}
+	dlartg_(&d__[*n], &e[*n], &cs, &sn, &r__);
+	d__[*n] = r__;
+	e[*n] = 0.;
+	if (rotate) {
+	    work[*n] = cs;
+	    work[*n + *n] = sn;
+	}
+	iuplo = 2;
+	sqre1 = 0;
+
+/*        Update singular vectors if desired. */
+
+	if (*ncvt > 0) {
+	    dlasr_("L", "V", "F", &np1, ncvt, &work[1], &work[np1], &vt[
+		    vt_offset], ldvt);
+	}
+    }
+
+/*
+       If matrix lower bidiagonal, rotate to be upper bidiagonal
+       by applying Givens rotations on the left.
+*/
+
+    if (iuplo == 2) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dlartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (rotate) {
+		work[i__] = cs;
+		work[*n + i__] = sn;
+	    }
+/* L20: */
+	}
+
+/*
+          If matrix (N+1)-by-N lower bidiagonal, one additional
+          rotation is needed.
+*/
+
+	if (sqre1 == 1) {
+	    dlartg_(&d__[*n], &e[*n], &cs, &sn, &r__);
+	    d__[*n] = r__;
+	    if (rotate) {
+		work[*n] = cs;
+		work[*n + *n] = sn;
+	    }
+	}
+
+/*        Update singular vectors if desired. */
+
+	if (*nru > 0) {
+	    if (sqre1 == 0) {
+		dlasr_("R", "V", "F", nru, n, &work[1], &work[np1], &u[
+			u_offset], ldu);
+	    } else {
+		dlasr_("R", "V", "F", nru, &np1, &work[1], &work[np1], &u[
+			u_offset], ldu);
+	    }
+	}
+	if (*ncc > 0) {
+	    if (sqre1 == 0) {
+		dlasr_("L", "V", "F", n, ncc, &work[1], &work[np1], &c__[
+			c_offset], ldc);
+	    } else {
+		dlasr_("L", "V", "F", &np1, ncc, &work[1], &work[np1], &c__[
+			c_offset], ldc);
+	    }
+	}
+    }
+
+/*
+       Call DBDSQR to compute the SVD of the reduced real
+       N-by-N upper bidiagonal matrix.
+*/
+
+    dbdsqr_("U", n, ncvt, nru, ncc, &d__[1], &e[1], &vt[vt_offset], ldvt, &u[
+	    u_offset], ldu, &c__[c_offset], ldc, &work[1], info);
+
+/*
+       Sort the singular values into ascending order (insertion sort on
+       singular values, but only one transposition per singular vector)
+*/
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Scan for smallest D(I). */
+
+	isub = i__;
+	smin = d__[i__];
+	i__2 = *n;
+	for (j = i__ + 1; j <= i__2; ++j) {
+	    if (d__[j] < smin) {
+		isub = j;
+		smin = d__[j];
+	    }
+/* L30: */
+	}
+	if (isub != i__) {
+
+/*           Swap singular values and vectors. */
+
+	    d__[isub] = d__[i__];
+	    d__[i__] = smin;
+	    if (*ncvt > 0) {
+		dswap_(ncvt, &vt[isub + vt_dim1], ldvt, &vt[i__ + vt_dim1],
+			ldvt);
+	    }
+	    if (*nru > 0) {
+		dswap_(nru, &u[isub * u_dim1 + 1], &c__1, &u[i__ * u_dim1 + 1]
+			, &c__1);
+	    }
+	    if (*ncc > 0) {
+		dswap_(ncc, &c__[isub + c_dim1], ldc, &c__[i__ + c_dim1], ldc)
+			;
+	    }
+	}
+/* L40: */
+    }
+
+    return 0;
+
+/*     End of DLASDQ */
+
+} /* dlasdq_ */
+
+/* Subroutine */ int dlasdt_(integer *n, integer *lvl, integer *nd, integer *
+	inode, integer *ndiml, integer *ndimr, integer *msub)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i__, il, ir, maxn;
+    static doublereal temp;
+    static integer nlvl, llst, ncrnt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    DLASDT creates a tree of subproblems for bidiagonal divide and
+    conquer.
+
+    Arguments
+    =========
+
+     N      (input) INTEGER
+            On entry, the number of diagonal elements of the
+            bidiagonal matrix.
+
+     LVL    (output) INTEGER
+            On exit, the number of levels on the computation tree.
+
+     ND     (output) INTEGER
+            On exit, the number of nodes on the tree.
+
+     INODE  (output) INTEGER array, dimension ( N )
+            On exit, centers of subproblems.
+
+     NDIML  (output) INTEGER array, dimension ( N )
+            On exit, row dimensions of left children.
+
+     NDIMR  (output) INTEGER array, dimension ( N )
+            On exit, row dimensions of right children.
+
+     MSUB   (input) INTEGER
+            On entry, the maximum row dimension each subproblem at the
+            bottom of the tree can be of.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Find the number of levels on the tree.
+*/
+
+    /* Parameter adjustments */
+    --ndimr;
+    --ndiml;
+    --inode;
+
+    /* Function Body */
+    maxn = max(1,*n);
+    temp = log((doublereal) maxn / (doublereal) (*msub + 1)) / log(2.);
+    *lvl = (integer) temp + 1;
+
+    i__ = *n / 2;
+    inode[1] = i__ + 1;
+    ndiml[1] = i__;
+    ndimr[1] = *n - i__ - 1;
+    il = 0;
+    ir = 1;
+    llst = 1;
+    i__1 = *lvl - 1;
+    for (nlvl = 1; nlvl <= i__1; ++nlvl) {
+
+/*
+          Constructing the tree at (NLVL+1)-st level. The number of
+          nodes created on this level is LLST * 2.
+*/
+
+	i__2 = llst - 1;
+	for (i__ = 0; i__ <= i__2; ++i__) {
+	    il += 2;
+	    ir += 2;
+	    ncrnt = llst + i__;
+	    ndiml[il] = ndiml[ncrnt] / 2;
+	    ndimr[il] = ndiml[ncrnt] - ndiml[il] - 1;
+	    inode[il] = inode[ncrnt] - ndimr[il] - 1;
+	    ndiml[ir] = ndimr[ncrnt] / 2;
+	    ndimr[ir] = ndimr[ncrnt] - ndiml[ir] - 1;
+	    inode[ir] = inode[ncrnt] + ndiml[ir] + 1;
+/* L10: */
+	}
+	llst <<= 1;
+/* L20: */
+    }
+    *nd = (llst << 1) - 1;
+
+    return 0;
+
+/*     End of DLASDT */
+
+} /* dlasdt_ */
+
+/* Subroutine */ int dlaset_(char *uplo, integer *m, integer *n, doublereal *
+	alpha, doublereal *beta, doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASET initializes an m-by-n matrix A to BETA on the diagonal and
+    ALPHA on the offdiagonals.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be set.
+            = 'U':      Upper triangular part is set; the strictly lower
+                        triangular part of A is not changed.
+            = 'L':      Lower triangular part is set; the strictly upper
+                        triangular part of A is not changed.
+            Otherwise:  All of the matrix A is set.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    ALPHA   (input) DOUBLE PRECISION
+            The constant to which the offdiagonal elements are to be set.
+
+    BETA    (input) DOUBLE PRECISION
+            The constant to which the diagonal elements are to be set.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On exit, the leading m-by-n submatrix of A is set as follows:
+
+            if UPLO = 'U', A(i,j) = ALPHA, 1<=i<=j-1, 1<=j<=n,
+            if UPLO = 'L', A(i,j) = ALPHA, j+1<=i<=m, 1<=j<=n,
+            otherwise,     A(i,j) = ALPHA, 1<=i<=m, 1<=j<=n, i.ne.j,
+
+            and, for all UPLO, A(i,i) = BETA, 1<=i<=min(m,n).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+
+/*
+          Set the strictly upper triangular or trapezoidal part of the
+          array to ALPHA.
+*/
+
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j - 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+
+/*
+          Set the strictly lower triangular or trapezoidal part of the
+          array to ALPHA.
+*/
+
+	i__1 = min(*m,*n);
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j + 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+
+/*        Set the leading m-by-n submatrix to ALPHA. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+/*     Set the first min(M,N) diagonal elements to BETA. */
+
+    i__1 = min(*m,*n);
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	a[i__ + i__ * a_dim1] = *beta;
+/* L70: */
+    }
+
+    return 0;
+
+/*     End of DLASET */
+
+} /* dlaset_ */
+
+/* Subroutine */ int dlasq1_(integer *n, doublereal *d__, doublereal *e,
+	doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal d__1, d__2, d__3;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal eps;
+    extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal
+	    *, doublereal *, doublereal *);
+    static doublereal scale;
+    static integer iinfo;
+    static doublereal sigmn;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static doublereal sigmx;
+    extern /* Subroutine */ int dlasq2_(integer *, doublereal *, integer *);
+
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *);
+    static doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *), dlasrt_(
+	    char *, integer *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    DLASQ1 computes the singular values of a real N-by-N bidiagonal
+    matrix with diagonal D and off-diagonal E. The singular values
+    are computed to high relative accuracy, in the absence of
+    denormalization, underflow and overflow. The algorithm was first
+    presented in
+
+    "Accurate singular values and differential qd algorithms" by K. V.
+    Fernando and B. N. Parlett, Numer. Math., Vol-67, No. 2, pp. 191-230,
+    1994,
+
+    and the present implementation is described in "An implementation of
+    the dqds Algorithm (Positive Case)", LAPACK Working Note.
+
+    Arguments
+    =========
+
+    N     (input) INTEGER
+          The number of rows and columns in the matrix. N >= 0.
+
+    D     (input/output) DOUBLE PRECISION array, dimension (N)
+          On entry, D contains the diagonal elements of the
+          bidiagonal matrix whose SVD is desired. On normal exit,
+          D contains the singular values in decreasing order.
+
+    E     (input/output) DOUBLE PRECISION array, dimension (N)
+          On entry, elements E(1:N-1) contain the off-diagonal elements
+          of the bidiagonal matrix whose SVD is desired.
+          On exit, E is overwritten.
+
+    WORK  (workspace) DOUBLE PRECISION array, dimension (4*N)
+
+    INFO  (output) INTEGER
+          = 0: successful exit
+          < 0: if INFO = -i, the i-th argument had an illegal value
+          > 0: the algorithm failed
+               = 1, a split was marked by a positive value in E
+               = 2, current block of Z not diagonalized after 30*N
+                    iterations (in inner while loop)
+               = 3, termination criterion of outer while loop not met
+                    (program created more than N unreduced blocks)
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --work;
+    --e;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -2;
+	i__1 = -(*info);
+	xerbla_("DLASQ1", &i__1);
+	return 0;
+    } else if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	d__[1] = abs(d__[1]);
+	return 0;
+    } else if (*n == 2) {
+	dlas2_(&d__[1], &e[1], &d__[2], &sigmn, &sigmx);
+	d__[1] = sigmx;
+	d__[2] = sigmn;
+	return 0;
+    }
+
+/*     Estimate the largest singular value. */
+
+    sigmx = 0.;
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = (d__1 = d__[i__], abs(d__1));
+/* Computing MAX */
+	d__2 = sigmx, d__3 = (d__1 = e[i__], abs(d__1));
+	sigmx = max(d__2,d__3);
+/* L10: */
+    }
+    d__[*n] = (d__1 = d__[*n], abs(d__1));
+
+/*     Early return if SIGMX is zero (matrix is already diagonal). */
+
+    if (sigmx == 0.) {
+	dlasrt_("D", n, &d__[1], &iinfo);
+	return 0;
+    }
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	d__1 = sigmx, d__2 = d__[i__];
+	sigmx = max(d__1,d__2);
+/* L20: */
+    }
+
+/*
+       Copy D and E into WORK (in the Z format) and scale (squaring the
+       input data makes scaling by a power of the radix pointless).
+*/
+
+    eps = PRECISION;
+    safmin = SAFEMINIMUM;
+    scale = sqrt(eps / safmin);
+    dcopy_(n, &d__[1], &c__1, &work[1], &c__2);
+    i__1 = *n - 1;
+    dcopy_(&i__1, &e[1], &c__1, &work[2], &c__2);
+    i__1 = (*n << 1) - 1;
+    i__2 = (*n << 1) - 1;
+    dlascl_("G", &c__0, &c__0, &sigmx, &scale, &i__1, &c__1, &work[1], &i__2,
+	    &iinfo);
+
+/*     Compute the q's and e's. */
+
+    i__1 = (*n << 1) - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing 2nd power */
+	d__1 = work[i__];
+	work[i__] = d__1 * d__1;
+/* L30: */
+    }
+    work[*n * 2] = 0.;
+
+    dlasq2_(n, &work[1], info);
+
+    if (*info == 0) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__[i__] = sqrt(work[i__]);
+/* L40: */
+	}
+	dlascl_("G", &c__0, &c__0, &scale, &sigmx, n, &c__1, &d__[1], n, &
+		iinfo);
+    }
+
+    return 0;
+
+/*     End of DLASQ1 */
+
+} /* dlasq1_ */
+
+/* Subroutine */ int dlasq2_(integer *n, doublereal *z__, integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal d__, e, g;
+    static integer k;
+    static doublereal s, t;
+    static integer i0, i4, n0;
+    static doublereal dn;
+    static integer pp;
+    static doublereal dn1, dn2, dee, eps, tau, tol;
+    static integer ipn4;
+    static doublereal tol2;
+    static logical ieee;
+    static integer nbig;
+    static doublereal dmin__, emin, emax;
+    static integer kmin, ndiv, iter;
+    static doublereal qmin, temp, qmax, zmax;
+    static integer splt;
+    static doublereal dmin1, dmin2;
+    static integer nfail;
+    static doublereal desig, trace, sigma;
+    static integer iinfo, ttype;
+    extern /* Subroutine */ int dlasq3_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     integer *, integer *, integer *, logical *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *);
+
+    static doublereal deemin;
+    static integer iwhila, iwhilb;
+    static doublereal oldemn, safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *,
+	    integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    DLASQ2 computes all the eigenvalues of the symmetric positive
+    definite tridiagonal matrix associated with the qd array Z to high
+    relative accuracy are computed to high relative accuracy, in the
+    absence of denormalization, underflow and overflow.
+
+    To see the relation of Z to the tridiagonal matrix, let L be a
+    unit lower bidiagonal matrix with subdiagonals Z(2,4,6,,..) and
+    let U be an upper bidiagonal matrix with 1's above and diagonal
+    Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+    symmetric tridiagonal to which it is similar.
+
+    Note : DLASQ2 defines a logical variable, IEEE, which is true
+    on machines which follow ieee-754 floating-point standard in their
+    handling of infinities and NaNs, and false otherwise. This variable
+    is passed to DLASQ3.
+
+    Arguments
+    =========
+
+    N     (input) INTEGER
+          The number of rows and columns in the matrix. N >= 0.
+
+    Z     (input/output) DOUBLE PRECISION array, dimension ( 4*N )
+          On entry Z holds the qd array. On exit, entries 1 to N hold
+          the eigenvalues in decreasing order, Z( 2*N+1 ) holds the
+          trace, and Z( 2*N+2 ) holds the sum of the eigenvalues. If
+          N > 2, then Z( 2*N+3 ) holds the iteration count, Z( 2*N+4 )
+          holds NDIVS/NIN^2, and Z( 2*N+5 ) holds the percentage of
+          shifts that failed.
+
+    INFO  (output) INTEGER
+          = 0: successful exit
+          < 0: if the i-th argument is a scalar and had an illegal
+               value, then INFO = -i, if the i-th argument is an
+               array and the j-entry had an illegal value, then
+               INFO = -(i*100+j)
+          > 0: the algorithm failed
+                = 1, a split was marked by a positive value in E
+                = 2, current block of Z not diagonalized after 30*N
+                     iterations (in inner while loop)
+                = 3, termination criterion of outer while loop not met
+                     (program created more than N unreduced blocks)
+
+    Further Details
+    ===============
+    Local Variables: I0:N0 defines a current unreduced segment of Z.
+    The shifts are accumulated in SIGMA. Iteration count is in ITER.
+    Ping-pong is controlled by PP (alternates between 0 and 1).
+
+    =====================================================================
+
+
+       Test the input arguments.
+       (in case DLASQ2 is not called by DLASQ1)
+*/
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    *info = 0;
+    eps = PRECISION;
+    safmin = SAFEMINIMUM;
+    tol = eps * 100.;
+/* Computing 2nd power */
+    d__1 = tol;
+    tol2 = d__1 * d__1;
+
+    if (*n < 0) {
+	*info = -1;
+	xerbla_("DLASQ2", &c__1);
+	return 0;
+    } else if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+
+/*        1-by-1 case. */
+
+	if (z__[1] < 0.) {
+	    *info = -201;
+	    xerbla_("DLASQ2", &c__2);
+	}
+	return 0;
+    } else if (*n == 2) {
+
+/*        2-by-2 case. */
+
+	if (z__[2] < 0. || z__[3] < 0.) {
+	    *info = -2;
+	    xerbla_("DLASQ2", &c__2);
+	    return 0;
+	} else if (z__[3] > z__[1]) {
+	    d__ = z__[3];
+	    z__[3] = z__[1];
+	    z__[1] = d__;
+	}
+	z__[5] = z__[1] + z__[2] + z__[3];
+	if (z__[2] > z__[3] * tol2) {
+	    t = (z__[1] - z__[3] + z__[2]) * .5;
+	    s = z__[3] * (z__[2] / t);
+	    if (s <= t) {
+		s = z__[3] * (z__[2] / (t * (sqrt(s / t + 1.) + 1.)));
+	    } else {
+		s = z__[3] * (z__[2] / (t + sqrt(t) * sqrt(t + s)));
+	    }
+	    t = z__[1] + (s + z__[2]);
+	    z__[3] *= z__[1] / t;
+	    z__[1] = t;
+	}
+	z__[2] = z__[3];
+	z__[6] = z__[2] + z__[1];
+	return 0;
+    }
+
+/*     Check for negative data and compute sums of q's and e's. */
+
+    z__[*n * 2] = 0.;
+    emin = z__[2];
+    qmax = 0.;
+    zmax = 0.;
+    d__ = 0.;
+    e = 0.;
+
+    i__1 = *n - 1 << 1;
+    for (k = 1; k <= i__1; k += 2) {
+	if (z__[k] < 0.) {
+	    *info = -(k + 200);
+	    xerbla_("DLASQ2", &c__2);
+	    return 0;
+	} else if (z__[k + 1] < 0.) {
+	    *info = -(k + 201);
+	    xerbla_("DLASQ2", &c__2);
+	    return 0;
+	}
+	d__ += z__[k];
+	e += z__[k + 1];
+/* Computing MAX */
+	d__1 = qmax, d__2 = z__[k];
+	qmax = max(d__1,d__2);
+/* Computing MIN */
+	d__1 = emin, d__2 = z__[k + 1];
+	emin = min(d__1,d__2);
+/* Computing MAX */
+	d__1 = max(qmax,zmax), d__2 = z__[k + 1];
+	zmax = max(d__1,d__2);
+/* L10: */
+    }
+    if (z__[(*n << 1) - 1] < 0.) {
+	*info = -((*n << 1) + 199);
+	xerbla_("DLASQ2", &c__2);
+	return 0;
+    }
+    d__ += z__[(*n << 1) - 1];
+/* Computing MAX */
+    d__1 = qmax, d__2 = z__[(*n << 1) - 1];
+    qmax = max(d__1,d__2);
+    zmax = max(qmax,zmax);
+
+/*     Check for diagonality. */
+
+    if (e == 0.) {
+	i__1 = *n;
+	for (k = 2; k <= i__1; ++k) {
+	    z__[k] = z__[(k << 1) - 1];
+/* L20: */
+	}
+	dlasrt_("D", n, &z__[1], &iinfo);
+	z__[(*n << 1) - 1] = d__;
+	return 0;
+    }
+
+    trace = d__ + e;
+
+/*     Check for zero data. */
+
+    if (trace == 0.) {
+	z__[(*n << 1) - 1] = 0.;
+	return 0;
+    }
+
+/*     Check whether the machine is IEEE conformable. */
+
+    ieee = ilaenv_(&c__10, "DLASQ2", "N", &c__1, &c__2, &c__3, &c__4, (ftnlen)
+	    6, (ftnlen)1) == 1 && ilaenv_(&c__11, "DLASQ2", "N", &c__1, &c__2,
+	     &c__3, &c__4, (ftnlen)6, (ftnlen)1) == 1;
+
+/*     Rearrange data for locality: Z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...). */
+
+    for (k = *n << 1; k >= 2; k += -2) {
+	z__[k * 2] = 0.;
+	z__[(k << 1) - 1] = z__[k];
+	z__[(k << 1) - 2] = 0.;
+	z__[(k << 1) - 3] = z__[k - 1];
+/* L30: */
+    }
+
+    i0 = 1;
+    n0 = *n;
+
+/*     Reverse the qd-array, if warranted. */
+
+    if (z__[(i0 << 2) - 3] * 1.5 < z__[(n0 << 2) - 3]) {
+	ipn4 = i0 + n0 << 2;
+	i__1 = i0 + n0 - 1 << 1;
+	for (i4 = i0 << 2; i4 <= i__1; i4 += 4) {
+	    temp = z__[i4 - 3];
+	    z__[i4 - 3] = z__[ipn4 - i4 - 3];
+	    z__[ipn4 - i4 - 3] = temp;
+	    temp = z__[i4 - 1];
+	    z__[i4 - 1] = z__[ipn4 - i4 - 5];
+	    z__[ipn4 - i4 - 5] = temp;
+/* L40: */
+	}
+    }
+
+/*     Initial split checking via dqd and Li's test. */
+
+    pp = 0;
+
+    for (k = 1; k <= 2; ++k) {
+
+	d__ = z__[(n0 << 2) + pp - 3];
+	i__1 = (i0 << 2) + pp;
+	for (i4 = (n0 - 1 << 2) + pp; i4 >= i__1; i4 += -4) {
+	    if (z__[i4 - 1] <= tol2 * d__) {
+		z__[i4 - 1] = -0.;
+		d__ = z__[i4 - 3];
+	    } else {
+		d__ = z__[i4 - 3] * (d__ / (d__ + z__[i4 - 1]));
+	    }
+/* L50: */
+	}
+
+/*        dqd maps Z to ZZ plus Li's test. */
+
+	emin = z__[(i0 << 2) + pp + 1];
+	d__ = z__[(i0 << 2) + pp - 3];
+	i__1 = (n0 - 1 << 2) + pp;
+	for (i4 = (i0 << 2) + pp; i4 <= i__1; i4 += 4) {
+	    z__[i4 - (pp << 1) - 2] = d__ + z__[i4 - 1];
+	    if (z__[i4 - 1] <= tol2 * d__) {
+		z__[i4 - 1] = -0.;
+		z__[i4 - (pp << 1) - 2] = d__;
+		z__[i4 - (pp << 1)] = 0.;
+		d__ = z__[i4 + 1];
+	    } else if (safmin * z__[i4 + 1] < z__[i4 - (pp << 1) - 2] &&
+		    safmin * z__[i4 - (pp << 1) - 2] < z__[i4 + 1]) {
+		temp = z__[i4 + 1] / z__[i4 - (pp << 1) - 2];
+		z__[i4 - (pp << 1)] = z__[i4 - 1] * temp;
+		d__ *= temp;
+	    } else {
+		z__[i4 - (pp << 1)] = z__[i4 + 1] * (z__[i4 - 1] / z__[i4 - (
+			pp << 1) - 2]);
+		d__ = z__[i4 + 1] * (d__ / z__[i4 - (pp << 1) - 2]);
+	    }
+/* Computing MIN */
+	    d__1 = emin, d__2 = z__[i4 - (pp << 1)];
+	    emin = min(d__1,d__2);
+/* L60: */
+	}
+	z__[(n0 << 2) - pp - 2] = d__;
+
+/*        Now find qmax. */
+
+	qmax = z__[(i0 << 2) - pp - 2];
+	i__1 = (n0 << 2) - pp - 2;
+	for (i4 = (i0 << 2) - pp + 2; i4 <= i__1; i4 += 4) {
+/* Computing MAX */
+	    d__1 = qmax, d__2 = z__[i4];
+	    qmax = max(d__1,d__2);
+/* L70: */
+	}
+
+/*        Prepare for the next iteration on K. */
+
+	pp = 1 - pp;
+/* L80: */
+    }
+
+/*     Initialise variables to pass to DLASQ3. */
+
+    ttype = 0;
+    dmin1 = 0.;
+    dmin2 = 0.;
+    dn = 0.;
+    dn1 = 0.;
+    dn2 = 0.;
+    g = 0.;
+    tau = 0.;
+
+    iter = 2;
+    nfail = 0;
+    ndiv = n0 - i0 << 1;
+
+    i__1 = *n + 1;
+    for (iwhila = 1; iwhila <= i__1; ++iwhila) {
+	if (n0 < 1) {
+	    goto L170;
+	}
+
+/*
+          While array unfinished do
+
+          E(N0) holds the value of SIGMA when submatrix in I0:N0
+          splits from the rest of the array, but is negated.
+*/
+
+	desig = 0.;
+	if (n0 == *n) {
+	    sigma = 0.;
+	} else {
+	    sigma = -z__[(n0 << 2) - 1];
+	}
+	if (sigma < 0.) {
+	    *info = 1;
+	    return 0;
+	}
+
+/*
+          Find last unreduced submatrix's top index I0, find QMAX and
+          EMIN. Find Gershgorin-type bound if Q's much greater than E's.
+*/
+
+	emax = 0.;
+	if (n0 > i0) {
+	    emin = (d__1 = z__[(n0 << 2) - 5], abs(d__1));
+	} else {
+	    emin = 0.;
+	}
+	qmin = z__[(n0 << 2) - 3];
+	qmax = qmin;
+	for (i4 = n0 << 2; i4 >= 8; i4 += -4) {
+	    if (z__[i4 - 5] <= 0.) {
+		goto L100;
+	    }
+	    if (qmin >= emax * 4.) {
+/* Computing MIN */
+		d__1 = qmin, d__2 = z__[i4 - 3];
+		qmin = min(d__1,d__2);
+/* Computing MAX */
+		d__1 = emax, d__2 = z__[i4 - 5];
+		emax = max(d__1,d__2);
+	    }
+/* Computing MAX */
+	    d__1 = qmax, d__2 = z__[i4 - 7] + z__[i4 - 5];
+	    qmax = max(d__1,d__2);
+/* Computing MIN */
+	    d__1 = emin, d__2 = z__[i4 - 5];
+	    emin = min(d__1,d__2);
+/* L90: */
+	}
+	i4 = 4;
+
+L100:
+	i0 = i4 / 4;
+	pp = 0;
+
+	if (n0 - i0 > 1) {
+	    dee = z__[(i0 << 2) - 3];
+	    deemin = dee;
+	    kmin = i0;
+	    i__2 = (n0 << 2) - 3;
+	    for (i4 = (i0 << 2) + 1; i4 <= i__2; i4 += 4) {
+		dee = z__[i4] * (dee / (dee + z__[i4 - 2]));
+		if (dee <= deemin) {
+		    deemin = dee;
+		    kmin = (i4 + 3) / 4;
+		}
+/* L110: */
+	    }
+	    if (kmin - i0 << 1 < n0 - kmin && deemin <= z__[(n0 << 2) - 3] *
+		    .5) {
+		ipn4 = i0 + n0 << 2;
+		pp = 2;
+		i__2 = i0 + n0 - 1 << 1;
+		for (i4 = i0 << 2; i4 <= i__2; i4 += 4) {
+		    temp = z__[i4 - 3];
+		    z__[i4 - 3] = z__[ipn4 - i4 - 3];
+		    z__[ipn4 - i4 - 3] = temp;
+		    temp = z__[i4 - 2];
+		    z__[i4 - 2] = z__[ipn4 - i4 - 2];
+		    z__[ipn4 - i4 - 2] = temp;
+		    temp = z__[i4 - 1];
+		    z__[i4 - 1] = z__[ipn4 - i4 - 5];
+		    z__[ipn4 - i4 - 5] = temp;
+		    temp = z__[i4];
+		    z__[i4] = z__[ipn4 - i4 - 4];
+		    z__[ipn4 - i4 - 4] = temp;
+/* L120: */
+		}
+	    }
+	}
+
+/*
+          Put -(initial shift) into DMIN.
+
+   Computing MAX
+*/
+	d__1 = 0., d__2 = qmin - sqrt(qmin) * 2. * sqrt(emax);
+	dmin__ = -max(d__1,d__2);
+
+/*
+          Now I0:N0 is unreduced.
+          PP = 0 for ping, PP = 1 for pong.
+          PP = 2 indicates that flipping was applied to the Z array and
+                 and that the tests for deflation upon entry in DLASQ3
+                 should not be performed.
+*/
+
+	nbig = (n0 - i0 + 1) * 30;
+	i__2 = nbig;
+	for (iwhilb = 1; iwhilb <= i__2; ++iwhilb) {
+	    if (i0 > n0) {
+		goto L150;
+	    }
+
+/*           While submatrix unfinished take a good dqds step. */
+
+	    dlasq3_(&i0, &n0, &z__[1], &pp, &dmin__, &sigma, &desig, &qmax, &
+		    nfail, &iter, &ndiv, &ieee, &ttype, &dmin1, &dmin2, &dn, &
+		    dn1, &dn2, &g, &tau);
+
+	    pp = 1 - pp;
+
+/*           When EMIN is very small check for splits. */
+
+	    if (pp == 0 && n0 - i0 >= 3) {
+		if (z__[n0 * 4] <= tol2 * qmax || z__[(n0 << 2) - 1] <= tol2 *
+			 sigma) {
+		    splt = i0 - 1;
+		    qmax = z__[(i0 << 2) - 3];
+		    emin = z__[(i0 << 2) - 1];
+		    oldemn = z__[i0 * 4];
+		    i__3 = n0 - 3 << 2;
+		    for (i4 = i0 << 2; i4 <= i__3; i4 += 4) {
+			if (z__[i4] <= tol2 * z__[i4 - 3] || z__[i4 - 1] <=
+				tol2 * sigma) {
+			    z__[i4 - 1] = -sigma;
+			    splt = i4 / 4;
+			    qmax = 0.;
+			    emin = z__[i4 + 3];
+			    oldemn = z__[i4 + 4];
+			} else {
+/* Computing MAX */
+			    d__1 = qmax, d__2 = z__[i4 + 1];
+			    qmax = max(d__1,d__2);
+/* Computing MIN */
+			    d__1 = emin, d__2 = z__[i4 - 1];
+			    emin = min(d__1,d__2);
+/* Computing MIN */
+			    d__1 = oldemn, d__2 = z__[i4];
+			    oldemn = min(d__1,d__2);
+			}
+/* L130: */
+		    }
+		    z__[(n0 << 2) - 1] = emin;
+		    z__[n0 * 4] = oldemn;
+		    i0 = splt + 1;
+		}
+	    }
+
+/* L140: */
+	}
+
+	*info = 2;
+	return 0;
+
+/*        end IWHILB */
+
+L150:
+
+/* L160: */
+	;
+    }
+
+    *info = 3;
+    return 0;
+
+/*     end IWHILA */
+
+L170:
+
+/*     Move q's to the front. */
+
+    i__1 = *n;
+    for (k = 2; k <= i__1; ++k) {
+	z__[k] = z__[(k << 2) - 3];
+/* L180: */
+    }
+
+/*     Sort and compute sum of eigenvalues. */
+
+    dlasrt_("D", n, &z__[1], &iinfo);
+
+    e = 0.;
+    for (k = *n; k >= 1; --k) {
+	e += z__[k];
+/* L190: */
+    }
+
+/*     Store trace, sum(eigenvalues) and information on performance. */
+
+    z__[(*n << 1) + 1] = trace;
+    z__[(*n << 1) + 2] = e;
+    z__[(*n << 1) + 3] = (doublereal) iter;
+/* Computing 2nd power */
+    i__1 = *n;
+    z__[(*n << 1) + 4] = (doublereal) ndiv / (doublereal) (i__1 * i__1);
+    z__[(*n << 1) + 5] = nfail * 100. / (doublereal) iter;
+    return 0;
+
+/*     End of DLASQ2 */
+
+} /* dlasq2_ */
+
+/* Subroutine */ int dlasq3_(integer *i0, integer *n0, doublereal *z__,
+	integer *pp, doublereal *dmin__, doublereal *sigma, doublereal *desig,
+	 doublereal *qmax, integer *nfail, integer *iter, integer *ndiv,
+	logical *ieee, integer *ttype, doublereal *dmin1, doublereal *dmin2,
+	doublereal *dn, doublereal *dn1, doublereal *dn2, doublereal *g,
+	doublereal *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal s, t;
+    static integer j4, nn;
+    static doublereal eps, tol;
+    static integer n0in, ipn4;
+    static doublereal tol2, temp;
+    extern /* Subroutine */ int dlasq4_(integer *, integer *, doublereal *,
+	    integer *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     doublereal *), dlasq5_(integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, doublereal *, doublereal *, logical *), dlasq6_(
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *);
+
+    extern logical disnan_(doublereal *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    DLASQ3 checks for deflation, computes a shift (TAU) and calls dqds.
+    In case of failure it changes shifts, and tries again until output
+    is positive.
+
+    Arguments
+    =========
+
+    I0     (input) INTEGER
+           First index.
+
+    N0     (input/output) INTEGER
+           Last index.
+
+    Z      (input) DOUBLE PRECISION array, dimension ( 4*N )
+           Z holds the qd array.
+
+    PP     (input/output) INTEGER
+           PP=0 for ping, PP=1 for pong.
+           PP=2 indicates that flipping was applied to the Z array
+           and that the initial tests for deflation should not be
+           performed.
+
+    DMIN   (output) DOUBLE PRECISION
+           Minimum value of d.
+
+    SIGMA  (output) DOUBLE PRECISION
+           Sum of shifts used in current segment.
+
+    DESIG  (input/output) DOUBLE PRECISION
+           Lower order part of SIGMA
+
+    QMAX   (input) DOUBLE PRECISION
+           Maximum value of q.
+
+    NFAIL  (output) INTEGER
+           Number of times shift was too big.
+
+    ITER   (output) INTEGER
+           Number of iterations.
+
+    NDIV   (output) INTEGER
+           Number of divisions.
+
+    IEEE   (input) LOGICAL
+           Flag for IEEE or non IEEE arithmetic (passed to DLASQ5).
+
+    TTYPE  (input/output) INTEGER
+           Shift type.
+
+    DMIN1  (input/output) DOUBLE PRECISION
+
+    DMIN2  (input/output) DOUBLE PRECISION
+
+    DN     (input/output) DOUBLE PRECISION
+
+    DN1    (input/output) DOUBLE PRECISION
+
+    DN2    (input/output) DOUBLE PRECISION
+
+    G      (input/output) DOUBLE PRECISION
+
+    TAU    (input/output) DOUBLE PRECISION
+
+           These are passed as arguments in order to save their values
+           between calls to DLASQ3.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    n0in = *n0;
+    eps = PRECISION;
+    tol = eps * 100.;
+/* Computing 2nd power */
+    d__1 = tol;
+    tol2 = d__1 * d__1;
+
+/*     Check for deflation. */
+
+L10:
+
+    if (*n0 < *i0) {
+	return 0;
+    }
+    if (*n0 == *i0) {
+	goto L20;
+    }
+    nn = (*n0 << 2) + *pp;
+    if (*n0 == *i0 + 1) {
+	goto L40;
+    }
+
+/*     Check whether E(N0-1) is negligible, 1 eigenvalue. */
+
+    if (z__[nn - 5] > tol2 * (*sigma + z__[nn - 3]) && z__[nn - (*pp << 1) -
+	    4] > tol2 * z__[nn - 7]) {
+	goto L30;
+    }
+
+L20:
+
+    z__[(*n0 << 2) - 3] = z__[(*n0 << 2) + *pp - 3] + *sigma;
+    --(*n0);
+    goto L10;
+
+/*     Check  whether E(N0-2) is negligible, 2 eigenvalues. */
+
+L30:
+
+    if (z__[nn - 9] > tol2 * *sigma && z__[nn - (*pp << 1) - 8] > tol2 * z__[
+	    nn - 11]) {
+	goto L50;
+    }
+
+L40:
+
+    if (z__[nn - 3] > z__[nn - 7]) {
+	s = z__[nn - 3];
+	z__[nn - 3] = z__[nn - 7];
+	z__[nn - 7] = s;
+    }
+    if (z__[nn - 5] > z__[nn - 3] * tol2) {
+	t = (z__[nn - 7] - z__[nn - 3] + z__[nn - 5]) * .5;
+	s = z__[nn - 3] * (z__[nn - 5] / t);
+	if (s <= t) {
+	    s = z__[nn - 3] * (z__[nn - 5] / (t * (sqrt(s / t + 1.) + 1.)));
+	} else {
+	    s = z__[nn - 3] * (z__[nn - 5] / (t + sqrt(t) * sqrt(t + s)));
+	}
+	t = z__[nn - 7] + (s + z__[nn - 5]);
+	z__[nn - 3] *= z__[nn - 7] / t;
+	z__[nn - 7] = t;
+    }
+    z__[(*n0 << 2) - 7] = z__[nn - 7] + *sigma;
+    z__[(*n0 << 2) - 3] = z__[nn - 3] + *sigma;
+    *n0 += -2;
+    goto L10;
+
+L50:
+    if (*pp == 2) {
+	*pp = 0;
+    }
+
+/*     Reverse the qd-array, if warranted. */
+
+    if (*dmin__ <= 0. || *n0 < n0in) {
+	if (z__[(*i0 << 2) + *pp - 3] * 1.5 < z__[(*n0 << 2) + *pp - 3]) {
+	    ipn4 = *i0 + *n0 << 2;
+	    i__1 = *i0 + *n0 - 1 << 1;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		temp = z__[j4 - 3];
+		z__[j4 - 3] = z__[ipn4 - j4 - 3];
+		z__[ipn4 - j4 - 3] = temp;
+		temp = z__[j4 - 2];
+		z__[j4 - 2] = z__[ipn4 - j4 - 2];
+		z__[ipn4 - j4 - 2] = temp;
+		temp = z__[j4 - 1];
+		z__[j4 - 1] = z__[ipn4 - j4 - 5];
+		z__[ipn4 - j4 - 5] = temp;
+		temp = z__[j4];
+		z__[j4] = z__[ipn4 - j4 - 4];
+		z__[ipn4 - j4 - 4] = temp;
+/* L60: */
+	    }
+	    if (*n0 - *i0 <= 4) {
+		z__[(*n0 << 2) + *pp - 1] = z__[(*i0 << 2) + *pp - 1];
+		z__[(*n0 << 2) - *pp] = z__[(*i0 << 2) - *pp];
+	    }
+/* Computing MIN */
+	    d__1 = *dmin2, d__2 = z__[(*n0 << 2) + *pp - 1];
+	    *dmin2 = min(d__1,d__2);
+/* Computing MIN */
+	    d__1 = z__[(*n0 << 2) + *pp - 1], d__2 = z__[(*i0 << 2) + *pp - 1]
+		    , d__1 = min(d__1,d__2), d__2 = z__[(*i0 << 2) + *pp + 3];
+	    z__[(*n0 << 2) + *pp - 1] = min(d__1,d__2);
+/* Computing MIN */
+	    d__1 = z__[(*n0 << 2) - *pp], d__2 = z__[(*i0 << 2) - *pp], d__1 =
+		     min(d__1,d__2), d__2 = z__[(*i0 << 2) - *pp + 4];
+	    z__[(*n0 << 2) - *pp] = min(d__1,d__2);
+/* Computing MAX */
+	    d__1 = *qmax, d__2 = z__[(*i0 << 2) + *pp - 3], d__1 = max(d__1,
+		    d__2), d__2 = z__[(*i0 << 2) + *pp + 1];
+	    *qmax = max(d__1,d__2);
+	    *dmin__ = -0.;
+	}
+    }
+
+/*     Choose a shift. */
+
+    dlasq4_(i0, n0, &z__[1], pp, &n0in, dmin__, dmin1, dmin2, dn, dn1, dn2,
+	    tau, ttype, g);
+
+/*     Call dqds until DMIN > 0. */
+
+L70:
+
+    dlasq5_(i0, n0, &z__[1], pp, tau, dmin__, dmin1, dmin2, dn, dn1, dn2,
+	    ieee);
+
+    *ndiv += *n0 - *i0 + 2;
+    ++(*iter);
+
+/*     Check status. */
+
+    if (*dmin__ >= 0. && *dmin1 > 0.) {
+
+/*        Success. */
+
+	goto L90;
+
+    } else if (*dmin__ < 0. && *dmin1 > 0. && z__[(*n0 - 1 << 2) - *pp] < tol
+	    * (*sigma + *dn1) && abs(*dn) < tol * *sigma) {
+
+/*        Convergence hidden by negative DN. */
+
+	z__[(*n0 - 1 << 2) - *pp + 2] = 0.;
+	*dmin__ = 0.;
+	goto L90;
+    } else if (*dmin__ < 0.) {
+
+/*        TAU too big. Select new TAU and try again. */
+
+	++(*nfail);
+	if (*ttype < -22) {
+
+/*           Failed twice. Play it safe. */
+
+	    *tau = 0.;
+	} else if (*dmin1 > 0.) {
+
+/*           Late failure. Gives excellent shift. */
+
+	    *tau = (*tau + *dmin__) * (1. - eps * 2.);
+	    *ttype += -11;
+	} else {
+
+/*           Early failure. Divide by 4. */
+
+	    *tau *= .25;
+	    *ttype += -12;
+	}
+	goto L70;
+    } else if (disnan_(dmin__)) {
+
+/*        NaN. */
+
+	if (*tau == 0.) {
+	    goto L80;
+	} else {
+	    *tau = 0.;
+	    goto L70;
+	}
+    } else {
+
+/*        Possible underflow. Play it safe. */
+
+	goto L80;
+    }
+
+/*     Risk of underflow. */
+
+L80:
+    dlasq6_(i0, n0, &z__[1], pp, dmin__, dmin1, dmin2, dn, dn1, dn2);
+    *ndiv += *n0 - *i0 + 2;
+    ++(*iter);
+    *tau = 0.;
+
+L90:
+    if (*tau < *sigma) {
+	*desig += *tau;
+	t = *sigma + *desig;
+	*desig -= t - *sigma;
+    } else {
+	t = *sigma + *tau;
+	*desig = *sigma - (t - *tau) + *desig;
+    }
+    *sigma = t;
+
+    return 0;
+
+/*     End of DLASQ3 */
+
+} /* dlasq3_ */
+
+/* Subroutine */ int dlasq4_(integer *i0, integer *n0, doublereal *z__,
+	integer *pp, integer *n0in, doublereal *dmin__, doublereal *dmin1,
+	doublereal *dmin2, doublereal *dn, doublereal *dn1, doublereal *dn2,
+	doublereal *tau, integer *ttype, doublereal *g)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal s, a2, b1, b2;
+    static integer i4, nn, np;
+    static doublereal gam, gap1, gap2;
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    DLASQ4 computes an approximation TAU to the smallest eigenvalue
+    using values of d from the previous transform.
+
+    I0    (input) INTEGER
+          First index.
+
+    N0    (input) INTEGER
+          Last index.
+
+    Z     (input) DOUBLE PRECISION array, dimension ( 4*N )
+          Z holds the qd array.
+
+    PP    (input) INTEGER
+          PP=0 for ping, PP=1 for pong.
+
+    NOIN  (input) INTEGER
+          The value of N0 at start of EIGTEST.
+
+    DMIN  (input) DOUBLE PRECISION
+          Minimum value of d.
+
+    DMIN1 (input) DOUBLE PRECISION
+          Minimum value of d, excluding D( N0 ).
+
+    DMIN2 (input) DOUBLE PRECISION
+          Minimum value of d, excluding D( N0 ) and D( N0-1 ).
+
+    DN    (input) DOUBLE PRECISION
+          d(N)
+
+    DN1   (input) DOUBLE PRECISION
+          d(N-1)
+
+    DN2   (input) DOUBLE PRECISION
+          d(N-2)
+
+    TAU   (output) DOUBLE PRECISION
+          This is the shift.
+
+    TTYPE (output) INTEGER
+          Shift type.
+
+    G     (input/output) REAL
+          G is passed as an argument in order to save its value between
+          calls to DLASQ4.
+
+    Further Details
+    ===============
+    CNST1 = 9/16
+
+    =====================================================================
+
+
+       A negative DMIN forces the shift to take that absolute value
+       TTYPE records the type of shift.
+*/
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    if (*dmin__ <= 0.) {
+	*tau = -(*dmin__);
+	*ttype = -1;
+	return 0;
+    }
+
+    nn = (*n0 << 2) + *pp;
+    if (*n0in == *n0) {
+
+/*        No eigenvalues deflated. */
+
+	if (*dmin__ == *dn || *dmin__ == *dn1) {
+
+	    b1 = sqrt(z__[nn - 3]) * sqrt(z__[nn - 5]);
+	    b2 = sqrt(z__[nn - 7]) * sqrt(z__[nn - 9]);
+	    a2 = z__[nn - 7] + z__[nn - 5];
+
+/*           Cases 2 and 3. */
+
+	    if (*dmin__ == *dn && *dmin1 == *dn1) {
+		gap2 = *dmin2 - a2 - *dmin2 * .25;
+		if (gap2 > 0. && gap2 > b2) {
+		    gap1 = a2 - *dn - b2 / gap2 * b2;
+		} else {
+		    gap1 = a2 - *dn - (b1 + b2);
+		}
+		if (gap1 > 0. && gap1 > b1) {
+/* Computing MAX */
+		    d__1 = *dn - b1 / gap1 * b1, d__2 = *dmin__ * .5;
+		    s = max(d__1,d__2);
+		    *ttype = -2;
+		} else {
+		    s = 0.;
+		    if (*dn > b1) {
+			s = *dn - b1;
+		    }
+		    if (a2 > b1 + b2) {
+/* Computing MIN */
+			d__1 = s, d__2 = a2 - (b1 + b2);
+			s = min(d__1,d__2);
+		    }
+/* Computing MAX */
+		    d__1 = s, d__2 = *dmin__ * .333;
+		    s = max(d__1,d__2);
+		    *ttype = -3;
+		}
+	    } else {
+
+/*              Case 4. */
+
+		*ttype = -4;
+		s = *dmin__ * .25;
+		if (*dmin__ == *dn) {
+		    gam = *dn;
+		    a2 = 0.;
+		    if (z__[nn - 5] > z__[nn - 7]) {
+			return 0;
+		    }
+		    b2 = z__[nn - 5] / z__[nn - 7];
+		    np = nn - 9;
+		} else {
+		    np = nn - (*pp << 1);
+		    b2 = z__[np - 2];
+		    gam = *dn1;
+		    if (z__[np - 4] > z__[np - 2]) {
+			return 0;
+		    }
+		    a2 = z__[np - 4] / z__[np - 2];
+		    if (z__[nn - 9] > z__[nn - 11]) {
+			return 0;
+		    }
+		    b2 = z__[nn - 9] / z__[nn - 11];
+		    np = nn - 13;
+		}
+
+/*              Approximate contribution to norm squared from I < NN-1. */
+
+		a2 += b2;
+		i__1 = (*i0 << 2) - 1 + *pp;
+		for (i4 = np; i4 >= i__1; i4 += -4) {
+		    if (b2 == 0.) {
+			goto L20;
+		    }
+		    b1 = b2;
+		    if (z__[i4] > z__[i4 - 2]) {
+			return 0;
+		    }
+		    b2 *= z__[i4] / z__[i4 - 2];
+		    a2 += b2;
+		    if (max(b2,b1) * 100. < a2 || .563 < a2) {
+			goto L20;
+		    }
+/* L10: */
+		}
+L20:
+		a2 *= 1.05;
+
+/*              Rayleigh quotient residual bound. */
+
+		if (a2 < .563) {
+		    s = gam * (1. - sqrt(a2)) / (a2 + 1.);
+		}
+	    }
+	} else if (*dmin__ == *dn2) {
+
+/*           Case 5. */
+
+	    *ttype = -5;
+	    s = *dmin__ * .25;
+
+/*           Compute contribution to norm squared from I > NN-2. */
+
+	    np = nn - (*pp << 1);
+	    b1 = z__[np - 2];
+	    b2 = z__[np - 6];
+	    gam = *dn2;
+	    if (z__[np - 8] > b2 || z__[np - 4] > b1) {
+		return 0;
+	    }
+	    a2 = z__[np - 8] / b2 * (z__[np - 4] / b1 + 1.);
+
+/*           Approximate contribution to norm squared from I < NN-2. */
+
+	    if (*n0 - *i0 > 2) {
+		b2 = z__[nn - 13] / z__[nn - 15];
+		a2 += b2;
+		i__1 = (*i0 << 2) - 1 + *pp;
+		for (i4 = nn - 17; i4 >= i__1; i4 += -4) {
+		    if (b2 == 0.) {
+			goto L40;
+		    }
+		    b1 = b2;
+		    if (z__[i4] > z__[i4 - 2]) {
+			return 0;
+		    }
+		    b2 *= z__[i4] / z__[i4 - 2];
+		    a2 += b2;
+		    if (max(b2,b1) * 100. < a2 || .563 < a2) {
+			goto L40;
+		    }
+/* L30: */
+		}
+L40:
+		a2 *= 1.05;
+	    }
+
+	    if (a2 < .563) {
+		s = gam * (1. - sqrt(a2)) / (a2 + 1.);
+	    }
+	} else {
+
+/*           Case 6, no information to guide us. */
+
+	    if (*ttype == -6) {
+		*g += (1. - *g) * .333;
+	    } else if (*ttype == -18) {
+		*g = .083250000000000005;
+	    } else {
+		*g = .25;
+	    }
+	    s = *g * *dmin__;
+	    *ttype = -6;
+	}
+
+    } else if (*n0in == *n0 + 1) {
+
+/*        One eigenvalue just deflated. Use DMIN1, DN1 for DMIN and DN. */
+
+	if (*dmin1 == *dn1 && *dmin2 == *dn2) {
+
+/*           Cases 7 and 8. */
+
+	    *ttype = -7;
+	    s = *dmin1 * .333;
+	    if (z__[nn - 5] > z__[nn - 7]) {
+		return 0;
+	    }
+	    b1 = z__[nn - 5] / z__[nn - 7];
+	    b2 = b1;
+	    if (b2 == 0.) {
+		goto L60;
+	    }
+	    i__1 = (*i0 << 2) - 1 + *pp;
+	    for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) {
+		a2 = b1;
+		if (z__[i4] > z__[i4 - 2]) {
+		    return 0;
+		}
+		b1 *= z__[i4] / z__[i4 - 2];
+		b2 += b1;
+		if (max(b1,a2) * 100. < b2) {
+		    goto L60;
+		}
+/* L50: */
+	    }
+L60:
+	    b2 = sqrt(b2 * 1.05);
+/* Computing 2nd power */
+	    d__1 = b2;
+	    a2 = *dmin1 / (d__1 * d__1 + 1.);
+	    gap2 = *dmin2 * .5 - a2;
+	    if (gap2 > 0. && gap2 > b2 * a2) {
+/* Computing MAX */
+		d__1 = s, d__2 = a2 * (1. - a2 * 1.01 * (b2 / gap2) * b2);
+		s = max(d__1,d__2);
+	    } else {
+/* Computing MAX */
+		d__1 = s, d__2 = a2 * (1. - b2 * 1.01);
+		s = max(d__1,d__2);
+		*ttype = -8;
+	    }
+	} else {
+
+/*           Case 9. */
+
+	    s = *dmin1 * .25;
+	    if (*dmin1 == *dn1) {
+		s = *dmin1 * .5;
+	    }
+	    *ttype = -9;
+	}
+
+    } else if (*n0in == *n0 + 2) {
+
+/*
+          Two eigenvalues deflated. Use DMIN2, DN2 for DMIN and DN.
+
+          Cases 10 and 11.
+*/
+
+	if (*dmin2 == *dn2 && z__[nn - 5] * 2. < z__[nn - 7]) {
+	    *ttype = -10;
+	    s = *dmin2 * .333;
+	    if (z__[nn - 5] > z__[nn - 7]) {
+		return 0;
+	    }
+	    b1 = z__[nn - 5] / z__[nn - 7];
+	    b2 = b1;
+	    if (b2 == 0.) {
+		goto L80;
+	    }
+	    i__1 = (*i0 << 2) - 1 + *pp;
+	    for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) {
+		if (z__[i4] > z__[i4 - 2]) {
+		    return 0;
+		}
+		b1 *= z__[i4] / z__[i4 - 2];
+		b2 += b1;
+		if (b1 * 100. < b2) {
+		    goto L80;
+		}
+/* L70: */
+	    }
+L80:
+	    b2 = sqrt(b2 * 1.05);
+/* Computing 2nd power */
+	    d__1 = b2;
+	    a2 = *dmin2 / (d__1 * d__1 + 1.);
+	    gap2 = z__[nn - 7] + z__[nn - 9] - sqrt(z__[nn - 11]) * sqrt(z__[
+		    nn - 9]) - a2;
+	    if (gap2 > 0. && gap2 > b2 * a2) {
+/* Computing MAX */
+		d__1 = s, d__2 = a2 * (1. - a2 * 1.01 * (b2 / gap2) * b2);
+		s = max(d__1,d__2);
+	    } else {
+/* Computing MAX */
+		d__1 = s, d__2 = a2 * (1. - b2 * 1.01);
+		s = max(d__1,d__2);
+	    }
+	} else {
+	    s = *dmin2 * .25;
+	    *ttype = -11;
+	}
+    } else if (*n0in > *n0 + 2) {
+
+/*        Case 12, more than two eigenvalues deflated. No information. */
+
+	s = 0.;
+	*ttype = -12;
+    }
+
+    *tau = s;
+    return 0;
+
+/*     End of DLASQ4 */
+
+} /* dlasq4_ */
+
+/* Subroutine */ int dlasq5_(integer *i0, integer *n0, doublereal *z__,
+	integer *pp, doublereal *tau, doublereal *dmin__, doublereal *dmin1,
+	doublereal *dmin2, doublereal *dn, doublereal *dnm1, doublereal *dnm2,
+	 logical *ieee)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal d__;
+    static integer j4, j4p2;
+    static doublereal emin, temp;
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    DLASQ5 computes one dqds transform in ping-pong form, one
+    version for IEEE machines another for non IEEE machines.
+
+    Arguments
+    =========
+
+    I0    (input) INTEGER
+          First index.
+
+    N0    (input) INTEGER
+          Last index.
+
+    Z     (input) DOUBLE PRECISION array, dimension ( 4*N )
+          Z holds the qd array. EMIN is stored in Z(4*N0) to avoid
+          an extra argument.
+
+    PP    (input) INTEGER
+          PP=0 for ping, PP=1 for pong.
+
+    TAU   (input) DOUBLE PRECISION
+          This is the shift.
+
+    DMIN  (output) DOUBLE PRECISION
+          Minimum value of d.
+
+    DMIN1 (output) DOUBLE PRECISION
+          Minimum value of d, excluding D( N0 ).
+
+    DMIN2 (output) DOUBLE PRECISION
+          Minimum value of d, excluding D( N0 ) and D( N0-1 ).
+
+    DN    (output) DOUBLE PRECISION
+          d(N0), the last value of d.
+
+    DNM1  (output) DOUBLE PRECISION
+          d(N0-1).
+
+    DNM2  (output) DOUBLE PRECISION
+          d(N0-2).
+
+    IEEE  (input) LOGICAL
+          Flag for IEEE or non IEEE arithmetic.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    if (*n0 - *i0 - 1 <= 0) {
+	return 0;
+    }
+
+    j4 = (*i0 << 2) + *pp - 3;
+    emin = z__[j4 + 4];
+    d__ = z__[j4] - *tau;
+    *dmin__ = d__;
+    *dmin1 = -z__[j4];
+
+    if (*ieee) {
+
+/*        Code for IEEE arithmetic. */
+
+	if (*pp == 0) {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 2] = d__ + z__[j4 - 1];
+		temp = z__[j4 + 1] / z__[j4 - 2];
+		d__ = d__ * temp - *tau;
+		*dmin__ = min(*dmin__,d__);
+		z__[j4] = z__[j4 - 1] * temp;
+/* Computing MIN */
+		d__1 = z__[j4];
+		emin = min(d__1,emin);
+/* L10: */
+	    }
+	} else {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 3] = d__ + z__[j4];
+		temp = z__[j4 + 2] / z__[j4 - 3];
+		d__ = d__ * temp - *tau;
+		*dmin__ = min(*dmin__,d__);
+		z__[j4 - 1] = z__[j4] * temp;
+/* Computing MIN */
+		d__1 = z__[j4 - 1];
+		emin = min(d__1,emin);
+/* L20: */
+	    }
+	}
+
+/*        Unroll last two steps. */
+
+	*dnm2 = d__;
+	*dmin2 = *dmin__;
+	j4 = (*n0 - 2 << 2) - *pp;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm2 + z__[j4p2];
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau;
+	*dmin__ = min(*dmin__,*dnm1);
+
+	*dmin1 = *dmin__;
+	j4 += 4;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm1 + z__[j4p2];
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau;
+	*dmin__ = min(*dmin__,*dn);
+
+    } else {
+
+/*        Code for non IEEE arithmetic. */
+
+	if (*pp == 0) {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 2] = d__ + z__[j4 - 1];
+		if (d__ < 0.) {
+		    return 0;
+		} else {
+		    z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]);
+		    d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]) - *tau;
+		}
+		*dmin__ = min(*dmin__,d__);
+/* Computing MIN */
+		d__1 = emin, d__2 = z__[j4];
+		emin = min(d__1,d__2);
+/* L30: */
+	    }
+	} else {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 3] = d__ + z__[j4];
+		if (d__ < 0.) {
+		    return 0;
+		} else {
+		    z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]);
+		    d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]) - *tau;
+		}
+		*dmin__ = min(*dmin__,d__);
+/* Computing MIN */
+		d__1 = emin, d__2 = z__[j4 - 1];
+		emin = min(d__1,d__2);
+/* L40: */
+	    }
+	}
+
+/*        Unroll last two steps. */
+
+	*dnm2 = d__;
+	*dmin2 = *dmin__;
+	j4 = (*n0 - 2 << 2) - *pp;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm2 + z__[j4p2];
+	if (*dnm2 < 0.) {
+	    return 0;
+	} else {
+	    z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	    *dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau;
+	}
+	*dmin__ = min(*dmin__,*dnm1);
+
+	*dmin1 = *dmin__;
+	j4 += 4;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm1 + z__[j4p2];
+	if (*dnm1 < 0.) {
+	    return 0;
+	} else {
+	    z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	    *dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau;
+	}
+	*dmin__ = min(*dmin__,*dn);
+
+    }
+
+    z__[j4 + 2] = *dn;
+    z__[(*n0 << 2) - *pp] = emin;
+    return 0;
+
+/*     End of DLASQ5 */
+
+} /* dlasq5_ */
+
+/* Subroutine */ int dlasq6_(integer *i0, integer *n0, doublereal *z__,
+	integer *pp, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2,
+	 doublereal *dn, doublereal *dnm1, doublereal *dnm2)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal d__;
+    static integer j4, j4p2;
+    static doublereal emin, temp;
+
+    static doublereal safmin;
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    DLASQ6 computes one dqd (shift equal to zero) transform in
+    ping-pong form, with protection against underflow and overflow.
+
+    Arguments
+    =========
+
+    I0    (input) INTEGER
+          First index.
+
+    N0    (input) INTEGER
+          Last index.
+
+    Z     (input) DOUBLE PRECISION array, dimension ( 4*N )
+          Z holds the qd array. EMIN is stored in Z(4*N0) to avoid
+          an extra argument.
+
+    PP    (input) INTEGER
+          PP=0 for ping, PP=1 for pong.
+
+    DMIN  (output) DOUBLE PRECISION
+          Minimum value of d.
+
+    DMIN1 (output) DOUBLE PRECISION
+          Minimum value of d, excluding D( N0 ).
+
+    DMIN2 (output) DOUBLE PRECISION
+          Minimum value of d, excluding D( N0 ) and D( N0-1 ).
+
+    DN    (output) DOUBLE PRECISION
+          d(N0), the last value of d.
+
+    DNM1  (output) DOUBLE PRECISION
+          d(N0-1).
+
+    DNM2  (output) DOUBLE PRECISION
+          d(N0-2).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    if (*n0 - *i0 - 1 <= 0) {
+	return 0;
+    }
+
+    safmin = SAFEMINIMUM;
+    j4 = (*i0 << 2) + *pp - 3;
+    emin = z__[j4 + 4];
+    d__ = z__[j4];
+    *dmin__ = d__;
+
+    if (*pp == 0) {
+	i__1 = *n0 - 3 << 2;
+	for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+	    z__[j4 - 2] = d__ + z__[j4 - 1];
+	    if (z__[j4 - 2] == 0.) {
+		z__[j4] = 0.;
+		d__ = z__[j4 + 1];
+		*dmin__ = d__;
+		emin = 0.;
+	    } else if (safmin * z__[j4 + 1] < z__[j4 - 2] && safmin * z__[j4
+		    - 2] < z__[j4 + 1]) {
+		temp = z__[j4 + 1] / z__[j4 - 2];
+		z__[j4] = z__[j4 - 1] * temp;
+		d__ *= temp;
+	    } else {
+		z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]);
+		d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]);
+	    }
+	    *dmin__ = min(*dmin__,d__);
+/* Computing MIN */
+	    d__1 = emin, d__2 = z__[j4];
+	    emin = min(d__1,d__2);
+/* L10: */
+	}
+    } else {
+	i__1 = *n0 - 3 << 2;
+	for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+	    z__[j4 - 3] = d__ + z__[j4];
+	    if (z__[j4 - 3] == 0.) {
+		z__[j4 - 1] = 0.;
+		d__ = z__[j4 + 2];
+		*dmin__ = d__;
+		emin = 0.;
+	    } else if (safmin * z__[j4 + 2] < z__[j4 - 3] && safmin * z__[j4
+		    - 3] < z__[j4 + 2]) {
+		temp = z__[j4 + 2] / z__[j4 - 3];
+		z__[j4 - 1] = z__[j4] * temp;
+		d__ *= temp;
+	    } else {
+		z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]);
+		d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]);
+	    }
+	    *dmin__ = min(*dmin__,d__);
+/* Computing MIN */
+	    d__1 = emin, d__2 = z__[j4 - 1];
+	    emin = min(d__1,d__2);
+/* L20: */
+	}
+    }
+
+/*     Unroll last two steps. */
+
+    *dnm2 = d__;
+    *dmin2 = *dmin__;
+    j4 = (*n0 - 2 << 2) - *pp;
+    j4p2 = j4 + (*pp << 1) - 1;
+    z__[j4 - 2] = *dnm2 + z__[j4p2];
+    if (z__[j4 - 2] == 0.) {
+	z__[j4] = 0.;
+	*dnm1 = z__[j4p2 + 2];
+	*dmin__ = *dnm1;
+	emin = 0.;
+    } else if (safmin * z__[j4p2 + 2] < z__[j4 - 2] && safmin * z__[j4 - 2] <
+	    z__[j4p2 + 2]) {
+	temp = z__[j4p2 + 2] / z__[j4 - 2];
+	z__[j4] = z__[j4p2] * temp;
+	*dnm1 = *dnm2 * temp;
+    } else {
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]);
+    }
+    *dmin__ = min(*dmin__,*dnm1);
+
+    *dmin1 = *dmin__;
+    j4 += 4;
+    j4p2 = j4 + (*pp << 1) - 1;
+    z__[j4 - 2] = *dnm1 + z__[j4p2];
+    if (z__[j4 - 2] == 0.) {
+	z__[j4] = 0.;
+	*dn = z__[j4p2 + 2];
+	*dmin__ = *dn;
+	emin = 0.;
+    } else if (safmin * z__[j4p2 + 2] < z__[j4 - 2] && safmin * z__[j4 - 2] <
+	    z__[j4p2 + 2]) {
+	temp = z__[j4p2 + 2] / z__[j4 - 2];
+	z__[j4] = z__[j4p2] * temp;
+	*dn = *dnm1 * temp;
+    } else {
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]);
+    }
+    *dmin__ = min(*dmin__,*dn);
+
+    z__[j4 + 2] = *dn;
+    z__[(*n0 << 2) - *pp] = emin;
+    return 0;
+
+/*     End of DLASQ6 */
+
+} /* dlasq6_ */
+
+/* Subroutine */ int dlasr_(char *side, char *pivot, char *direct, integer *m,
+	 integer *n, doublereal *c__, doublereal *s, doublereal *a, integer *
+	lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, info;
+    static doublereal temp;
+    extern logical lsame_(char *, char *);
+    static doublereal ctemp, stemp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASR applies a sequence of plane rotations to a real matrix A,
+    from either the left or the right.
+
+    When SIDE = 'L', the transformation takes the form
+
+       A := P*A
+
+    and when SIDE = 'R', the transformation takes the form
+
+       A := A*P**T
+
+    where P is an orthogonal matrix consisting of a sequence of z plane
+    rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+    and P**T is the transpose of P.
+
+    When DIRECT = 'F' (Forward sequence), then
+
+       P = P(z-1) * ... * P(2) * P(1)
+
+    and when DIRECT = 'B' (Backward sequence), then
+
+       P = P(1) * P(2) * ... * P(z-1)
+
+    where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+
+       R(k) = (  c(k)  s(k) )
+            = ( -s(k)  c(k) ).
+
+    When PIVOT = 'V' (Variable pivot), the rotation is performed
+    for the plane (k,k+1), i.e., P(k) has the form
+
+       P(k) = (  1                                            )
+              (       ...                                     )
+              (              1                                )
+              (                   c(k)  s(k)                  )
+              (                  -s(k)  c(k)                  )
+              (                                1              )
+              (                                     ...       )
+              (                                            1  )
+
+    where R(k) appears as a rank-2 modification to the identity matrix in
+    rows and columns k and k+1.
+
+    When PIVOT = 'T' (Top pivot), the rotation is performed for the
+    plane (1,k+1), so P(k) has the form
+
+       P(k) = (  c(k)                    s(k)                 )
+              (         1                                     )
+              (              ...                              )
+              (                     1                         )
+              ( -s(k)                    c(k)                 )
+              (                                 1             )
+              (                                      ...      )
+              (                                             1 )
+
+    where R(k) appears in rows and columns 1 and k+1.
+
+    Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+    performed for the plane (k,z), giving P(k) the form
+
+       P(k) = ( 1                                             )
+              (      ...                                      )
+              (             1                                 )
+              (                  c(k)                    s(k) )
+              (                         1                     )
+              (                              ...              )
+              (                                     1         )
+              (                 -s(k)                    c(k) )
+
+    where R(k) appears in rows and columns k and z.  The rotations are
+    performed without ever forming P(k) explicitly.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            Specifies whether the plane rotation matrix P is applied to
+            A on the left or the right.
+            = 'L':  Left, compute A := P*A
+            = 'R':  Right, compute A:= A*P**T
+
+    PIVOT   (input) CHARACTER*1
+            Specifies the plane for which P(k) is a plane rotation
+            matrix.
+            = 'V':  Variable pivot, the plane (k,k+1)
+            = 'T':  Top pivot, the plane (1,k+1)
+            = 'B':  Bottom pivot, the plane (k,z)
+
+    DIRECT  (input) CHARACTER*1
+            Specifies whether P is a forward or backward sequence of
+            plane rotations.
+            = 'F':  Forward, P = P(z-1)*...*P(2)*P(1)
+            = 'B':  Backward, P = P(1)*P(2)*...*P(z-1)
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  If m <= 1, an immediate
+            return is effected.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  If n <= 1, an
+            immediate return is effected.
+
+    C       (input) DOUBLE PRECISION array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The cosines c(k) of the plane rotations.
+
+    S       (input) DOUBLE PRECISION array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The sines s(k) of the plane rotations.  The 2-by-2 plane
+            rotation part of the matrix P(k), R(k), has the form
+            R(k) = (  c(k)  s(k) )
+                   ( -s(k)  c(k) ).
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            The M-by-N matrix A.  On exit, A is overwritten by P*A if
+            SIDE = 'R' or by A*P**T if SIDE = 'L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --c__;
+    --s;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! (lsame_(side, "L") || lsame_(side, "R"))) {
+	info = 1;
+    } else if (! (lsame_(pivot, "V") || lsame_(pivot,
+	    "T") || lsame_(pivot, "B"))) {
+	info = 2;
+    } else if (! (lsame_(direct, "F") || lsame_(direct,
+	    "B"))) {
+	info = 3;
+    } else if (*m < 0) {
+	info = 4;
+    } else if (*n < 0) {
+	info = 5;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DLASR ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+    if (lsame_(side, "L")) {
+
+/*        Form  P * A */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[j + 1 + i__ * a_dim1];
+			    a[j + 1 + i__ * a_dim1] = ctemp * temp - stemp *
+				    a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * temp + ctemp * a[j
+				    + i__ * a_dim1];
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[j + 1 + i__ * a_dim1];
+			    a[j + 1 + i__ * a_dim1] = ctemp * temp - stemp *
+				    a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * temp + ctemp * a[j
+				    + i__ * a_dim1];
+/* L30: */
+			}
+		    }
+/* L40: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = ctemp * temp - stemp * a[
+				    i__ * a_dim1 + 1];
+			    a[i__ * a_dim1 + 1] = stemp * temp + ctemp * a[
+				    i__ * a_dim1 + 1];
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = ctemp * temp - stemp * a[
+				    i__ * a_dim1 + 1];
+			    a[i__ * a_dim1 + 1] = stemp * temp + ctemp * a[
+				    i__ * a_dim1 + 1];
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * a[*m + i__ * a_dim1]
+				     + ctemp * temp;
+			    a[*m + i__ * a_dim1] = ctemp * a[*m + i__ *
+				    a_dim1] - stemp * temp;
+/* L90: */
+			}
+		    }
+/* L100: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * a[*m + i__ * a_dim1]
+				     + ctemp * temp;
+			    a[*m + i__ * a_dim1] = ctemp * a[*m + i__ *
+				    a_dim1] - stemp * temp;
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+	    }
+	}
+    } else if (lsame_(side, "R")) {
+
+/*        Form A * P' */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[i__ + (j + 1) * a_dim1];
+			    a[i__ + (j + 1) * a_dim1] = ctemp * temp - stemp *
+				     a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * temp + ctemp * a[
+				    i__ + j * a_dim1];
+/* L130: */
+			}
+		    }
+/* L140: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[i__ + (j + 1) * a_dim1];
+			    a[i__ + (j + 1) * a_dim1] = ctemp * temp - stemp *
+				     a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * temp + ctemp * a[
+				    i__ + j * a_dim1];
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = ctemp * temp - stemp * a[
+				    i__ + a_dim1];
+			    a[i__ + a_dim1] = stemp * temp + ctemp * a[i__ +
+				    a_dim1];
+/* L170: */
+			}
+		    }
+/* L180: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = ctemp * temp - stemp * a[
+				    i__ + a_dim1];
+			    a[i__ + a_dim1] = stemp * temp + ctemp * a[i__ +
+				    a_dim1];
+/* L190: */
+			}
+		    }
+/* L200: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * a[i__ + *n * a_dim1]
+				     + ctemp * temp;
+			    a[i__ + *n * a_dim1] = ctemp * a[i__ + *n *
+				    a_dim1] - stemp * temp;
+/* L210: */
+			}
+		    }
+/* L220: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * a[i__ + *n * a_dim1]
+				     + ctemp * temp;
+			    a[i__ + *n * a_dim1] = ctemp * a[i__ + *n *
+				    a_dim1] - stemp * temp;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DLASR */
+
+} /* dlasr_ */
+
+/* Subroutine */ int dlasrt_(char *id, integer *n, doublereal *d__, integer *
+	info)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal d1, d2, d3;
+    static integer dir;
+    static doublereal tmp;
+    static integer endd;
+    extern logical lsame_(char *, char *);
+    static integer stack[64]	/* was [2][32] */;
+    static doublereal dmnmx;
+    static integer start;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static integer stkpnt;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    Sort the numbers in D in increasing order (if ID = 'I') or
+    in decreasing order (if ID = 'D' ).
+
+    Use Quick Sort, reverting to Insertion sort on arrays of
+    size <= 20. Dimension of STACK limits N to about 2**32.
+
+    Arguments
+    =========
+
+    ID      (input) CHARACTER*1
+            = 'I': sort D in increasing order;
+            = 'D': sort D in decreasing order.
+
+    N       (input) INTEGER
+            The length of the array D.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the array to be sorted.
+            On exit, D has been sorted into increasing order
+            (D(1) <= ... <= D(N) ) or into decreasing order
+            (D(1) >= ... >= D(N) ), depending on ID.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input paramters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    dir = -1;
+    if (lsame_(id, "D")) {
+	dir = 0;
+    } else if (lsame_(id, "I")) {
+	dir = 1;
+    }
+    if (dir == -1) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASRT", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 1) {
+	return 0;
+    }
+
+    stkpnt = 1;
+    stack[0] = 1;
+    stack[1] = *n;
+L10:
+    start = stack[(stkpnt << 1) - 2];
+    endd = stack[(stkpnt << 1) - 1];
+    --stkpnt;
+    if (endd - start <= 20 && endd - start > 0) {
+
+/*        Do Insertion sort on D( START:ENDD ) */
+
+	if (dir == 0) {
+
+/*           Sort into decreasing order */
+
+	    i__1 = endd;
+	    for (i__ = start + 1; i__ <= i__1; ++i__) {
+		i__2 = start + 1;
+		for (j = i__; j >= i__2; --j) {
+		    if (d__[j] > d__[j - 1]) {
+			dmnmx = d__[j];
+			d__[j] = d__[j - 1];
+			d__[j - 1] = dmnmx;
+		    } else {
+			goto L30;
+		    }
+/* L20: */
+		}
+L30:
+		;
+	    }
+
+	} else {
+
+/*           Sort into increasing order */
+
+	    i__1 = endd;
+	    for (i__ = start + 1; i__ <= i__1; ++i__) {
+		i__2 = start + 1;
+		for (j = i__; j >= i__2; --j) {
+		    if (d__[j] < d__[j - 1]) {
+			dmnmx = d__[j];
+			d__[j] = d__[j - 1];
+			d__[j - 1] = dmnmx;
+		    } else {
+			goto L50;
+		    }
+/* L40: */
+		}
+L50:
+		;
+	    }
+
+	}
+
+    } else if (endd - start > 20) {
+
+/*
+          Partition D( START:ENDD ) and stack parts, largest one first
+
+          Choose partition entry as median of 3
+*/
+
+	d1 = d__[start];
+	d2 = d__[endd];
+	i__ = (start + endd) / 2;
+	d3 = d__[i__];
+	if (d1 < d2) {
+	    if (d3 < d1) {
+		dmnmx = d1;
+	    } else if (d3 < d2) {
+		dmnmx = d3;
+	    } else {
+		dmnmx = d2;
+	    }
+	} else {
+	    if (d3 < d2) {
+		dmnmx = d2;
+	    } else if (d3 < d1) {
+		dmnmx = d3;
+	    } else {
+		dmnmx = d1;
+	    }
+	}
+
+	if (dir == 0) {
+
+/*           Sort into decreasing order */
+
+	    i__ = start - 1;
+	    j = endd + 1;
+L60:
+L70:
+	    --j;
+	    if (d__[j] < dmnmx) {
+		goto L70;
+	    }
+L80:
+	    ++i__;
+	    if (d__[i__] > dmnmx) {
+		goto L80;
+	    }
+	    if (i__ < j) {
+		tmp = d__[i__];
+		d__[i__] = d__[j];
+		d__[j] = tmp;
+		goto L60;
+	    }
+	    if (j - start > endd - j - 1) {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+	    } else {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+	    }
+	} else {
+
+/*           Sort into increasing order */
+
+	    i__ = start - 1;
+	    j = endd + 1;
+L90:
+L100:
+	    --j;
+	    if (d__[j] > dmnmx) {
+		goto L100;
+	    }
+L110:
+	    ++i__;
+	    if (d__[i__] < dmnmx) {
+		goto L110;
+	    }
+	    if (i__ < j) {
+		tmp = d__[i__];
+		d__[i__] = d__[j];
+		d__[j] = tmp;
+		goto L90;
+	    }
+	    if (j - start > endd - j - 1) {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+	    } else {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+	    }
+	}
+    }
+    if (stkpnt > 0) {
+	goto L10;
+    }
+    return 0;
+
+/*     End of DLASRT */
+
+} /* dlasrt_ */
+
+/* Subroutine */ int dlassq_(integer *n, doublereal *x, integer *incx,
+	doublereal *scale, doublereal *sumsq)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer ix;
+    static doublereal absxi;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASSQ  returns the values  scl  and  smsq  such that
+
+       ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+
+    where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+    assumed to be non-negative and  scl  returns the value
+
+       scl = max( scale, abs( x( i ) ) ).
+
+    scale and sumsq must be supplied in SCALE and SUMSQ and
+    scl and smsq are overwritten on SCALE and SUMSQ respectively.
+
+    The routine makes only one pass through the vector x.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of elements to be used from the vector X.
+
+    X       (input) DOUBLE PRECISION array, dimension (N)
+            The vector for which a scaled sum of squares is computed.
+               x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+
+    INCX    (input) INTEGER
+            The increment between successive values of the vector X.
+            INCX > 0.
+
+    SCALE   (input/output) DOUBLE PRECISION
+            On entry, the value  scale  in the equation above.
+            On exit, SCALE is overwritten with  scl , the scaling factor
+            for the sum of squares.
+
+    SUMSQ   (input/output) DOUBLE PRECISION
+            On entry, the value  sumsq  in the equation above.
+            On exit, SUMSQ is overwritten with  smsq , the basic sum of
+            squares from which  scl  has been factored out.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n > 0) {
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    if (x[ix] != 0.) {
+		absxi = (d__1 = x[ix], abs(d__1));
+		if (*scale < absxi) {
+/* Computing 2nd power */
+		    d__1 = *scale / absxi;
+		    *sumsq = *sumsq * (d__1 * d__1) + 1;
+		    *scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    d__1 = absxi / *scale;
+		    *sumsq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+    }
+    return 0;
+
+/*     End of DLASSQ */
+
+} /* dlassq_ */
+
+/* Subroutine */ int dlasv2_(doublereal *f, doublereal *g, doublereal *h__,
+	doublereal *ssmin, doublereal *ssmax, doublereal *snr, doublereal *
+	csr, doublereal *snl, doublereal *csl)
+{
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal a, d__, l, m, r__, s, t, fa, ga, ha, ft, gt, ht, mm, tt,
+	     clt, crt, slt, srt;
+    static integer pmax;
+    static doublereal temp;
+    static logical swap;
+    static doublereal tsign;
+
+    static logical gasmal;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASV2 computes the singular value decomposition of a 2-by-2
+    triangular matrix
+       [  F   G  ]
+       [  0   H  ].
+    On return, abs(SSMAX) is the larger singular value, abs(SSMIN) is the
+    smaller singular value, and (CSL,SNL) and (CSR,SNR) are the left and
+    right singular vectors for abs(SSMAX), giving the decomposition
+
+       [ CSL  SNL ] [  F   G  ] [ CSR -SNR ]  =  [ SSMAX   0   ]
+       [-SNL  CSL ] [  0   H  ] [ SNR  CSR ]     [  0    SSMIN ].
+
+    Arguments
+    =========
+
+    F       (input) DOUBLE PRECISION
+            The (1,1) element of the 2-by-2 matrix.
+
+    G       (input) DOUBLE PRECISION
+            The (1,2) element of the 2-by-2 matrix.
+
+    H       (input) DOUBLE PRECISION
+            The (2,2) element of the 2-by-2 matrix.
+
+    SSMIN   (output) DOUBLE PRECISION
+            abs(SSMIN) is the smaller singular value.
+
+    SSMAX   (output) DOUBLE PRECISION
+            abs(SSMAX) is the larger singular value.
+
+    SNL     (output) DOUBLE PRECISION
+    CSL     (output) DOUBLE PRECISION
+            The vector (CSL, SNL) is a unit left singular vector for the
+            singular value abs(SSMAX).
+
+    SNR     (output) DOUBLE PRECISION
+    CSR     (output) DOUBLE PRECISION
+            The vector (CSR, SNR) is a unit right singular vector for the
+            singular value abs(SSMAX).
+
+    Further Details
+    ===============
+
+    Any input parameter may be aliased with any output parameter.
+
+    Barring over/underflow and assuming a guard digit in subtraction, all
+    output quantities are correct to within a few units in the last
+    place (ulps).
+
+    In IEEE arithmetic, the code works correctly if one matrix element is
+    infinite.
+
+    Overflow will not occur unless the largest singular value itself
+    overflows or is within a few ulps of overflow. (On machines with
+    partial overflow, like the Cray, overflow may occur if the largest
+    singular value is within a factor of 2 of overflow.)
+
+    Underflow is harmless if underflow is gradual. Otherwise, results
+    may correspond to a matrix modified by perturbations of size near
+    the underflow threshold.
+
+   =====================================================================
+*/
+
+
+    ft = *f;
+    fa = abs(ft);
+    ht = *h__;
+    ha = abs(*h__);
+
+/*
+       PMAX points to the maximum absolute element of matrix
+         PMAX = 1 if F largest in absolute values
+         PMAX = 2 if G largest in absolute values
+         PMAX = 3 if H largest in absolute values
+*/
+
+    pmax = 1;
+    swap = ha > fa;
+    if (swap) {
+	pmax = 3;
+	temp = ft;
+	ft = ht;
+	ht = temp;
+	temp = fa;
+	fa = ha;
+	ha = temp;
+
+/*        Now FA .ge. HA */
+
+    }
+    gt = *g;
+    ga = abs(gt);
+    if (ga == 0.) {
+
+/*        Diagonal matrix */
+
+	*ssmin = ha;
+	*ssmax = fa;
+	clt = 1.;
+	crt = 1.;
+	slt = 0.;
+	srt = 0.;
+    } else {
+	gasmal = TRUE_;
+	if (ga > fa) {
+	    pmax = 2;
+	    if (fa / ga < EPSILON) {
+
+/*              Case of very large GA */
+
+		gasmal = FALSE_;
+		*ssmax = ga;
+		if (ha > 1.) {
+		    *ssmin = fa / (ga / ha);
+		} else {
+		    *ssmin = fa / ga * ha;
+		}
+		clt = 1.;
+		slt = ht / gt;
+		srt = 1.;
+		crt = ft / gt;
+	    }
+	}
+	if (gasmal) {
+
+/*           Normal case */
+
+	    d__ = fa - ha;
+	    if (d__ == fa) {
+
+/*              Copes with infinite F or H */
+
+		l = 1.;
+	    } else {
+		l = d__ / fa;
+	    }
+
+/*           Note that 0 .le. L .le. 1 */
+
+	    m = gt / ft;
+
+/*           Note that abs(M) .le. 1/macheps */
+
+	    t = 2. - l;
+
+/*           Note that T .ge. 1 */
+
+	    mm = m * m;
+	    tt = t * t;
+	    s = sqrt(tt + mm);
+
+/*           Note that 1 .le. S .le. 1 + 1/macheps */
+
+	    if (l == 0.) {
+		r__ = abs(m);
+	    } else {
+		r__ = sqrt(l * l + mm);
+	    }
+
+/*           Note that 0 .le. R .le. 1 + 1/macheps */
+
+	    a = (s + r__) * .5;
+
+/*           Note that 1 .le. A .le. 1 + abs(M) */
+
+	    *ssmin = ha / a;
+	    *ssmax = fa * a;
+	    if (mm == 0.) {
+
+/*              Note that M is very tiny */
+
+		if (l == 0.) {
+		    t = d_sign(&c_b3192, &ft) * d_sign(&c_b15, &gt);
+		} else {
+		    t = gt / d_sign(&d__, &ft) + m / t;
+		}
+	    } else {
+		t = (m / (s + t) + m / (r__ + l)) * (a + 1.);
+	    }
+	    l = sqrt(t * t + 4.);
+	    crt = 2. / l;
+	    srt = t / l;
+	    clt = (crt + srt * m) / a;
+	    slt = ht / ft * srt / a;
+	}
+    }
+    if (swap) {
+	*csl = srt;
+	*snl = crt;
+	*csr = slt;
+	*snr = clt;
+    } else {
+	*csl = clt;
+	*snl = slt;
+	*csr = crt;
+	*snr = srt;
+    }
+
+/*     Correct signs of SSMAX and SSMIN */
+
+    if (pmax == 1) {
+	tsign = d_sign(&c_b15, csr) * d_sign(&c_b15, csl) * d_sign(&c_b15, f);
+    }
+    if (pmax == 2) {
+	tsign = d_sign(&c_b15, snr) * d_sign(&c_b15, csl) * d_sign(&c_b15, g);
+    }
+    if (pmax == 3) {
+	tsign = d_sign(&c_b15, snr) * d_sign(&c_b15, snl) * d_sign(&c_b15,
+		h__);
+    }
+    *ssmax = d_sign(ssmax, &tsign);
+    d__1 = tsign * d_sign(&c_b15, f) * d_sign(&c_b15, h__);
+    *ssmin = d_sign(ssmin, &d__1);
+    return 0;
+
+/*     End of DLASV2 */
+
+} /* dlasv2_ */
+
+/* Subroutine */ int dlaswp_(integer *n, doublereal *a, integer *lda, integer
+	*k1, integer *k2, integer *ipiv, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, k, i1, i2, n32, ip, ix, ix0, inc;
+    static doublereal temp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASWP performs a series of row interchanges on the matrix A.
+    One row interchange is initiated for each of rows K1 through K2 of A.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the matrix of column dimension N to which the row
+            interchanges will be applied.
+            On exit, the permuted matrix.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+
+    K1      (input) INTEGER
+            The first element of IPIV for which a row interchange will
+            be done.
+
+    K2      (input) INTEGER
+            The last element of IPIV for which a row interchange will
+            be done.
+
+    IPIV    (input) INTEGER array, dimension (K2*abs(INCX))
+            The vector of pivot indices.  Only the elements in positions
+            K1 through K2 of IPIV are accessed.
+            IPIV(K) = L implies rows K and L are to be interchanged.
+
+    INCX    (input) INTEGER
+            The increment between successive values of IPIV.  If IPIV
+            is negative, the pivots are applied in reverse order.
+
+    Further Details
+    ===============
+
+    Modified by
+     R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA
+
+   =====================================================================
+
+
+       Interchange row I with row IPIV(I) for each of rows K1 through K2.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    if (*incx > 0) {
+	ix0 = *k1;
+	i1 = *k1;
+	i2 = *k2;
+	inc = 1;
+    } else if (*incx < 0) {
+	ix0 = (1 - *k2) * *incx + 1;
+	i1 = *k2;
+	i2 = *k1;
+	inc = -1;
+    } else {
+	return 0;
+    }
+
+    n32 = *n / 32 << 5;
+    if (n32 != 0) {
+	i__1 = n32;
+	for (j = 1; j <= i__1; j += 32) {
+	    ix = ix0;
+	    i__2 = i2;
+	    i__3 = inc;
+	    for (i__ = i1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__3)
+		    {
+		ip = ipiv[ix];
+		if (ip != i__) {
+		    i__4 = j + 31;
+		    for (k = j; k <= i__4; ++k) {
+			temp = a[i__ + k * a_dim1];
+			a[i__ + k * a_dim1] = a[ip + k * a_dim1];
+			a[ip + k * a_dim1] = temp;
+/* L10: */
+		    }
+		}
+		ix += *incx;
+/* L20: */
+	    }
+/* L30: */
+	}
+    }
+    if (n32 != *n) {
+	++n32;
+	ix = ix0;
+	i__1 = i2;
+	i__3 = inc;
+	for (i__ = i1; i__3 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__3) {
+	    ip = ipiv[ix];
+	    if (ip != i__) {
+		i__2 = *n;
+		for (k = n32; k <= i__2; ++k) {
+		    temp = a[i__ + k * a_dim1];
+		    a[i__ + k * a_dim1] = a[ip + k * a_dim1];
+		    a[ip + k * a_dim1] = temp;
+/* L40: */
+		}
+	    }
+	    ix += *incx;
+/* L50: */
+	}
+    }
+
+    return 0;
+
+/*     End of DLASWP */
+
+} /* dlaswp_ */
+
+/* Subroutine */ int dlasy2_(logical *ltranl, logical *ltranr, integer *isgn,
+	integer *n1, integer *n2, doublereal *tl, integer *ldtl, doublereal *
+	tr, integer *ldtr, doublereal *b, integer *ldb, doublereal *scale,
+	doublereal *x, integer *ldx, doublereal *xnorm, integer *info)
+{
+    /* Initialized data */
+
+    static integer locu12[4] = { 3,4,1,2 };
+    static integer locl21[4] = { 2,1,4,3 };
+    static integer locu22[4] = { 4,3,2,1 };
+    static logical xswpiv[4] = { FALSE_,FALSE_,TRUE_,TRUE_ };
+    static logical bswpiv[4] = { FALSE_,TRUE_,FALSE_,TRUE_ };
+
+    /* System generated locals */
+    integer b_dim1, b_offset, tl_dim1, tl_offset, tr_dim1, tr_offset, x_dim1,
+	    x_offset;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6, d__7, d__8;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static doublereal x2[2], l21, u11, u12;
+    static integer ip, jp;
+    static doublereal u22, t16[16]	/* was [4][4] */, gam, bet, eps, sgn,
+	    tmp[4], tau1, btmp[4], smin;
+    static integer ipiv;
+    static doublereal temp;
+    static integer jpiv[4];
+    static doublereal xmax;
+    static integer ipsv, jpsv;
+    static logical bswap;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *), dswap_(integer *, doublereal *, integer
+	    *, doublereal *, integer *);
+    static logical xswap;
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    static doublereal smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLASY2 solves for the N1 by N2 matrix X, 1 <= N1,N2 <= 2, in
+
+           op(TL)*X + ISGN*X*op(TR) = SCALE*B,
+
+    where TL is N1 by N1, TR is N2 by N2, B is N1 by N2, and ISGN = 1 or
+    -1.  op(T) = T or T', where T' denotes the transpose of T.
+
+    Arguments
+    =========
+
+    LTRANL  (input) LOGICAL
+            On entry, LTRANL specifies the op(TL):
+               = .FALSE., op(TL) = TL,
+               = .TRUE., op(TL) = TL'.
+
+    LTRANR  (input) LOGICAL
+            On entry, LTRANR specifies the op(TR):
+              = .FALSE., op(TR) = TR,
+              = .TRUE., op(TR) = TR'.
+
+    ISGN    (input) INTEGER
+            On entry, ISGN specifies the sign of the equation
+            as described before. ISGN may only be 1 or -1.
+
+    N1      (input) INTEGER
+            On entry, N1 specifies the order of matrix TL.
+            N1 may only be 0, 1 or 2.
+
+    N2      (input) INTEGER
+            On entry, N2 specifies the order of matrix TR.
+            N2 may only be 0, 1 or 2.
+
+    TL      (input) DOUBLE PRECISION array, dimension (LDTL,2)
+            On entry, TL contains an N1 by N1 matrix.
+
+    LDTL    (input) INTEGER
+            The leading dimension of the matrix TL. LDTL >= max(1,N1).
+
+    TR      (input) DOUBLE PRECISION array, dimension (LDTR,2)
+            On entry, TR contains an N2 by N2 matrix.
+
+    LDTR    (input) INTEGER
+            The leading dimension of the matrix TR. LDTR >= max(1,N2).
+
+    B       (input) DOUBLE PRECISION array, dimension (LDB,2)
+            On entry, the N1 by N2 matrix B contains the right-hand
+            side of the equation.
+
+    LDB     (input) INTEGER
+            The leading dimension of the matrix B. LDB >= max(1,N1).
+
+    SCALE   (output) DOUBLE PRECISION
+            On exit, SCALE contains the scale factor. SCALE is chosen
+            less than or equal to 1 to prevent the solution overflowing.
+
+    X       (output) DOUBLE PRECISION array, dimension (LDX,2)
+            On exit, X contains the N1 by N2 solution.
+
+    LDX     (input) INTEGER
+            The leading dimension of the matrix X. LDX >= max(1,N1).
+
+    XNORM   (output) DOUBLE PRECISION
+            On exit, XNORM is the infinity-norm of the solution.
+
+    INFO    (output) INTEGER
+            On exit, INFO is set to
+               0: successful exit.
+               1: TL and TR have too close eigenvalues, so TL or
+                  TR is perturbed to get a nonsingular equation.
+            NOTE: In the interests of speed, this routine does not
+                  check the inputs for errors.
+
+   =====================================================================
+*/
+
+    /* Parameter adjustments */
+    tl_dim1 = *ldtl;
+    tl_offset = 1 + tl_dim1;
+    tl -= tl_offset;
+    tr_dim1 = *ldtr;
+    tr_offset = 1 + tr_dim1;
+    tr -= tr_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+
+    /* Function Body */
+
+/*     Do not check the input parameters for errors */
+
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n1 == 0 || *n2 == 0) {
+	return 0;
+    }
+
+/*     Set constants to control overflow */
+
+    eps = PRECISION;
+    smlnum = SAFEMINIMUM / eps;
+    sgn = (doublereal) (*isgn);
+
+    k = *n1 + *n1 + *n2 - 2;
+    switch (k) {
+	case 1:  goto L10;
+	case 2:  goto L20;
+	case 3:  goto L30;
+	case 4:  goto L50;
+    }
+
+/*     1 by 1: TL11*X + SGN*X*TR11 = B11 */
+
+L10:
+    tau1 = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    bet = abs(tau1);
+    if (bet <= smlnum) {
+	tau1 = smlnum;
+	bet = smlnum;
+	*info = 1;
+    }
+
+    *scale = 1.;
+    gam = (d__1 = b[b_dim1 + 1], abs(d__1));
+    if (smlnum * gam > bet) {
+	*scale = 1. / gam;
+    }
+
+    x[x_dim1 + 1] = b[b_dim1 + 1] * *scale / tau1;
+    *xnorm = (d__1 = x[x_dim1 + 1], abs(d__1));
+    return 0;
+
+/*
+       1 by 2:
+       TL11*[X11 X12] + ISGN*[X11 X12]*op[TR11 TR12]  = [B11 B12]
+                                         [TR21 TR22]
+*/
+
+L20:
+
+/*
+   Computing MAX
+   Computing MAX
+*/
+    d__7 = (d__1 = tl[tl_dim1 + 1], abs(d__1)), d__8 = (d__2 = tr[tr_dim1 + 1]
+	    , abs(d__2)), d__7 = max(d__7,d__8), d__8 = (d__3 = tr[(tr_dim1 <<
+	     1) + 1], abs(d__3)), d__7 = max(d__7,d__8), d__8 = (d__4 = tr[
+	    tr_dim1 + 2], abs(d__4)), d__7 = max(d__7,d__8), d__8 = (d__5 =
+	    tr[(tr_dim1 << 1) + 2], abs(d__5));
+    d__6 = eps * max(d__7,d__8);
+    smin = max(d__6,smlnum);
+    tmp[0] = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    tmp[3] = tl[tl_dim1 + 1] + sgn * tr[(tr_dim1 << 1) + 2];
+    if (*ltranr) {
+	tmp[1] = sgn * tr[tr_dim1 + 2];
+	tmp[2] = sgn * tr[(tr_dim1 << 1) + 1];
+    } else {
+	tmp[1] = sgn * tr[(tr_dim1 << 1) + 1];
+	tmp[2] = sgn * tr[tr_dim1 + 2];
+    }
+    btmp[0] = b[b_dim1 + 1];
+    btmp[1] = b[(b_dim1 << 1) + 1];
+    goto L40;
+
+/*
+       2 by 1:
+            op[TL11 TL12]*[X11] + ISGN* [X11]*TR11  = [B11]
+              [TL21 TL22] [X21]         [X21]         [B21]
+*/
+
+L30:
+/*
+   Computing MAX
+   Computing MAX
+*/
+    d__7 = (d__1 = tr[tr_dim1 + 1], abs(d__1)), d__8 = (d__2 = tl[tl_dim1 + 1]
+	    , abs(d__2)), d__7 = max(d__7,d__8), d__8 = (d__3 = tl[(tl_dim1 <<
+	     1) + 1], abs(d__3)), d__7 = max(d__7,d__8), d__8 = (d__4 = tl[
+	    tl_dim1 + 2], abs(d__4)), d__7 = max(d__7,d__8), d__8 = (d__5 =
+	    tl[(tl_dim1 << 1) + 2], abs(d__5));
+    d__6 = eps * max(d__7,d__8);
+    smin = max(d__6,smlnum);
+    tmp[0] = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    tmp[3] = tl[(tl_dim1 << 1) + 2] + sgn * tr[tr_dim1 + 1];
+    if (*ltranl) {
+	tmp[1] = tl[(tl_dim1 << 1) + 1];
+	tmp[2] = tl[tl_dim1 + 2];
+    } else {
+	tmp[1] = tl[tl_dim1 + 2];
+	tmp[2] = tl[(tl_dim1 << 1) + 1];
+    }
+    btmp[0] = b[b_dim1 + 1];
+    btmp[1] = b[b_dim1 + 2];
+L40:
+
+/*
+       Solve 2 by 2 system using complete pivoting.
+       Set pivots less than SMIN to SMIN.
+*/
+
+    ipiv = idamax_(&c__4, tmp, &c__1);
+    u11 = tmp[ipiv - 1];
+    if (abs(u11) <= smin) {
+	*info = 1;
+	u11 = smin;
+    }
+    u12 = tmp[locu12[ipiv - 1] - 1];
+    l21 = tmp[locl21[ipiv - 1] - 1] / u11;
+    u22 = tmp[locu22[ipiv - 1] - 1] - u12 * l21;
+    xswap = xswpiv[ipiv - 1];
+    bswap = bswpiv[ipiv - 1];
+    if (abs(u22) <= smin) {
+	*info = 1;
+	u22 = smin;
+    }
+    if (bswap) {
+	temp = btmp[1];
+	btmp[1] = btmp[0] - l21 * temp;
+	btmp[0] = temp;
+    } else {
+	btmp[1] -= l21 * btmp[0];
+    }
+    *scale = 1.;
+    if (smlnum * 2. * abs(btmp[1]) > abs(u22) || smlnum * 2. * abs(btmp[0]) >
+	    abs(u11)) {
+/* Computing MAX */
+	d__1 = abs(btmp[0]), d__2 = abs(btmp[1]);
+	*scale = .5 / max(d__1,d__2);
+	btmp[0] *= *scale;
+	btmp[1] *= *scale;
+    }
+    x2[1] = btmp[1] / u22;
+    x2[0] = btmp[0] / u11 - u12 / u11 * x2[1];
+    if (xswap) {
+	temp = x2[1];
+	x2[1] = x2[0];
+	x2[0] = temp;
+    }
+    x[x_dim1 + 1] = x2[0];
+    if (*n1 == 1) {
+	x[(x_dim1 << 1) + 1] = x2[1];
+	*xnorm = (d__1 = x[x_dim1 + 1], abs(d__1)) + (d__2 = x[(x_dim1 << 1)
+		+ 1], abs(d__2));
+    } else {
+	x[x_dim1 + 2] = x2[1];
+/* Computing MAX */
+	d__3 = (d__1 = x[x_dim1 + 1], abs(d__1)), d__4 = (d__2 = x[x_dim1 + 2]
+		, abs(d__2));
+	*xnorm = max(d__3,d__4);
+    }
+    return 0;
+
+/*
+       2 by 2:
+       op[TL11 TL12]*[X11 X12] +ISGN* [X11 X12]*op[TR11 TR12] = [B11 B12]
+         [TL21 TL22] [X21 X22]        [X21 X22]   [TR21 TR22]   [B21 B22]
+
+       Solve equivalent 4 by 4 system using complete pivoting.
+       Set pivots less than SMIN to SMIN.
+*/
+
+L50:
+/* Computing MAX */
+    d__5 = (d__1 = tr[tr_dim1 + 1], abs(d__1)), d__6 = (d__2 = tr[(tr_dim1 <<
+	    1) + 1], abs(d__2)), d__5 = max(d__5,d__6), d__6 = (d__3 = tr[
+	    tr_dim1 + 2], abs(d__3)), d__5 = max(d__5,d__6), d__6 = (d__4 =
+	    tr[(tr_dim1 << 1) + 2], abs(d__4));
+    smin = max(d__5,d__6);
+/* Computing MAX */
+    d__5 = smin, d__6 = (d__1 = tl[tl_dim1 + 1], abs(d__1)), d__5 = max(d__5,
+	    d__6), d__6 = (d__2 = tl[(tl_dim1 << 1) + 1], abs(d__2)), d__5 =
+	    max(d__5,d__6), d__6 = (d__3 = tl[tl_dim1 + 2], abs(d__3)), d__5 =
+	     max(d__5,d__6), d__6 = (d__4 = tl[(tl_dim1 << 1) + 2], abs(d__4))
+	    ;
+    smin = max(d__5,d__6);
+/* Computing MAX */
+    d__1 = eps * smin;
+    smin = max(d__1,smlnum);
+    btmp[0] = 0.;
+    dcopy_(&c__16, btmp, &c__0, t16, &c__1);
+    t16[0] = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    t16[5] = tl[(tl_dim1 << 1) + 2] + sgn * tr[tr_dim1 + 1];
+    t16[10] = tl[tl_dim1 + 1] + sgn * tr[(tr_dim1 << 1) + 2];
+    t16[15] = tl[(tl_dim1 << 1) + 2] + sgn * tr[(tr_dim1 << 1) + 2];
+    if (*ltranl) {
+	t16[4] = tl[tl_dim1 + 2];
+	t16[1] = tl[(tl_dim1 << 1) + 1];
+	t16[14] = tl[tl_dim1 + 2];
+	t16[11] = tl[(tl_dim1 << 1) + 1];
+    } else {
+	t16[4] = tl[(tl_dim1 << 1) + 1];
+	t16[1] = tl[tl_dim1 + 2];
+	t16[14] = tl[(tl_dim1 << 1) + 1];
+	t16[11] = tl[tl_dim1 + 2];
+    }
+    if (*ltranr) {
+	t16[8] = sgn * tr[(tr_dim1 << 1) + 1];
+	t16[13] = sgn * tr[(tr_dim1 << 1) + 1];
+	t16[2] = sgn * tr[tr_dim1 + 2];
+	t16[7] = sgn * tr[tr_dim1 + 2];
+    } else {
+	t16[8] = sgn * tr[tr_dim1 + 2];
+	t16[13] = sgn * tr[tr_dim1 + 2];
+	t16[2] = sgn * tr[(tr_dim1 << 1) + 1];
+	t16[7] = sgn * tr[(tr_dim1 << 1) + 1];
+    }
+    btmp[0] = b[b_dim1 + 1];
+    btmp[1] = b[b_dim1 + 2];
+    btmp[2] = b[(b_dim1 << 1) + 1];
+    btmp[3] = b[(b_dim1 << 1) + 2];
+
+/*     Perform elimination */
+
+    for (i__ = 1; i__ <= 3; ++i__) {
+	xmax = 0.;
+	for (ip = i__; ip <= 4; ++ip) {
+	    for (jp = i__; jp <= 4; ++jp) {
+		if ((d__1 = t16[ip + (jp << 2) - 5], abs(d__1)) >= xmax) {
+		    xmax = (d__1 = t16[ip + (jp << 2) - 5], abs(d__1));
+		    ipsv = ip;
+		    jpsv = jp;
+		}
+/* L60: */
+	    }
+/* L70: */
+	}
+	if (ipsv != i__) {
+	    dswap_(&c__4, &t16[ipsv - 1], &c__4, &t16[i__ - 1], &c__4);
+	    temp = btmp[i__ - 1];
+	    btmp[i__ - 1] = btmp[ipsv - 1];
+	    btmp[ipsv - 1] = temp;
+	}
+	if (jpsv != i__) {
+	    dswap_(&c__4, &t16[(jpsv << 2) - 4], &c__1, &t16[(i__ << 2) - 4],
+		    &c__1);
+	}
+	jpiv[i__ - 1] = jpsv;
+	if ((d__1 = t16[i__ + (i__ << 2) - 5], abs(d__1)) < smin) {
+	    *info = 1;
+	    t16[i__ + (i__ << 2) - 5] = smin;
+	}
+	for (j = i__ + 1; j <= 4; ++j) {
+	    t16[j + (i__ << 2) - 5] /= t16[i__ + (i__ << 2) - 5];
+	    btmp[j - 1] -= t16[j + (i__ << 2) - 5] * btmp[i__ - 1];
+	    for (k = i__ + 1; k <= 4; ++k) {
+		t16[j + (k << 2) - 5] -= t16[j + (i__ << 2) - 5] * t16[i__ + (
+			k << 2) - 5];
+/* L80: */
+	    }
+/* L90: */
+	}
+/* L100: */
+    }
+    if (abs(t16[15]) < smin) {
+	t16[15] = smin;
+    }
+    *scale = 1.;
+    if (smlnum * 8. * abs(btmp[0]) > abs(t16[0]) || smlnum * 8. * abs(btmp[1])
+	     > abs(t16[5]) || smlnum * 8. * abs(btmp[2]) > abs(t16[10]) ||
+	    smlnum * 8. * abs(btmp[3]) > abs(t16[15])) {
+/* Computing MAX */
+	d__1 = abs(btmp[0]), d__2 = abs(btmp[1]), d__1 = max(d__1,d__2), d__2
+		= abs(btmp[2]), d__1 = max(d__1,d__2), d__2 = abs(btmp[3]);
+	*scale = .125 / max(d__1,d__2);
+	btmp[0] *= *scale;
+	btmp[1] *= *scale;
+	btmp[2] *= *scale;
+	btmp[3] *= *scale;
+    }
+    for (i__ = 1; i__ <= 4; ++i__) {
+	k = 5 - i__;
+	temp = 1. / t16[k + (k << 2) - 5];
+	tmp[k - 1] = btmp[k - 1] * temp;
+	for (j = k + 1; j <= 4; ++j) {
+	    tmp[k - 1] -= temp * t16[k + (j << 2) - 5] * tmp[j - 1];
+/* L110: */
+	}
+/* L120: */
+    }
+    for (i__ = 1; i__ <= 3; ++i__) {
+	if (jpiv[4 - i__ - 1] != 4 - i__) {
+	    temp = tmp[4 - i__ - 1];
+	    tmp[4 - i__ - 1] = tmp[jpiv[4 - i__ - 1] - 1];
+	    tmp[jpiv[4 - i__ - 1] - 1] = temp;
+	}
+/* L130: */
+    }
+    x[x_dim1 + 1] = tmp[0];
+    x[x_dim1 + 2] = tmp[1];
+    x[(x_dim1 << 1) + 1] = tmp[2];
+    x[(x_dim1 << 1) + 2] = tmp[3];
+/* Computing MAX */
+    d__1 = abs(tmp[0]) + abs(tmp[2]), d__2 = abs(tmp[1]) + abs(tmp[3]);
+    *xnorm = max(d__1,d__2);
+    return 0;
+
+/*     End of DLASY2 */
+
+} /* dlasy2_ */
+
+/* Subroutine */ int dlatrd_(char *uplo, integer *n, integer *nb, doublereal *
+	a, integer *lda, doublereal *e, doublereal *tau, doublereal *w,
+	integer *ldw)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, iw;
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    static doublereal alpha;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *), daxpy_(integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *),
+	    dsymv_(char *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *), dlarfg_(integer *, doublereal *, doublereal *, integer *,
+	     doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLATRD reduces NB rows and columns of a real symmetric matrix A to
+    symmetric tridiagonal form by an orthogonal similarity
+    transformation Q' * A * Q, and returns the matrices V and W which are
+    needed to apply the transformation to the unreduced part of A.
+
+    If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
+    matrix, of which the upper triangle is supplied;
+    if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
+    matrix, of which the lower triangle is supplied.
+
+    This is an auxiliary routine called by DSYTRD.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is stored:
+            = 'U': Upper triangular
+            = 'L': Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    NB      (input) INTEGER
+            The number of rows and columns to be reduced.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit:
+            if UPLO = 'U', the last NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements above the diagonal
+              with the array TAU, represent the orthogonal matrix Q as a
+              product of elementary reflectors;
+            if UPLO = 'L', the first NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements below the diagonal
+              with the array TAU, represent the  orthogonal matrix Q as a
+              product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= (1,N).
+
+    E       (output) DOUBLE PRECISION array, dimension (N-1)
+            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
+            elements of the last NB columns of the reduced matrix;
+            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
+            the first NB columns of the reduced matrix.
+
+    TAU     (output) DOUBLE PRECISION array, dimension (N-1)
+            The scalar factors of the elementary reflectors, stored in
+            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
+            See Further Details.
+
+    W       (output) DOUBLE PRECISION array, dimension (LDW,NB)
+            The n-by-nb matrix W required to update the unreduced part
+            of A.
+
+    LDW     (input) INTEGER
+            The leading dimension of the array W. LDW >= max(1,N).
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n) H(n-1) . . . H(n-nb+1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
+    and tau in TAU(i-1).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
+    and tau in TAU(i).
+
+    The elements of the vectors v together form the n-by-nb matrix V
+    which is needed, with W, to apply the transformation to the unreduced
+    part of the matrix, using a symmetric rank-2k update of the form:
+    A := A - V*W' - W*V'.
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5 and nb = 2:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  a   a   a   v4  v5 )              (  d                  )
+      (      a   a   v4  v5 )              (  1   d              )
+      (          a   1   v5 )              (  v1  1   a          )
+      (              d   1  )              (  v1  v2  a   a      )
+      (                  d  )              (  v1  v2  a   a   a  )
+
+    where d denotes a diagonal element of the reduced matrix, a denotes
+    an element of the original matrix that is unchanged, and vi denotes
+    an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --e;
+    --tau;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(uplo, "U")) {
+
+/*        Reduce last NB columns of upper triangle */
+
+	i__1 = *n - *nb + 1;
+	for (i__ = *n; i__ >= i__1; --i__) {
+	    iw = i__ - *n + *nb;
+	    if (i__ < *n) {
+
+/*              Update A(1:i,i) */
+
+		i__2 = *n - i__;
+		dgemv_("No transpose", &i__, &i__2, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &w[i__ + (iw + 1) * w_dim1], ldw, &
+			c_b15, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		dgemv_("No transpose", &i__, &i__2, &c_b151, &w[(iw + 1) *
+			w_dim1 + 1], ldw, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b15, &a[i__ * a_dim1 + 1], &c__1);
+	    }
+	    if (i__ > 1) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(1:i-2,i)
+*/
+
+		i__2 = i__ - 1;
+		dlarfg_(&i__2, &a[i__ - 1 + i__ * a_dim1], &a[i__ * a_dim1 +
+			1], &c__1, &tau[i__ - 1]);
+		e[i__ - 1] = a[i__ - 1 + i__ * a_dim1];
+		a[i__ - 1 + i__ * a_dim1] = 1.;
+
+/*              Compute W(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		dsymv_("Upper", &i__2, &c_b15, &a[a_offset], lda, &a[i__ *
+			a_dim1 + 1], &c__1, &c_b29, &w[iw * w_dim1 + 1], &
+			c__1);
+		if (i__ < *n) {
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    dgemv_("Transpose", &i__2, &i__3, &c_b15, &w[(iw + 1) *
+			    w_dim1 + 1], ldw, &a[i__ * a_dim1 + 1], &c__1, &
+			    c_b29, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[(i__ + 1)
+			     * a_dim1 + 1], lda, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b15, &w[iw * w_dim1 + 1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[(i__ + 1) *
+			    a_dim1 + 1], lda, &a[i__ * a_dim1 + 1], &c__1, &
+			    c_b29, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    dgemv_("No transpose", &i__2, &i__3, &c_b151, &w[(iw + 1)
+			    * w_dim1 + 1], ldw, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b15, &w[iw * w_dim1 + 1], &c__1);
+		}
+		i__2 = i__ - 1;
+		dscal_(&i__2, &tau[i__ - 1], &w[iw * w_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		alpha = tau[i__ - 1] * -.5 * ddot_(&i__2, &w[iw * w_dim1 + 1],
+			 &c__1, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		daxpy_(&i__2, &alpha, &a[i__ * a_dim1 + 1], &c__1, &w[iw *
+			w_dim1 + 1], &c__1);
+	    }
+
+/* L10: */
+	}
+    } else {
+
+/*        Reduce first NB columns of lower triangle */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:n,i) */
+
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + a_dim1],
+		    lda, &w[i__ + w_dim1], ldw, &c_b15, &a[i__ + i__ * a_dim1]
+		    , &c__1);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    dgemv_("No transpose", &i__2, &i__3, &c_b151, &w[i__ + w_dim1],
+		    ldw, &a[i__ + a_dim1], lda, &c_b15, &a[i__ + i__ * a_dim1]
+		    , &c__1);
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:n,i)
+*/
+
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		dlarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*n) +
+			i__ * a_dim1], &c__1, &tau[i__]);
+		e[i__] = a[i__ + 1 + i__ * a_dim1];
+		a[i__ + 1 + i__ * a_dim1] = 1.;
+
+/*              Compute W(i+1:n,i) */
+
+		i__2 = *n - i__;
+		dsymv_("Lower", &i__2, &c_b15, &a[i__ + 1 + (i__ + 1) *
+			a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b29, &w[i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &w[i__ + 1 + w_dim1]
+			, ldw, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &w[
+			i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &w[i__ * w_dim1 + 1], &c__1, &c_b15, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + a_dim1]
+			, lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &w[
+			i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &w[i__ + 1 +
+			w_dim1], ldw, &w[i__ * w_dim1 + 1], &c__1, &c_b15, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		dscal_(&i__2, &tau[i__], &w[i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		alpha = tau[i__] * -.5 * ddot_(&i__2, &w[i__ + 1 + i__ *
+			w_dim1], &c__1, &a[i__ + 1 + i__ * a_dim1], &c__1);
+		i__2 = *n - i__;
+		daxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+	    }
+
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of DLATRD */
+
+} /* dlatrd_ */
+
+/* Subroutine */ int dlauu2_(char *uplo, integer *n, doublereal *a, integer *
+	lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal aii;
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAUU2 computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAUU2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the product U * U'. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    aii = a[i__ + i__ * a_dim1];
+	    if (i__ < *n) {
+		i__2 = *n - i__ + 1;
+		a[i__ + i__ * a_dim1] = ddot_(&i__2, &a[i__ + i__ * a_dim1],
+			lda, &a[i__ + i__ * a_dim1], lda);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		dgemv_("No transpose", &i__2, &i__3, &c_b15, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			aii, &a[i__ * a_dim1 + 1], &c__1);
+	    } else {
+		dscal_(&i__, &aii, &a[i__ * a_dim1 + 1], &c__1);
+	    }
+/* L10: */
+	}
+
+    } else {
+
+/*        Compute the product L' * L. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    aii = a[i__ + i__ * a_dim1];
+	    if (i__ < *n) {
+		i__2 = *n - i__ + 1;
+		a[i__ + i__ * a_dim1] = ddot_(&i__2, &a[i__ + i__ * a_dim1], &
+			c__1, &a[i__ + i__ * a_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		dgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + a_dim1]
+			, lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &aii, &a[
+			i__ + a_dim1], lda);
+	    } else {
+		dscal_(&i__, &aii, &a[i__ + a_dim1], lda);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of DLAUU2 */
+
+} /* dlauu2_ */
+
+/* Subroutine */ int dlauum_(char *uplo, integer *n, doublereal *a, integer *
+	lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, ib, nb;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
+	     integer *), dlauu2_(char *, integer *,
+	    doublereal *, integer *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DLAUUM computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLAUUM", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "DLAUUM", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	dlauu2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute the product U * U'. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		dtrmm_("Right", "Upper", "Transpose", "Non-unit", &i__3, &ib,
+			&c_b15, &a[i__ + i__ * a_dim1], lda, &a[i__ * a_dim1
+			+ 1], lda)
+			;
+		dlauu2_("Upper", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    dgemm_("No transpose", "Transpose", &i__3, &ib, &i__4, &
+			    c_b15, &a[(i__ + ib) * a_dim1 + 1], lda, &a[i__ +
+			    (i__ + ib) * a_dim1], lda, &c_b15, &a[i__ *
+			    a_dim1 + 1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    dsyrk_("Upper", "No transpose", &ib, &i__3, &c_b15, &a[
+			    i__ + (i__ + ib) * a_dim1], lda, &c_b15, &a[i__ +
+			    i__ * a_dim1], lda);
+		}
+/* L10: */
+	    }
+	} else {
+
+/*           Compute the product L' * L. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		dtrmm_("Left", "Lower", "Transpose", "Non-unit", &ib, &i__3, &
+			c_b15, &a[i__ + i__ * a_dim1], lda, &a[i__ + a_dim1],
+			lda);
+		dlauu2_("Lower", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    dgemm_("Transpose", "No transpose", &ib, &i__3, &i__4, &
+			    c_b15, &a[i__ + ib + i__ * a_dim1], lda, &a[i__ +
+			    ib + a_dim1], lda, &c_b15, &a[i__ + a_dim1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    dsyrk_("Lower", "Transpose", &ib, &i__3, &c_b15, &a[i__ +
+			    ib + i__ * a_dim1], lda, &c_b15, &a[i__ + i__ *
+			    a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DLAUUM */
+
+} /* dlauum_ */
+
+/* Subroutine */ int dorg2r_(integer *m, integer *n, integer *k, doublereal *
+	a, integer *lda, doublereal *tau, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *), dlarf_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORG2R generates an m by n real matrix Q with orthonormal columns,
+    which is defined as the first n columns of a product of k elementary
+    reflectors of order m
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by DGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by DGEQRF in the first k columns of its array
+            argument A.
+            On exit, the m-by-n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEQRF.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORG2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Initialise columns k+1:n to columns of the unit matrix */
+
+    i__1 = *n;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (l = 1; l <= i__2; ++l) {
+	    a[l + j * a_dim1] = 0.;
+/* L10: */
+	}
+	a[j + j * a_dim1] = 1.;
+/* L20: */
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i) to A(i:m,i:n) from the left */
+
+	if (i__ < *n) {
+	    a[i__ + i__ * a_dim1] = 1.;
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n - i__;
+	    dlarf_("Left", &i__1, &i__2, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	}
+	if (i__ < *m) {
+	    i__1 = *m - i__;
+	    d__1 = -tau[i__];
+	    dscal_(&i__1, &d__1, &a[i__ + 1 + i__ * a_dim1], &c__1);
+	}
+	a[i__ + i__ * a_dim1] = 1. - tau[i__];
+
+/*        Set A(1:i-1,i) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    a[l + i__ * a_dim1] = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of DORG2R */
+
+} /* dorg2r_ */
+
+/* Subroutine */ int dorgbr_(char *vect, integer *m, integer *n, integer *k,
+	doublereal *a, integer *lda, doublereal *tau, doublereal *work,
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, nb, mn;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical wantq;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dorglq_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *), dorgqr_(integer *, integer *, integer *, doublereal *,
+	     integer *, doublereal *, doublereal *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORGBR generates one of the real orthogonal matrices Q or P**T
+    determined by DGEBRD when reducing a real matrix A to bidiagonal
+    form: A = Q * B * P**T.  Q and P**T are defined as products of
+    elementary reflectors H(i) or G(i) respectively.
+
+    If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+    is of order M:
+    if m >= k, Q = H(1) H(2) . . . H(k) and DORGBR returns the first n
+    columns of Q, where m >= n >= k;
+    if m < k, Q = H(1) H(2) . . . H(m-1) and DORGBR returns Q as an
+    M-by-M matrix.
+
+    If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**T
+    is of order N:
+    if k < n, P**T = G(k) . . . G(2) G(1) and DORGBR returns the first m
+    rows of P**T, where n >= m >= k;
+    if k >= n, P**T = G(n-1) . . . G(2) G(1) and DORGBR returns P**T as
+    an N-by-N matrix.
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            Specifies whether the matrix Q or the matrix P**T is
+            required, as defined in the transformation applied by DGEBRD:
+            = 'Q':  generate Q;
+            = 'P':  generate P**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q or P**T to be returned.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q or P**T to be returned.
+            N >= 0.
+            If VECT = 'Q', M >= N >= min(M,K);
+            if VECT = 'P', N >= M >= min(N,K).
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original M-by-K
+            matrix reduced by DGEBRD.
+            If VECT = 'P', the number of rows in the original K-by-N
+            matrix reduced by DGEBRD.
+            K >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by DGEBRD.
+            On exit, the M-by-N matrix Q or P**T.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) DOUBLE PRECISION array, dimension
+                                  (min(M,K)) if VECT = 'Q'
+                                  (min(N,K)) if VECT = 'P'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i), which determines Q or P**T, as
+            returned by DGEBRD in its array argument TAUQ or TAUP.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,min(M,N)).
+            For optimum performance LWORK >= min(M,N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(vect, "Q");
+    mn = min(*m,*n);
+    lquery = *lwork == -1;
+    if (! wantq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0 || wantq && (*n > *m || *n < min(*m,*k)) || ! wantq && (
+	    *m > *n || *m < min(*n,*k))) {
+	*info = -3;
+    } else if (*k < 0) {
+	*info = -4;
+    } else if (*lda < max(1,*m)) {
+	*info = -6;
+    } else if (*lwork < max(1,mn) && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+	if (wantq) {
+	    nb = ilaenv_(&c__1, "DORGQR", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	} else {
+	    nb = ilaenv_(&c__1, "DORGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	}
+	lwkopt = max(1,mn) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORGBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    if (wantq) {
+
+/*
+          Form Q, determined by a call to DGEBRD to reduce an m-by-k
+          matrix
+*/
+
+	if (*m >= *k) {
+
+/*           If m >= k, assume m >= n >= k */
+
+	    dorgqr_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If m < k, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             column to the right, and set the first row and column of Q
+             to those of the unit matrix
+*/
+
+	    for (j = *m; j >= 2; --j) {
+		a[j * a_dim1 + 1] = 0.;
+		i__1 = *m;
+		for (i__ = j + 1; i__ <= i__1; ++i__) {
+		    a[i__ + j * a_dim1] = a[i__ + (j - 1) * a_dim1];
+/* L10: */
+		}
+/* L20: */
+	    }
+	    a[a_dim1 + 1] = 1.;
+	    i__1 = *m;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		a[i__ + a_dim1] = 0.;
+/* L30: */
+	    }
+	    if (*m > 1) {
+
+/*              Form Q(2:m,2:m) */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		dorgqr_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    } else {
+
+/*
+          Form P', determined by a call to DGEBRD to reduce a k-by-n
+          matrix
+*/
+
+	if (*k < *n) {
+
+/*           If k < n, assume k <= m <= n */
+
+	    dorglq_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If k >= n, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             row downward, and set the first row and column of P' to
+             those of the unit matrix
+*/
+
+	    a[a_dim1 + 1] = 1.;
+	    i__1 = *n;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		a[i__ + a_dim1] = 0.;
+/* L40: */
+	    }
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		for (i__ = j - 1; i__ >= 2; --i__) {
+		    a[i__ + j * a_dim1] = a[i__ - 1 + j * a_dim1];
+/* L50: */
+		}
+		a[j * a_dim1 + 1] = 0.;
+/* L60: */
+	    }
+	    if (*n > 1) {
+
+/*              Form P'(2:n,2:n) */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		dorglq_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORGBR */
+
+} /* dorgbr_ */
+
+/* Subroutine */ int dorghr_(integer *n, integer *ilo, integer *ihi,
+	doublereal *a, integer *lda, doublereal *tau, doublereal *work,
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, nb, nh, iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORGHR generates a real orthogonal matrix Q which is defined as the
+    product of IHI-ILO elementary reflectors of order N, as returned by
+    DGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix Q. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of DGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by DGEHRD.
+            On exit, the N-by-N orthogonal matrix Q.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,N).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (N-1)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEHRD.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= IHI-ILO.
+            For optimum performance LWORK >= (IHI-ILO)*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,nh) && ! lquery) {
+	*info = -8;
+    }
+
+    if (*info == 0) {
+	nb = ilaenv_(&c__1, "DORGQR", " ", &nh, &nh, &nh, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+	lwkopt = max(1,nh) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORGHR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+/*
+       Shift the vectors which define the elementary reflectors one
+       column to the right, and set the first ilo and the last n-ihi
+       rows and columns to those of the unit matrix
+*/
+
+    i__1 = *ilo + 1;
+    for (j = *ihi; j >= i__1; --j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.;
+/* L10: */
+	}
+	i__2 = *ihi;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = a[i__ + (j - 1) * a_dim1];
+/* L20: */
+	}
+	i__2 = *n;
+	for (i__ = *ihi + 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+    i__1 = *ilo;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.;
+/* L50: */
+	}
+	a[j + j * a_dim1] = 1.;
+/* L60: */
+    }
+    i__1 = *n;
+    for (j = *ihi + 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.;
+/* L70: */
+	}
+	a[j + j * a_dim1] = 1.;
+/* L80: */
+    }
+
+    if (nh > 0) {
+
+/*        Generate Q(ilo+1:ihi,ilo+1:ihi) */
+
+	dorgqr_(&nh, &nh, &nh, &a[*ilo + 1 + (*ilo + 1) * a_dim1], lda, &tau[*
+		ilo], &work[1], lwork, &iinfo);
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORGHR */
+
+} /* dorghr_ */
+
+/* Subroutine */ int dorgl2_(integer *m, integer *n, integer *k, doublereal *
+	a, integer *lda, doublereal *tau, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *), dlarf_(char *, integer *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORGL2 generates an m by n real matrix Q with orthonormal rows,
+    which is defined as the first m rows of a product of k elementary
+    reflectors of order n
+
+          Q  =  H(k) . . . H(2) H(1)
+
+    as returned by DGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by DGELQF in the first k rows of its array argument A.
+            On exit, the m-by-n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGELQF.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORGL2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	return 0;
+    }
+
+    if (*k < *m) {
+
+/*        Initialise rows k+1:m to rows of the unit matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (l = *k + 1; l <= i__2; ++l) {
+		a[l + j * a_dim1] = 0.;
+/* L10: */
+	    }
+	    if (j > *k && j <= *m) {
+		a[j + j * a_dim1] = 1.;
+	    }
+/* L20: */
+	}
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i) to A(i:m,i:n) from the right */
+
+	if (i__ < *n) {
+	    if (i__ < *m) {
+		a[i__ + i__ * a_dim1] = 1.;
+		i__1 = *m - i__;
+		i__2 = *n - i__ + 1;
+		dlarf_("Right", &i__1, &i__2, &a[i__ + i__ * a_dim1], lda, &
+			tau[i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    i__1 = *n - i__;
+	    d__1 = -tau[i__];
+	    dscal_(&i__1, &d__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+	a[i__ + i__ * a_dim1] = 1. - tau[i__];
+
+/*        Set A(i,1:i-1) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    a[i__ + l * a_dim1] = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of DORGL2 */
+
+} /* dorgl2_ */
+
+/* Subroutine */ int dorglq_(integer *m, integer *n, integer *k, doublereal *
+	a, integer *lda, doublereal *tau, doublereal *work, integer *lwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int dorgl2_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *),
+	    dlarfb_(char *, char *, char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORGLQ generates an M-by-N real matrix Q with orthonormal rows,
+    which is defined as the first M rows of a product of K elementary
+    reflectors of order N
+
+          Q  =  H(k) . . . H(2) H(1)
+
+    as returned by DGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by DGELQF in the first k rows of its array argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGELQF.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "DORGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*m) * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORGLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DORGLQ", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DORGLQ", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk rows are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(kk+1:m,1:kk) to zero. */
+
+	i__1 = kk;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = kk + 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *m) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	dorgl2_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *n - i__ + 1;
+		dlarft_("Forward", "Rowwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i+ib:m,i:n) from the right */
+
+		i__2 = *m - i__ - ib + 1;
+		i__3 = *n - i__ + 1;
+		dlarfb_("Right", "Transpose", "Forward", "Rowwise", &i__2, &
+			i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib +
+			1], &ldwork);
+	    }
+
+/*           Apply H' to columns i:n of current block */
+
+	    i__2 = *n - i__ + 1;
+	    dorgl2_(&ib, &i__2, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set columns 1:i-1 of current block to zero */
+
+	    i__2 = i__ - 1;
+	    for (j = 1; j <= i__2; ++j) {
+		i__3 = i__ + ib - 1;
+		for (l = i__; l <= i__3; ++l) {
+		    a[l + j * a_dim1] = 0.;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1] = (doublereal) iws;
+    return 0;
+
+/*     End of DORGLQ */
+
+} /* dorglq_ */
+
+/* Subroutine */ int dorgqr_(integer *m, integer *n, integer *k, doublereal *
+	a, integer *lda, doublereal *tau, doublereal *work, integer *lwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int dorg2r_(integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *),
+	    dlarfb_(char *, char *, char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORGQR generates an M-by-N real matrix Q with orthonormal columns,
+    which is defined as the first N columns of a product of K elementary
+    reflectors of order M
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by DGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by DGEQRF in the first k columns of its array
+            argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEQRF.
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "DORGQR", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*n) * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORGQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DORGQR", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DORGQR", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk columns are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(1:kk,kk+1:n) to zero. */
+
+	i__1 = *n;
+	for (j = kk + 1; j <= i__1; ++j) {
+	    i__2 = kk;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *n) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	dorg2r_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *m - i__ + 1;
+		dlarft_("Forward", "Columnwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i:m,i+ib:n) from the left */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__ - ib + 1;
+		dlarfb_("Left", "No transpose", "Forward", "Columnwise", &
+			i__2, &i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[
+			1], &ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &
+			work[ib + 1], &ldwork);
+	    }
+
+/*           Apply H to rows i:m of current block */
+
+	    i__2 = *m - i__ + 1;
+	    dorg2r_(&i__2, &ib, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set rows 1:i-1 of current block to zero */
+
+	    i__2 = i__ + ib - 1;
+	    for (j = i__; j <= i__2; ++j) {
+		i__3 = i__ - 1;
+		for (l = 1; l <= i__3; ++l) {
+		    a[l + j * a_dim1] = 0.;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1] = (doublereal) iws;
+    return 0;
+
+/*     End of DORGQR */
+
+} /* dorgqr_ */
+
+/* Subroutine */ int dorm2l_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, mi, ni, nq;
+    static doublereal aii;
+    static logical left;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORM2L overwrites the general real m by n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'T', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'T',
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by DGEQLF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'T': apply Q' (Transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            DGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEQLF.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORM2L", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+    } else {
+	mi = *m;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(1:m-k+i,1:n) */
+
+	    mi = *m - *k + i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,1:n-k+i) */
+
+	    ni = *n - *k + i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[nq - *k + i__ + i__ * a_dim1];
+	a[nq - *k + i__ + i__ * a_dim1] = 1.;
+	dlarf_(side, &mi, &ni, &a[i__ * a_dim1 + 1], &c__1, &tau[i__], &c__[
+		c_offset], ldc, &work[1]);
+	a[nq - *k + i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of DORM2L */
+
+} /* dorm2l_ */
+
+/* Subroutine */ int dorm2r_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static doublereal aii;
+    static logical left;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORM2R overwrites the general real m by n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'T', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'T',
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by DGEQRF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'T': apply Q' (Transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            DGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEQRF.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORM2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && ! notran || ! left && notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[i__ + i__ * a_dim1];
+	a[i__ + i__ * a_dim1] = 1.;
+	dlarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], &c__1, &tau[i__], &c__[
+		ic + jc * c_dim1], ldc, &work[1]);
+	a[i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of DORM2R */
+
+} /* dorm2r_ */
+
+/* Subroutine */ int dormbr_(char *vect, char *side, char *trans, integer *m,
+	integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau,
+	doublereal *c__, integer *ldc, doublereal *work, integer *lwork,
+	integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2];
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *);
+    static logical notran;
+    extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *);
+    static logical applyq;
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    If VECT = 'Q', DORMBR overwrites the general real M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    If VECT = 'P', DORMBR overwrites the general real M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      P * C          C * P
+    TRANS = 'T':      P**T * C       C * P**T
+
+    Here Q and P**T are the orthogonal matrices determined by DGEBRD when
+    reducing a real matrix A to bidiagonal form: A = Q * B * P**T. Q and
+    P**T are defined as products of elementary reflectors H(i) and G(i)
+    respectively.
+
+    Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+    order of the orthogonal matrix Q or P**T that is applied.
+
+    If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+    if nq >= k, Q = H(1) H(2) . . . H(k);
+    if nq < k, Q = H(1) H(2) . . . H(nq-1).
+
+    If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+    if k < nq, P = G(1) G(2) . . . G(k);
+    if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            = 'Q': apply Q or Q**T;
+            = 'P': apply P or P**T.
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q, Q**T, P or P**T from the Left;
+            = 'R': apply Q, Q**T, P or P**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q  or P;
+            = 'T':  Transpose, apply Q**T or P**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original
+            matrix reduced by DGEBRD.
+            If VECT = 'P', the number of rows in the original
+            matrix reduced by DGEBRD.
+            K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension
+                                  (LDA,min(nq,K)) if VECT = 'Q'
+                                  (LDA,nq)        if VECT = 'P'
+            The vectors which define the elementary reflectors H(i) and
+            G(i), whose products determine the matrices Q and P, as
+            returned by DGEBRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If VECT = 'Q', LDA >= max(1,nq);
+            if VECT = 'P', LDA >= max(1,min(nq,K)).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (min(nq,K))
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i) which determines Q or P, as returned
+            by DGEBRD in the array argument TAUQ or TAUP.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q
+            or P*C or P**T*C or C*P or C*P**T.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    applyq = lsame_(vect, "Q");
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q or P and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! applyq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (! left && ! lsame_(side, "R")) {
+	*info = -2;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*k < 0) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = 1, i__2 = min(nq,*k);
+	if (applyq && *lda < max(1,nq) || ! applyq && *lda < max(i__1,i__2)) {
+	    *info = -8;
+	} else if (*ldc < max(1,*m)) {
+	    *info = -11;
+	} else if (*lwork < max(1,nw) && ! lquery) {
+	    *info = -13;
+	}
+    }
+
+    if (*info == 0) {
+	if (applyq) {
+	    if (left) {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		nb = ilaenv_(&c__1, "DORMQR", ch__1, &i__1, n, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		nb = ilaenv_(&c__1, "DORMQR", ch__1, m, &i__1, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	} else {
+	    if (left) {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		nb = ilaenv_(&c__1, "DORMLQ", ch__1, &i__1, n, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		nb = ilaenv_(&c__1, "DORMLQ", ch__1, m, &i__1, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORMBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    work[1] = 1.;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    if (applyq) {
+
+/*        Apply Q */
+
+	if (nq >= *k) {
+
+/*           Q was determined by a call to DGEBRD with nq >= k */
+
+	    dormqr_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           Q was determined by a call to DGEBRD with nq < k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    dormqr_(side, trans, &mi, &ni, &i__1, &a[a_dim1 + 2], lda, &tau[1]
+		    , &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+	}
+    } else {
+
+/*        Apply P */
+
+	if (notran) {
+	    *(unsigned char *)transt = 'T';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+	if (nq > *k) {
+
+/*           P was determined by a call to DGEBRD with nq > k */
+
+	    dormlq_(side, transt, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           P was determined by a call to DGEBRD with nq <= k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    dormlq_(side, transt, &mi, &ni, &i__1, &a[(a_dim1 << 1) + 1], lda,
+		     &tau[1], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &
+		    iinfo);
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMBR */
+
+} /* dormbr_ */
+
+/* Subroutine */ int dormhr_(char *side, char *trans, integer *m, integer *n,
+	integer *ilo, integer *ihi, doublereal *a, integer *lda, doublereal *
+	tau, doublereal *c__, integer *ldc, doublereal *work, integer *lwork,
+	integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, nh, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORMHR overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    IHI-ILO elementary reflectors, as returned by DGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of DGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            If SIDE = 'L', then 1 <= ILO <= IHI <= M, if M > 0, and
+            ILO = 1 and IHI = 0, if M = 0;
+            if SIDE = 'R', then 1 <= ILO <= IHI <= N, if N > 0, and
+            ILO = 1 and IHI = 0, if N = 0.
+
+    A       (input) DOUBLE PRECISION array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by DGEHRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) DOUBLE PRECISION array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEHRD.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    left = lsame_(side, "L");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ilo < 1 || *ilo > max(1,nq)) {
+	*info = -5;
+    } else if (*ihi < min(*ilo,nq) || *ihi > nq) {
+	*info = -6;
+    } else if (*lda < max(1,nq)) {
+	*info = -8;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -13;
+    }
+
+    if (*info == 0) {
+	if (left) {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "DORMQR", ch__1, &nh, n, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	} else {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "DORMQR", ch__1, m, &nh, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("DORMHR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nh == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    if (left) {
+	mi = nh;
+	ni = *n;
+	i1 = *ilo + 1;
+	i2 = 1;
+    } else {
+	mi = *m;
+	ni = nh;
+	i1 = 1;
+	i2 = *ilo + 1;
+    }
+
+    dormqr_(side, trans, &mi, &ni, &nh, &a[*ilo + 1 + *ilo * a_dim1], lda, &
+	    tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMHR */
+
+} /* dormhr_ */
+
+/* Subroutine */ int dorml2_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static doublereal aii;
+    static logical left;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORML2 overwrites the general real m by n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'T', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'T',
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by DGELQF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'T': apply Q' (Transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            DGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGELQF.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORML2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[i__ + i__ * a_dim1];
+	a[i__ + i__ * a_dim1] = 1.;
+	dlarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], lda, &tau[i__], &c__[
+		ic + jc * c_dim1], ldc, &work[1]);
+	a[i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of DORML2 */
+
+} /* dorml2_ */
+
+/* Subroutine */ int dormlq_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublereal t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int dorml2_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *), dlarfb_(char
+	    *, char *, char *, char *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran;
+    static integer ldwork;
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORMLQ overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by DGELQF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            DGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGELQF.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "DORMLQ", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORMLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "DORMLQ", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	dorml2_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	if (notran) {
+	    *(unsigned char *)transt = 'T';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    dlarft_("Forward", "Rowwise", &i__4, &ib, &a[i__ + i__ * a_dim1],
+		    lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    dlarfb_(side, transt, "Forward", "Rowwise", &mi, &ni, &ib, &a[i__
+		    + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc * c_dim1],
+		    ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMLQ */
+
+} /* dormlq_ */
+
+/* Subroutine */ int dormql_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublereal t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int dorm2l_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *), dlarfb_(char
+	    *, char *, char *, char *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran;
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORMQL overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by DGEQLF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            DGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEQLF.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = max(1,*n);
+    } else {
+	nq = *n;
+	nw = max(1,*m);
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+
+    if (*info == 0) {
+	if (*m == 0 || *n == 0) {
+	    lwkopt = 1;
+	} else {
+
+/*
+             Determine the block size.  NB may be at most NBMAX, where
+             NBMAX is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 64, i__2 = ilaenv_(&c__1, "DORMQL", ch__1, m, n, k, &c_n1,
+		    (ftnlen)6, (ftnlen)2);
+	    nb = min(i__1,i__2);
+	    lwkopt = nw * nb;
+	}
+	work[1] = (doublereal) lwkopt;
+
+	if (*lwork < nw && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORMQL", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "DORMQL", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	dorm2l_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	} else {
+	    mi = *m;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i+ib-1) . . . H(i+1) H(i)
+*/
+
+	    i__4 = nq - *k + i__ + ib - 1;
+	    dlarft_("Backward", "Columnwise", &i__4, &ib, &a[i__ * a_dim1 + 1]
+		    , lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(1:m-k+i+ib-1,1:n) */
+
+		mi = *m - *k + i__ + ib - 1;
+	    } else {
+
+/*              H or H' is applied to C(1:m,1:n-k+i+ib-1) */
+
+		ni = *n - *k + i__ + ib - 1;
+	    }
+
+/*           Apply H or H' */
+
+	    dlarfb_(side, trans, "Backward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ * a_dim1 + 1], lda, t, &c__65, &c__[c_offset], ldc, &
+		    work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMQL */
+
+} /* dormql_ */
+
+/* Subroutine */ int dormqr_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublereal t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int dorm2r_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *), dlarfb_(char
+	    *, char *, char *, char *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran;
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORMQR overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            DGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) DOUBLE PRECISION array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DGEQRF.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "DORMQR", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORMQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "DORMQR", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	dorm2r_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && ! notran || ! left && notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    dlarft_("Forward", "Columnwise", &i__4, &ib, &a[i__ + i__ *
+		    a_dim1], lda, &tau[i__], t, &c__65)
+		    ;
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    dlarfb_(side, trans, "Forward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc *
+		    c_dim1], ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMQR */
+
+} /* dormqr_ */
+
+/* Subroutine */ int dormtr_(char *side, char *uplo, char *trans, integer *m,
+	integer *n, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2, i__3;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int dormql_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *),
+	    dormqr_(char *, char *, integer *, integer *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DORMTR overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    nq-1 elementary reflectors, as returned by DSYTRD:
+
+    if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+
+    if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    UPLO    (input) CHARACTER*1
+            = 'U': Upper triangle of A contains elementary reflectors
+                   from DSYTRD;
+            = 'L': Lower triangle of A contains elementary reflectors
+                   from DSYTRD.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by DSYTRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) DOUBLE PRECISION array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by DSYTRD.
+
+    C       (input/output) DOUBLE PRECISION array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	*info = -2;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+	if (upper) {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "DORMQL", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "DORMQL", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	} else {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "DORMQR", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "DORMQR", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("DORMTR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nq == 1) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    if (left) {
+	mi = *m - 1;
+	ni = *n;
+    } else {
+	mi = *m;
+	ni = *n - 1;
+    }
+
+    if (upper) {
+
+/*        Q was determined by a call to DSYTRD with UPLO = 'U' */
+
+	i__2 = nq - 1;
+	dormql_(side, trans, &mi, &ni, &i__2, &a[(a_dim1 << 1) + 1], lda, &
+		tau[1], &c__[c_offset], ldc, &work[1], lwork, &iinfo);
+    } else {
+
+/*        Q was determined by a call to DSYTRD with UPLO = 'L' */
+
+	if (left) {
+	    i1 = 2;
+	    i2 = 1;
+	} else {
+	    i1 = 1;
+	    i2 = 2;
+	}
+	i__2 = nq - 1;
+	dormqr_(side, trans, &mi, &ni, &i__2, &a[a_dim1 + 2], lda, &tau[1], &
+		c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMTR */
+
+} /* dormtr_ */
+
+/* Subroutine */ int dpotf2_(char *uplo, integer *n, doublereal *a, integer *
+	lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer j;
+    static doublereal ajj;
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *);
+    static logical upper;
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DPOTF2 computes the Cholesky factorization of a real symmetric
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U' * U ,  if UPLO = 'U', or
+       A = L  * L',  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is stored.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            n by n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U'*U  or A = L*L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, the leading minor of order k is not
+                 positive definite, and the factorization could not be
+                 completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DPOTF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the Cholesky factorization A = U'*U. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute U(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j - 1;
+	    ajj = a[j + j * a_dim1] - ddot_(&i__2, &a[j * a_dim1 + 1], &c__1,
+		    &a[j * a_dim1 + 1], &c__1);
+	    if (ajj <= 0. || disnan_(&ajj)) {
+		a[j + j * a_dim1] = ajj;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    a[j + j * a_dim1] = ajj;
+
+/*           Compute elements J+1:N of row J. */
+
+	    if (j < *n) {
+		i__2 = j - 1;
+		i__3 = *n - j;
+		dgemv_("Transpose", &i__2, &i__3, &c_b151, &a[(j + 1) *
+			a_dim1 + 1], lda, &a[j * a_dim1 + 1], &c__1, &c_b15, &
+			a[j + (j + 1) * a_dim1], lda);
+		i__2 = *n - j;
+		d__1 = 1. / ajj;
+		dscal_(&i__2, &d__1, &a[j + (j + 1) * a_dim1], lda);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Compute the Cholesky factorization A = L*L'. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute L(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j - 1;
+	    ajj = a[j + j * a_dim1] - ddot_(&i__2, &a[j + a_dim1], lda, &a[j
+		    + a_dim1], lda);
+	    if (ajj <= 0. || disnan_(&ajj)) {
+		a[j + j * a_dim1] = ajj;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    a[j + j * a_dim1] = ajj;
+
+/*           Compute elements J+1:N of column J. */
+
+	    if (j < *n) {
+		i__2 = *n - j;
+		i__3 = j - 1;
+		dgemv_("No transpose", &i__2, &i__3, &c_b151, &a[j + 1 +
+			a_dim1], lda, &a[j + a_dim1], lda, &c_b15, &a[j + 1 +
+			j * a_dim1], &c__1);
+		i__2 = *n - j;
+		d__1 = 1. / ajj;
+		dscal_(&i__2, &d__1, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+    goto L40;
+
+L30:
+    *info = j;
+
+L40:
+    return 0;
+
+/*     End of DPOTF2 */
+
+} /* dpotf2_ */
+
+/* Subroutine */ int dpotrf_(char *uplo, integer *n, doublereal *a, integer *
+	lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer j, jb, nb;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
+	     integer *), dpotf2_(char *, integer *,
+	    doublereal *, integer *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DPOTRF computes the Cholesky factorization of a real symmetric
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U**T * U,  if UPLO = 'U', or
+       A = L  * L**T,  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the block version of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U**T*U or A = L*L**T.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the leading minor of order i is not
+                  positive definite, and the factorization could not be
+                  completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DPOTRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "DPOTRF", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code. */
+
+	dpotf2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code. */
+
+	if (upper) {
+
+/*           Compute the Cholesky factorization A = U'*U. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		dsyrk_("Upper", "Transpose", &jb, &i__3, &c_b151, &a[j *
+			a_dim1 + 1], lda, &c_b15, &a[j + j * a_dim1], lda);
+		dpotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block row. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    dgemm_("Transpose", "No transpose", &jb, &i__3, &i__4, &
+			    c_b151, &a[j * a_dim1 + 1], lda, &a[(j + jb) *
+			    a_dim1 + 1], lda, &c_b15, &a[j + (j + jb) *
+			    a_dim1], lda);
+		    i__3 = *n - j - jb + 1;
+		    dtrsm_("Left", "Upper", "Transpose", "Non-unit", &jb, &
+			    i__3, &c_b15, &a[j + j * a_dim1], lda, &a[j + (j
+			    + jb) * a_dim1], lda);
+		}
+/* L10: */
+	    }
+
+	} else {
+
+/*           Compute the Cholesky factorization A = L*L'. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		dsyrk_("Lower", "No transpose", &jb, &i__3, &c_b151, &a[j +
+			a_dim1], lda, &c_b15, &a[j + j * a_dim1], lda);
+		dpotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block column. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    dgemm_("No transpose", "Transpose", &i__3, &jb, &i__4, &
+			    c_b151, &a[j + jb + a_dim1], lda, &a[j + a_dim1],
+			    lda, &c_b15, &a[j + jb + j * a_dim1], lda);
+		    i__3 = *n - j - jb + 1;
+		    dtrsm_("Right", "Lower", "Transpose", "Non-unit", &i__3, &
+			    jb, &c_b15, &a[j + j * a_dim1], lda, &a[j + jb +
+			    j * a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+    goto L40;
+
+L30:
+    *info = *info + j - 1;
+
+L40:
+    return 0;
+
+/*     End of DPOTRF */
+
+} /* dpotrf_ */
+
+/* Subroutine */ int dpotri_(char *uplo, integer *n, doublereal *a, integer *
+	lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), dlauum_(
+	    char *, integer *, doublereal *, integer *, integer *),
+	    dtrtri_(char *, char *, integer *, doublereal *, integer *,
+	    integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DPOTRI computes the inverse of a real symmetric positive definite
+    matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+    computed by DPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the triangular factor U or L from the Cholesky
+            factorization A = U**T*U or A = L*L**T, as computed by
+            DPOTRF.
+            On exit, the upper or lower triangle of the (symmetric)
+            inverse of A, overwriting the input factor U or L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the (i,i) element of the factor U or L is
+                  zero, and the inverse could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DPOTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Invert the triangular Cholesky factor U or L. */
+
+    dtrtri_(uplo, "Non-unit", n, &a[a_offset], lda, info);
+    if (*info > 0) {
+	return 0;
+    }
+
+/*     Form inv(U)*inv(U)' or inv(L)'*inv(L). */
+
+    dlauum_(uplo, n, &a[a_offset], lda, info);
+
+    return 0;
+
+/*     End of DPOTRI */
+
+} /* dpotri_ */
+
+/* Subroutine */ int dpotrs_(char *uplo, integer *n, integer *nrhs,
+	doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DPOTRS solves a system of linear equations A*X = B with a symmetric
+    positive definite matrix A using the Cholesky factorization
+    A = U**T*U or A = L*L**T computed by DPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The triangular factor U or L from the Cholesky factorization
+            A = U**T*U or A = L*L**T, as computed by DPOTRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DPOTRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*
+          Solve A*X = B where A = U'*U.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	dtrsm_("Left", "Upper", "Transpose", "Non-unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	dtrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b15, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A*X = B where A = L*L'.
+
+          Solve L*X = B, overwriting B with X.
+*/
+
+	dtrsm_("Left", "Lower", "No transpose", "Non-unit", n, nrhs, &c_b15, &
+		a[a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	dtrsm_("Left", "Lower", "Transpose", "Non-unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+    }
+
+    return 0;
+
+/*     End of DPOTRS */
+
+} /* dpotrs_ */
+
+/* Subroutine */ int dstedc_(char *compz, integer *n, doublereal *d__,
+	doublereal *e, doublereal *z__, integer *ldz, doublereal *work,
+	integer *lwork, integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, j, k, m;
+    static doublereal p;
+    static integer ii, lgn;
+    static doublereal eps, tiny;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer lwmin;
+    extern /* Subroutine */ int dlaed0_(integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     integer *, doublereal *, integer *, integer *);
+    static integer start;
+
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlacpy_(char *, integer *, integer
+	    *, doublereal *, integer *, doublereal *, integer *),
+	    dlaset_(char *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static integer finish;
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *,
+	     integer *), dlasrt_(char *, integer *, doublereal *, integer *);
+    static integer liwmin, icompz;
+    extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *);
+    static doublereal orgnrm;
+    static logical lquery;
+    static integer smlsiz, storez, strtrw;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DSTEDC computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the divide and conquer method.
+    The eigenvectors of a full or band real symmetric matrix can also be
+    found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this
+    matrix to tridiagonal form.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.  See DLAED3 for details.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'I':  Compute eigenvectors of tridiagonal matrix also.
+            = 'V':  Compute eigenvectors of original dense symmetric
+                    matrix also.  On entry, Z contains the orthogonal
+                    matrix used to reduce the original matrix to
+                    tridiagonal form.
+
+    N       (input) INTEGER
+            The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the subdiagonal elements of the tridiagonal matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) DOUBLE PRECISION array, dimension (LDZ,N)
+            On entry, if COMPZ = 'V', then Z contains the orthogonal
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original symmetric matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If  COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1.
+            If eigenvectors are desired, then LDZ >= max(1,N).
+
+    WORK    (workspace/output) DOUBLE PRECISION array,
+                                           dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If COMPZ = 'N' or N <= 1 then LWORK must be at least 1.
+            If COMPZ = 'V' and N > 1 then LWORK must be at least
+                           ( 1 + 3*N + 2*N*lg N + 3*N**2 ),
+                           where lg( N ) = smallest integer k such
+                           that 2**k >= N.
+            If COMPZ = 'I' and N > 1 then LWORK must be at least
+                           ( 1 + 4*N + N**2 ).
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LWORK need
+            only be max(1,2*(N-1)).
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If COMPZ = 'N' or N <= 1 then LIWORK must be at least 1.
+            If COMPZ = 'V' and N > 1 then LIWORK must be at least
+                           ( 6 + 6*N + 5*N*lg N ).
+            If COMPZ = 'I' and N > 1 then LIWORK must be at least
+                           ( 3 + 5*N ).
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LIWORK
+            need only be 1.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal size of the IWORK array,
+            returns this value as the first entry of the IWORK array, and
+            no error message related to LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+
+    if (*info == 0) {
+
+/*        Compute the workspace requirements */
+
+	smlsiz = ilaenv_(&c__9, "DSTEDC", " ", &c__0, &c__0, &c__0, &c__0, (
+		ftnlen)6, (ftnlen)1);
+	if (*n <= 1 || icompz == 0) {
+	    liwmin = 1;
+	    lwmin = 1;
+	} else if (*n <= smlsiz) {
+	    liwmin = 1;
+	    lwmin = *n - 1 << 1;
+	} else {
+	    lgn = (integer) (log((doublereal) (*n)) / log(2.));
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    if (icompz == 1) {
+/* Computing 2nd power */
+		i__1 = *n;
+		lwmin = *n * 3 + 1 + (*n << 1) * lgn + i__1 * i__1 * 3;
+		liwmin = *n * 6 + 6 + *n * 5 * lgn;
+	    } else if (icompz == 2) {
+/* Computing 2nd power */
+		i__1 = *n;
+		lwmin = (*n << 2) + 1 + i__1 * i__1;
+		liwmin = *n * 5 + 3;
+	    }
+	}
+	work[1] = (doublereal) lwmin;
+	iwork[1] = liwmin;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -10;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DSTEDC", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*n == 1) {
+	if (icompz != 0) {
+	    z__[z_dim1 + 1] = 1.;
+	}
+	return 0;
+    }
+
+/*
+       If the following conditional clause is removed, then the routine
+       will use the Divide and Conquer routine to compute only the
+       eigenvalues, which requires (3N + 3N**2) real workspace and
+       (2 + 5N + 2N lg(N)) integer workspace.
+       Since on many architectures DSTERF is much faster than any other
+       algorithm for finding eigenvalues only, it is used here
+       as the default. If the conditional clause is removed, then
+       information on the size of workspace needs to be changed.
+
+       If COMPZ = 'N', use DSTERF to compute the eigenvalues.
+*/
+
+    if (icompz == 0) {
+	dsterf_(n, &d__[1], &e[1], info);
+	goto L50;
+    }
+
+/*
+       If N is smaller than the minimum divide size (SMLSIZ+1), then
+       solve the problem with another solver.
+*/
+
+    if (*n <= smlsiz) {
+
+	dsteqr_(compz, n, &d__[1], &e[1], &z__[z_offset], ldz, &work[1], info);
+
+    } else {
+
+/*
+          If COMPZ = 'V', the Z matrix must be stored elsewhere for later
+          use.
+*/
+
+	if (icompz == 1) {
+	    storez = *n * *n + 1;
+	} else {
+	    storez = 1;
+	}
+
+	if (icompz == 2) {
+	    dlaset_("Full", n, n, &c_b29, &c_b15, &z__[z_offset], ldz);
+	}
+
+/*        Scale. */
+
+	orgnrm = dlanst_("M", n, &d__[1], &e[1]);
+	if (orgnrm == 0.) {
+	    goto L50;
+	}
+
+	eps = EPSILON;
+
+	start = 1;
+
+/*        while ( START <= N ) */
+
+L10:
+	if (start <= *n) {
+
+/*
+             Let FINISH be the position of the next subdiagonal entry
+             such that E( FINISH ) <= TINY or FINISH = N if no such
+             subdiagonal exists.  The matrix identified by the elements
+             between START and FINISH constitutes an independent
+             sub-problem.
+*/
+
+	    finish = start;
+L20:
+	    if (finish < *n) {
+		tiny = eps * sqrt((d__1 = d__[finish], abs(d__1))) * sqrt((
+			d__2 = d__[finish + 1], abs(d__2)));
+		if ((d__1 = e[finish], abs(d__1)) > tiny) {
+		    ++finish;
+		    goto L20;
+		}
+	    }
+
+/*           (Sub) Problem determined.  Compute its size and solve it. */
+
+	    m = finish - start + 1;
+	    if (m == 1) {
+		start = finish + 1;
+		goto L10;
+	    }
+	    if (m > smlsiz) {
+
+/*              Scale. */
+
+		orgnrm = dlanst_("M", &m, &d__[start], &e[start]);
+		dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &m, &c__1, &d__[
+			start], &m, info);
+		i__1 = m - 1;
+		i__2 = m - 1;
+		dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &i__1, &c__1, &e[
+			start], &i__2, info);
+
+		if (icompz == 1) {
+		    strtrw = 1;
+		} else {
+		    strtrw = start;
+		}
+		dlaed0_(&icompz, n, &m, &d__[start], &e[start], &z__[strtrw +
+			start * z_dim1], ldz, &work[1], n, &work[storez], &
+			iwork[1], info);
+		if (*info != 0) {
+		    *info = (*info / (m + 1) + start - 1) * (*n + 1) + *info %
+			     (m + 1) + start - 1;
+		    goto L50;
+		}
+
+/*              Scale back. */
+
+		dlascl_("G", &c__0, &c__0, &c_b15, &orgnrm, &m, &c__1, &d__[
+			start], &m, info);
+
+	    } else {
+		if (icompz == 1) {
+
+/*
+                   Since QR won't update a Z matrix which is larger than
+                   the length of D, we must solve the sub-problem in a
+                   workspace and then multiply back into Z.
+*/
+
+		    dsteqr_("I", &m, &d__[start], &e[start], &work[1], &m, &
+			    work[m * m + 1], info);
+		    dlacpy_("A", n, &m, &z__[start * z_dim1 + 1], ldz, &work[
+			    storez], n);
+		    dgemm_("N", "N", n, &m, &m, &c_b15, &work[storez], n, &
+			    work[1], &m, &c_b29, &z__[start * z_dim1 + 1],
+			    ldz);
+		} else if (icompz == 2) {
+		    dsteqr_("I", &m, &d__[start], &e[start], &z__[start +
+			    start * z_dim1], ldz, &work[1], info);
+		} else {
+		    dsterf_(&m, &d__[start], &e[start], info);
+		}
+		if (*info != 0) {
+		    *info = start * (*n + 1) + finish;
+		    goto L50;
+		}
+	    }
+
+	    start = finish + 1;
+	    goto L10;
+	}
+
+/*
+          endwhile
+
+          If the problem split any number of times, then the eigenvalues
+          will not be properly ordered.  Here we permute the eigenvalues
+          (and the associated eigenvectors) into ascending order.
+*/
+
+	if (m != *n) {
+	    if (icompz == 0) {
+
+/*              Use Quick Sort */
+
+		dlasrt_("I", n, &d__[1], info);
+
+	    } else {
+
+/*              Use Selection Sort to minimize swaps of eigenvectors */
+
+		i__1 = *n;
+		for (ii = 2; ii <= i__1; ++ii) {
+		    i__ = ii - 1;
+		    k = i__;
+		    p = d__[i__];
+		    i__2 = *n;
+		    for (j = ii; j <= i__2; ++j) {
+			if (d__[j] < p) {
+			    k = j;
+			    p = d__[j];
+			}
+/* L30: */
+		    }
+		    if (k != i__) {
+			d__[k] = d__[i__];
+			d__[i__] = p;
+			dswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k *
+				z_dim1 + 1], &c__1);
+		    }
+/* L40: */
+		}
+	    }
+	}
+    }
+
+L50:
+    work[1] = (doublereal) lwmin;
+    iwork[1] = liwmin;
+
+    return 0;
+
+/*     End of DSTEDC */
+
+} /* dstedc_ */
+
+/* Subroutine */ int dsteqr_(char *compz, integer *n, doublereal *d__,
+	doublereal *e, doublereal *z__, integer *ldz, doublereal *work,
+	integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal b, c__, f, g;
+    static integer i__, j, k, l, m;
+    static doublereal p, r__, s;
+    static integer l1, ii, mm, lm1, mm1, nm1;
+    static doublereal rt1, rt2, eps;
+    static integer lsv;
+    static doublereal tst, eps2;
+    static integer lend, jtot;
+    extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal
+	    *, doublereal *, doublereal *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *);
+    static doublereal anorm;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlaev2_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *);
+    static integer lendm1, lendp1;
+
+    static integer iscale;
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlaset_(char *, integer *, integer
+	    *, doublereal *, doublereal *, doublereal *, integer *);
+    static doublereal safmin;
+    extern /* Subroutine */ int dlartg_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *);
+    static doublereal safmax;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *,
+	    integer *);
+    static integer lendsv;
+    static doublereal ssfmin;
+    static integer nmaxit, icompz;
+    static doublereal ssfmax;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DSTEQR computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the implicit QL or QR method.
+    The eigenvectors of a full or band symmetric matrix can also be found
+    if DSYTRD or DSPTRD or DSBTRD has been used to reduce this matrix to
+    tridiagonal form.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'V':  Compute eigenvalues and eigenvectors of the original
+                    symmetric matrix.  On entry, Z must contain the
+                    orthogonal matrix used to reduce the original matrix
+                    to tridiagonal form.
+            = 'I':  Compute eigenvalues and eigenvectors of the
+                    tridiagonal matrix.  Z is initialized to the identity
+                    matrix.
+
+    N       (input) INTEGER
+            The order of the matrix.  N >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the (n-1) subdiagonal elements of the tridiagonal
+            matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) DOUBLE PRECISION array, dimension (LDZ, N)
+            On entry, if  COMPZ = 'V', then Z contains the orthogonal
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if  COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original symmetric matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1, and if
+            eigenvectors are desired, then  LDZ >= max(1,N).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (max(1,2*N-2))
+            If COMPZ = 'N', then WORK is not referenced.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  the algorithm has failed to find all the eigenvalues in
+                  a total of 30*N iterations; if INFO = i, then i
+                  elements of E have not converged to zero; on exit, D
+                  and E contain the elements of a symmetric tridiagonal
+                  matrix which is orthogonally similar to the original
+                  matrix.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DSTEQR", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	if (icompz == 2) {
+	    z__[z_dim1 + 1] = 1.;
+	}
+	return 0;
+    }
+
+/*     Determine the unit roundoff and over/underflow thresholds. */
+
+    eps = EPSILON;
+/* Computing 2nd power */
+    d__1 = eps;
+    eps2 = d__1 * d__1;
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    ssfmax = sqrt(safmax) / 3.;
+    ssfmin = sqrt(safmin) / eps2;
+
+/*
+       Compute the eigenvalues and eigenvectors of the tridiagonal
+       matrix.
+*/
+
+    if (icompz == 2) {
+	dlaset_("Full", n, n, &c_b29, &c_b15, &z__[z_offset], ldz);
+    }
+
+    nmaxit = *n * 30;
+    jtot = 0;
+
+/*
+       Determine where the matrix splits and choose QL or QR iteration
+       for each block, according to whether top or bottom diagonal
+       element is smaller.
+*/
+
+    l1 = 1;
+    nm1 = *n - 1;
+
+L10:
+    if (l1 > *n) {
+	goto L160;
+    }
+    if (l1 > 1) {
+	e[l1 - 1] = 0.;
+    }
+    if (l1 <= nm1) {
+	i__1 = nm1;
+	for (m = l1; m <= i__1; ++m) {
+	    tst = (d__1 = e[m], abs(d__1));
+	    if (tst == 0.) {
+		goto L30;
+	    }
+	    if (tst <= sqrt((d__1 = d__[m], abs(d__1))) * sqrt((d__2 = d__[m
+		    + 1], abs(d__2))) * eps) {
+		e[m] = 0.;
+		goto L30;
+	    }
+/* L20: */
+	}
+    }
+    m = *n;
+
+L30:
+    l = l1;
+    lsv = l;
+    lend = m;
+    lendsv = lend;
+    l1 = m + 1;
+    if (lend == l) {
+	goto L10;
+    }
+
+/*     Scale submatrix in rows and columns L to LEND */
+
+    i__1 = lend - l + 1;
+    anorm = dlanst_("I", &i__1, &d__[l], &e[l]);
+    iscale = 0;
+    if (anorm == 0.) {
+	goto L10;
+    }
+    if (anorm > ssfmax) {
+	iscale = 1;
+	i__1 = lend - l + 1;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n,
+		info);
+    } else if (anorm < ssfmin) {
+	iscale = 2;
+	i__1 = lend - l + 1;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n,
+		info);
+    }
+
+/*     Choose between QL and QR iteration */
+
+    if ((d__1 = d__[lend], abs(d__1)) < (d__2 = d__[l], abs(d__2))) {
+	lend = lsv;
+	l = lendsv;
+    }
+
+    if (lend > l) {
+
+/*
+          QL Iteration
+
+          Look for small subdiagonal element.
+*/
+
+L40:
+	if (l != lend) {
+	    lendm1 = lend - 1;
+	    i__1 = lendm1;
+	    for (m = l; m <= i__1; ++m) {
+/* Computing 2nd power */
+		d__2 = (d__1 = e[m], abs(d__1));
+		tst = d__2 * d__2;
+		if (tst <= eps2 * (d__1 = d__[m], abs(d__1)) * (d__2 = d__[m
+			+ 1], abs(d__2)) + safmin) {
+		    goto L60;
+		}
+/* L50: */
+	    }
+	}
+
+	m = lend;
+
+L60:
+	if (m < lend) {
+	    e[m] = 0.;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L80;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use DLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l + 1) {
+	    if (icompz > 0) {
+		dlaev2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2, &c__, &s);
+		work[l] = c__;
+		work[*n - 1 + l] = s;
+		dlasr_("R", "V", "B", n, &c__2, &work[l], &work[*n - 1 + l], &
+			z__[l * z_dim1 + 1], ldz);
+	    } else {
+		dlae2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2);
+	    }
+	    d__[l] = rt1;
+	    d__[l + 1] = rt2;
+	    e[l] = 0.;
+	    l += 2;
+	    if (l <= lend) {
+		goto L40;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l + 1] - p) / (e[l] * 2.);
+	r__ = dlapy2_(&g, &c_b15);
+	g = d__[m] - p + e[l] / (g + d_sign(&r__, &g));
+
+	s = 1.;
+	c__ = 1.;
+	p = 0.;
+
+/*        Inner loop */
+
+	mm1 = m - 1;
+	i__1 = l;
+	for (i__ = mm1; i__ >= i__1; --i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    dlartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m - 1) {
+		e[i__ + 1] = r__;
+	    }
+	    g = d__[i__ + 1] - p;
+	    r__ = (d__[i__] - g) * s + c__ * 2. * b;
+	    p = s * r__;
+	    d__[i__ + 1] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = -s;
+	    }
+
+/* L70: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = m - l + 1;
+	    dlasr_("R", "V", "B", n, &mm, &work[l], &work[*n - 1 + l], &z__[l
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[l] = g;
+	goto L40;
+
+/*        Eigenvalue found. */
+
+L80:
+	d__[l] = p;
+
+	++l;
+	if (l <= lend) {
+	    goto L40;
+	}
+	goto L140;
+
+    } else {
+
+/*
+          QR Iteration
+
+          Look for small superdiagonal element.
+*/
+
+L90:
+	if (l != lend) {
+	    lendp1 = lend + 1;
+	    i__1 = lendp1;
+	    for (m = l; m >= i__1; --m) {
+/* Computing 2nd power */
+		d__2 = (d__1 = e[m - 1], abs(d__1));
+		tst = d__2 * d__2;
+		if (tst <= eps2 * (d__1 = d__[m], abs(d__1)) * (d__2 = d__[m
+			- 1], abs(d__2)) + safmin) {
+		    goto L110;
+		}
+/* L100: */
+	    }
+	}
+
+	m = lend;
+
+L110:
+	if (m > lend) {
+	    e[m - 1] = 0.;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L130;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use DLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l - 1) {
+	    if (icompz > 0) {
+		dlaev2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2, &c__, &s)
+			;
+		work[m] = c__;
+		work[*n - 1 + m] = s;
+		dlasr_("R", "V", "F", n, &c__2, &work[m], &work[*n - 1 + m], &
+			z__[(l - 1) * z_dim1 + 1], ldz);
+	    } else {
+		dlae2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2);
+	    }
+	    d__[l - 1] = rt1;
+	    d__[l] = rt2;
+	    e[l - 1] = 0.;
+	    l += -2;
+	    if (l >= lend) {
+		goto L90;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l - 1] - p) / (e[l - 1] * 2.);
+	r__ = dlapy2_(&g, &c_b15);
+	g = d__[m] - p + e[l - 1] / (g + d_sign(&r__, &g));
+
+	s = 1.;
+	c__ = 1.;
+	p = 0.;
+
+/*        Inner loop */
+
+	lm1 = l - 1;
+	i__1 = lm1;
+	for (i__ = m; i__ <= i__1; ++i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    dlartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m) {
+		e[i__ - 1] = r__;
+	    }
+	    g = d__[i__] - p;
+	    r__ = (d__[i__ + 1] - g) * s + c__ * 2. * b;
+	    p = s * r__;
+	    d__[i__] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = s;
+	    }
+
+/* L120: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = l - m + 1;
+	    dlasr_("R", "V", "F", n, &mm, &work[m], &work[*n - 1 + m], &z__[m
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[lm1] = g;
+	goto L90;
+
+/*        Eigenvalue found. */
+
+L130:
+	d__[l] = p;
+
+	--l;
+	if (l >= lend) {
+	    goto L90;
+	}
+	goto L140;
+
+    }
+
+/*     Undo scaling if necessary */
+
+L140:
+    if (iscale == 1) {
+	i__1 = lendsv - lsv + 1;
+	dlascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	dlascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    } else if (iscale == 2) {
+	i__1 = lendsv - lsv + 1;
+	dlascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	dlascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    }
+
+/*
+       Check for no convergence to an eigenvalue after a total
+       of N*MAXIT iterations.
+*/
+
+    if (jtot < nmaxit) {
+	goto L10;
+    }
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (e[i__] != 0.) {
+	    ++(*info);
+	}
+/* L150: */
+    }
+    goto L190;
+
+/*     Order eigenvalues and eigenvectors. */
+
+L160:
+    if (icompz == 0) {
+
+/*        Use Quick Sort */
+
+	dlasrt_("I", n, &d__[1], info);
+
+    } else {
+
+/*        Use Selection Sort to minimize swaps of eigenvectors */
+
+	i__1 = *n;
+	for (ii = 2; ii <= i__1; ++ii) {
+	    i__ = ii - 1;
+	    k = i__;
+	    p = d__[i__];
+	    i__2 = *n;
+	    for (j = ii; j <= i__2; ++j) {
+		if (d__[j] < p) {
+		    k = j;
+		    p = d__[j];
+		}
+/* L170: */
+	    }
+	    if (k != i__) {
+		d__[k] = d__[i__];
+		d__[i__] = p;
+		dswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k * z_dim1 + 1],
+			 &c__1);
+	    }
+/* L180: */
+	}
+    }
+
+L190:
+    return 0;
+
+/*     End of DSTEQR */
+
+} /* dsteqr_ */
+
+/* Subroutine */ int dsterf_(integer *n, doublereal *d__, doublereal *e,
+	integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2, d__3;
+
+    /* Local variables */
+    static doublereal c__;
+    static integer i__, l, m;
+    static doublereal p, r__, s;
+    static integer l1;
+    static doublereal bb, rt1, rt2, eps, rte;
+    static integer lsv;
+    static doublereal eps2, oldc;
+    static integer lend, jtot;
+    extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal
+	    *, doublereal *, doublereal *);
+    static doublereal gamma, alpha, sigma, anorm;
+
+    static integer iscale;
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *);
+    static doublereal oldgam, safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal safmax;
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *,
+	    integer *);
+    static integer lendsv;
+    static doublereal ssfmin;
+    static integer nmaxit;
+    static doublereal ssfmax;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DSTERF computes all eigenvalues of a symmetric tridiagonal matrix
+    using the Pal-Walker-Kahan variant of the QL or QR algorithm.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix.  N >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the n diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the (n-1) subdiagonal elements of the tridiagonal
+            matrix.
+            On exit, E has been destroyed.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  the algorithm failed to find all of the eigenvalues in
+                  a total of 30*N iterations; if INFO = i, then i
+                  elements of E have not converged to zero.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --e;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n < 0) {
+	*info = -1;
+	i__1 = -(*info);
+	xerbla_("DSTERF", &i__1);
+	return 0;
+    }
+    if (*n <= 1) {
+	return 0;
+    }
+
+/*     Determine the unit roundoff for this environment. */
+
+    eps = EPSILON;
+/* Computing 2nd power */
+    d__1 = eps;
+    eps2 = d__1 * d__1;
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    ssfmax = sqrt(safmax) / 3.;
+    ssfmin = sqrt(safmin) / eps2;
+
+/*     Compute the eigenvalues of the tridiagonal matrix. */
+
+    nmaxit = *n * 30;
+    sigma = 0.;
+    jtot = 0;
+
+/*
+       Determine where the matrix splits and choose QL or QR iteration
+       for each block, according to whether top or bottom diagonal
+       element is smaller.
+*/
+
+    l1 = 1;
+
+L10:
+    if (l1 > *n) {
+	goto L170;
+    }
+    if (l1 > 1) {
+	e[l1 - 1] = 0.;
+    }
+    i__1 = *n - 1;
+    for (m = l1; m <= i__1; ++m) {
+	if ((d__3 = e[m], abs(d__3)) <= sqrt((d__1 = d__[m], abs(d__1))) *
+		sqrt((d__2 = d__[m + 1], abs(d__2))) * eps) {
+	    e[m] = 0.;
+	    goto L30;
+	}
+/* L20: */
+    }
+    m = *n;
+
+L30:
+    l = l1;
+    lsv = l;
+    lend = m;
+    lendsv = lend;
+    l1 = m + 1;
+    if (lend == l) {
+	goto L10;
+    }
+
+/*     Scale submatrix in rows and columns L to LEND */
+
+    i__1 = lend - l + 1;
+    anorm = dlanst_("I", &i__1, &d__[l], &e[l]);
+    iscale = 0;
+    if (anorm > ssfmax) {
+	iscale = 1;
+	i__1 = lend - l + 1;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n,
+		info);
+    } else if (anorm < ssfmin) {
+	iscale = 2;
+	i__1 = lend - l + 1;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n,
+		info);
+    }
+
+    i__1 = lend - 1;
+    for (i__ = l; i__ <= i__1; ++i__) {
+/* Computing 2nd power */
+	d__1 = e[i__];
+	e[i__] = d__1 * d__1;
+/* L40: */
+    }
+
+/*     Choose between QL and QR iteration */
+
+    if ((d__1 = d__[lend], abs(d__1)) < (d__2 = d__[l], abs(d__2))) {
+	lend = lsv;
+	l = lendsv;
+    }
+
+    if (lend >= l) {
+
+/*
+          QL Iteration
+
+          Look for small subdiagonal element.
+*/
+
+L50:
+	if (l != lend) {
+	    i__1 = lend - 1;
+	    for (m = l; m <= i__1; ++m) {
+		if ((d__2 = e[m], abs(d__2)) <= eps2 * (d__1 = d__[m] * d__[m
+			+ 1], abs(d__1))) {
+		    goto L70;
+		}
+/* L60: */
+	    }
+	}
+	m = lend;
+
+L70:
+	if (m < lend) {
+	    e[m] = 0.;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L90;
+	}
+
+/*
+          If remaining matrix is 2 by 2, use DLAE2 to compute its
+          eigenvalues.
+*/
+
+	if (m == l + 1) {
+	    rte = sqrt(e[l]);
+	    dlae2_(&d__[l], &rte, &d__[l + 1], &rt1, &rt2);
+	    d__[l] = rt1;
+	    d__[l + 1] = rt2;
+	    e[l] = 0.;
+	    l += 2;
+	    if (l <= lend) {
+		goto L50;
+	    }
+	    goto L150;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L150;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	rte = sqrt(e[l]);
+	sigma = (d__[l + 1] - p) / (rte * 2.);
+	r__ = dlapy2_(&sigma, &c_b15);
+	sigma = p - rte / (sigma + d_sign(&r__, &sigma));
+
+	c__ = 1.;
+	s = 0.;
+	gamma = d__[m] - sigma;
+	p = gamma * gamma;
+
+/*        Inner loop */
+
+	i__1 = l;
+	for (i__ = m - 1; i__ >= i__1; --i__) {
+	    bb = e[i__];
+	    r__ = p + bb;
+	    if (i__ != m - 1) {
+		e[i__ + 1] = s * r__;
+	    }
+	    oldc = c__;
+	    c__ = p / r__;
+	    s = bb / r__;
+	    oldgam = gamma;
+	    alpha = d__[i__];
+	    gamma = c__ * (alpha - sigma) - s * oldgam;
+	    d__[i__ + 1] = oldgam + (alpha - gamma);
+	    if (c__ != 0.) {
+		p = gamma * gamma / c__;
+	    } else {
+		p = oldc * bb;
+	    }
+/* L80: */
+	}
+
+	e[l] = s * p;
+	d__[l] = sigma + gamma;
+	goto L50;
+
+/*        Eigenvalue found. */
+
+L90:
+	d__[l] = p;
+
+	++l;
+	if (l <= lend) {
+	    goto L50;
+	}
+	goto L150;
+
+    } else {
+
+/*
+          QR Iteration
+
+          Look for small superdiagonal element.
+*/
+
+L100:
+	i__1 = lend + 1;
+	for (m = l; m >= i__1; --m) {
+	    if ((d__2 = e[m - 1], abs(d__2)) <= eps2 * (d__1 = d__[m] * d__[m
+		    - 1], abs(d__1))) {
+		goto L120;
+	    }
+/* L110: */
+	}
+	m = lend;
+
+L120:
+	if (m > lend) {
+	    e[m - 1] = 0.;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L140;
+	}
+
+/*
+          If remaining matrix is 2 by 2, use DLAE2 to compute its
+          eigenvalues.
+*/
+
+	if (m == l - 1) {
+	    rte = sqrt(e[l - 1]);
+	    dlae2_(&d__[l], &rte, &d__[l - 1], &rt1, &rt2);
+	    d__[l] = rt1;
+	    d__[l - 1] = rt2;
+	    e[l - 1] = 0.;
+	    l += -2;
+	    if (l >= lend) {
+		goto L100;
+	    }
+	    goto L150;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L150;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	rte = sqrt(e[l - 1]);
+	sigma = (d__[l - 1] - p) / (rte * 2.);
+	r__ = dlapy2_(&sigma, &c_b15);
+	sigma = p - rte / (sigma + d_sign(&r__, &sigma));
+
+	c__ = 1.;
+	s = 0.;
+	gamma = d__[m] - sigma;
+	p = gamma * gamma;
+
+/*        Inner loop */
+
+	i__1 = l - 1;
+	for (i__ = m; i__ <= i__1; ++i__) {
+	    bb = e[i__];
+	    r__ = p + bb;
+	    if (i__ != m) {
+		e[i__ - 1] = s * r__;
+	    }
+	    oldc = c__;
+	    c__ = p / r__;
+	    s = bb / r__;
+	    oldgam = gamma;
+	    alpha = d__[i__ + 1];
+	    gamma = c__ * (alpha - sigma) - s * oldgam;
+	    d__[i__] = oldgam + (alpha - gamma);
+	    if (c__ != 0.) {
+		p = gamma * gamma / c__;
+	    } else {
+		p = oldc * bb;
+	    }
+/* L130: */
+	}
+
+	e[l - 1] = s * p;
+	d__[l] = sigma + gamma;
+	goto L100;
+
+/*        Eigenvalue found. */
+
+L140:
+	d__[l] = p;
+
+	--l;
+	if (l >= lend) {
+	    goto L100;
+	}
+	goto L150;
+
+    }
+
+/*     Undo scaling if necessary */
+
+L150:
+    if (iscale == 1) {
+	i__1 = lendsv - lsv + 1;
+	dlascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+    }
+    if (iscale == 2) {
+	i__1 = lendsv - lsv + 1;
+	dlascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+    }
+
+/*
+       Check for no convergence to an eigenvalue after a total
+       of N*MAXIT iterations.
+*/
+
+    if (jtot < nmaxit) {
+	goto L10;
+    }
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (e[i__] != 0.) {
+	    ++(*info);
+	}
+/* L160: */
+    }
+    goto L180;
+
+/*     Sort eigenvalues in increasing order. */
+
+L170:
+    dlasrt_("I", n, &d__[1], info);
+
+L180:
+    return 0;
+
+/*     End of DSTERF */
+
+} /* dsterf_ */
+
+/* Subroutine */ int dsyevd_(char *jobz, char *uplo, integer *n, doublereal *
+	a, integer *lda, doublereal *w, doublereal *work, integer *lwork,
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal eps;
+    static integer inde;
+    static doublereal anrm, rmin, rmax;
+    static integer lopt;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    static doublereal sigma;
+    extern logical lsame_(char *, char *);
+    static integer iinfo, lwmin, liopt;
+    static logical lower, wantz;
+    static integer indwk2, llwrk2;
+
+    static integer iscale;
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dstedc_(char *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     integer *, integer *, integer *, integer *), dlacpy_(
+	    char *, integer *, integer *, doublereal *, integer *, doublereal
+	    *, integer *);
+    static doublereal safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal bignum;
+    static integer indtau;
+    extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *,
+	     integer *);
+    extern doublereal dlansy_(char *, char *, integer *, doublereal *,
+	    integer *, doublereal *);
+    static integer indwrk, liwmin;
+    extern /* Subroutine */ int dormtr_(char *, char *, char *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *, integer *), dsytrd_(char *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *);
+    static integer llwork;
+    static doublereal smlnum;
+    static logical lquery;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DSYEVD computes all eigenvalues and, optionally, eigenvectors of a
+    real symmetric matrix A. If eigenvectors are desired, it uses a
+    divide and conquer algorithm.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Because of large use of BLAS of level 3, DSYEVD needs N**2 more
+    workspace than DSYEVX.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only;
+            = 'V':  Compute eigenvalues and eigenvectors.
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA, N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of A contains the
+            upper triangular part of the matrix A.  If UPLO = 'L',
+            the leading N-by-N lower triangular part of A contains
+            the lower triangular part of the matrix A.
+            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+            orthonormal eigenvectors of the matrix A.
+            If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+            or the upper triangle (if UPLO='U') of A, including the
+            diagonal, is destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    W       (output) DOUBLE PRECISION array, dimension (N)
+            If INFO = 0, the eigenvalues in ascending order.
+
+    WORK    (workspace/output) DOUBLE PRECISION array,
+                                           dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If N <= 1,               LWORK must be at least 1.
+            If JOBZ = 'N' and N > 1, LWORK must be at least 2*N+1.
+            If JOBZ = 'V' and N > 1, LWORK must be at least
+                                                  1 + 6*N + 2*N**2.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal sizes of the WORK and IWORK
+            arrays, returns these values as the first entries of the WORK
+            and IWORK arrays, and no error message related to LWORK or
+            LIWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If N <= 1,                LIWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+            If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK and
+            IWORK arrays, returns these values as the first entries of
+            the WORK and IWORK arrays, and no error message related to
+            LWORK or LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+                  to converge; i off-diagonal elements of an intermediate
+                  tridiagonal form did not converge to zero;
+                  if INFO = i and JOBZ = 'V', then the algorithm failed
+                  to compute an eigenvalue while working on the submatrix
+                  lying in rows and columns INFO/(N+1) through
+                  mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    Modified description of INFO. Sven, 16 Feb 05.
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --w;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    wantz = lsame_(jobz, "V");
+    lower = lsame_(uplo, "L");
+    lquery = *lwork == -1 || *liwork == -1;
+
+    *info = 0;
+    if (! (wantz || lsame_(jobz, "N"))) {
+	*info = -1;
+    } else if (! (lower || lsame_(uplo, "U"))) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+
+    if (*info == 0) {
+	if (*n <= 1) {
+	    liwmin = 1;
+	    lwmin = 1;
+	    lopt = lwmin;
+	    liopt = liwmin;
+	} else {
+	    if (wantz) {
+		liwmin = *n * 5 + 3;
+/* Computing 2nd power */
+		i__1 = *n;
+		lwmin = *n * 6 + 1 + (i__1 * i__1 << 1);
+	    } else {
+		liwmin = 1;
+		lwmin = (*n << 1) + 1;
+	    }
+/* Computing MAX */
+	    i__1 = lwmin, i__2 = (*n << 1) + ilaenv_(&c__1, "DSYTRD", uplo, n,
+		     &c_n1, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+	    lopt = max(i__1,i__2);
+	    liopt = liwmin;
+	}
+	work[1] = (doublereal) lopt;
+	iwork[1] = liopt;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -10;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DSYEVD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	w[1] = a[a_dim1 + 1];
+	if (wantz) {
+	    a[a_dim1 + 1] = 1.;
+	}
+	return 0;
+    }
+
+/*     Get machine constants. */
+
+    safmin = SAFEMINIMUM;
+    eps = PRECISION;
+    smlnum = safmin / eps;
+    bignum = 1. / smlnum;
+    rmin = sqrt(smlnum);
+    rmax = sqrt(bignum);
+
+/*     Scale matrix to allowable range, if necessary. */
+
+    anrm = dlansy_("M", uplo, n, &a[a_offset], lda, &work[1]);
+    iscale = 0;
+    if (anrm > 0. && anrm < rmin) {
+	iscale = 1;
+	sigma = rmin / anrm;
+    } else if (anrm > rmax) {
+	iscale = 1;
+	sigma = rmax / anrm;
+    }
+    if (iscale == 1) {
+	dlascl_(uplo, &c__0, &c__0, &c_b15, &sigma, n, n, &a[a_offset], lda,
+		info);
+    }
+
+/*     Call DSYTRD to reduce symmetric matrix to tridiagonal form. */
+
+    inde = 1;
+    indtau = inde + *n;
+    indwrk = indtau + *n;
+    llwork = *lwork - indwrk + 1;
+    indwk2 = indwrk + *n * *n;
+    llwrk2 = *lwork - indwk2 + 1;
+
+    dsytrd_(uplo, n, &a[a_offset], lda, &w[1], &work[inde], &work[indtau], &
+	    work[indwrk], &llwork, &iinfo);
+    lopt = (integer) ((*n << 1) + work[indwrk]);
+
+/*
+       For eigenvalues only, call DSTERF.  For eigenvectors, first call
+       DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+       tridiagonal matrix, then call DORMTR to multiply it by the
+       Householder transformations stored in A.
+*/
+
+    if (! wantz) {
+	dsterf_(n, &w[1], &work[inde], info);
+    } else {
+	dstedc_("I", n, &w[1], &work[inde], &work[indwrk], n, &work[indwk2], &
+		llwrk2, &iwork[1], liwork, info);
+	dormtr_("L", uplo, "N", n, n, &a[a_offset], lda, &work[indtau], &work[
+		indwrk], n, &work[indwk2], &llwrk2, &iinfo);
+	dlacpy_("A", n, n, &work[indwrk], n, &a[a_offset], lda);
+/*
+   Computing MAX
+   Computing 2nd power
+*/
+	i__3 = *n;
+	i__1 = lopt, i__2 = *n * 6 + 1 + (i__3 * i__3 << 1);
+	lopt = max(i__1,i__2);
+    }
+
+/*     If matrix was scaled, then rescale eigenvalues appropriately. */
+
+    if (iscale == 1) {
+	d__1 = 1. / sigma;
+	dscal_(n, &d__1, &w[1], &c__1);
+    }
+
+    work[1] = (doublereal) lopt;
+    iwork[1] = liopt;
+
+    return 0;
+
+/*     End of DSYEVD */
+
+} /* dsyevd_ */
+
+/* Subroutine */ int dsytd2_(char *uplo, integer *n, doublereal *a, integer *
+	lda, doublereal *d__, doublereal *e, doublereal *tau, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    static doublereal taui;
+    extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    static doublereal alpha;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *,
+	    doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, integer *), dlarfg_(integer *, doublereal *,
+	     doublereal *, integer *, doublereal *), xerbla_(char *, integer *
+	    );
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DSYTD2 reduces a real symmetric matrix A to symmetric tridiagonal
+    form T by an orthogonal similarity transformation: Q' * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is stored:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the orthogonal
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the orthogonal matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) DOUBLE PRECISION array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) DOUBLE PRECISION array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DSYTD2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Reduce the upper triangle of A */
+
+	for (i__ = *n - 1; i__ >= 1; --i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(1:i-1,i+1)
+*/
+
+	    dlarfg_(&i__, &a[i__ + (i__ + 1) * a_dim1], &a[(i__ + 1) * a_dim1
+		    + 1], &c__1, &taui);
+	    e[i__] = a[i__ + (i__ + 1) * a_dim1];
+
+	    if (taui != 0.) {
+
+/*              Apply H(i) from both sides to A(1:i,1:i) */
+
+		a[i__ + (i__ + 1) * a_dim1] = 1.;
+
+/*              Compute  x := tau * A * v  storing x in TAU(1:i) */
+
+		dsymv_(uplo, &i__, &taui, &a[a_offset], lda, &a[(i__ + 1) *
+			a_dim1 + 1], &c__1, &c_b29, &tau[1], &c__1)
+			;
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		alpha = taui * -.5 * ddot_(&i__, &tau[1], &c__1, &a[(i__ + 1)
+			* a_dim1 + 1], &c__1);
+		daxpy_(&i__, &alpha, &a[(i__ + 1) * a_dim1 + 1], &c__1, &tau[
+			1], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		dsyr2_(uplo, &i__, &c_b151, &a[(i__ + 1) * a_dim1 + 1], &c__1,
+			 &tau[1], &c__1, &a[a_offset], lda);
+
+		a[i__ + (i__ + 1) * a_dim1] = e[i__];
+	    }
+	    d__[i__ + 1] = a[i__ + 1 + (i__ + 1) * a_dim1];
+	    tau[i__] = taui;
+/* L10: */
+	}
+	d__[1] = a[a_dim1 + 1];
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(i+2:n,i)
+*/
+
+	    i__2 = *n - i__;
+/* Computing MIN */
+	    i__3 = i__ + 2;
+	    dlarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		     a_dim1], &c__1, &taui);
+	    e[i__] = a[i__ + 1 + i__ * a_dim1];
+
+	    if (taui != 0.) {
+
+/*              Apply H(i) from both sides to A(i+1:n,i+1:n) */
+
+		a[i__ + 1 + i__ * a_dim1] = 1.;
+
+/*              Compute  x := tau * A * v  storing y in TAU(i:n-1) */
+
+		i__2 = *n - i__;
+		dsymv_(uplo, &i__2, &taui, &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &tau[
+			i__], &c__1);
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		i__2 = *n - i__;
+		alpha = taui * -.5 * ddot_(&i__2, &tau[i__], &c__1, &a[i__ +
+			1 + i__ * a_dim1], &c__1);
+		i__2 = *n - i__;
+		daxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+			i__], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		i__2 = *n - i__;
+		dsyr2_(uplo, &i__2, &c_b151, &a[i__ + 1 + i__ * a_dim1], &
+			c__1, &tau[i__], &c__1, &a[i__ + 1 + (i__ + 1) *
+			a_dim1], lda);
+
+		a[i__ + 1 + i__ * a_dim1] = e[i__];
+	    }
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    tau[i__] = taui;
+/* L20: */
+	}
+	d__[*n] = a[*n + *n * a_dim1];
+    }
+
+    return 0;
+
+/*     End of DSYTD2 */
+
+} /* dsytd2_ */
+
+/* Subroutine */ int dsytrd_(char *uplo, integer *n, doublereal *a, integer *
+	lda, doublereal *d__, doublereal *e, doublereal *tau, doublereal *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, nb, kk, nx, iws;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    static logical upper;
+    extern /* Subroutine */ int dsytd2_(char *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *), dsyr2k_(char *, char *, integer *, integer *, doublereal
+	    *, doublereal *, integer *, doublereal *, integer *, doublereal *,
+	     doublereal *, integer *), dlatrd_(char *,
+	    integer *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *), xerbla_(char *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DSYTRD reduces a real symmetric matrix A to real symmetric
+    tridiagonal form T by an orthogonal similarity transformation:
+    Q**T * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the orthogonal
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the orthogonal matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) DOUBLE PRECISION array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) DOUBLE PRECISION array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= 1.
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*lwork < 1 && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+
+/*        Determine the block size. */
+
+	nb = ilaenv_(&c__1, "DSYTRD", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6,
+		 (ftnlen)1);
+	lwkopt = *n * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DSYTRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nx = *n;
+    iws = 1;
+    if (nb > 1 && nb < *n) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code).
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "DSYTRD", uplo, n, &c_n1, &c_n1, &
+		c_n1, (ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *n) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code by setting NX = N.
+
+   Computing MAX
+*/
+		i__1 = *lwork / ldwork;
+		nb = max(i__1,1);
+		nbmin = ilaenv_(&c__2, "DSYTRD", uplo, n, &c_n1, &c_n1, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		if (nb < nbmin) {
+		    nx = *n;
+		}
+	    }
+	} else {
+	    nx = *n;
+	}
+    } else {
+	nb = 1;
+    }
+
+    if (upper) {
+
+/*
+          Reduce the upper triangle of A.
+          Columns 1:kk are handled by the unblocked method.
+*/
+
+	kk = *n - (*n - nx + nb - 1) / nb * nb;
+	i__1 = kk + 1;
+	i__2 = -nb;
+	for (i__ = *n - nb + 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+		i__2) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = i__ + nb - 1;
+	    dlatrd_(uplo, &i__3, &nb, &a[a_offset], lda, &e[1], &tau[1], &
+		    work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(1:i-1,1:i-1), using an
+             update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = i__ - 1;
+	    dsyr2k_(uplo, "No transpose", &i__3, &nb, &c_b151, &a[i__ *
+		    a_dim1 + 1], lda, &work[1], &ldwork, &c_b15, &a[a_offset],
+		     lda);
+
+/*
+             Copy superdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j - 1 + j * a_dim1] = e[j - 1];
+		d__[j] = a[j + j * a_dim1];
+/* L10: */
+	    }
+/* L20: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	dsytd2_(uplo, &kk, &a[a_offset], lda, &d__[1], &e[1], &tau[1], &iinfo);
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__2 = *n - nx;
+	i__1 = nb;
+	for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = *n - i__ + 1;
+	    dlatrd_(uplo, &i__3, &nb, &a[i__ + i__ * a_dim1], lda, &e[i__], &
+		    tau[i__], &work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(i+ib:n,i+ib:n), using
+             an update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = *n - i__ - nb + 1;
+	    dsyr2k_(uplo, "No transpose", &i__3, &nb, &c_b151, &a[i__ + nb +
+		    i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b15, &a[
+		    i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*
+             Copy subdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j + 1 + j * a_dim1] = e[j];
+		d__[j] = a[j + j * a_dim1];
+/* L30: */
+	    }
+/* L40: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	i__1 = *n - i__ + 1;
+	dsytd2_(uplo, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__],
+		&tau[i__], &iinfo);
+    }
+
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DSYTRD */
+
+} /* dsytrd_ */
+
+/* Subroutine */ int dtrevc_(char *side, char *howmny, logical *select,
+	integer *n, doublereal *t, integer *ldt, doublereal *vl, integer *
+	ldvl, doublereal *vr, integer *ldvr, integer *mm, integer *m,
+	doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3;
+    doublereal d__1, d__2, d__3, d__4;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static doublereal x[4]	/* was [2][2] */;
+    static integer j1, j2, n2, ii, ki, ip, is;
+    static doublereal wi, wr, rec, ulp, beta, emax;
+    static logical pair;
+    extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *,
+	    integer *);
+    static logical allv;
+    static integer ierr;
+    static doublereal unfl, ovfl, smin;
+    static logical over;
+    static doublereal vmax;
+    static integer jnxt;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    static doublereal scale;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *);
+    static doublereal remax;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static logical leftv, bothv;
+    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *,
+	    integer *, doublereal *, integer *);
+    static doublereal vcrit;
+    static logical somev;
+    static doublereal xnorm;
+    extern /* Subroutine */ int dlaln2_(logical *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     doublereal *, doublereal *, integer *, doublereal *, doublereal *
+	    , doublereal *, integer *, doublereal *, doublereal *, integer *),
+	     dlabad_(doublereal *, doublereal *);
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal bignum;
+    static logical rightv;
+    static doublereal smlnum;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DTREVC computes some or all of the right and/or left eigenvectors of
+    a real upper quasi-triangular matrix T.
+    Matrices of this type are produced by the Schur factorization of
+    a real general matrix:  A = Q*T*Q**T, as computed by DHSEQR.
+
+    The right eigenvector x and the left eigenvector y of T corresponding
+    to an eigenvalue w are defined by:
+
+       T*x = w*x,     (y**H)*T = w*(y**H)
+
+    where y**H denotes the conjugate transpose of y.
+    The eigenvalues are not input to this routine, but are read directly
+    from the diagonal blocks of T.
+
+    This routine returns the matrices X and/or Y of right and left
+    eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+    input matrix.  If Q is the orthogonal factor that reduces a matrix
+    A to Schur form T, then Q*X and Q*Y are the matrices of right and
+    left eigenvectors of A.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  compute right eigenvectors only;
+            = 'L':  compute left eigenvectors only;
+            = 'B':  compute both right and left eigenvectors.
+
+    HOWMNY  (input) CHARACTER*1
+            = 'A':  compute all right and/or left eigenvectors;
+            = 'B':  compute all right and/or left eigenvectors,
+                    backtransformed by the matrices in VR and/or VL;
+            = 'S':  compute selected right and/or left eigenvectors,
+                    as indicated by the logical array SELECT.
+
+    SELECT  (input/output) LOGICAL array, dimension (N)
+            If HOWMNY = 'S', SELECT specifies the eigenvectors to be
+            computed.
+            If w(j) is a real eigenvalue, the corresponding real
+            eigenvector is computed if SELECT(j) is .TRUE..
+            If w(j) and w(j+1) are the real and imaginary parts of a
+            complex eigenvalue, the corresponding complex eigenvector is
+            computed if either SELECT(j) or SELECT(j+1) is .TRUE., and
+            on exit SELECT(j) is set to .TRUE. and SELECT(j+1) is set to
+            .FALSE..
+            Not referenced if HOWMNY = 'A' or 'B'.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input) DOUBLE PRECISION array, dimension (LDT,N)
+            The upper quasi-triangular matrix T in Schur canonical form.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    VL      (input/output) DOUBLE PRECISION array, dimension (LDVL,MM)
+            On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must
+            contain an N-by-N matrix Q (usually the orthogonal matrix Q
+            of Schur vectors returned by DHSEQR).
+            On exit, if SIDE = 'L' or 'B', VL contains:
+            if HOWMNY = 'A', the matrix Y of left eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*Y;
+            if HOWMNY = 'S', the left eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VL, in the same order as their
+                             eigenvalues.
+            A complex eigenvector corresponding to a complex eigenvalue
+            is stored in two consecutive columns, the first holding the
+            real part, and the second the imaginary part.
+            Not referenced if SIDE = 'R'.
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1, and if
+            SIDE = 'L' or 'B', LDVL >= N.
+
+    VR      (input/output) DOUBLE PRECISION array, dimension (LDVR,MM)
+            On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
+            contain an N-by-N matrix Q (usually the orthogonal matrix Q
+            of Schur vectors returned by DHSEQR).
+            On exit, if SIDE = 'R' or 'B', VR contains:
+            if HOWMNY = 'A', the matrix X of right eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*X;
+            if HOWMNY = 'S', the right eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VR, in the same order as their
+                             eigenvalues.
+            A complex eigenvector corresponding to a complex eigenvalue
+            is stored in two consecutive columns, the first holding the
+            real part and the second the imaginary part.
+            Not referenced if SIDE = 'L'.
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1, and if
+            SIDE = 'R' or 'B', LDVR >= N.
+
+    MM      (input) INTEGER
+            The number of columns in the arrays VL and/or VR. MM >= M.
+
+    M       (output) INTEGER
+            The number of columns in the arrays VL and/or VR actually
+            used to store the eigenvectors.
+            If HOWMNY = 'A' or 'B', M is set to N.
+            Each selected real eigenvector occupies one column and each
+            selected complex eigenvector occupies two columns.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (3*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The algorithm used in this program is basically backward (forward)
+    substitution, with scaling to make the the code robust against
+    possible overflow.
+
+    Each eigenvector is normalized so that the element of largest
+    magnitude has magnitude 1; here the magnitude of a complex number
+    (x,y) is taken to be |x| + |y|.
+
+    =====================================================================
+
+
+       Decode and test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --select;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+
+    /* Function Body */
+    bothv = lsame_(side, "B");
+    rightv = lsame_(side, "R") || bothv;
+    leftv = lsame_(side, "L") || bothv;
+
+    allv = lsame_(howmny, "A");
+    over = lsame_(howmny, "B");
+    somev = lsame_(howmny, "S");
+
+    *info = 0;
+    if (! rightv && ! leftv) {
+	*info = -1;
+    } else if (! allv && ! over && ! somev) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ldt < max(1,*n)) {
+	*info = -6;
+    } else if (*ldvl < 1 || leftv && *ldvl < *n) {
+	*info = -8;
+    } else if (*ldvr < 1 || rightv && *ldvr < *n) {
+	*info = -10;
+    } else {
+
+/*
+          Set M to the number of columns required to store the selected
+          eigenvectors, standardize the array SELECT if necessary, and
+          test MM.
+*/
+
+	if (somev) {
+	    *m = 0;
+	    pair = FALSE_;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (pair) {
+		    pair = FALSE_;
+		    select[j] = FALSE_;
+		} else {
+		    if (j < *n) {
+			if (t[j + 1 + j * t_dim1] == 0.) {
+			    if (select[j]) {
+				++(*m);
+			    }
+			} else {
+			    pair = TRUE_;
+			    if (select[j] || select[j + 1]) {
+				select[j] = TRUE_;
+				*m += 2;
+			    }
+			}
+		    } else {
+			if (select[*n]) {
+			    ++(*m);
+			}
+		    }
+		}
+/* L10: */
+	    }
+	} else {
+	    *m = *n;
+	}
+
+	if (*mm < *m) {
+	    *info = -11;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTREVC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Set the constants to control overflow. */
+
+    unfl = SAFEMINIMUM;
+    ovfl = 1. / unfl;
+    dlabad_(&unfl, &ovfl);
+    ulp = PRECISION;
+    smlnum = unfl * (*n / ulp);
+    bignum = (1. - ulp) / smlnum;
+
+/*
+       Compute 1-norm of each column of strictly upper triangular
+       part of T to control overflow in triangular solver.
+*/
+
+    work[1] = 0.;
+    i__1 = *n;
+    for (j = 2; j <= i__1; ++j) {
+	work[j] = 0.;
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    work[j] += (d__1 = t[i__ + j * t_dim1], abs(d__1));
+/* L20: */
+	}
+/* L30: */
+    }
+
+/*
+       Index IP is used to specify the real or complex eigenvalue:
+         IP = 0, real eigenvalue,
+              1, first of conjugate complex pair: (wr,wi)
+             -1, second of conjugate complex pair: (wr,wi)
+*/
+
+    n2 = *n << 1;
+
+    if (rightv) {
+
+/*        Compute right eigenvectors. */
+
+	ip = 0;
+	is = *m;
+	for (ki = *n; ki >= 1; --ki) {
+
+	    if (ip == 1) {
+		goto L130;
+	    }
+	    if (ki == 1) {
+		goto L40;
+	    }
+	    if (t[ki + (ki - 1) * t_dim1] == 0.) {
+		goto L40;
+	    }
+	    ip = -1;
+
+L40:
+	    if (somev) {
+		if (ip == 0) {
+		    if (! select[ki]) {
+			goto L130;
+		    }
+		} else {
+		    if (! select[ki - 1]) {
+			goto L130;
+		    }
+		}
+	    }
+
+/*           Compute the KI-th eigenvalue (WR,WI). */
+
+	    wr = t[ki + ki * t_dim1];
+	    wi = 0.;
+	    if (ip != 0) {
+		wi = sqrt((d__1 = t[ki + (ki - 1) * t_dim1], abs(d__1))) *
+			sqrt((d__2 = t[ki - 1 + ki * t_dim1], abs(d__2)));
+	    }
+/* Computing MAX */
+	    d__1 = ulp * (abs(wr) + abs(wi));
+	    smin = max(d__1,smlnum);
+
+	    if (ip == 0) {
+
+/*              Real right eigenvector */
+
+		work[ki + *n] = 1.;
+
+/*              Form right-hand side */
+
+		i__1 = ki - 1;
+		for (k = 1; k <= i__1; ++k) {
+		    work[k + *n] = -t[k + ki * t_dim1];
+/* L50: */
+		}
+
+/*
+                Solve the upper quasi-triangular system:
+                   (T(1:KI-1,1:KI-1) - WR)*X = SCALE*WORK.
+*/
+
+		jnxt = ki - 1;
+		for (j = ki - 1; j >= 1; --j) {
+		    if (j > jnxt) {
+			goto L60;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j - 1;
+		    if (j > 1) {
+			if (t[j + (j - 1) * t_dim1] != 0.) {
+			    j1 = j - 1;
+			    jnxt = j - 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*                    1-by-1 diagonal block */
+
+			dlaln2_(&c_false, &c__1, &c__1, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &c_b29, x, &c__2, &scale, &xnorm,
+				&ierr);
+
+/*
+                      Scale X(1,1) to avoid overflow when updating
+                      the right-hand side.
+*/
+
+			if (xnorm > 1.) {
+			    if (work[j] > bignum / xnorm) {
+				x[0] /= xnorm;
+				scale /= xnorm;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    dscal_(&ki, &scale, &work[*n + 1], &c__1);
+			}
+			work[j + *n] = x[0];
+
+/*                    Update right-hand side */
+
+			i__1 = j - 1;
+			d__1 = -x[0];
+			daxpy_(&i__1, &d__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+
+		    } else {
+
+/*                    2-by-2 diagonal block */
+
+			dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b15, &t[j -
+				1 + (j - 1) * t_dim1], ldt, &c_b15, &c_b15, &
+				work[j - 1 + *n], n, &wr, &c_b29, x, &c__2, &
+				scale, &xnorm, &ierr);
+
+/*
+                      Scale X(1,1) and X(2,1) to avoid overflow when
+                      updating the right-hand side.
+*/
+
+			if (xnorm > 1.) {
+/* Computing MAX */
+			    d__1 = work[j - 1], d__2 = work[j];
+			    beta = max(d__1,d__2);
+			    if (beta > bignum / xnorm) {
+				x[0] /= xnorm;
+				x[1] /= xnorm;
+				scale /= xnorm;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    dscal_(&ki, &scale, &work[*n + 1], &c__1);
+			}
+			work[j - 1 + *n] = x[0];
+			work[j + *n] = x[1];
+
+/*                    Update right-hand side */
+
+			i__1 = j - 2;
+			d__1 = -x[0];
+			daxpy_(&i__1, &d__1, &t[(j - 1) * t_dim1 + 1], &c__1,
+				&work[*n + 1], &c__1);
+			i__1 = j - 2;
+			d__1 = -x[1];
+			daxpy_(&i__1, &d__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+		    }
+L60:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VR and normalize. */
+
+		if (! over) {
+		    dcopy_(&ki, &work[*n + 1], &c__1, &vr[is * vr_dim1 + 1], &
+			    c__1);
+
+		    ii = idamax_(&ki, &vr[is * vr_dim1 + 1], &c__1);
+		    remax = 1. / (d__1 = vr[ii + is * vr_dim1], abs(d__1));
+		    dscal_(&ki, &remax, &vr[is * vr_dim1 + 1], &c__1);
+
+		    i__1 = *n;
+		    for (k = ki + 1; k <= i__1; ++k) {
+			vr[k + is * vr_dim1] = 0.;
+/* L70: */
+		    }
+		} else {
+		    if (ki > 1) {
+			i__1 = ki - 1;
+			dgemv_("N", n, &i__1, &c_b15, &vr[vr_offset], ldvr, &
+				work[*n + 1], &c__1, &work[ki + *n], &vr[ki *
+				vr_dim1 + 1], &c__1);
+		    }
+
+		    ii = idamax_(n, &vr[ki * vr_dim1 + 1], &c__1);
+		    remax = 1. / (d__1 = vr[ii + ki * vr_dim1], abs(d__1));
+		    dscal_(n, &remax, &vr[ki * vr_dim1 + 1], &c__1);
+		}
+
+	    } else {
+
+/*
+                Complex right eigenvector.
+
+                Initial solve
+                  [ (T(KI-1,KI-1) T(KI-1,KI) ) - (WR + I* WI)]*X = 0.
+                  [ (T(KI,KI-1)   T(KI,KI)   )               ]
+*/
+
+		if ((d__1 = t[ki - 1 + ki * t_dim1], abs(d__1)) >= (d__2 = t[
+			ki + (ki - 1) * t_dim1], abs(d__2))) {
+		    work[ki - 1 + *n] = 1.;
+		    work[ki + n2] = wi / t[ki - 1 + ki * t_dim1];
+		} else {
+		    work[ki - 1 + *n] = -wi / t[ki + (ki - 1) * t_dim1];
+		    work[ki + n2] = 1.;
+		}
+		work[ki + *n] = 0.;
+		work[ki - 1 + n2] = 0.;
+
+/*              Form right-hand side */
+
+		i__1 = ki - 2;
+		for (k = 1; k <= i__1; ++k) {
+		    work[k + *n] = -work[ki - 1 + *n] * t[k + (ki - 1) *
+			    t_dim1];
+		    work[k + n2] = -work[ki + n2] * t[k + ki * t_dim1];
+/* L80: */
+		}
+
+/*
+                Solve upper quasi-triangular system:
+                (T(1:KI-2,1:KI-2) - (WR+i*WI))*X = SCALE*(WORK+i*WORK2)
+*/
+
+		jnxt = ki - 2;
+		for (j = ki - 2; j >= 1; --j) {
+		    if (j > jnxt) {
+			goto L90;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j - 1;
+		    if (j > 1) {
+			if (t[j + (j - 1) * t_dim1] != 0.) {
+			    j1 = j - 1;
+			    jnxt = j - 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*                    1-by-1 diagonal block */
+
+			dlaln2_(&c_false, &c__1, &c__2, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &wi, x, &c__2, &scale, &xnorm, &
+				ierr);
+
+/*
+                      Scale X(1,1) and X(1,2) to avoid overflow when
+                      updating the right-hand side.
+*/
+
+			if (xnorm > 1.) {
+			    if (work[j] > bignum / xnorm) {
+				x[0] /= xnorm;
+				x[2] /= xnorm;
+				scale /= xnorm;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    dscal_(&ki, &scale, &work[*n + 1], &c__1);
+			    dscal_(&ki, &scale, &work[n2 + 1], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + n2] = x[2];
+
+/*                    Update the right-hand side */
+
+			i__1 = j - 1;
+			d__1 = -x[0];
+			daxpy_(&i__1, &d__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+			i__1 = j - 1;
+			d__1 = -x[2];
+			daxpy_(&i__1, &d__1, &t[j * t_dim1 + 1], &c__1, &work[
+				n2 + 1], &c__1);
+
+		    } else {
+
+/*                    2-by-2 diagonal block */
+
+			dlaln2_(&c_false, &c__2, &c__2, &smin, &c_b15, &t[j -
+				1 + (j - 1) * t_dim1], ldt, &c_b15, &c_b15, &
+				work[j - 1 + *n], n, &wr, &wi, x, &c__2, &
+				scale, &xnorm, &ierr);
+
+/*
+                      Scale X to avoid overflow when updating
+                      the right-hand side.
+*/
+
+			if (xnorm > 1.) {
+/* Computing MAX */
+			    d__1 = work[j - 1], d__2 = work[j];
+			    beta = max(d__1,d__2);
+			    if (beta > bignum / xnorm) {
+				rec = 1. / xnorm;
+				x[0] *= rec;
+				x[2] *= rec;
+				x[1] *= rec;
+				x[3] *= rec;
+				scale *= rec;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    dscal_(&ki, &scale, &work[*n + 1], &c__1);
+			    dscal_(&ki, &scale, &work[n2 + 1], &c__1);
+			}
+			work[j - 1 + *n] = x[0];
+			work[j + *n] = x[1];
+			work[j - 1 + n2] = x[2];
+			work[j + n2] = x[3];
+
+/*                    Update the right-hand side */
+
+			i__1 = j - 2;
+			d__1 = -x[0];
+			daxpy_(&i__1, &d__1, &t[(j - 1) * t_dim1 + 1], &c__1,
+				&work[*n + 1], &c__1);
+			i__1 = j - 2;
+			d__1 = -x[1];
+			daxpy_(&i__1, &d__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+			i__1 = j - 2;
+			d__1 = -x[2];
+			daxpy_(&i__1, &d__1, &t[(j - 1) * t_dim1 + 1], &c__1,
+				&work[n2 + 1], &c__1);
+			i__1 = j - 2;
+			d__1 = -x[3];
+			daxpy_(&i__1, &d__1, &t[j * t_dim1 + 1], &c__1, &work[
+				n2 + 1], &c__1);
+		    }
+L90:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VR and normalize. */
+
+		if (! over) {
+		    dcopy_(&ki, &work[*n + 1], &c__1, &vr[(is - 1) * vr_dim1
+			    + 1], &c__1);
+		    dcopy_(&ki, &work[n2 + 1], &c__1, &vr[is * vr_dim1 + 1], &
+			    c__1);
+
+		    emax = 0.;
+		    i__1 = ki;
+		    for (k = 1; k <= i__1; ++k) {
+/* Computing MAX */
+			d__3 = emax, d__4 = (d__1 = vr[k + (is - 1) * vr_dim1]
+				, abs(d__1)) + (d__2 = vr[k + is * vr_dim1],
+				abs(d__2));
+			emax = max(d__3,d__4);
+/* L100: */
+		    }
+
+		    remax = 1. / emax;
+		    dscal_(&ki, &remax, &vr[(is - 1) * vr_dim1 + 1], &c__1);
+		    dscal_(&ki, &remax, &vr[is * vr_dim1 + 1], &c__1);
+
+		    i__1 = *n;
+		    for (k = ki + 1; k <= i__1; ++k) {
+			vr[k + (is - 1) * vr_dim1] = 0.;
+			vr[k + is * vr_dim1] = 0.;
+/* L110: */
+		    }
+
+		} else {
+
+		    if (ki > 2) {
+			i__1 = ki - 2;
+			dgemv_("N", n, &i__1, &c_b15, &vr[vr_offset], ldvr, &
+				work[*n + 1], &c__1, &work[ki - 1 + *n], &vr[(
+				ki - 1) * vr_dim1 + 1], &c__1);
+			i__1 = ki - 2;
+			dgemv_("N", n, &i__1, &c_b15, &vr[vr_offset], ldvr, &
+				work[n2 + 1], &c__1, &work[ki + n2], &vr[ki *
+				vr_dim1 + 1], &c__1);
+		    } else {
+			dscal_(n, &work[ki - 1 + *n], &vr[(ki - 1) * vr_dim1
+				+ 1], &c__1);
+			dscal_(n, &work[ki + n2], &vr[ki * vr_dim1 + 1], &
+				c__1);
+		    }
+
+		    emax = 0.;
+		    i__1 = *n;
+		    for (k = 1; k <= i__1; ++k) {
+/* Computing MAX */
+			d__3 = emax, d__4 = (d__1 = vr[k + (ki - 1) * vr_dim1]
+				, abs(d__1)) + (d__2 = vr[k + ki * vr_dim1],
+				abs(d__2));
+			emax = max(d__3,d__4);
+/* L120: */
+		    }
+		    remax = 1. / emax;
+		    dscal_(n, &remax, &vr[(ki - 1) * vr_dim1 + 1], &c__1);
+		    dscal_(n, &remax, &vr[ki * vr_dim1 + 1], &c__1);
+		}
+	    }
+
+	    --is;
+	    if (ip != 0) {
+		--is;
+	    }
+L130:
+	    if (ip == 1) {
+		ip = 0;
+	    }
+	    if (ip == -1) {
+		ip = 1;
+	    }
+/* L140: */
+	}
+    }
+
+    if (leftv) {
+
+/*        Compute left eigenvectors. */
+
+	ip = 0;
+	is = 1;
+	i__1 = *n;
+	for (ki = 1; ki <= i__1; ++ki) {
+
+	    if (ip == -1) {
+		goto L250;
+	    }
+	    if (ki == *n) {
+		goto L150;
+	    }
+	    if (t[ki + 1 + ki * t_dim1] == 0.) {
+		goto L150;
+	    }
+	    ip = 1;
+
+L150:
+	    if (somev) {
+		if (! select[ki]) {
+		    goto L250;
+		}
+	    }
+
+/*           Compute the KI-th eigenvalue (WR,WI). */
+
+	    wr = t[ki + ki * t_dim1];
+	    wi = 0.;
+	    if (ip != 0) {
+		wi = sqrt((d__1 = t[ki + (ki + 1) * t_dim1], abs(d__1))) *
+			sqrt((d__2 = t[ki + 1 + ki * t_dim1], abs(d__2)));
+	    }
+/* Computing MAX */
+	    d__1 = ulp * (abs(wr) + abs(wi));
+	    smin = max(d__1,smlnum);
+
+	    if (ip == 0) {
+
+/*              Real left eigenvector. */
+
+		work[ki + *n] = 1.;
+
+/*              Form right-hand side */
+
+		i__2 = *n;
+		for (k = ki + 1; k <= i__2; ++k) {
+		    work[k + *n] = -t[ki + k * t_dim1];
+/* L160: */
+		}
+
+/*
+                Solve the quasi-triangular system:
+                   (T(KI+1:N,KI+1:N) - WR)'*X = SCALE*WORK
+*/
+
+		vmax = 1.;
+		vcrit = bignum;
+
+		jnxt = ki + 1;
+		i__2 = *n;
+		for (j = ki + 1; j <= i__2; ++j) {
+		    if (j < jnxt) {
+			goto L170;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j + 1;
+		    if (j < *n) {
+			if (t[j + 1 + j * t_dim1] != 0.) {
+			    j2 = j + 1;
+			    jnxt = j + 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*
+                      1-by-1 diagonal block
+
+                      Scale if necessary to avoid overflow when forming
+                      the right-hand side.
+*/
+
+			if (work[j] > vcrit) {
+			    rec = 1. / vmax;
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    vmax = 1.;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 1;
+			work[j + *n] -= ddot_(&i__3, &t[ki + 1 + j * t_dim1],
+				&c__1, &work[ki + 1 + *n], &c__1);
+
+/*                    Solve (T(J,J)-WR)'*X = WORK */
+
+			dlaln2_(&c_false, &c__1, &c__1, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &c_b29, x, &c__2, &scale, &xnorm,
+				&ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			}
+			work[j + *n] = x[0];
+/* Computing MAX */
+			d__2 = (d__1 = work[j + *n], abs(d__1));
+			vmax = max(d__2,vmax);
+			vcrit = bignum / vmax;
+
+		    } else {
+
+/*
+                      2-by-2 diagonal block
+
+                      Scale if necessary to avoid overflow when forming
+                      the right-hand side.
+
+   Computing MAX
+*/
+			d__1 = work[j], d__2 = work[j + 1];
+			beta = max(d__1,d__2);
+			if (beta > vcrit) {
+			    rec = 1. / vmax;
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    vmax = 1.;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 1;
+			work[j + *n] -= ddot_(&i__3, &t[ki + 1 + j * t_dim1],
+				&c__1, &work[ki + 1 + *n], &c__1);
+
+			i__3 = j - ki - 1;
+			work[j + 1 + *n] -= ddot_(&i__3, &t[ki + 1 + (j + 1) *
+				 t_dim1], &c__1, &work[ki + 1 + *n], &c__1);
+
+/*
+                      Solve
+                        [T(J,J)-WR   T(J,J+1)     ]'* X = SCALE*( WORK1 )
+                        [T(J+1,J)    T(J+1,J+1)-WR]             ( WORK2 )
+*/
+
+			dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &c_b29, x, &c__2, &scale, &xnorm,
+				&ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + 1 + *n] = x[1];
+
+/* Computing MAX */
+			d__3 = (d__1 = work[j + *n], abs(d__1)), d__4 = (d__2
+				= work[j + 1 + *n], abs(d__2)), d__3 = max(
+				d__3,d__4);
+			vmax = max(d__3,vmax);
+			vcrit = bignum / vmax;
+
+		    }
+L170:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VL and normalize. */
+
+		if (! over) {
+		    i__2 = *n - ki + 1;
+		    dcopy_(&i__2, &work[ki + *n], &c__1, &vl[ki + is *
+			    vl_dim1], &c__1);
+
+		    i__2 = *n - ki + 1;
+		    ii = idamax_(&i__2, &vl[ki + is * vl_dim1], &c__1) + ki -
+			    1;
+		    remax = 1. / (d__1 = vl[ii + is * vl_dim1], abs(d__1));
+		    i__2 = *n - ki + 1;
+		    dscal_(&i__2, &remax, &vl[ki + is * vl_dim1], &c__1);
+
+		    i__2 = ki - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			vl[k + is * vl_dim1] = 0.;
+/* L180: */
+		    }
+
+		} else {
+
+		    if (ki < *n) {
+			i__2 = *n - ki;
+			dgemv_("N", n, &i__2, &c_b15, &vl[(ki + 1) * vl_dim1
+				+ 1], ldvl, &work[ki + 1 + *n], &c__1, &work[
+				ki + *n], &vl[ki * vl_dim1 + 1], &c__1);
+		    }
+
+		    ii = idamax_(n, &vl[ki * vl_dim1 + 1], &c__1);
+		    remax = 1. / (d__1 = vl[ii + ki * vl_dim1], abs(d__1));
+		    dscal_(n, &remax, &vl[ki * vl_dim1 + 1], &c__1);
+
+		}
+
+	    } else {
+
+/*
+                Complex left eigenvector.
+
+                 Initial solve:
+                   ((T(KI,KI)    T(KI,KI+1) )' - (WR - I* WI))*X = 0.
+                   ((T(KI+1,KI) T(KI+1,KI+1))                )
+*/
+
+		if ((d__1 = t[ki + (ki + 1) * t_dim1], abs(d__1)) >= (d__2 =
+			t[ki + 1 + ki * t_dim1], abs(d__2))) {
+		    work[ki + *n] = wi / t[ki + (ki + 1) * t_dim1];
+		    work[ki + 1 + n2] = 1.;
+		} else {
+		    work[ki + *n] = 1.;
+		    work[ki + 1 + n2] = -wi / t[ki + 1 + ki * t_dim1];
+		}
+		work[ki + 1 + *n] = 0.;
+		work[ki + n2] = 0.;
+
+/*              Form right-hand side */
+
+		i__2 = *n;
+		for (k = ki + 2; k <= i__2; ++k) {
+		    work[k + *n] = -work[ki + *n] * t[ki + k * t_dim1];
+		    work[k + n2] = -work[ki + 1 + n2] * t[ki + 1 + k * t_dim1]
+			    ;
+/* L190: */
+		}
+
+/*
+                Solve complex quasi-triangular system:
+                ( T(KI+2,N:KI+2,N) - (WR-i*WI) )*X = WORK1+i*WORK2
+*/
+
+		vmax = 1.;
+		vcrit = bignum;
+
+		jnxt = ki + 2;
+		i__2 = *n;
+		for (j = ki + 2; j <= i__2; ++j) {
+		    if (j < jnxt) {
+			goto L200;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j + 1;
+		    if (j < *n) {
+			if (t[j + 1 + j * t_dim1] != 0.) {
+			    j2 = j + 1;
+			    jnxt = j + 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*
+                      1-by-1 diagonal block
+
+                      Scale if necessary to avoid overflow when
+                      forming the right-hand side elements.
+*/
+
+			if (work[j] > vcrit) {
+			    rec = 1. / vmax;
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &rec, &work[ki + n2], &c__1);
+			    vmax = 1.;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 2;
+			work[j + *n] -= ddot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + *n], &c__1);
+			i__3 = j - ki - 2;
+			work[j + n2] -= ddot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + n2], &c__1);
+
+/*                    Solve (T(J,J)-(WR-i*WI))*(X11+i*X12)= WK+I*WK2 */
+
+			d__1 = -wi;
+			dlaln2_(&c_false, &c__1, &c__2, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &d__1, x, &c__2, &scale, &xnorm, &
+				ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &scale, &work[ki + n2], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + n2] = x[2];
+/* Computing MAX */
+			d__3 = (d__1 = work[j + *n], abs(d__1)), d__4 = (d__2
+				= work[j + n2], abs(d__2)), d__3 = max(d__3,
+				d__4);
+			vmax = max(d__3,vmax);
+			vcrit = bignum / vmax;
+
+		    } else {
+
+/*
+                      2-by-2 diagonal block
+
+                      Scale if necessary to avoid overflow when forming
+                      the right-hand side elements.
+
+   Computing MAX
+*/
+			d__1 = work[j], d__2 = work[j + 1];
+			beta = max(d__1,d__2);
+			if (beta > vcrit) {
+			    rec = 1. / vmax;
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &rec, &work[ki + n2], &c__1);
+			    vmax = 1.;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 2;
+			work[j + *n] -= ddot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + *n], &c__1);
+
+			i__3 = j - ki - 2;
+			work[j + n2] -= ddot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + n2], &c__1);
+
+			i__3 = j - ki - 2;
+			work[j + 1 + *n] -= ddot_(&i__3, &t[ki + 2 + (j + 1) *
+				 t_dim1], &c__1, &work[ki + 2 + *n], &c__1);
+
+			i__3 = j - ki - 2;
+			work[j + 1 + n2] -= ddot_(&i__3, &t[ki + 2 + (j + 1) *
+				 t_dim1], &c__1, &work[ki + 2 + n2], &c__1);
+
+/*
+                      Solve 2-by-2 complex linear equation
+                        ([T(j,j)   T(j,j+1)  ]'-(wr-i*wi)*I)*X = SCALE*B
+                        ([T(j+1,j) T(j+1,j+1)]             )
+*/
+
+			d__1 = -wi;
+			dlaln2_(&c_true, &c__2, &c__2, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &d__1, x, &c__2, &scale, &xnorm, &
+				ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.) {
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    dscal_(&i__3, &scale, &work[ki + n2], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + n2] = x[2];
+			work[j + 1 + *n] = x[1];
+			work[j + 1 + n2] = x[3];
+/* Computing MAX */
+			d__1 = abs(x[0]), d__2 = abs(x[2]), d__1 = max(d__1,
+				d__2), d__2 = abs(x[1]), d__1 = max(d__1,d__2)
+				, d__2 = abs(x[3]), d__1 = max(d__1,d__2);
+			vmax = max(d__1,vmax);
+			vcrit = bignum / vmax;
+
+		    }
+L200:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VL and normalize. */
+
+		if (! over) {
+		    i__2 = *n - ki + 1;
+		    dcopy_(&i__2, &work[ki + *n], &c__1, &vl[ki + is *
+			    vl_dim1], &c__1);
+		    i__2 = *n - ki + 1;
+		    dcopy_(&i__2, &work[ki + n2], &c__1, &vl[ki + (is + 1) *
+			    vl_dim1], &c__1);
+
+		    emax = 0.;
+		    i__2 = *n;
+		    for (k = ki; k <= i__2; ++k) {
+/* Computing MAX */
+			d__3 = emax, d__4 = (d__1 = vl[k + is * vl_dim1], abs(
+				d__1)) + (d__2 = vl[k + (is + 1) * vl_dim1],
+				abs(d__2));
+			emax = max(d__3,d__4);
+/* L220: */
+		    }
+		    remax = 1. / emax;
+		    i__2 = *n - ki + 1;
+		    dscal_(&i__2, &remax, &vl[ki + is * vl_dim1], &c__1);
+		    i__2 = *n - ki + 1;
+		    dscal_(&i__2, &remax, &vl[ki + (is + 1) * vl_dim1], &c__1)
+			    ;
+
+		    i__2 = ki - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			vl[k + is * vl_dim1] = 0.;
+			vl[k + (is + 1) * vl_dim1] = 0.;
+/* L230: */
+		    }
+		} else {
+		    if (ki < *n - 1) {
+			i__2 = *n - ki - 1;
+			dgemv_("N", n, &i__2, &c_b15, &vl[(ki + 2) * vl_dim1
+				+ 1], ldvl, &work[ki + 2 + *n], &c__1, &work[
+				ki + *n], &vl[ki * vl_dim1 + 1], &c__1);
+			i__2 = *n - ki - 1;
+			dgemv_("N", n, &i__2, &c_b15, &vl[(ki + 2) * vl_dim1
+				+ 1], ldvl, &work[ki + 2 + n2], &c__1, &work[
+				ki + 1 + n2], &vl[(ki + 1) * vl_dim1 + 1], &
+				c__1);
+		    } else {
+			dscal_(n, &work[ki + *n], &vl[ki * vl_dim1 + 1], &
+				c__1);
+			dscal_(n, &work[ki + 1 + n2], &vl[(ki + 1) * vl_dim1
+				+ 1], &c__1);
+		    }
+
+		    emax = 0.;
+		    i__2 = *n;
+		    for (k = 1; k <= i__2; ++k) {
+/* Computing MAX */
+			d__3 = emax, d__4 = (d__1 = vl[k + ki * vl_dim1], abs(
+				d__1)) + (d__2 = vl[k + (ki + 1) * vl_dim1],
+				abs(d__2));
+			emax = max(d__3,d__4);
+/* L240: */
+		    }
+		    remax = 1. / emax;
+		    dscal_(n, &remax, &vl[ki * vl_dim1 + 1], &c__1);
+		    dscal_(n, &remax, &vl[(ki + 1) * vl_dim1 + 1], &c__1);
+
+		}
+
+	    }
+
+	    ++is;
+	    if (ip != 0) {
+		++is;
+	    }
+L250:
+	    if (ip == -1) {
+		ip = 0;
+	    }
+	    if (ip == 1) {
+		ip = -1;
+	    }
+
+/* L260: */
+	}
+
+    }
+
+    return 0;
+
+/*     End of DTREVC */
+
+} /* dtrevc_ */
+
+/* Subroutine */ int dtrexc_(char *compq, integer *n, doublereal *t, integer *
+	ldt, doublereal *q, integer *ldq, integer *ifst, integer *ilst,
+	doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, t_dim1, t_offset, i__1;
+
+    /* Local variables */
+    static integer nbf, nbl, here;
+    extern logical lsame_(char *, char *);
+    static logical wantq;
+    extern /* Subroutine */ int dlaexc_(logical *, integer *, doublereal *,
+	    integer *, doublereal *, integer *, integer *, integer *, integer
+	    *, doublereal *, integer *), xerbla_(char *, integer *);
+    static integer nbnext;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DTREXC reorders the real Schur factorization of a real matrix
+    A = Q*T*Q**T, so that the diagonal block of T with row index IFST is
+    moved to row ILST.
+
+    The real Schur form T is reordered by an orthogonal similarity
+    transformation Z**T*T*Z, and optionally the matrix Q of Schur vectors
+    is updated by postmultiplying it with Z.
+
+    T must be in Schur canonical form (as returned by DHSEQR), that is,
+    block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+    2-by-2 diagonal block has its diagonal elements equal and its
+    off-diagonal elements of opposite sign.
+
+    Arguments
+    =========
+
+    COMPQ   (input) CHARACTER*1
+            = 'V':  update the matrix Q of Schur vectors;
+            = 'N':  do not update Q.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) DOUBLE PRECISION array, dimension (LDT,N)
+            On entry, the upper quasi-triangular matrix T, in Schur
+            Schur canonical form.
+            On exit, the reordered upper quasi-triangular matrix, again
+            in Schur canonical form.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    Q       (input/output) DOUBLE PRECISION array, dimension (LDQ,N)
+            On entry, if COMPQ = 'V', the matrix Q of Schur vectors.
+            On exit, if COMPQ = 'V', Q has been postmultiplied by the
+            orthogonal transformation matrix Z which reorders T.
+            If COMPQ = 'N', Q is not referenced.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    IFST    (input/output) INTEGER
+    ILST    (input/output) INTEGER
+            Specify the reordering of the diagonal blocks of T.
+            The block with row index IFST is moved to row ILST, by a
+            sequence of transpositions between adjacent blocks.
+            On exit, if IFST pointed on entry to the second row of a
+            2-by-2 block, it is changed to point to the first row; ILST
+            always points to the first row of the block in its final
+            position (which may differ from its input value by +1 or -1).
+            1 <= IFST <= N; 1 <= ILST <= N.
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            = 1:  two adjacent blocks were too close to swap (the problem
+                  is very ill-conditioned); T may have been partially
+                  reordered, and ILST points to the first row of the
+                  current position of the block being moved.
+
+    =====================================================================
+
+
+       Decode and test the input arguments.
+*/
+
+    /* Parameter adjustments */
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(compq, "V");
+    if (! wantq && ! lsame_(compq, "N")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldt < max(1,*n)) {
+	*info = -4;
+    } else if (*ldq < 1 || wantq && *ldq < max(1,*n)) {
+	*info = -6;
+    } else if (*ifst < 1 || *ifst > *n) {
+	*info = -7;
+    } else if (*ilst < 1 || *ilst > *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTREXC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 1) {
+	return 0;
+    }
+
+/*
+       Determine the first row of specified block
+       and find out it is 1 by 1 or 2 by 2.
+*/
+
+    if (*ifst > 1) {
+	if (t[*ifst + (*ifst - 1) * t_dim1] != 0.) {
+	    --(*ifst);
+	}
+    }
+    nbf = 1;
+    if (*ifst < *n) {
+	if (t[*ifst + 1 + *ifst * t_dim1] != 0.) {
+	    nbf = 2;
+	}
+    }
+
+/*
+       Determine the first row of the final block
+       and find out it is 1 by 1 or 2 by 2.
+*/
+
+    if (*ilst > 1) {
+	if (t[*ilst + (*ilst - 1) * t_dim1] != 0.) {
+	    --(*ilst);
+	}
+    }
+    nbl = 1;
+    if (*ilst < *n) {
+	if (t[*ilst + 1 + *ilst * t_dim1] != 0.) {
+	    nbl = 2;
+	}
+    }
+
+    if (*ifst == *ilst) {
+	return 0;
+    }
+
+    if (*ifst < *ilst) {
+
+/*        Update ILST */
+
+	if (nbf == 2 && nbl == 1) {
+	    --(*ilst);
+	}
+	if (nbf == 1 && nbl == 2) {
+	    ++(*ilst);
+	}
+
+	here = *ifst;
+
+L10:
+
+/*        Swap block with next one below */
+
+	if (nbf == 1 || nbf == 2) {
+
+/*           Current block either 1 by 1 or 2 by 2 */
+
+	    nbnext = 1;
+	    if (here + nbf + 1 <= *n) {
+		if (t[here + nbf + 1 + (here + nbf) * t_dim1] != 0.) {
+		    nbnext = 2;
+		}
+	    }
+	    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &here, &
+		    nbf, &nbnext, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    here += nbnext;
+
+/*           Test if 2 by 2 block breaks into two 1 by 1 blocks */
+
+	    if (nbf == 2) {
+		if (t[here + 1 + here * t_dim1] == 0.) {
+		    nbf = 3;
+		}
+	    }
+
+	} else {
+
+/*
+             Current block consists of two 1 by 1 blocks each of which
+             must be swapped individually
+*/
+
+	    nbnext = 1;
+	    if (here + 3 <= *n) {
+		if (t[here + 3 + (here + 2) * t_dim1] != 0.) {
+		    nbnext = 2;
+		}
+	    }
+	    i__1 = here + 1;
+	    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &i__1, &
+		    c__1, &nbnext, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    if (nbnext == 1) {
+
+/*              Swap two 1 by 1 blocks, no problems possible */
+
+		dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			here, &c__1, &nbnext, &work[1], info);
+		++here;
+	    } else {
+
+/*              Recompute NBNEXT in case 2 by 2 split */
+
+		if (t[here + 2 + (here + 1) * t_dim1] == 0.) {
+		    nbnext = 1;
+		}
+		if (nbnext == 2) {
+
+/*                 2 by 2 Block did not split */
+
+		    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    here, &c__1, &nbnext, &work[1], info);
+		    if (*info != 0) {
+			*ilst = here;
+			return 0;
+		    }
+		    here += 2;
+		} else {
+
+/*                 2 by 2 Block did split */
+
+		    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    here, &c__1, &c__1, &work[1], info);
+		    i__1 = here + 1;
+		    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    i__1, &c__1, &c__1, &work[1], info);
+		    here += 2;
+		}
+	    }
+	}
+	if (here < *ilst) {
+	    goto L10;
+	}
+
+    } else {
+
+	here = *ifst;
+L20:
+
+/*        Swap block with next one above */
+
+	if (nbf == 1 || nbf == 2) {
+
+/*           Current block either 1 by 1 or 2 by 2 */
+
+	    nbnext = 1;
+	    if (here >= 3) {
+		if (t[here - 1 + (here - 2) * t_dim1] != 0.) {
+		    nbnext = 2;
+		}
+	    }
+	    i__1 = here - nbnext;
+	    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &i__1, &
+		    nbnext, &nbf, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    here -= nbnext;
+
+/*           Test if 2 by 2 block breaks into two 1 by 1 blocks */
+
+	    if (nbf == 2) {
+		if (t[here + 1 + here * t_dim1] == 0.) {
+		    nbf = 3;
+		}
+	    }
+
+	} else {
+
+/*
+             Current block consists of two 1 by 1 blocks each of which
+             must be swapped individually
+*/
+
+	    nbnext = 1;
+	    if (here >= 3) {
+		if (t[here - 1 + (here - 2) * t_dim1] != 0.) {
+		    nbnext = 2;
+		}
+	    }
+	    i__1 = here - nbnext;
+	    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &i__1, &
+		    nbnext, &c__1, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    if (nbnext == 1) {
+
+/*              Swap two 1 by 1 blocks, no problems possible */
+
+		dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			here, &nbnext, &c__1, &work[1], info);
+		--here;
+	    } else {
+
+/*              Recompute NBNEXT in case 2 by 2 split */
+
+		if (t[here + (here - 1) * t_dim1] == 0.) {
+		    nbnext = 1;
+		}
+		if (nbnext == 2) {
+
+/*                 2 by 2 Block did not split */
+
+		    i__1 = here - 1;
+		    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    i__1, &c__2, &c__1, &work[1], info);
+		    if (*info != 0) {
+			*ilst = here;
+			return 0;
+		    }
+		    here += -2;
+		} else {
+
+/*                 2 by 2 Block did split */
+
+		    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    here, &c__1, &c__1, &work[1], info);
+		    i__1 = here - 1;
+		    dlaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    i__1, &c__1, &c__1, &work[1], info);
+		    here += -2;
+		}
+	    }
+	}
+	if (here > *ilst) {
+	    goto L20;
+	}
+    }
+    *ilst = here;
+
+    return 0;
+
+/*     End of DTREXC */
+
+} /* dtrexc_ */
+
+/* Subroutine */ int dtrti2_(char *uplo, char *diag, integer *n, doublereal *
+	a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer j;
+    static doublereal ajj;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *,
+	    doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DTRTI2 computes the inverse of a real upper or lower triangular
+    matrix.
+
+    This is the Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the matrix A is upper or lower triangular.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    DIAG    (input) CHARACTER*1
+            Specifies whether or not the matrix A is unit triangular.
+            = 'N':  Non-unit triangular
+            = 'U':  Unit triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading n by n upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTRTI2", &i__1);
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute inverse of upper triangular matrix. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (nounit) {
+		a[j + j * a_dim1] = 1. / a[j + j * a_dim1];
+		ajj = -a[j + j * a_dim1];
+	    } else {
+		ajj = -1.;
+	    }
+
+/*           Compute elements 1:j-1 of j-th column. */
+
+	    i__2 = j - 1;
+	    dtrmv_("Upper", "No transpose", diag, &i__2, &a[a_offset], lda, &
+		    a[j * a_dim1 + 1], &c__1);
+	    i__2 = j - 1;
+	    dscal_(&i__2, &ajj, &a[j * a_dim1 + 1], &c__1);
+/* L10: */
+	}
+    } else {
+
+/*        Compute inverse of lower triangular matrix. */
+
+	for (j = *n; j >= 1; --j) {
+	    if (nounit) {
+		a[j + j * a_dim1] = 1. / a[j + j * a_dim1];
+		ajj = -a[j + j * a_dim1];
+	    } else {
+		ajj = -1.;
+	    }
+	    if (j < *n) {
+
+/*              Compute elements j+1:n of j-th column. */
+
+		i__1 = *n - j;
+		dtrmv_("Lower", "No transpose", diag, &i__1, &a[j + 1 + (j +
+			1) * a_dim1], lda, &a[j + 1 + j * a_dim1], &c__1);
+		i__1 = *n - j;
+		dscal_(&i__1, &ajj, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of DTRTI2 */
+
+} /* dtrti2_ */
+
+/* Subroutine */ int dtrtri_(char *uplo, char *diag, integer *n, doublereal *
+	a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, i__1, i__2[2], i__3, i__4, i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer j, jb, nb, nn;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *), dtrsm_(
+	    char *, char *, char *, char *, integer *, integer *, doublereal *
+	    , doublereal *, integer *, doublereal *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int dtrti2_(char *, char *, integer *, doublereal
+	    *, integer *, integer *), xerbla_(char *, integer
+	    *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    DTRTRI computes the inverse of a real upper or lower triangular
+    matrix A.
+
+    This is the Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  A is upper triangular;
+            = 'L':  A is lower triangular.
+
+    DIAG    (input) CHARACTER*1
+            = 'N':  A is non-unit triangular;
+            = 'U':  A is unit triangular.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+            > 0: if INFO = i, A(i,i) is exactly zero.  The triangular
+                 matrix is singular and its inverse can not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTRTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Check for singularity if non-unit. */
+
+    if (nounit) {
+	i__1 = *n;
+	for (*info = 1; *info <= i__1; ++(*info)) {
+	    if (a[*info + *info * a_dim1] == 0.) {
+		return 0;
+	    }
+/* L10: */
+	}
+	*info = 0;
+    }
+
+/*
+       Determine the block size for this environment.
+
+   Writing concatenation
+*/
+    i__2[0] = 1, a__1[0] = uplo;
+    i__2[1] = 1, a__1[1] = diag;
+    s_cat(ch__1, a__1, i__2, &c__2, (ftnlen)2);
+    nb = ilaenv_(&c__1, "DTRTRI", ch__1, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)2);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	dtrti2_(uplo, diag, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute inverse of upper triangular matrix */
+
+	    i__1 = *n;
+	    i__3 = nb;
+	    for (j = 1; i__3 < 0 ? j >= i__1 : j <= i__1; j += i__3) {
+/* Computing MIN */
+		i__4 = nb, i__5 = *n - j + 1;
+		jb = min(i__4,i__5);
+
+/*              Compute rows 1:j-1 of current block column */
+
+		i__4 = j - 1;
+		dtrmm_("Left", "Upper", "No transpose", diag, &i__4, &jb, &
+			c_b15, &a[a_offset], lda, &a[j * a_dim1 + 1], lda);
+		i__4 = j - 1;
+		dtrsm_("Right", "Upper", "No transpose", diag, &i__4, &jb, &
+			c_b151, &a[j + j * a_dim1], lda, &a[j * a_dim1 + 1],
+			lda);
+
+/*              Compute inverse of current diagonal block */
+
+		dtrti2_("Upper", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L20: */
+	    }
+	} else {
+
+/*           Compute inverse of lower triangular matrix */
+
+	    nn = (*n - 1) / nb * nb + 1;
+	    i__3 = -nb;
+	    for (j = nn; i__3 < 0 ? j >= 1 : j <= 1; j += i__3) {
+/* Computing MIN */
+		i__1 = nb, i__4 = *n - j + 1;
+		jb = min(i__1,i__4);
+		if (j + jb <= *n) {
+
+/*                 Compute rows j+jb:n of current block column */
+
+		    i__1 = *n - j - jb + 1;
+		    dtrmm_("Left", "Lower", "No transpose", diag, &i__1, &jb,
+			    &c_b15, &a[j + jb + (j + jb) * a_dim1], lda, &a[j
+			    + jb + j * a_dim1], lda);
+		    i__1 = *n - j - jb + 1;
+		    dtrsm_("Right", "Lower", "No transpose", diag, &i__1, &jb,
+			     &c_b151, &a[j + j * a_dim1], lda, &a[j + jb + j *
+			     a_dim1], lda);
+		}
+
+/*              Compute inverse of current diagonal block */
+
+		dtrti2_("Lower", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L30: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRTRI */
+
+} /* dtrtri_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_d_lapack.f.patch b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_d_lapack.f.patch
new file mode 100644
index 00000000000..cd750cec096
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_d_lapack.f.patch
@@ -0,0 +1,32 @@
+@@ -19075,5 +19075,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( LASTV, I ).NE.ZERO ) EXIT
++                     IF( V( LASTV, I ).NE.ZERO ) GO TO 15
+                   END DO
++   15             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -19087,5 +19088,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( I, LASTV ).NE.ZERO ) EXIT
++                     IF( V( I, LASTV ).NE.ZERO ) GO TO 16
+                   END DO
++   16             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -19131,5 +19133,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = 1, I-1
+-                        IF( V( LASTV, I ).NE.ZERO ) EXIT
++                        IF( V( LASTV, I ).NE.ZERO ) GO TO 35
+                      END DO
++   35                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
+@@ -19147,5 +19150,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = N, I+1, -1
+-                        IF( V( I, LASTV ).NE.ZERO ) EXIT
++                        IF( V( I, LASTV ).NE.ZERO ) GO TO 36
+                      END DO
++   36                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_lapack.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_lapack.c
new file mode 100644
index 00000000000..d956ddbbb74
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_lapack.c
@@ -0,0 +1,1651 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static real c_b172 = 0.f;
+static real c_b173 = 1.f;
+static integer c__0 = 0;
+
+integer ieeeck_(integer *ispec, real *zero, real *one)
+{
+    /* System generated locals */
+    integer ret_val;
+
+    /* Local variables */
+    static real nan1, nan2, nan3, nan4, nan5, nan6, neginf, posinf, negzro,
+	    newzro;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    IEEECK is called from the ILAENV to verify that Infinity and
+    possibly NaN arithmetic is safe (i.e. will not trap).
+
+    Arguments
+    =========
+
+    ISPEC   (input) INTEGER
+            Specifies whether to test just for inifinity arithmetic
+            or whether to test for infinity and NaN arithmetic.
+            = 0: Verify infinity arithmetic only.
+            = 1: Verify infinity and NaN arithmetic.
+
+    ZERO    (input) REAL
+            Must contain the value 0.0
+            This is passed to prevent the compiler from optimizing
+            away this code.
+
+    ONE     (input) REAL
+            Must contain the value 1.0
+            This is passed to prevent the compiler from optimizing
+            away this code.
+
+    RETURN VALUE:  INTEGER
+            = 0:  Arithmetic failed to produce the correct answers
+            = 1:  Arithmetic produced the correct answers
+*/
+
+    ret_val = 1;
+
+    posinf = *one / *zero;
+    if (posinf <= *one) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    neginf = -(*one) / *zero;
+    if (neginf >= *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    negzro = *one / (neginf + *one);
+    if (negzro != *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    neginf = *one / negzro;
+    if (neginf >= *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    newzro = negzro + *zero;
+    if (newzro != *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    posinf = *one / newzro;
+    if (posinf <= *one) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    neginf *= posinf;
+    if (neginf >= *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    posinf *= posinf;
+    if (posinf <= *one) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+
+/*     Return if we were only asked to check infinity arithmetic */
+
+    if (*ispec == 0) {
+	return ret_val;
+    }
+
+    nan1 = posinf + neginf;
+
+    nan2 = posinf / neginf;
+
+    nan3 = posinf / posinf;
+
+    nan4 = posinf * *zero;
+
+    nan5 = neginf * negzro;
+
+    nan6 = nan5 * *zero;
+
+    if (nan1 == nan1) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan2 == nan2) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan3 == nan3) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan4 == nan4) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan5 == nan5) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan6 == nan6) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    return ret_val;
+} /* ieeeck_ */
+
+integer ilaclc_(integer *m, integer *n, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1, i__2;
+
+    /* Local variables */
+    static integer i__;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILACLC scans A for its last non-zero column.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) COMPLEX array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	ret_val = *n;
+    } else /* if(complicated condition) */ {
+	i__1 = *n * a_dim1 + 1;
+	i__2 = *m + *n * a_dim1;
+	if (a[i__1].r != 0.f || a[i__1].i != 0.f || (a[i__2].r != 0.f || a[
+		i__2].i != 0.f)) {
+	    ret_val = *n;
+	} else {
+/*     Now scan each column from the end, returning with the first non-zero. */
+	    for (ret_val = *n; ret_val >= 1; --ret_val) {
+		i__1 = *m;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__ + ret_val * a_dim1;
+		    if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+			return ret_val;
+		    }
+		}
+	    }
+	}
+    }
+    return ret_val;
+} /* ilaclc_ */
+
+integer ilaclr_(integer *m, integer *n, complex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILACLR scans A for its last non-zero row.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) COMPLEX          array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*m == 0) {
+	ret_val = *m;
+    } else /* if(complicated condition) */ {
+	i__1 = *m + a_dim1;
+	i__2 = *m + *n * a_dim1;
+	if (a[i__1].r != 0.f || a[i__1].i != 0.f || (a[i__2].r != 0.f || a[
+		i__2].i != 0.f)) {
+	    ret_val = *m;
+	} else {
+/*     Scan up each column tracking the last zero row seen. */
+	    ret_val = 0;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		for (i__ = *m; i__ >= 1; --i__) {
+		    i__2 = i__ + j * a_dim1;
+		    if (a[i__2].r != 0.f || a[i__2].i != 0.f) {
+			goto L10;
+		    }
+		}
+L10:
+		ret_val = max(ret_val,i__);
+	    }
+	}
+    }
+    return ret_val;
+} /* ilaclr_ */
+
+integer iladlc_(integer *m, integer *n, doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1;
+
+    /* Local variables */
+    static integer i__;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILADLC scans A for its last non-zero column.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	ret_val = *n;
+    } else if (a[*n * a_dim1 + 1] != 0. || a[*m + *n * a_dim1] != 0.) {
+	ret_val = *n;
+    } else {
+/*     Now scan each column from the end, returning with the first non-zero. */
+	for (ret_val = *n; ret_val >= 1; --ret_val) {
+	    i__1 = *m;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		if (a[i__ + ret_val * a_dim1] != 0.) {
+		    return ret_val;
+		}
+	    }
+	}
+    }
+    return ret_val;
+} /* iladlc_ */
+
+integer iladlr_(integer *m, integer *n, doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1;
+
+    /* Local variables */
+    static integer i__, j;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILADLR scans A for its last non-zero row.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*m == 0) {
+	ret_val = *m;
+    } else if (a[*m + a_dim1] != 0. || a[*m + *n * a_dim1] != 0.) {
+	ret_val = *m;
+    } else {
+/*     Scan up each column tracking the last zero row seen. */
+	ret_val = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    for (i__ = *m; i__ >= 1; --i__) {
+		if (a[i__ + j * a_dim1] != 0.) {
+		    goto L10;
+		}
+	    }
+L10:
+	    ret_val = max(ret_val,i__);
+	}
+    }
+    return ret_val;
+} /* iladlr_ */
+
+integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
+	integer *n2, integer *n3, integer *n4, ftnlen name_len, ftnlen
+	opts_len)
+{
+    /* System generated locals */
+    integer ret_val;
+
+    /* Local variables */
+    static integer i__;
+    static char c1[1], c2[2], c3[3], c4[2];
+    static integer ic, nb, iz, nx;
+    static logical cname;
+    static integer nbmin;
+    static logical sname;
+    extern integer ieeeck_(integer *, real *, real *);
+    static char subnam[6];
+    extern integer iparmq_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+
+    -- April 2009                                                      --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILAENV is called from the LAPACK routines to choose problem-dependent
+    parameters for the local environment.  See ISPEC for a description of
+    the parameters.
+
+    ILAENV returns an INTEGER
+    if ILAENV >= 0: ILAENV returns the value of the parameter specified by ISPEC
+    if ILAENV < 0:  if ILAENV = -k, the k-th argument had an illegal value.
+
+    This version provides a set of parameters which should give good,
+    but not optimal, performance on many of the currently available
+    computers.  Users are encouraged to modify this subroutine to set
+    the tuning parameters for their particular machine using the option
+    and problem size information in the arguments.
+
+    This routine will not function correctly if it is converted to all
+    lower case.  Converting it to all upper case is allowed.
+
+    Arguments
+    =========
+
+    ISPEC   (input) INTEGER
+            Specifies the parameter to be returned as the value of
+            ILAENV.
+            = 1: the optimal blocksize; if this value is 1, an unblocked
+                 algorithm will give the best performance.
+            = 2: the minimum block size for which the block routine
+                 should be used; if the usable block size is less than
+                 this value, an unblocked routine should be used.
+            = 3: the crossover point (in a block routine, for N less
+                 than this value, an unblocked routine should be used)
+            = 4: the number of shifts, used in the nonsymmetric
+                 eigenvalue routines (DEPRECATED)
+            = 5: the minimum column dimension for blocking to be used;
+                 rectangular blocks must have dimension at least k by m,
+                 where k is given by ILAENV(2,...) and m by ILAENV(5,...)
+            = 6: the crossover point for the SVD (when reducing an m by n
+                 matrix to bidiagonal form, if max(m,n)/min(m,n) exceeds
+                 this value, a QR factorization is used first to reduce
+                 the matrix to a triangular form.)
+            = 7: the number of processors
+            = 8: the crossover point for the multishift QR method
+                 for nonsymmetric eigenvalue problems (DEPRECATED)
+            = 9: maximum size of the subproblems at the bottom of the
+                 computation tree in the divide-and-conquer algorithm
+                 (used by xGELSD and xGESDD)
+            =10: ieee NaN arithmetic can be trusted not to trap
+            =11: infinity arithmetic can be trusted not to trap
+            12 <= ISPEC <= 16:
+                 xHSEQR or one of its subroutines,
+                 see IPARMQ for detailed explanation
+
+    NAME    (input) CHARACTER*(*)
+            The name of the calling subroutine, in either upper case or
+            lower case.
+
+    OPTS    (input) CHARACTER*(*)
+            The character options to the subroutine NAME, concatenated
+            into a single character string.  For example, UPLO = 'U',
+            TRANS = 'T', and DIAG = 'N' for a triangular routine would
+            be specified as OPTS = 'UTN'.
+
+    N1      (input) INTEGER
+    N2      (input) INTEGER
+    N3      (input) INTEGER
+    N4      (input) INTEGER
+            Problem dimensions for the subroutine NAME; these may not all
+            be required.
+
+    Further Details
+    ===============
+
+    The following conventions have been used when calling ILAENV from the
+    LAPACK routines:
+    1)  OPTS is a concatenation of all of the character options to
+        subroutine NAME, in the same order that they appear in the
+        argument list for NAME, even if they are not used in determining
+        the value of the parameter specified by ISPEC.
+    2)  The problem dimensions N1, N2, N3, N4 are specified in the order
+        that they appear in the argument list for NAME.  N1 is used
+        first, N2 second, and so on, and unused problem dimensions are
+        passed a value of -1.
+    3)  The parameter value returned by ILAENV is checked for validity in
+        the calling subroutine.  For example, ILAENV is used to retrieve
+        the optimal blocksize for STRTRI as follows:
+
+        NB = ILAENV( 1, 'STRTRI', UPLO // DIAG, N, -1, -1, -1 )
+        IF( NB.LE.1 ) NB = MAX( 1, N )
+
+    =====================================================================
+*/
+
+
+    switch (*ispec) {
+	case 1:  goto L10;
+	case 2:  goto L10;
+	case 3:  goto L10;
+	case 4:  goto L80;
+	case 5:  goto L90;
+	case 6:  goto L100;
+	case 7:  goto L110;
+	case 8:  goto L120;
+	case 9:  goto L130;
+	case 10:  goto L140;
+	case 11:  goto L150;
+	case 12:  goto L160;
+	case 13:  goto L160;
+	case 14:  goto L160;
+	case 15:  goto L160;
+	case 16:  goto L160;
+    }
+
+/*     Invalid value for ISPEC */
+
+    ret_val = -1;
+    return ret_val;
+
+L10:
+
+/*     Convert NAME to upper case if the first character is lower case. */
+
+    ret_val = 1;
+    s_copy(subnam, name__, (ftnlen)6, name_len);
+    ic = *(unsigned char *)subnam;
+    iz = 'Z';
+    if (iz == 90 || iz == 122) {
+
+/*        ASCII character set */
+
+	if (ic >= 97 && ic <= 122) {
+	    *(unsigned char *)subnam = (char) (ic - 32);
+	    for (i__ = 2; i__ <= 6; ++i__) {
+		ic = *(unsigned char *)&subnam[i__ - 1];
+		if (ic >= 97 && ic <= 122) {
+		    *(unsigned char *)&subnam[i__ - 1] = (char) (ic - 32);
+		}
+/* L20: */
+	    }
+	}
+
+    } else if (iz == 233 || iz == 169) {
+
+/*        EBCDIC character set */
+
+	if (ic >= 129 && ic <= 137 || ic >= 145 && ic <= 153 || ic >= 162 &&
+		ic <= 169) {
+	    *(unsigned char *)subnam = (char) (ic + 64);
+	    for (i__ = 2; i__ <= 6; ++i__) {
+		ic = *(unsigned char *)&subnam[i__ - 1];
+		if (ic >= 129 && ic <= 137 || ic >= 145 && ic <= 153 || ic >=
+			162 && ic <= 169) {
+		    *(unsigned char *)&subnam[i__ - 1] = (char) (ic + 64);
+		}
+/* L30: */
+	    }
+	}
+
+    } else if (iz == 218 || iz == 250) {
+
+/*        Prime machines:  ASCII+128 */
+
+	if (ic >= 225 && ic <= 250) {
+	    *(unsigned char *)subnam = (char) (ic - 32);
+	    for (i__ = 2; i__ <= 6; ++i__) {
+		ic = *(unsigned char *)&subnam[i__ - 1];
+		if (ic >= 225 && ic <= 250) {
+		    *(unsigned char *)&subnam[i__ - 1] = (char) (ic - 32);
+		}
+/* L40: */
+	    }
+	}
+    }
+
+    *(unsigned char *)c1 = *(unsigned char *)subnam;
+    sname = *(unsigned char *)c1 == 'S' || *(unsigned char *)c1 == 'D';
+    cname = *(unsigned char *)c1 == 'C' || *(unsigned char *)c1 == 'Z';
+    if (! (cname || sname)) {
+	return ret_val;
+    }
+    s_copy(c2, subnam + 1, (ftnlen)2, (ftnlen)2);
+    s_copy(c3, subnam + 3, (ftnlen)3, (ftnlen)3);
+    s_copy(c4, c3 + 1, (ftnlen)2, (ftnlen)2);
+
+    switch (*ispec) {
+	case 1:  goto L50;
+	case 2:  goto L60;
+	case 3:  goto L70;
+    }
+
+L50:
+
+/*
+       ISPEC = 1:  block size
+
+       In these examples, separate code is provided for setting NB for
+       real and complex.  We assume that NB will take the same value in
+       single or double precision.
+*/
+
+    nb = 1;
+
+    if (s_cmp(c2, "GE", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	} else if (s_cmp(c3, "QRF", (ftnlen)3, (ftnlen)3) == 0 || s_cmp(c3,
+		"RQF", (ftnlen)3, (ftnlen)3) == 0 || s_cmp(c3, "LQF", (ftnlen)
+		3, (ftnlen)3) == 0 || s_cmp(c3, "QLF", (ftnlen)3, (ftnlen)3)
+		== 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
+	} else if (s_cmp(c3, "HRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
+	} else if (s_cmp(c3, "BRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
+	} else if (s_cmp(c3, "TRI", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	} else if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    nb = 32;
+	} else if (sname && s_cmp(c3, "GST", (ftnlen)3, (ftnlen)3) == 0) {
+	    nb = 64;
+	}
+    } else if (cname && s_cmp(c2, "HE", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    nb = 64;
+	} else if (s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    nb = 32;
+	} else if (s_cmp(c3, "GST", (ftnlen)3, (ftnlen)3) == 0) {
+	    nb = 64;
+	}
+    } else if (sname && s_cmp(c2, "OR", (ftnlen)2, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	}
+    } else if (cname && s_cmp(c2, "UN", (ftnlen)2, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	}
+    } else if (s_cmp(c2, "GB", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		if (*n4 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    } else {
+		if (*n4 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    }
+	}
+    } else if (s_cmp(c2, "PB", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		if (*n2 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    } else {
+		if (*n2 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    }
+	}
+    } else if (s_cmp(c2, "TR", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRI", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) {
+	    nb = 1;
+	}
+    }
+    ret_val = nb;
+    return ret_val;
+
+L60:
+
+/*     ISPEC = 2:  minimum block size */
+
+    nbmin = 2;
+    if (s_cmp(c2, "GE", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "QRF", (ftnlen)3, (ftnlen)3) == 0 || s_cmp(c3, "RQF", (
+		ftnlen)3, (ftnlen)3) == 0 || s_cmp(c3, "LQF", (ftnlen)3, (
+		ftnlen)3) == 0 || s_cmp(c3, "QLF", (ftnlen)3, (ftnlen)3) == 0)
+		 {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	} else if (s_cmp(c3, "HRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	} else if (s_cmp(c3, "BRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	} else if (s_cmp(c3, "TRI", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	}
+    } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 8;
+	    } else {
+		nbmin = 8;
+	    }
+	} else if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    nbmin = 2;
+	}
+    } else if (cname && s_cmp(c2, "HE", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    nbmin = 2;
+	}
+    } else if (sname && s_cmp(c2, "OR", (ftnlen)2, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	}
+    } else if (cname && s_cmp(c2, "UN", (ftnlen)2, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	}
+    }
+    ret_val = nbmin;
+    return ret_val;
+
+L70:
+
+/*     ISPEC = 3:  crossover point */
+
+    nx = 0;
+    if (s_cmp(c2, "GE", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "QRF", (ftnlen)3, (ftnlen)3) == 0 || s_cmp(c3, "RQF", (
+		ftnlen)3, (ftnlen)3) == 0 || s_cmp(c3, "LQF", (ftnlen)3, (
+		ftnlen)3) == 0 || s_cmp(c3, "QLF", (ftnlen)3, (ftnlen)3) == 0)
+		 {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
+	} else if (s_cmp(c3, "HRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
+	} else if (s_cmp(c3, "BRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
+	}
+    } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
+	if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    nx = 32;
+	}
+    } else if (cname && s_cmp(c2, "HE", (ftnlen)2, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
+	    nx = 32;
+	}
+    } else if (sname && s_cmp(c2, "OR", (ftnlen)2, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nx = 128;
+	    }
+	}
+    } else if (cname && s_cmp(c2, "UN", (ftnlen)2, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "RQ",
+		    (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)2, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)2, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)2, (ftnlen)2) == 0) {
+		nx = 128;
+	    }
+	}
+    }
+    ret_val = nx;
+    return ret_val;
+
+L80:
+
+/*     ISPEC = 4:  number of shifts (used by xHSEQR) */
+
+    ret_val = 6;
+    return ret_val;
+
+L90:
+
+/*     ISPEC = 5:  minimum column dimension (not used) */
+
+    ret_val = 2;
+    return ret_val;
+
+L100:
+
+/*     ISPEC = 6:  crossover point for SVD (used by xGELSS and xGESVD) */
+
+    ret_val = (integer) ((real) min(*n1,*n2) * 1.6f);
+    return ret_val;
+
+L110:
+
+/*     ISPEC = 7:  number of processors (not used) */
+
+    ret_val = 1;
+    return ret_val;
+
+L120:
+
+/*     ISPEC = 8:  crossover point for multishift (used by xHSEQR) */
+
+    ret_val = 50;
+    return ret_val;
+
+L130:
+
+/*
+       ISPEC = 9:  maximum size of the subproblems at the bottom of the
+                   computation tree in the divide-and-conquer algorithm
+                   (used by xGELSD and xGESDD)
+*/
+
+    ret_val = 25;
+    return ret_val;
+
+L140:
+
+/*
+       ISPEC = 10: ieee NaN arithmetic can be trusted not to trap
+
+       ILAENV = 0
+*/
+    ret_val = 1;
+    if (ret_val == 1) {
+	ret_val = ieeeck_(&c__1, &c_b172, &c_b173);
+    }
+    return ret_val;
+
+L150:
+
+/*
+       ISPEC = 11: infinity arithmetic can be trusted not to trap
+
+       ILAENV = 0
+*/
+    ret_val = 1;
+    if (ret_val == 1) {
+	ret_val = ieeeck_(&c__0, &c_b172, &c_b173);
+    }
+    return ret_val;
+
+L160:
+
+/*     12 <= ISPEC <= 16: xHSEQR or one of its subroutines. */
+
+    ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4, name_len, opts_len)
+	    ;
+    return ret_val;
+
+/*     End of ILAENV */
+
+} /* ilaenv_ */
+
+integer ilaslc_(integer *m, integer *n, real *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1;
+
+    /* Local variables */
+    static integer i__;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILASLC scans A for its last non-zero column.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	ret_val = *n;
+    } else if (a[*n * a_dim1 + 1] != 0.f || a[*m + *n * a_dim1] != 0.f) {
+	ret_val = *n;
+    } else {
+/*     Now scan each column from the end, returning with the first non-zero. */
+	for (ret_val = *n; ret_val >= 1; --ret_val) {
+	    i__1 = *m;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		if (a[i__ + ret_val * a_dim1] != 0.f) {
+		    return ret_val;
+		}
+	    }
+	}
+    }
+    return ret_val;
+} /* ilaslc_ */
+
+integer ilaslr_(integer *m, integer *n, real *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1;
+
+    /* Local variables */
+    static integer i__, j;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILASLR scans A for its last non-zero row.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*m == 0) {
+	ret_val = *m;
+    } else if (a[*m + a_dim1] != 0.f || a[*m + *n * a_dim1] != 0.f) {
+	ret_val = *m;
+    } else {
+/*     Scan up each column tracking the last zero row seen. */
+	ret_val = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    for (i__ = *m; i__ >= 1; --i__) {
+		if (a[i__ + j * a_dim1] != 0.f) {
+		    goto L10;
+		}
+	    }
+L10:
+	    ret_val = max(ret_val,i__);
+	}
+    }
+    return ret_val;
+} /* ilaslr_ */
+
+integer ilazlc_(integer *m, integer *n, doublecomplex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1, i__2;
+
+    /* Local variables */
+    static integer i__;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILAZLC scans A for its last non-zero column.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	ret_val = *n;
+    } else /* if(complicated condition) */ {
+	i__1 = *n * a_dim1 + 1;
+	i__2 = *m + *n * a_dim1;
+	if (a[i__1].r != 0. || a[i__1].i != 0. || (a[i__2].r != 0. || a[i__2]
+		.i != 0.)) {
+	    ret_val = *n;
+	} else {
+/*     Now scan each column from the end, returning with the first non-zero. */
+	    for (ret_val = *n; ret_val >= 1; --ret_val) {
+		i__1 = *m;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__ + ret_val * a_dim1;
+		    if (a[i__2].r != 0. || a[i__2].i != 0.) {
+			return ret_val;
+		    }
+		}
+	    }
+	}
+    }
+    return ret_val;
+} /* ilazlc_ */
+
+integer ilazlr_(integer *m, integer *n, doublecomplex *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2)                        --
+
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    ILAZLR scans A for its last non-zero row.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Quick test for the common case where one corner is non-zero.
+*/
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*m == 0) {
+	ret_val = *m;
+    } else /* if(complicated condition) */ {
+	i__1 = *m + a_dim1;
+	i__2 = *m + *n * a_dim1;
+	if (a[i__1].r != 0. || a[i__1].i != 0. || (a[i__2].r != 0. || a[i__2]
+		.i != 0.)) {
+	    ret_val = *m;
+	} else {
+/*     Scan up each column tracking the last zero row seen. */
+	    ret_val = 0;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		for (i__ = *m; i__ >= 1; --i__) {
+		    i__2 = i__ + j * a_dim1;
+		    if (a[i__2].r != 0. || a[i__2].i != 0.) {
+			goto L10;
+		    }
+		}
+L10:
+		ret_val = max(ret_val,i__);
+	    }
+	}
+    }
+    return ret_val;
+} /* ilazlr_ */
+
+integer iparmq_(integer *ispec, char *name__, char *opts, integer *n, integer
+	*ilo, integer *ihi, integer *lwork, ftnlen name_len, ftnlen opts_len)
+{
+    /* System generated locals */
+    integer ret_val, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer nh, ns;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+         This program sets problem and machine dependent parameters
+         useful for xHSEQR and its subroutines. It is called whenever
+         ILAENV is called with 12 <= ISPEC <= 16
+
+    Arguments
+    =========
+
+         ISPEC  (input) integer scalar
+                ISPEC specifies which tunable parameter IPARMQ should
+                return.
+
+                ISPEC=12: (INMIN)  Matrices of order nmin or less
+                          are sent directly to xLAHQR, the implicit
+                          double shift QR algorithm.  NMIN must be
+                          at least 11.
+
+                ISPEC=13: (INWIN)  Size of the deflation window.
+                          This is best set greater than or equal to
+                          the number of simultaneous shifts NS.
+                          Larger matrices benefit from larger deflation
+                          windows.
+
+                ISPEC=14: (INIBL) Determines when to stop nibbling and
+                          invest in an (expensive) multi-shift QR sweep.
+                          If the aggressive early deflation subroutine
+                          finds LD converged eigenvalues from an order
+                          NW deflation window and LD.GT.(NW*NIBBLE)/100,
+                          then the next QR sweep is skipped and early
+                          deflation is applied immediately to the
+                          remaining active diagonal block.  Setting
+                          IPARMQ(ISPEC=14) = 0 causes TTQRE to skip a
+                          multi-shift QR sweep whenever early deflation
+                          finds a converged eigenvalue.  Setting
+                          IPARMQ(ISPEC=14) greater than or equal to 100
+                          prevents TTQRE from skipping a multi-shift
+                          QR sweep.
+
+                ISPEC=15: (NSHFTS) The number of simultaneous shifts in
+                          a multi-shift QR iteration.
+
+                ISPEC=16: (IACC22) IPARMQ is set to 0, 1 or 2 with the
+                          following meanings.
+                          0:  During the multi-shift QR sweep,
+                              xLAQR5 does not accumulate reflections and
+                              does not use matrix-matrix multiply to
+                              update the far-from-diagonal matrix
+                              entries.
+                          1:  During the multi-shift QR sweep,
+                              xLAQR5 and/or xLAQRaccumulates reflections and uses
+                              matrix-matrix multiply to update the
+                              far-from-diagonal matrix entries.
+                          2:  During the multi-shift QR sweep.
+                              xLAQR5 accumulates reflections and takes
+                              advantage of 2-by-2 block structure during
+                              matrix-matrix multiplies.
+                          (If xTRMM is slower than xGEMM, then
+                          IPARMQ(ISPEC=16)=1 may be more efficient than
+                          IPARMQ(ISPEC=16)=2 despite the greater level of
+                          arithmetic work implied by the latter choice.)
+
+         NAME    (input) character string
+                 Name of the calling subroutine
+
+         OPTS    (input) character string
+                 This is a concatenation of the string arguments to
+                 TTQRE.
+
+         N       (input) integer scalar
+                 N is the order of the Hessenberg matrix H.
+
+         ILO     (input) INTEGER
+         IHI     (input) INTEGER
+                 It is assumed that H is already upper triangular
+                 in rows and columns 1:ILO-1 and IHI+1:N.
+
+         LWORK   (input) integer scalar
+                 The amount of workspace available.
+
+    Further Details
+    ===============
+
+         Little is known about how best to choose these parameters.
+         It is possible to use different values of the parameters
+         for each of CHSEQR, DHSEQR, SHSEQR and ZHSEQR.
+
+         It is probably best to choose different parameters for
+         different matrices and different parameters at different
+         times during the iteration, but this has not been
+         implemented --- yet.
+
+
+         The best choices of most of the parameters depend
+         in an ill-understood way on the relative execution
+         rate of xLAQR3 and xLAQR5 and on the nature of each
+         particular eigenvalue problem.  Experiment may be the
+         only practical way to determine which choices are most
+         effective.
+
+         Following is a list of default values supplied by IPARMQ.
+         These defaults may be adjusted in order to attain better
+         performance in any particular computational environment.
+
+         IPARMQ(ISPEC=12) The xLAHQR vs xLAQR0 crossover point.
+                          Default: 75. (Must be at least 11.)
+
+         IPARMQ(ISPEC=13) Recommended deflation window size.
+                          This depends on ILO, IHI and NS, the
+                          number of simultaneous shifts returned
+                          by IPARMQ(ISPEC=15).  The default for
+                          (IHI-ILO+1).LE.500 is NS.  The default
+                          for (IHI-ILO+1).GT.500 is 3*NS/2.
+
+         IPARMQ(ISPEC=14) Nibble crossover point.  Default: 14.
+
+         IPARMQ(ISPEC=15) Number of simultaneous shifts, NS.
+                          a multi-shift QR iteration.
+
+                          If IHI-ILO+1 is ...
+
+                          greater than      ...but less    ... the
+                          or equal to ...      than        default is
+
+                                  0               30       NS =   2+
+                                 30               60       NS =   4+
+                                 60              150       NS =  10
+                                150              590       NS =  **
+                                590             3000       NS =  64
+                               3000             6000       NS = 128
+                               6000             infinity   NS = 256
+
+                      (+)  By default matrices of this order are
+                           passed to the implicit double shift routine
+                           xLAHQR.  See IPARMQ(ISPEC=12) above.   These
+                           values of NS are used only in case of a rare
+                           xLAHQR failure.
+
+                      (**) The asterisks (**) indicate an ad-hoc
+                           function increasing from 10 to 64.
+
+         IPARMQ(ISPEC=16) Select structured matrix multiply.
+                          (See ISPEC=16 above for details.)
+                          Default: 3.
+
+       ================================================================
+*/
+    if (*ispec == 15 || *ispec == 13 || *ispec == 16) {
+
+/*        ==== Set the number simultaneous shifts ==== */
+
+	nh = *ihi - *ilo + 1;
+	ns = 2;
+	if (nh >= 30) {
+	    ns = 4;
+	}
+	if (nh >= 60) {
+	    ns = 10;
+	}
+	if (nh >= 150) {
+/* Computing MAX */
+	    r__1 = log((real) nh) / log(2.f);
+	    i__1 = 10, i__2 = nh / i_nint(&r__1);
+	    ns = max(i__1,i__2);
+	}
+	if (nh >= 590) {
+	    ns = 64;
+	}
+	if (nh >= 3000) {
+	    ns = 128;
+	}
+	if (nh >= 6000) {
+	    ns = 256;
+	}
+/* Computing MAX */
+	i__1 = 2, i__2 = ns - ns % 2;
+	ns = max(i__1,i__2);
+    }
+
+    if (*ispec == 12) {
+
+
+/*
+          ===== Matrices of order smaller than NMIN get sent
+          .     to xLAHQR, the classic double shift algorithm.
+          .     This must be at least 11. ====
+*/
+
+	ret_val = 75;
+
+    } else if (*ispec == 14) {
+
+/*
+          ==== INIBL: skip a multi-shift qr iteration and
+          .    whenever aggressive early deflation finds
+          .    at least (NIBBLE*(window size)/100) deflations. ====
+*/
+
+	ret_val = 14;
+
+    } else if (*ispec == 15) {
+
+/*        ==== NSHFTS: The number of simultaneous shifts ===== */
+
+	ret_val = ns;
+
+    } else if (*ispec == 13) {
+
+/*        ==== NW: deflation window size.  ==== */
+
+	if (nh <= 500) {
+	    ret_val = ns;
+	} else {
+	    ret_val = ns * 3 / 2;
+	}
+
+    } else if (*ispec == 16) {
+
+/*
+          ==== IACC22: Whether to accumulate reflections
+          .     before updating the far-from-diagonal elements
+          .     and whether to use 2-by-2 block structure while
+          .     doing it.  A small amount of work could be saved
+          .     by making this choice dependent also upon the
+          .     NH=IHI-ILO+1.
+*/
+
+	ret_val = 0;
+	if (ns >= 14) {
+	    ret_val = 1;
+	}
+	if (ns >= 14) {
+	    ret_val = 2;
+	}
+
+    } else {
+/*        ===== invalid value of ispec ===== */
+	ret_val = -1;
+
+    }
+
+/*     ==== End of IPARMQ ==== */
+
+    return ret_val;
+} /* iparmq_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_lapack.f.patch b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_lapack.f.patch
new file mode 100644
index 00000000000..c743c1f627c
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_lapack.f.patch
@@ -0,0 +1,48 @@
+@@ -267,9 +267,10 @@
+      Scan up each column tracking the last zero row seen.
+          ILACLR = 0
+          DO J = 1, N
+             DO I = M, 1, -1
+-               IF( A(I, J).NE.ZERO ) EXIT
++               IF( A(I, J).NE.ZERO ) GO TO 10
+             END DO
++   10       CONTINUE
+             ILACLR = MAX( ILACLR, I )
+          END DO
+       END IF
+@@ -395,9 +396,10 @@
+      Scan up each column tracking the last zero row seen.
+          ILADLR = 0
+          DO J = 1, N
+             DO I = M, 1, -1
+-               IF( A(I, J).NE.ZERO ) EXIT
++               IF( A(I, J).NE.ZERO ) GO TO 10
+             END DO
++   10       CONTINUE
+             ILADLR = MAX( ILADLR, I )
+          END DO
+       END IF
+@@ -1078,9 +1080,10 @@
+      Scan up each column tracking the last zero row seen.
+          ILASLR = 0
+          DO J = 1, N
+             DO I = M, 1, -1
+-               IF( A(I, J).NE.ZERO ) EXIT
++               IF( A(I, J).NE.ZERO ) GO TO 10
+             END DO
++   10       CONTINUE
+             ILASLR = MAX( ILASLR, I )
+          END DO
+       END IF
+@@ -1206,9 +1209,10 @@
+      Scan up each column tracking the last zero row seen.
+          ILAZLR = 0
+          DO J = 1, N
+             DO I = M, 1, -1
+-               IF( A(I, J).NE.ZERO ) EXIT
++               IF( A(I, J).NE.ZERO ) GO TO 10
+             END DO
++   10       CONTINUE
+             ILAZLR = MAX( ILAZLR, I )
+          END DO
+       END IF
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_s_lapack.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_s_lapack.c
new file mode 100644
index 00000000000..fccb1f58b55
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_s_lapack.c
@@ -0,0 +1,41691 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static integer c__9 = 9;
+static integer c__0 = 0;
+static real c_b15 = 1.f;
+static integer c__1 = 1;
+static real c_b29 = 0.f;
+static doublereal c_b94 = -.125;
+static real c_b151 = -1.f;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+static integer c__65 = 65;
+static integer c__6 = 6;
+static integer c__12 = 12;
+static integer c__49 = 49;
+static integer c__4 = 4;
+static logical c_false = FALSE_;
+static integer c__13 = 13;
+static integer c__15 = 15;
+static integer c__14 = 14;
+static integer c__16 = 16;
+static logical c_true = TRUE_;
+static real c_b3178 = 2.f;
+
+/* Subroutine */ int sbdsdc_(char *uplo, char *compq, integer *n, real *d__,
+	real *e, real *u, integer *ldu, real *vt, integer *ldvt, real *q,
+	integer *iq, real *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, vt_dim1, vt_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static real p, r__;
+    static integer z__, ic, ii, kk;
+    static real cs;
+    static integer is, iu;
+    static real sn;
+    static integer nm1;
+    static real eps;
+    static integer ivt, difl, difr, ierr, perm, mlvl, sqre;
+    extern logical lsame_(char *, char *);
+    static integer poles;
+    extern /* Subroutine */ int slasr_(char *, char *, char *, integer *,
+	    integer *, real *, real *, real *, integer *);
+    static integer iuplo, nsize, start;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), sswap_(integer *, real *, integer *, real *, integer *
+	    ), slasd0_(integer *, integer *, real *, real *, real *, integer *
+	    , real *, integer *, integer *, integer *, real *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int slasda_(integer *, integer *, integer *,
+	    integer *, real *, real *, real *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *);
+    static integer givcol;
+    extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *);
+    static integer icompq;
+    extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *,
+	    real *, real *, integer *), slartg_(real *, real *, real *
+	    , real *, real *);
+    static real orgnrm;
+    static integer givnum;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    static integer givptr, qstart, smlsiz, wstart, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SBDSDC computes the singular value decomposition (SVD) of a real
+    N-by-N (upper or lower) bidiagonal matrix B:  B = U * S * VT,
+    using a divide and conquer method, where S is a diagonal matrix
+    with non-negative diagonal elements (the singular values of B), and
+    U and VT are orthogonal matrices of left and right singular vectors,
+    respectively. SBDSDC can be used to compute all singular values,
+    and optionally, singular vectors or singular vectors in compact form.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.  See SLASD3 for details.
+
+    The code currently calls SLASDQ if singular values only are desired.
+    However, it can be slightly modified to compute singular values
+    using the divide and conquer method.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  B is upper bidiagonal.
+            = 'L':  B is lower bidiagonal.
+
+    COMPQ   (input) CHARACTER*1
+            Specifies whether singular vectors are to be computed
+            as follows:
+            = 'N':  Compute singular values only;
+            = 'P':  Compute singular values and compute singular
+                    vectors in compact form;
+            = 'I':  Compute singular values and singular vectors.
+
+    N       (input) INTEGER
+            The order of the matrix B.  N >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the n diagonal elements of the bidiagonal matrix B.
+            On exit, if INFO=0, the singular values of B.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the elements of E contain the offdiagonal
+            elements of the bidiagonal matrix whose SVD is desired.
+            On exit, E has been destroyed.
+
+    U       (output) REAL array, dimension (LDU,N)
+            If  COMPQ = 'I', then:
+               On exit, if INFO = 0, U contains the left singular vectors
+               of the bidiagonal matrix.
+            For other values of COMPQ, U is not referenced.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= 1.
+            If singular vectors are desired, then LDU >= max( 1, N ).
+
+    VT      (output) REAL array, dimension (LDVT,N)
+            If  COMPQ = 'I', then:
+               On exit, if INFO = 0, VT' contains the right singular
+               vectors of the bidiagonal matrix.
+            For other values of COMPQ, VT is not referenced.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.  LDVT >= 1.
+            If singular vectors are desired, then LDVT >= max( 1, N ).
+
+    Q       (output) REAL array, dimension (LDQ)
+            If  COMPQ = 'P', then:
+               On exit, if INFO = 0, Q and IQ contain the left
+               and right singular vectors in a compact form,
+               requiring O(N log N) space instead of 2*N**2.
+               In particular, Q contains all the REAL data in
+               LDQ >= N*(11 + 2*SMLSIZ + 8*INT(LOG_2(N/(SMLSIZ+1))))
+               words of memory, where SMLSIZ is returned by ILAENV and
+               is equal to the maximum size of the subproblems at the
+               bottom of the computation tree (usually about 25).
+            For other values of COMPQ, Q is not referenced.
+
+    IQ      (output) INTEGER array, dimension (LDIQ)
+            If  COMPQ = 'P', then:
+               On exit, if INFO = 0, Q and IQ contain the left
+               and right singular vectors in a compact form,
+               requiring O(N log N) space instead of 2*N**2.
+               In particular, IQ contains all INTEGER data in
+               LDIQ >= N*(3 + 3*INT(LOG_2(N/(SMLSIZ+1))))
+               words of memory, where SMLSIZ is returned by ILAENV and
+               is equal to the maximum size of the subproblems at the
+               bottom of the computation tree (usually about 25).
+            For other values of COMPQ, IQ is not referenced.
+
+    WORK    (workspace) REAL array, dimension (MAX(1,LWORK))
+            If COMPQ = 'N' then LWORK >= (4 * N).
+            If COMPQ = 'P' then LWORK >= (6 * N).
+            If COMPQ = 'I' then LWORK >= (3 * N**2 + 4 * N).
+
+    IWORK   (workspace) INTEGER array, dimension (8*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute a singular value.
+                  The update process of divide and conquer failed.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+    =====================================================================
+    Changed dimension statement in comment describing E from (N) to
+    (N-1).  Sven, 17 Feb 05.
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --q;
+    --iq;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    iuplo = 0;
+    if (lsame_(uplo, "U")) {
+	iuplo = 1;
+    }
+    if (lsame_(uplo, "L")) {
+	iuplo = 2;
+    }
+    if (lsame_(compq, "N")) {
+	icompq = 0;
+    } else if (lsame_(compq, "P")) {
+	icompq = 1;
+    } else if (lsame_(compq, "I")) {
+	icompq = 2;
+    } else {
+	icompq = -1;
+    }
+    if (iuplo == 0) {
+	*info = -1;
+    } else if (icompq < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ldu < 1 || icompq == 2 && *ldu < *n) {
+	*info = -7;
+    } else if (*ldvt < 1 || icompq == 2 && *ldvt < *n) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SBDSDC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    smlsiz = ilaenv_(&c__9, "SBDSDC", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+    if (*n == 1) {
+	if (icompq == 1) {
+	    q[1] = r_sign(&c_b15, &d__[1]);
+	    q[smlsiz * *n + 1] = 1.f;
+	} else if (icompq == 2) {
+	    u[u_dim1 + 1] = r_sign(&c_b15, &d__[1]);
+	    vt[vt_dim1 + 1] = 1.f;
+	}
+	d__[1] = dabs(d__[1]);
+	return 0;
+    }
+    nm1 = *n - 1;
+
+/*
+       If matrix lower bidiagonal, rotate to be upper bidiagonal
+       by applying Givens rotations on the left
+*/
+
+    wstart = 1;
+    qstart = 3;
+    if (icompq == 1) {
+	scopy_(n, &d__[1], &c__1, &q[1], &c__1);
+	i__1 = *n - 1;
+	scopy_(&i__1, &e[1], &c__1, &q[*n + 1], &c__1);
+    }
+    if (iuplo == 2) {
+	qstart = 5;
+	wstart = (*n << 1) - 1;
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (icompq == 1) {
+		q[i__ + (*n << 1)] = cs;
+		q[i__ + *n * 3] = sn;
+	    } else if (icompq == 2) {
+		work[i__] = cs;
+		work[nm1 + i__] = -sn;
+	    }
+/* L10: */
+	}
+    }
+
+/*     If ICOMPQ = 0, use SLASDQ to compute the singular values. */
+
+    if (icompq == 0) {
+	slasdq_("U", &c__0, n, &c__0, &c__0, &c__0, &d__[1], &e[1], &vt[
+		vt_offset], ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[
+		wstart], info);
+	goto L40;
+    }
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= smlsiz) {
+	if (icompq == 2) {
+	    slaset_("A", n, n, &c_b29, &c_b15, &u[u_offset], ldu);
+	    slaset_("A", n, n, &c_b29, &c_b15, &vt[vt_offset], ldvt);
+	    slasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &vt[vt_offset]
+		    , ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[
+		    wstart], info);
+	} else if (icompq == 1) {
+	    iu = 1;
+	    ivt = iu + *n;
+	    slaset_("A", n, n, &c_b29, &c_b15, &q[iu + (qstart - 1) * *n], n);
+	    slaset_("A", n, n, &c_b29, &c_b15, &q[ivt + (qstart - 1) * *n], n);
+	    slasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &q[ivt + (
+		    qstart - 1) * *n], n, &q[iu + (qstart - 1) * *n], n, &q[
+		    iu + (qstart - 1) * *n], n, &work[wstart], info);
+	}
+	goto L40;
+    }
+
+    if (icompq == 2) {
+	slaset_("A", n, n, &c_b29, &c_b15, &u[u_offset], ldu);
+	slaset_("A", n, n, &c_b29, &c_b15, &vt[vt_offset], ldvt);
+    }
+
+/*     Scale. */
+
+    orgnrm = slanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.f) {
+	return 0;
+    }
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, &ierr);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1, &
+	    ierr);
+
+    eps = slamch_("Epsilon");
+
+    mlvl = (integer) (log((real) (*n) / (real) (smlsiz + 1)) / log(2.f)) + 1;
+    smlszp = smlsiz + 1;
+
+    if (icompq == 1) {
+	iu = 1;
+	ivt = smlsiz + 1;
+	difl = ivt + smlszp;
+	difr = difl + mlvl;
+	z__ = difr + (mlvl << 1);
+	ic = z__ + mlvl;
+	is = ic + 1;
+	poles = is + 1;
+	givnum = poles + (mlvl << 1);
+
+	k = 1;
+	givptr = 2;
+	perm = 3;
+	givcol = perm + mlvl;
+    }
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) < eps) {
+	    d__[i__] = r_sign(&eps, &d__[i__]);
+	}
+/* L20: */
+    }
+
+    start = 1;
+    sqre = 0;
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) {
+
+/*
+          Subproblem found. First determine its size and then
+          apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*        A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - start + 1;
+	    } else if ((r__1 = e[i__], dabs(r__1)) >= eps) {
+
+/*        A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - start + 1;
+	    } else {
+
+/*
+          A subproblem with E(NM1) small. This implies an
+          1-by-1 subproblem at D(N). Solve this 1-by-1 problem
+          first.
+*/
+
+		nsize = i__ - start + 1;
+		if (icompq == 2) {
+		    u[*n + *n * u_dim1] = r_sign(&c_b15, &d__[*n]);
+		    vt[*n + *n * vt_dim1] = 1.f;
+		} else if (icompq == 1) {
+		    q[*n + (qstart - 1) * *n] = r_sign(&c_b15, &d__[*n]);
+		    q[*n + (smlsiz + qstart - 1) * *n] = 1.f;
+		}
+		d__[*n] = (r__1 = d__[*n], dabs(r__1));
+	    }
+	    if (icompq == 2) {
+		slasd0_(&nsize, &sqre, &d__[start], &e[start], &u[start +
+			start * u_dim1], ldu, &vt[start + start * vt_dim1],
+			ldvt, &smlsiz, &iwork[1], &work[wstart], info);
+	    } else {
+		slasda_(&icompq, &smlsiz, &nsize, &sqre, &d__[start], &e[
+			start], &q[start + (iu + qstart - 2) * *n], n, &q[
+			start + (ivt + qstart - 2) * *n], &iq[start + k * *n],
+			 &q[start + (difl + qstart - 2) * *n], &q[start + (
+			difr + qstart - 2) * *n], &q[start + (z__ + qstart -
+			2) * *n], &q[start + (poles + qstart - 2) * *n], &iq[
+			start + givptr * *n], &iq[start + givcol * *n], n, &
+			iq[start + perm * *n], &q[start + (givnum + qstart -
+			2) * *n], &q[start + (ic + qstart - 2) * *n], &q[
+			start + (is + qstart - 2) * *n], &work[wstart], &
+			iwork[1], info);
+	    }
+	    if (*info != 0) {
+		return 0;
+	    }
+	    start = i__ + 1;
+	}
+/* L30: */
+    }
+
+/*     Unscale */
+
+    slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, &ierr);
+L40:
+
+/*     Use Selection Sort to minimize swaps of singular vectors */
+
+    i__1 = *n;
+    for (ii = 2; ii <= i__1; ++ii) {
+	i__ = ii - 1;
+	kk = i__;
+	p = d__[i__];
+	i__2 = *n;
+	for (j = ii; j <= i__2; ++j) {
+	    if (d__[j] > p) {
+		kk = j;
+		p = d__[j];
+	    }
+/* L50: */
+	}
+	if (kk != i__) {
+	    d__[kk] = d__[i__];
+	    d__[i__] = p;
+	    if (icompq == 1) {
+		iq[i__] = kk;
+	    } else if (icompq == 2) {
+		sswap_(n, &u[i__ * u_dim1 + 1], &c__1, &u[kk * u_dim1 + 1], &
+			c__1);
+		sswap_(n, &vt[i__ + vt_dim1], ldvt, &vt[kk + vt_dim1], ldvt);
+	    }
+	} else if (icompq == 1) {
+	    iq[i__] = i__;
+	}
+/* L60: */
+    }
+
+/*     If ICOMPQ = 1, use IQ(N,1) as the indicator for UPLO */
+
+    if (icompq == 1) {
+	if (iuplo == 1) {
+	    iq[*n] = 1;
+	} else {
+	    iq[*n] = 0;
+	}
+    }
+
+/*
+       If B is lower bidiagonal, update U by those Givens rotations
+       which rotated B to be upper bidiagonal
+*/
+
+    if (iuplo == 2 && icompq == 2) {
+	slasr_("L", "V", "B", n, n, &work[1], &work[*n], &u[u_offset], ldu);
+    }
+
+    return 0;
+
+/*     End of SBDSDC */
+
+} /* sbdsdc_ */
+
+/* Subroutine */ int sbdsqr_(char *uplo, integer *n, integer *ncvt, integer *
+	nru, integer *ncc, real *d__, real *e, real *vt, integer *ldvt, real *
+	u, integer *ldu, real *c__, integer *ldc, real *work, integer *info)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2;
+    real r__1, r__2, r__3, r__4;
+    doublereal d__1;
+
+    /* Local variables */
+    static real f, g, h__;
+    static integer i__, j, m;
+    static real r__, cs;
+    static integer ll;
+    static real sn, mu;
+    static integer nm1, nm12, nm13, lll;
+    static real eps, sll, tol, abse;
+    static integer idir;
+    static real abss;
+    static integer oldm;
+    static real cosl;
+    static integer isub, iter;
+    static real unfl, sinl, cosr, smin, smax, sinr;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *), slas2_(real *, real *, real *, real *,
+	     real *);
+    extern logical lsame_(char *, char *);
+    static real oldcs;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static integer oldll;
+    static real shift, sigmn, oldsn;
+    static integer maxit;
+    static real sminl;
+    extern /* Subroutine */ int slasr_(char *, char *, char *, integer *,
+	    integer *, real *, real *, real *, integer *);
+    static real sigmx;
+    static logical lower;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *,
+	    integer *), slasq1_(integer *, real *, real *, real *, integer *),
+	     slasv2_(real *, real *, real *, real *, real *, real *, real *,
+	    real *, real *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real sminoa;
+    extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real *
+	    );
+    static real thresh;
+    static logical rotate;
+    static real tolmul;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       January 2007
+
+
+    Purpose
+    =======
+
+    SBDSQR computes the singular values and, optionally, the right and/or
+    left singular vectors from the singular value decomposition (SVD) of
+    a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+    zero-shift QR algorithm.  The SVD of B has the form
+
+       B = Q * S * P**T
+
+    where S is the diagonal matrix of singular values, Q is an orthogonal
+    matrix of left singular vectors, and P is an orthogonal matrix of
+    right singular vectors.  If left singular vectors are requested, this
+    subroutine actually returns U*Q instead of Q, and, if right singular
+    vectors are requested, this subroutine returns P**T*VT instead of
+    P**T, for given real input matrices U and VT.  When U and VT are the
+    orthogonal matrices that reduce a general matrix A to bidiagonal
+    form:  A = U*B*VT, as computed by SGEBRD, then
+
+       A = (U*Q) * S * (P**T*VT)
+
+    is the SVD of A.  Optionally, the subroutine may also compute Q**T*C
+    for a given real input matrix C.
+
+    See "Computing  Small Singular Values of Bidiagonal Matrices With
+    Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+    LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+    no. 5, pp. 873-912, Sept 1990) and
+    "Accurate singular values and differential qd algorithms," by
+    B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+    Department, University of California at Berkeley, July 1992
+    for a detailed description of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  B is upper bidiagonal;
+            = 'L':  B is lower bidiagonal.
+
+    N       (input) INTEGER
+            The order of the matrix B.  N >= 0.
+
+    NCVT    (input) INTEGER
+            The number of columns of the matrix VT. NCVT >= 0.
+
+    NRU     (input) INTEGER
+            The number of rows of the matrix U. NRU >= 0.
+
+    NCC     (input) INTEGER
+            The number of columns of the matrix C. NCC >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the n diagonal elements of the bidiagonal matrix B.
+            On exit, if INFO=0, the singular values of B in decreasing
+            order.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the N-1 offdiagonal elements of the bidiagonal
+            matrix B.
+            On exit, if INFO = 0, E is destroyed; if INFO > 0, D and E
+            will contain the diagonal and superdiagonal elements of a
+            bidiagonal matrix orthogonally equivalent to the one given
+            as input.
+
+    VT      (input/output) REAL array, dimension (LDVT, NCVT)
+            On entry, an N-by-NCVT matrix VT.
+            On exit, VT is overwritten by P**T * VT.
+            Not referenced if NCVT = 0.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.
+            LDVT >= max(1,N) if NCVT > 0; LDVT >= 1 if NCVT = 0.
+
+    U       (input/output) REAL array, dimension (LDU, N)
+            On entry, an NRU-by-N matrix U.
+            On exit, U is overwritten by U * Q.
+            Not referenced if NRU = 0.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= max(1,NRU).
+
+    C       (input/output) REAL array, dimension (LDC, NCC)
+            On entry, an N-by-NCC matrix C.
+            On exit, C is overwritten by Q**T * C.
+            Not referenced if NCC = 0.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C.
+            LDC >= max(1,N) if NCC > 0; LDC >=1 if NCC = 0.
+
+    WORK    (workspace) REAL array, dimension (4*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  If INFO = -i, the i-th argument had an illegal value
+            > 0:
+               if NCVT = NRU = NCC = 0,
+                  = 1, a split was marked by a positive value in E
+                  = 2, current block of Z not diagonalized after 30*N
+                       iterations (in inner while loop)
+                  = 3, termination criterion of outer while loop not met
+                       (program created more than N unreduced blocks)
+               else NCVT = NRU = NCC = 0,
+                     the algorithm did not converge; D and E contain the
+                     elements of a bidiagonal matrix which is orthogonally
+                     similar to the input matrix B;  if INFO = i, i
+                     elements of E have not converged to zero.
+
+    Internal Parameters
+    ===================
+
+    TOLMUL  REAL, default = max(10,min(100,EPS**(-1/8)))
+            TOLMUL controls the convergence criterion of the QR loop.
+            If it is positive, TOLMUL*EPS is the desired relative
+               precision in the computed singular values.
+            If it is negative, abs(TOLMUL*EPS*sigma_max) is the
+               desired absolute accuracy in the computed singular
+               values (corresponds to relative accuracy
+               abs(TOLMUL*EPS) in the largest singular value.
+            abs(TOLMUL) should be between 1 and 1/EPS, and preferably
+               between 10 (for fast convergence) and .1/EPS
+               (for there to be some accuracy in the results).
+            Default is to lose at either one eighth or 2 of the
+               available decimal digits in each computed singular value
+               (whichever is smaller).
+
+    MAXITR  INTEGER, default = 6
+            MAXITR controls the maximum number of passes of the
+            algorithm through its inner loop. The algorithms stops
+            (and so fails to converge) if the number of passes
+            through the inner loop exceeds MAXITR*N**2.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    lower = lsame_(uplo, "L");
+    if (! lsame_(uplo, "U") && ! lower) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ncvt < 0) {
+	*info = -3;
+    } else if (*nru < 0) {
+	*info = -4;
+    } else if (*ncc < 0) {
+	*info = -5;
+    } else if (*ncvt == 0 && *ldvt < 1 || *ncvt > 0 && *ldvt < max(1,*n)) {
+	*info = -9;
+    } else if (*ldu < max(1,*nru)) {
+	*info = -11;
+    } else if (*ncc == 0 && *ldc < 1 || *ncc > 0 && *ldc < max(1,*n)) {
+	*info = -13;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SBDSQR", &i__1);
+	return 0;
+    }
+    if (*n == 0) {
+	return 0;
+    }
+    if (*n == 1) {
+	goto L160;
+    }
+
+/*     ROTATE is true if any singular vectors desired, false otherwise */
+
+    rotate = *ncvt > 0 || *nru > 0 || *ncc > 0;
+
+/*     If no singular vectors desired, use qd algorithm */
+
+    if (! rotate) {
+	slasq1_(n, &d__[1], &e[1], &work[1], info);
+	return 0;
+    }
+
+    nm1 = *n - 1;
+    nm12 = nm1 + nm1;
+    nm13 = nm12 + nm1;
+    idir = 0;
+
+/*     Get machine constants */
+
+    eps = slamch_("Epsilon");
+    unfl = slamch_("Safe minimum");
+
+/*
+       If matrix lower bidiagonal, rotate to be upper bidiagonal
+       by applying Givens rotations on the left
+*/
+
+    if (lower) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    work[i__] = cs;
+	    work[nm1 + i__] = sn;
+/* L10: */
+	}
+
+/*        Update singular vectors if desired */
+
+	if (*nru > 0) {
+	    slasr_("R", "V", "F", nru, n, &work[1], &work[*n], &u[u_offset],
+		    ldu);
+	}
+	if (*ncc > 0) {
+	    slasr_("L", "V", "F", n, ncc, &work[1], &work[*n], &c__[c_offset],
+		     ldc);
+	}
+    }
+
+/*
+       Compute singular values to relative accuracy TOL
+       (By setting TOL to be negative, algorithm will compute
+       singular values to absolute accuracy ABS(TOL)*norm(input matrix))
+
+   Computing MAX
+   Computing MIN
+*/
+    d__1 = (doublereal) eps;
+    r__3 = 100.f, r__4 = pow_dd(&d__1, &c_b94);
+    r__1 = 10.f, r__2 = dmin(r__3,r__4);
+    tolmul = dmax(r__1,r__2);
+    tol = tolmul * eps;
+
+/*     Compute approximate maximum, minimum singular values */
+
+    smax = 0.f;
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	r__2 = smax, r__3 = (r__1 = d__[i__], dabs(r__1));
+	smax = dmax(r__2,r__3);
+/* L20: */
+    }
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	r__2 = smax, r__3 = (r__1 = e[i__], dabs(r__1));
+	smax = dmax(r__2,r__3);
+/* L30: */
+    }
+    sminl = 0.f;
+    if (tol >= 0.f) {
+
+/*        Relative accuracy desired */
+
+	sminoa = dabs(d__[1]);
+	if (sminoa == 0.f) {
+	    goto L50;
+	}
+	mu = sminoa;
+	i__1 = *n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    mu = (r__2 = d__[i__], dabs(r__2)) * (mu / (mu + (r__1 = e[i__ -
+		    1], dabs(r__1))));
+	    sminoa = dmin(sminoa,mu);
+	    if (sminoa == 0.f) {
+		goto L50;
+	    }
+/* L40: */
+	}
+L50:
+	sminoa /= sqrt((real) (*n));
+/* Computing MAX */
+	r__1 = tol * sminoa, r__2 = *n * 6 * *n * unfl;
+	thresh = dmax(r__1,r__2);
+    } else {
+
+/*
+          Absolute accuracy desired
+
+   Computing MAX
+*/
+	r__1 = dabs(tol) * smax, r__2 = *n * 6 * *n * unfl;
+	thresh = dmax(r__1,r__2);
+    }
+
+/*
+       Prepare for main iteration loop for the singular values
+       (MAXIT is the maximum number of passes through the inner
+       loop permitted before nonconvergence signalled.)
+*/
+
+    maxit = *n * 6 * *n;
+    iter = 0;
+    oldll = -1;
+    oldm = -1;
+
+/*     M points to last element of unconverged part of matrix */
+
+    m = *n;
+
+/*     Begin main iteration loop */
+
+L60:
+
+/*     Check for convergence or exceeding iteration count */
+
+    if (m <= 1) {
+	goto L160;
+    }
+    if (iter > maxit) {
+	goto L200;
+    }
+
+/*     Find diagonal block of matrix to work on */
+
+    if (tol < 0.f && (r__1 = d__[m], dabs(r__1)) <= thresh) {
+	d__[m] = 0.f;
+    }
+    smax = (r__1 = d__[m], dabs(r__1));
+    smin = smax;
+    i__1 = m - 1;
+    for (lll = 1; lll <= i__1; ++lll) {
+	ll = m - lll;
+	abss = (r__1 = d__[ll], dabs(r__1));
+	abse = (r__1 = e[ll], dabs(r__1));
+	if (tol < 0.f && abss <= thresh) {
+	    d__[ll] = 0.f;
+	}
+	if (abse <= thresh) {
+	    goto L80;
+	}
+	smin = dmin(smin,abss);
+/* Computing MAX */
+	r__1 = max(smax,abss);
+	smax = dmax(r__1,abse);
+/* L70: */
+    }
+    ll = 0;
+    goto L90;
+L80:
+    e[ll] = 0.f;
+
+/*     Matrix splits since E(LL) = 0 */
+
+    if (ll == m - 1) {
+
+/*        Convergence of bottom singular value, return to top of loop */
+
+	--m;
+	goto L60;
+    }
+L90:
+    ++ll;
+
+/*     E(LL) through E(M-1) are nonzero, E(LL-1) is zero */
+
+    if (ll == m - 1) {
+
+/*        2 by 2 block, handle separately */
+
+	slasv2_(&d__[m - 1], &e[m - 1], &d__[m], &sigmn, &sigmx, &sinr, &cosr,
+		 &sinl, &cosl);
+	d__[m - 1] = sigmx;
+	e[m - 1] = 0.f;
+	d__[m] = sigmn;
+
+/*        Compute singular vectors, if desired */
+
+	if (*ncvt > 0) {
+	    srot_(ncvt, &vt[m - 1 + vt_dim1], ldvt, &vt[m + vt_dim1], ldvt, &
+		    cosr, &sinr);
+	}
+	if (*nru > 0) {
+	    srot_(nru, &u[(m - 1) * u_dim1 + 1], &c__1, &u[m * u_dim1 + 1], &
+		    c__1, &cosl, &sinl);
+	}
+	if (*ncc > 0) {
+	    srot_(ncc, &c__[m - 1 + c_dim1], ldc, &c__[m + c_dim1], ldc, &
+		    cosl, &sinl);
+	}
+	m += -2;
+	goto L60;
+    }
+
+/*
+       If working on new submatrix, choose shift direction
+       (from larger end diagonal element towards smaller)
+*/
+
+    if (ll > oldm || m < oldll) {
+	if ((r__1 = d__[ll], dabs(r__1)) >= (r__2 = d__[m], dabs(r__2))) {
+
+/*           Chase bulge from top (big end) to bottom (small end) */
+
+	    idir = 1;
+	} else {
+
+/*           Chase bulge from bottom (big end) to top (small end) */
+
+	    idir = 2;
+	}
+    }
+
+/*     Apply convergence tests */
+
+    if (idir == 1) {
+
+/*
+          Run convergence test in forward direction
+          First apply standard test to bottom of matrix
+*/
+
+	if ((r__2 = e[m - 1], dabs(r__2)) <= dabs(tol) * (r__1 = d__[m], dabs(
+		r__1)) || tol < 0.f && (r__3 = e[m - 1], dabs(r__3)) <=
+		thresh) {
+	    e[m - 1] = 0.f;
+	    goto L60;
+	}
+
+	if (tol >= 0.f) {
+
+/*
+             If relative accuracy desired,
+             apply convergence criterion forward
+*/
+
+	    mu = (r__1 = d__[ll], dabs(r__1));
+	    sminl = mu;
+	    i__1 = m - 1;
+	    for (lll = ll; lll <= i__1; ++lll) {
+		if ((r__1 = e[lll], dabs(r__1)) <= tol * mu) {
+		    e[lll] = 0.f;
+		    goto L60;
+		}
+		mu = (r__2 = d__[lll + 1], dabs(r__2)) * (mu / (mu + (r__1 =
+			e[lll], dabs(r__1))));
+		sminl = dmin(sminl,mu);
+/* L100: */
+	    }
+	}
+
+    } else {
+
+/*
+          Run convergence test in backward direction
+          First apply standard test to top of matrix
+*/
+
+	if ((r__2 = e[ll], dabs(r__2)) <= dabs(tol) * (r__1 = d__[ll], dabs(
+		r__1)) || tol < 0.f && (r__3 = e[ll], dabs(r__3)) <= thresh) {
+	    e[ll] = 0.f;
+	    goto L60;
+	}
+
+	if (tol >= 0.f) {
+
+/*
+             If relative accuracy desired,
+             apply convergence criterion backward
+*/
+
+	    mu = (r__1 = d__[m], dabs(r__1));
+	    sminl = mu;
+	    i__1 = ll;
+	    for (lll = m - 1; lll >= i__1; --lll) {
+		if ((r__1 = e[lll], dabs(r__1)) <= tol * mu) {
+		    e[lll] = 0.f;
+		    goto L60;
+		}
+		mu = (r__2 = d__[lll], dabs(r__2)) * (mu / (mu + (r__1 = e[
+			lll], dabs(r__1))));
+		sminl = dmin(sminl,mu);
+/* L110: */
+	    }
+	}
+    }
+    oldll = ll;
+    oldm = m;
+
+/*
+       Compute shift.  First, test if shifting would ruin relative
+       accuracy, and if so set the shift to zero.
+
+   Computing MAX
+*/
+    r__1 = eps, r__2 = tol * .01f;
+    if (tol >= 0.f && *n * tol * (sminl / smax) <= dmax(r__1,r__2)) {
+
+/*        Use a zero shift to avoid loss of relative accuracy */
+
+	shift = 0.f;
+    } else {
+
+/*        Compute the shift from 2-by-2 block at end of matrix */
+
+	if (idir == 1) {
+	    sll = (r__1 = d__[ll], dabs(r__1));
+	    slas2_(&d__[m - 1], &e[m - 1], &d__[m], &shift, &r__);
+	} else {
+	    sll = (r__1 = d__[m], dabs(r__1));
+	    slas2_(&d__[ll], &e[ll], &d__[ll + 1], &shift, &r__);
+	}
+
+/*        Test if shift negligible, and if so set to zero */
+
+	if (sll > 0.f) {
+/* Computing 2nd power */
+	    r__1 = shift / sll;
+	    if (r__1 * r__1 < eps) {
+		shift = 0.f;
+	    }
+	}
+    }
+
+/*     Increment iteration count */
+
+    iter = iter + m - ll;
+
+/*     If SHIFT = 0, do simplified QR iteration */
+
+    if (shift == 0.f) {
+	if (idir == 1) {
+
+/*
+             Chase bulge from top to bottom
+             Save cosines and sines for later singular vector updates
+*/
+
+	    cs = 1.f;
+	    oldcs = 1.f;
+	    i__1 = m - 1;
+	    for (i__ = ll; i__ <= i__1; ++i__) {
+		r__1 = d__[i__] * cs;
+		slartg_(&r__1, &e[i__], &cs, &sn, &r__);
+		if (i__ > ll) {
+		    e[i__ - 1] = oldsn * r__;
+		}
+		r__1 = oldcs * r__;
+		r__2 = d__[i__ + 1] * sn;
+		slartg_(&r__1, &r__2, &oldcs, &oldsn, &d__[i__]);
+		work[i__ - ll + 1] = cs;
+		work[i__ - ll + 1 + nm1] = sn;
+		work[i__ - ll + 1 + nm12] = oldcs;
+		work[i__ - ll + 1 + nm13] = oldsn;
+/* L120: */
+	    }
+	    h__ = d__[m] * cs;
+	    d__[m] = h__ * oldcs;
+	    e[m - 1] = h__ * oldsn;
+
+/*           Update singular vectors */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "F", &i__1, ncvt, &work[1], &work[*n], &vt[
+			ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		slasr_("R", "V", "F", nru, &i__1, &work[nm12 + 1], &work[nm13
+			+ 1], &u[ll * u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "F", &i__1, ncc, &work[nm12 + 1], &work[nm13
+			+ 1], &c__[ll + c_dim1], ldc);
+	    }
+
+/*           Test convergence */
+
+	    if ((r__1 = e[m - 1], dabs(r__1)) <= thresh) {
+		e[m - 1] = 0.f;
+	    }
+
+	} else {
+
+/*
+             Chase bulge from bottom to top
+             Save cosines and sines for later singular vector updates
+*/
+
+	    cs = 1.f;
+	    oldcs = 1.f;
+	    i__1 = ll + 1;
+	    for (i__ = m; i__ >= i__1; --i__) {
+		r__1 = d__[i__] * cs;
+		slartg_(&r__1, &e[i__ - 1], &cs, &sn, &r__);
+		if (i__ < m) {
+		    e[i__] = oldsn * r__;
+		}
+		r__1 = oldcs * r__;
+		r__2 = d__[i__ - 1] * sn;
+		slartg_(&r__1, &r__2, &oldcs, &oldsn, &d__[i__]);
+		work[i__ - ll] = cs;
+		work[i__ - ll + nm1] = -sn;
+		work[i__ - ll + nm12] = oldcs;
+		work[i__ - ll + nm13] = -oldsn;
+/* L130: */
+	    }
+	    h__ = d__[ll] * cs;
+	    d__[ll] = h__ * oldcs;
+	    e[ll] = h__ * oldsn;
+
+/*           Update singular vectors */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "B", &i__1, ncvt, &work[nm12 + 1], &work[
+			nm13 + 1], &vt[ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		slasr_("R", "V", "B", nru, &i__1, &work[1], &work[*n], &u[ll *
+			 u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "B", &i__1, ncc, &work[1], &work[*n], &c__[
+			ll + c_dim1], ldc);
+	    }
+
+/*           Test convergence */
+
+	    if ((r__1 = e[ll], dabs(r__1)) <= thresh) {
+		e[ll] = 0.f;
+	    }
+	}
+    } else {
+
+/*        Use nonzero shift */
+
+	if (idir == 1) {
+
+/*
+             Chase bulge from top to bottom
+             Save cosines and sines for later singular vector updates
+*/
+
+	    f = ((r__1 = d__[ll], dabs(r__1)) - shift) * (r_sign(&c_b15, &d__[
+		    ll]) + shift / d__[ll]);
+	    g = e[ll];
+	    i__1 = m - 1;
+	    for (i__ = ll; i__ <= i__1; ++i__) {
+		slartg_(&f, &g, &cosr, &sinr, &r__);
+		if (i__ > ll) {
+		    e[i__ - 1] = r__;
+		}
+		f = cosr * d__[i__] + sinr * e[i__];
+		e[i__] = cosr * e[i__] - sinr * d__[i__];
+		g = sinr * d__[i__ + 1];
+		d__[i__ + 1] = cosr * d__[i__ + 1];
+		slartg_(&f, &g, &cosl, &sinl, &r__);
+		d__[i__] = r__;
+		f = cosl * e[i__] + sinl * d__[i__ + 1];
+		d__[i__ + 1] = cosl * d__[i__ + 1] - sinl * e[i__];
+		if (i__ < m - 1) {
+		    g = sinl * e[i__ + 1];
+		    e[i__ + 1] = cosl * e[i__ + 1];
+		}
+		work[i__ - ll + 1] = cosr;
+		work[i__ - ll + 1 + nm1] = sinr;
+		work[i__ - ll + 1 + nm12] = cosl;
+		work[i__ - ll + 1 + nm13] = sinl;
+/* L140: */
+	    }
+	    e[m - 1] = f;
+
+/*           Update singular vectors */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "F", &i__1, ncvt, &work[1], &work[*n], &vt[
+			ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		slasr_("R", "V", "F", nru, &i__1, &work[nm12 + 1], &work[nm13
+			+ 1], &u[ll * u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "F", &i__1, ncc, &work[nm12 + 1], &work[nm13
+			+ 1], &c__[ll + c_dim1], ldc);
+	    }
+
+/*           Test convergence */
+
+	    if ((r__1 = e[m - 1], dabs(r__1)) <= thresh) {
+		e[m - 1] = 0.f;
+	    }
+
+	} else {
+
+/*
+             Chase bulge from bottom to top
+             Save cosines and sines for later singular vector updates
+*/
+
+	    f = ((r__1 = d__[m], dabs(r__1)) - shift) * (r_sign(&c_b15, &d__[
+		    m]) + shift / d__[m]);
+	    g = e[m - 1];
+	    i__1 = ll + 1;
+	    for (i__ = m; i__ >= i__1; --i__) {
+		slartg_(&f, &g, &cosr, &sinr, &r__);
+		if (i__ < m) {
+		    e[i__] = r__;
+		}
+		f = cosr * d__[i__] + sinr * e[i__ - 1];
+		e[i__ - 1] = cosr * e[i__ - 1] - sinr * d__[i__];
+		g = sinr * d__[i__ - 1];
+		d__[i__ - 1] = cosr * d__[i__ - 1];
+		slartg_(&f, &g, &cosl, &sinl, &r__);
+		d__[i__] = r__;
+		f = cosl * e[i__ - 1] + sinl * d__[i__ - 1];
+		d__[i__ - 1] = cosl * d__[i__ - 1] - sinl * e[i__ - 1];
+		if (i__ > ll + 1) {
+		    g = sinl * e[i__ - 2];
+		    e[i__ - 2] = cosl * e[i__ - 2];
+		}
+		work[i__ - ll] = cosr;
+		work[i__ - ll + nm1] = -sinr;
+		work[i__ - ll + nm12] = cosl;
+		work[i__ - ll + nm13] = -sinl;
+/* L150: */
+	    }
+	    e[ll] = f;
+
+/*           Test convergence */
+
+	    if ((r__1 = e[ll], dabs(r__1)) <= thresh) {
+		e[ll] = 0.f;
+	    }
+
+/*           Update singular vectors if desired */
+
+	    if (*ncvt > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "B", &i__1, ncvt, &work[nm12 + 1], &work[
+			nm13 + 1], &vt[ll + vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		i__1 = m - ll + 1;
+		slasr_("R", "V", "B", nru, &i__1, &work[1], &work[*n], &u[ll *
+			 u_dim1 + 1], ldu);
+	    }
+	    if (*ncc > 0) {
+		i__1 = m - ll + 1;
+		slasr_("L", "V", "B", &i__1, ncc, &work[1], &work[*n], &c__[
+			ll + c_dim1], ldc);
+	    }
+	}
+    }
+
+/*     QR iteration finished, go back and check convergence */
+
+    goto L60;
+
+/*     All singular values converged, so make them positive */
+
+L160:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (d__[i__] < 0.f) {
+	    d__[i__] = -d__[i__];
+
+/*           Change sign of singular vectors, if desired */
+
+	    if (*ncvt > 0) {
+		sscal_(ncvt, &c_b151, &vt[i__ + vt_dim1], ldvt);
+	    }
+	}
+/* L170: */
+    }
+
+/*
+       Sort the singular values into decreasing order (insertion sort on
+       singular values, but only one transposition per singular vector)
+*/
+
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Scan for smallest D(I) */
+
+	isub = 1;
+	smin = d__[1];
+	i__2 = *n + 1 - i__;
+	for (j = 2; j <= i__2; ++j) {
+	    if (d__[j] <= smin) {
+		isub = j;
+		smin = d__[j];
+	    }
+/* L180: */
+	}
+	if (isub != *n + 1 - i__) {
+
+/*           Swap singular values and vectors */
+
+	    d__[isub] = d__[*n + 1 - i__];
+	    d__[*n + 1 - i__] = smin;
+	    if (*ncvt > 0) {
+		sswap_(ncvt, &vt[isub + vt_dim1], ldvt, &vt[*n + 1 - i__ +
+			vt_dim1], ldvt);
+	    }
+	    if (*nru > 0) {
+		sswap_(nru, &u[isub * u_dim1 + 1], &c__1, &u[(*n + 1 - i__) *
+			u_dim1 + 1], &c__1);
+	    }
+	    if (*ncc > 0) {
+		sswap_(ncc, &c__[isub + c_dim1], ldc, &c__[*n + 1 - i__ +
+			c_dim1], ldc);
+	    }
+	}
+/* L190: */
+    }
+    goto L220;
+
+/*     Maximum number of iterations exceeded, failure to converge */
+
+L200:
+    *info = 0;
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (e[i__] != 0.f) {
+	    ++(*info);
+	}
+/* L210: */
+    }
+L220:
+    return 0;
+
+/*     End of SBDSQR */
+
+} /* sbdsqr_ */
+
+/* Subroutine */ int sgebak_(char *job, char *side, integer *n, integer *ilo,
+	integer *ihi, real *scale, integer *m, real *v, integer *ldv, integer
+	*info)
+{
+    /* System generated locals */
+    integer v_dim1, v_offset, i__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static real s;
+    static integer ii;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static logical leftv;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *,
+	    integer *), xerbla_(char *, integer *);
+    static logical rightv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEBAK forms the right or left eigenvectors of a real general matrix
+    by backward transformation on the computed eigenvectors of the
+    balanced matrix output by SGEBAL.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the type of backward transformation required:
+            = 'N', do nothing, return immediately;
+            = 'P', do backward transformation for permutation only;
+            = 'S', do backward transformation for scaling only;
+            = 'B', do backward transformations for both permutation and
+                   scaling.
+            JOB must be the same as the argument JOB supplied to SGEBAL.
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  V contains right eigenvectors;
+            = 'L':  V contains left eigenvectors.
+
+    N       (input) INTEGER
+            The number of rows of the matrix V.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            The integers ILO and IHI determined by SGEBAL.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    SCALE   (input) REAL array, dimension (N)
+            Details of the permutation and scaling factors, as returned
+            by SGEBAL.
+
+    M       (input) INTEGER
+            The number of columns of the matrix V.  M >= 0.
+
+    V       (input/output) REAL array, dimension (LDV,M)
+            On entry, the matrix of right or left eigenvectors to be
+            transformed, as returned by SHSEIN or STREVC.
+            On exit, V is overwritten by the transformed eigenvectors.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V. LDV >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Decode and Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --scale;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+
+    /* Function Body */
+    rightv = lsame_(side, "R");
+    leftv = lsame_(side, "L");
+
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (! rightv && ! leftv) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -7;
+    } else if (*ldv < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEBAK", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*m == 0) {
+	return 0;
+    }
+    if (lsame_(job, "N")) {
+	return 0;
+    }
+
+    if (*ilo == *ihi) {
+	goto L30;
+    }
+
+/*     Backward balance */
+
+    if (lsame_(job, "S") || lsame_(job, "B")) {
+
+	if (rightv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = scale[i__];
+		sscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L10: */
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = 1.f / scale[i__];
+		sscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L20: */
+	    }
+	}
+
+    }
+
+/*
+       Backward permutation
+
+       For  I = ILO-1 step -1 until 1,
+                IHI+1 step 1 until N do --
+*/
+
+L30:
+    if (lsame_(job, "P") || lsame_(job, "B")) {
+	if (rightv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L40;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = scale[i__];
+		if (k == i__) {
+		    goto L40;
+		}
+		sswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L40:
+		;
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L50;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = scale[i__];
+		if (k == i__) {
+		    goto L50;
+		}
+		sswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L50:
+		;
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SGEBAK */
+
+} /* sgebak_ */
+
+/* Subroutine */ int sgebal_(char *job, integer *n, real *a, integer *lda,
+	integer *ilo, integer *ihi, real *scale, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real c__, f, g;
+    static integer i__, j, k, l, m;
+    static real r__, s, ca, ra;
+    static integer ica, ira, iexc;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sswap_(integer *, real *, integer *, real *, integer *);
+    static real sfmin1, sfmin2, sfmax1, sfmax2;
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+    static logical noconv;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SGEBAL balances a general real matrix A.  This involves, first,
+    permuting A by a similarity transformation to isolate eigenvalues
+    in the first 1 to ILO-1 and last IHI+1 to N elements on the
+    diagonal; and second, applying a diagonal similarity transformation
+    to rows and columns ILO to IHI to make the rows and columns as
+    close in norm as possible.  Both steps are optional.
+
+    Balancing may reduce the 1-norm of the matrix, and improve the
+    accuracy of the computed eigenvalues and/or eigenvectors.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the operations to be performed on A:
+            = 'N':  none:  simply set ILO = 1, IHI = N, SCALE(I) = 1.0
+                    for i = 1,...,N;
+            = 'P':  permute only;
+            = 'S':  scale only;
+            = 'B':  both permute and scale.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the input matrix A.
+            On exit,  A is overwritten by the balanced matrix.
+            If JOB = 'N', A is not referenced.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    ILO     (output) INTEGER
+    IHI     (output) INTEGER
+            ILO and IHI are set to integers such that on exit
+            A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
+            If JOB = 'N' or 'S', ILO = 1 and IHI = N.
+
+    SCALE   (output) REAL array, dimension (N)
+            Details of the permutations and scaling factors applied to
+            A.  If P(j) is the index of the row and column interchanged
+            with row and column j and D(j) is the scaling factor
+            applied to row and column j, then
+            SCALE(j) = P(j)    for j = 1,...,ILO-1
+                     = D(j)    for j = ILO,...,IHI
+                     = P(j)    for j = IHI+1,...,N.
+            The order in which the interchanges are made is N to IHI+1,
+            then 1 to ILO-1.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The permutations consist of row and column interchanges which put
+    the matrix in the form
+
+               ( T1   X   Y  )
+       P A P = (  0   B   Z  )
+               (  0   0   T2 )
+
+    where T1 and T2 are upper triangular matrices whose eigenvalues lie
+    along the diagonal.  The column indices ILO and IHI mark the starting
+    and ending columns of the submatrix B. Balancing consists of applying
+    a diagonal similarity transformation inv(D) * B * D to make the
+    1-norms of each row of B and its corresponding column nearly equal.
+    The output matrix is
+
+       ( T1     X*D          Y    )
+       (  0  inv(D)*B*D  inv(D)*Z ).
+       (  0      0           T2   )
+
+    Information about the permutations P and the diagonal matrix D is
+    returned in the vector SCALE.
+
+    This subroutine is based on the EISPACK routine BALANC.
+
+    Modified by Tzu-Yi Chen, Computer Science Division, University of
+      California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --scale;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEBAL", &i__1);
+	return 0;
+    }
+
+    k = 1;
+    l = *n;
+
+    if (*n == 0) {
+	goto L210;
+    }
+
+    if (lsame_(job, "N")) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scale[i__] = 1.f;
+/* L10: */
+	}
+	goto L210;
+    }
+
+    if (lsame_(job, "S")) {
+	goto L120;
+    }
+
+/*     Permutation to isolate eigenvalues if possible */
+
+    goto L50;
+
+/*     Row and column exchange. */
+
+L20:
+    scale[m] = (real) j;
+    if (j == m) {
+	goto L30;
+    }
+
+    sswap_(&l, &a[j * a_dim1 + 1], &c__1, &a[m * a_dim1 + 1], &c__1);
+    i__1 = *n - k + 1;
+    sswap_(&i__1, &a[j + k * a_dim1], lda, &a[m + k * a_dim1], lda);
+
+L30:
+    switch (iexc) {
+	case 1:  goto L40;
+	case 2:  goto L80;
+    }
+
+/*     Search for rows isolating an eigenvalue and push them down. */
+
+L40:
+    if (l == 1) {
+	goto L210;
+    }
+    --l;
+
+L50:
+    for (j = l; j >= 1; --j) {
+
+	i__1 = l;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (i__ == j) {
+		goto L60;
+	    }
+	    if (a[j + i__ * a_dim1] != 0.f) {
+		goto L70;
+	    }
+L60:
+	    ;
+	}
+
+	m = l;
+	iexc = 1;
+	goto L20;
+L70:
+	;
+    }
+
+    goto L90;
+
+/*     Search for columns isolating an eigenvalue and push them left. */
+
+L80:
+    ++k;
+
+L90:
+    i__1 = l;
+    for (j = k; j <= i__1; ++j) {
+
+	i__2 = l;
+	for (i__ = k; i__ <= i__2; ++i__) {
+	    if (i__ == j) {
+		goto L100;
+	    }
+	    if (a[i__ + j * a_dim1] != 0.f) {
+		goto L110;
+	    }
+L100:
+	    ;
+	}
+
+	m = k;
+	iexc = 2;
+	goto L20;
+L110:
+	;
+    }
+
+L120:
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	scale[i__] = 1.f;
+/* L130: */
+    }
+
+    if (lsame_(job, "P")) {
+	goto L210;
+    }
+
+/*
+       Balance the submatrix in rows K to L.
+
+       Iterative loop for norm reduction
+*/
+
+    sfmin1 = slamch_("S") / slamch_("P");
+    sfmax1 = 1.f / sfmin1;
+    sfmin2 = sfmin1 * 2.f;
+    sfmax2 = 1.f / sfmin2;
+L140:
+    noconv = FALSE_;
+
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	c__ = 0.f;
+	r__ = 0.f;
+
+	i__2 = l;
+	for (j = k; j <= i__2; ++j) {
+	    if (j == i__) {
+		goto L150;
+	    }
+	    c__ += (r__1 = a[j + i__ * a_dim1], dabs(r__1));
+	    r__ += (r__1 = a[i__ + j * a_dim1], dabs(r__1));
+L150:
+	    ;
+	}
+	ica = isamax_(&l, &a[i__ * a_dim1 + 1], &c__1);
+	ca = (r__1 = a[ica + i__ * a_dim1], dabs(r__1));
+	i__2 = *n - k + 1;
+	ira = isamax_(&i__2, &a[i__ + k * a_dim1], lda);
+	ra = (r__1 = a[i__ + (ira + k - 1) * a_dim1], dabs(r__1));
+
+/*        Guard against zero C or R due to underflow. */
+
+	if (c__ == 0.f || r__ == 0.f) {
+	    goto L200;
+	}
+	g = r__ / 2.f;
+	f = 1.f;
+	s = c__ + r__;
+L160:
+/* Computing MAX */
+	r__1 = max(f,c__);
+/* Computing MIN */
+	r__2 = min(r__,g);
+	if (c__ >= g || dmax(r__1,ca) >= sfmax2 || dmin(r__2,ra) <= sfmin2) {
+	    goto L170;
+	}
+	f *= 2.f;
+	c__ *= 2.f;
+	ca *= 2.f;
+	r__ /= 2.f;
+	g /= 2.f;
+	ra /= 2.f;
+	goto L160;
+
+L170:
+	g = c__ / 2.f;
+L180:
+/* Computing MIN */
+	r__1 = min(f,c__), r__1 = min(r__1,g);
+	if (g < r__ || dmax(r__,ra) >= sfmax2 || dmin(r__1,ca) <= sfmin2) {
+	    goto L190;
+	}
+	r__1 = c__ + f + ca + r__ + g + ra;
+	if (sisnan_(&r__1)) {
+
+/*           Exit if NaN to avoid infinite loop */
+
+	    *info = -3;
+	    i__2 = -(*info);
+	    xerbla_("SGEBAL", &i__2);
+	    return 0;
+	}
+	f /= 2.f;
+	c__ /= 2.f;
+	g /= 2.f;
+	ca /= 2.f;
+	r__ *= 2.f;
+	ra *= 2.f;
+	goto L180;
+
+/*        Now balance. */
+
+L190:
+	if (c__ + r__ >= s * .95f) {
+	    goto L200;
+	}
+	if (f < 1.f && scale[i__] < 1.f) {
+	    if (f * scale[i__] <= sfmin1) {
+		goto L200;
+	    }
+	}
+	if (f > 1.f && scale[i__] > 1.f) {
+	    if (scale[i__] >= sfmax1 / f) {
+		goto L200;
+	    }
+	}
+	g = 1.f / f;
+	scale[i__] *= f;
+	noconv = TRUE_;
+
+	i__2 = *n - k + 1;
+	sscal_(&i__2, &g, &a[i__ + k * a_dim1], lda);
+	sscal_(&l, &f, &a[i__ * a_dim1 + 1], &c__1);
+
+L200:
+	;
+    }
+
+    if (noconv) {
+	goto L140;
+    }
+
+L210:
+    *ilo = k;
+    *ihi = l;
+
+    return 0;
+
+/*     End of SGEBAL */
+
+} /* sgebal_ */
+
+/* Subroutine */ int sgebd2_(integer *m, integer *n, real *a, integer *lda,
+	real *d__, real *e, real *tauq, real *taup, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *), slarfg_(integer *, real *, real *,
+	    integer *, real *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEBD2 reduces a real general m by n matrix A to upper or lower
+    bidiagonal form B by an orthogonal transformation: Q' * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the orthogonal matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the orthogonal matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) REAL array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) REAL array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix P. See Further Details.
+
+    WORK    (workspace) REAL array, dimension (max(M,N))
+
+    INFO    (output) INTEGER
+            = 0: successful exit.
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
+    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("SGEBD2", &i__1);
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ *
+		    a_dim1], &c__1, &tauq[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.f;
+
+/*           Apply H(i) to A(i:m,i+1:n) from the left */
+
+	    if (i__ < *n) {
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		slarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &
+			tauq[i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]
+			);
+	    }
+	    a[i__ + i__ * a_dim1] = d__[i__];
+
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector G(i) to annihilate
+                A(i,i+2:n)
+*/
+
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		slarfg_(&i__2, &a[i__ + (i__ + 1) * a_dim1], &a[i__ + min(
+			i__3,*n) * a_dim1], lda, &taup[i__]);
+		e[i__] = a[i__ + (i__ + 1) * a_dim1];
+		a[i__ + (i__ + 1) * a_dim1] = 1.f;
+
+/*              Apply G(i) to A(i+1:m,i+1:n) from the right */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		slarf_("Right", &i__2, &i__3, &a[i__ + (i__ + 1) * a_dim1],
+			lda, &taup[i__], &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &work[1]);
+		a[i__ + (i__ + 1) * a_dim1] = e[i__];
+	    } else {
+		taup[i__] = 0.f;
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector G(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3,*n) *
+		    a_dim1], lda, &taup[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.f;
+
+/*           Apply G(i) to A(i+1:m,i:n) from the right */
+
+	    if (i__ < *m) {
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		slarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &
+			taup[i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    a[i__ + i__ * a_dim1] = d__[i__];
+
+	    if (i__ < *m) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:m,i)
+*/
+
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		slarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*m) +
+			i__ * a_dim1], &c__1, &tauq[i__]);
+		e[i__] = a[i__ + 1 + i__ * a_dim1];
+		a[i__ + 1 + i__ * a_dim1] = 1.f;
+
+/*              Apply H(i) to A(i+1:m,i+1:n) from the left */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		slarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &
+			c__1, &tauq[i__], &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &work[1]);
+		a[i__ + 1 + i__ * a_dim1] = e[i__];
+	    } else {
+		tauq[i__] = 0.f;
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of SGEBD2 */
+
+} /* sgebd2_ */
+
+/* Subroutine */ int sgebrd_(integer *m, integer *n, real *a, integer *lda,
+	real *d__, real *e, real *tauq, real *taup, real *work, integer *
+	lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, nb, nx;
+    static real ws;
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer minmn;
+    extern /* Subroutine */ int sgebd2_(integer *, integer *, real *, integer
+	    *, real *, real *, real *, real *, real *, integer *), slabrd_(
+	    integer *, integer *, integer *, real *, integer *, real *, real *
+	    , real *, real *, real *, integer *, real *, integer *), xerbla_(
+	    char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwrkx, ldwrky, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEBRD reduces a general real M-by-N matrix A to upper or lower
+    bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the M-by-N general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the orthogonal matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the orthogonal matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) REAL array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) REAL array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix P. See Further Details.
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,M,N).
+            For optimum performance LWORK >= (M+N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
+    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MAX */
+    i__1 = 1, i__2 = ilaenv_(&c__1, "SGEBRD", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = max(i__1,i__2);
+    lwkopt = (*m + *n) * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = max(1,*m);
+	if (*lwork < max(i__1,*n) && ! lquery) {
+	    *info = -10;
+	}
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("SGEBRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    minmn = min(*m,*n);
+    if (minmn == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    ws = (real) max(*m,*n);
+    ldwrkx = *m;
+    ldwrky = *n;
+
+    if (nb > 1 && nb < minmn) {
+
+/*
+          Set the crossover point NX.
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "SGEBRD", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+
+/*        Determine when to switch from blocked to unblocked code. */
+
+	if (nx < minmn) {
+	    ws = (real) ((*m + *n) * nb);
+	    if ((real) (*lwork) < ws) {
+
+/*
+                Not enough work space for the optimal NB, consider using
+                a smaller block size.
+*/
+
+		nbmin = ilaenv_(&c__2, "SGEBRD", " ", m, n, &c_n1, &c_n1, (
+			ftnlen)6, (ftnlen)1);
+		if (*lwork >= (*m + *n) * nbmin) {
+		    nb = *lwork / (*m + *n);
+		} else {
+		    nb = 1;
+		    nx = minmn;
+		}
+	    }
+	}
+    } else {
+	nx = minmn;
+    }
+
+    i__1 = minmn - nx;
+    i__2 = nb;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+
+/*
+          Reduce rows and columns i:i+nb-1 to bidiagonal form and return
+          the matrices X and Y which are needed to update the unreduced
+          part of the matrix
+*/
+
+	i__3 = *m - i__ + 1;
+	i__4 = *n - i__ + 1;
+	slabrd_(&i__3, &i__4, &nb, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[
+		i__], &tauq[i__], &taup[i__], &work[1], &ldwrkx, &work[ldwrkx
+		* nb + 1], &ldwrky);
+
+/*
+          Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
+          of the form  A := A - V*Y' - X*U'
+*/
+
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	sgemm_("No transpose", "Transpose", &i__3, &i__4, &nb, &c_b151, &a[
+		i__ + nb + i__ * a_dim1], lda, &work[ldwrkx * nb + nb + 1], &
+		ldwrky, &c_b15, &a[i__ + nb + (i__ + nb) * a_dim1], lda);
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	sgemm_("No transpose", "No transpose", &i__3, &i__4, &nb, &c_b151, &
+		work[nb + 1], &ldwrkx, &a[i__ + (i__ + nb) * a_dim1], lda, &
+		c_b15, &a[i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*        Copy diagonal and off-diagonal elements of B back into A */
+
+	if (*m >= *n) {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j + j * a_dim1] = d__[j];
+		a[j + (j + 1) * a_dim1] = e[j];
+/* L10: */
+	    }
+	} else {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j + j * a_dim1] = d__[j];
+		a[j + 1 + j * a_dim1] = e[j];
+/* L20: */
+	    }
+	}
+/* L30: */
+    }
+
+/*     Use unblocked code to reduce the remainder of the matrix */
+
+    i__2 = *m - i__ + 1;
+    i__1 = *n - i__ + 1;
+    sgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], &
+	    tauq[i__], &taup[i__], &work[1], &iinfo);
+    work[1] = ws;
+    return 0;
+
+/*     End of SGEBRD */
+
+} /* sgebrd_ */
+
+/* Subroutine */ int sgeev_(char *jobvl, char *jobvr, integer *n, real *a,
+	integer *lda, real *wr, real *wi, real *vl, integer *ldvl, real *vr,
+	integer *ldvr, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, k;
+    static real r__, cs, sn;
+    static integer ihi;
+    static real scl;
+    static integer ilo;
+    static real dum[1], eps;
+    static integer ibal;
+    static char side[1];
+    static real anrm;
+    static integer ierr, itau, iwrk, nout;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    extern doublereal snrm2_(integer *, real *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    extern doublereal slapy2_(real *, real *);
+    extern /* Subroutine */ int slabad_(real *, real *);
+    static logical scalea;
+    static real cscale;
+    extern /* Subroutine */ int sgebak_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *), sgebal_(char *, integer *, real *, integer *,
+	    integer *, integer *, real *, integer *);
+    extern doublereal slamch_(char *), slange_(char *, integer *,
+	    integer *, real *, integer *, real *);
+    extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *), xerbla_(char
+	    *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical select[1];
+    static real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *,
+	    integer *, real *, integer *), slartg_(real *, real *,
+	    real *, real *, real *), sorghr_(integer *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *, integer *), shseqr_(
+	    char *, char *, integer *, integer *, integer *, real *, integer *
+	    , real *, real *, real *, integer *, real *, integer *, integer *), strevc_(char *, char *, logical *, integer *,
+	    real *, integer *, real *, integer *, real *, integer *, integer *
+	    , integer *, real *, integer *);
+    static integer minwrk, maxwrk;
+    static logical wantvl;
+    static real smlnum;
+    static integer hswork;
+    static logical lquery, wantvr;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEEV computes for an N-by-N real nonsymmetric matrix A, the
+    eigenvalues and, optionally, the left and/or right eigenvectors.
+
+    The right eigenvector v(j) of A satisfies
+                     A * v(j) = lambda(j) * v(j)
+    where lambda(j) is its eigenvalue.
+    The left eigenvector u(j) of A satisfies
+                  u(j)**H * A = lambda(j) * u(j)**H
+    where u(j)**H denotes the conjugate transpose of u(j).
+
+    The computed eigenvectors are normalized to have Euclidean norm
+    equal to 1 and largest component real.
+
+    Arguments
+    =========
+
+    JOBVL   (input) CHARACTER*1
+            = 'N': left eigenvectors of A are not computed;
+            = 'V': left eigenvectors of A are computed.
+
+    JOBVR   (input) CHARACTER*1
+            = 'N': right eigenvectors of A are not computed;
+            = 'V': right eigenvectors of A are computed.
+
+    N       (input) INTEGER
+            The order of the matrix A. N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the N-by-N matrix A.
+            On exit, A has been overwritten.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    WR      (output) REAL array, dimension (N)
+    WI      (output) REAL array, dimension (N)
+            WR and WI contain the real and imaginary parts,
+            respectively, of the computed eigenvalues.  Complex
+            conjugate pairs of eigenvalues appear consecutively
+            with the eigenvalue having the positive imaginary part
+            first.
+
+    VL      (output) REAL array, dimension (LDVL,N)
+            If JOBVL = 'V', the left eigenvectors u(j) are stored one
+            after another in the columns of VL, in the same order
+            as their eigenvalues.
+            If JOBVL = 'N', VL is not referenced.
+            If the j-th eigenvalue is real, then u(j) = VL(:,j),
+            the j-th column of VL.
+            If the j-th and (j+1)-st eigenvalues form a complex
+            conjugate pair, then u(j) = VL(:,j) + i*VL(:,j+1) and
+            u(j+1) = VL(:,j) - i*VL(:,j+1).
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1; if
+            JOBVL = 'V', LDVL >= N.
+
+    VR      (output) REAL array, dimension (LDVR,N)
+            If JOBVR = 'V', the right eigenvectors v(j) are stored one
+            after another in the columns of VR, in the same order
+            as their eigenvalues.
+            If JOBVR = 'N', VR is not referenced.
+            If the j-th eigenvalue is real, then v(j) = VR(:,j),
+            the j-th column of VR.
+            If the j-th and (j+1)-st eigenvalues form a complex
+            conjugate pair, then v(j) = VR(:,j) + i*VR(:,j+1) and
+            v(j+1) = VR(:,j) - i*VR(:,j+1).
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1; if
+            JOBVR = 'V', LDVR >= N.
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,3*N), and
+            if JOBVL = 'V' or JOBVR = 'V', LWORK >= 4*N.  For good
+            performance, LWORK must generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = i, the QR algorithm failed to compute all the
+                  eigenvalues, and no eigenvectors have been computed;
+                  elements i+1:N of WR and WI contain eigenvalues which
+                  have converged.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --wr;
+    --wi;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    wantvl = lsame_(jobvl, "V");
+    wantvr = lsame_(jobvr, "V");
+    if (! wantvl && ! lsame_(jobvl, "N")) {
+	*info = -1;
+    } else if (! wantvr && ! lsame_(jobvr, "N")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldvl < 1 || wantvl && *ldvl < *n) {
+	*info = -9;
+    } else if (*ldvr < 1 || wantvr && *ldvr < *n) {
+	*info = -11;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         NB refers to the optimal block size for the immediately
+         following subroutine, as returned by ILAENV.
+         HSWORK refers to the workspace preferred by SHSEQR, as
+         calculated below. HSWORK is computed assuming ILO=1 and IHI=N,
+         the worst case.)
+*/
+
+    if (*info == 0) {
+	if (*n == 0) {
+	    minwrk = 1;
+	    maxwrk = 1;
+	} else {
+	    maxwrk = (*n << 1) + *n * ilaenv_(&c__1, "SGEHRD", " ", n, &c__1,
+		    n, &c__0, (ftnlen)6, (ftnlen)1);
+	    if (wantvl) {
+		minwrk = *n << 2;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"SORGHR", " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)
+			1);
+		maxwrk = max(i__1,i__2);
+		shseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &wr[1], &wi[
+			1], &vl[vl_offset], ldvl, &work[1], &c_n1, info);
+		hswork = work[1];
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + 1, i__1 = max(i__1,i__2), i__2 = *
+			n + hswork;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n << 2;
+		maxwrk = max(i__1,i__2);
+	    } else if (wantvr) {
+		minwrk = *n << 2;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"SORGHR", " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)
+			1);
+		maxwrk = max(i__1,i__2);
+		shseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &wr[1], &wi[
+			1], &vr[vr_offset], ldvr, &work[1], &c_n1, info);
+		hswork = work[1];
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + 1, i__1 = max(i__1,i__2), i__2 = *
+			n + hswork;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n << 2;
+		maxwrk = max(i__1,i__2);
+	    } else {
+		minwrk = *n * 3;
+		shseqr_("E", "N", n, &c__1, n, &a[a_offset], lda, &wr[1], &wi[
+			1], &vr[vr_offset], ldvr, &work[1], &c_n1, info);
+		hswork = work[1];
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + 1, i__1 = max(i__1,i__2), i__2 = *
+			n + hswork;
+		maxwrk = max(i__1,i__2);
+	    }
+	    maxwrk = max(maxwrk,minwrk);
+	}
+	work[1] = (real) maxwrk;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -13;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEEV ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = slamch_("P");
+    smlnum = slamch_("S");
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+    smlnum = sqrt(smlnum) / eps;
+    bignum = 1.f / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = slange_("M", n, n, &a[a_offset], lda, dum);
+    scalea = FALSE_;
+    if (anrm > 0.f && anrm < smlnum) {
+	scalea = TRUE_;
+	cscale = smlnum;
+    } else if (anrm > bignum) {
+	scalea = TRUE_;
+	cscale = bignum;
+    }
+    if (scalea) {
+	slascl_("G", &c__0, &c__0, &anrm, &cscale, n, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+/*
+       Balance the matrix
+       (Workspace: need N)
+*/
+
+    ibal = 1;
+    sgebal_("B", n, &a[a_offset], lda, &ilo, &ihi, &work[ibal], &ierr);
+
+/*
+       Reduce to upper Hessenberg form
+       (Workspace: need 3*N, prefer 2*N+N*NB)
+*/
+
+    itau = ibal + *n;
+    iwrk = itau + *n;
+    i__1 = *lwork - iwrk + 1;
+    sgehrd_(n, &ilo, &ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1,
+	     &ierr);
+
+    if (wantvl) {
+
+/*
+          Want left eigenvectors
+          Copy Householder vectors to VL
+*/
+
+	*(unsigned char *)side = 'L';
+	slacpy_("L", n, n, &a[a_offset], lda, &vl[vl_offset], ldvl)
+		;
+
+/*
+          Generate orthogonal matrix in VL
+          (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	sorghr_(n, &ilo, &ihi, &vl[vl_offset], ldvl, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VL
+          (Workspace: need N+1, prefer N+HSWORK (see comments) )
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	shseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &wr[1], &wi[1], &
+		vl[vl_offset], ldvl, &work[iwrk], &i__1, info);
+
+	if (wantvr) {
+
+/*
+             Want left and right eigenvectors
+             Copy Schur vectors to VR
+*/
+
+	    *(unsigned char *)side = 'B';
+	    slacpy_("F", n, n, &vl[vl_offset], ldvl, &vr[vr_offset], ldvr);
+	}
+
+    } else if (wantvr) {
+
+/*
+          Want right eigenvectors
+          Copy Householder vectors to VR
+*/
+
+	*(unsigned char *)side = 'R';
+	slacpy_("L", n, n, &a[a_offset], lda, &vr[vr_offset], ldvr)
+		;
+
+/*
+          Generate orthogonal matrix in VR
+          (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	sorghr_(n, &ilo, &ihi, &vr[vr_offset], ldvr, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VR
+          (Workspace: need N+1, prefer N+HSWORK (see comments) )
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	shseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &wr[1], &wi[1], &
+		vr[vr_offset], ldvr, &work[iwrk], &i__1, info);
+
+    } else {
+
+/*
+          Compute eigenvalues only
+          (Workspace: need N+1, prefer N+HSWORK (see comments) )
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	shseqr_("E", "N", n, &ilo, &ihi, &a[a_offset], lda, &wr[1], &wi[1], &
+		vr[vr_offset], ldvr, &work[iwrk], &i__1, info);
+    }
+
+/*     If INFO > 0 from SHSEQR, then quit */
+
+    if (*info > 0) {
+	goto L50;
+    }
+
+    if (wantvl || wantvr) {
+
+/*
+          Compute left and/or right eigenvectors
+          (Workspace: need 4*N)
+*/
+
+	strevc_(side, "B", select, n, &a[a_offset], lda, &vl[vl_offset], ldvl,
+		 &vr[vr_offset], ldvr, n, &nout, &work[iwrk], &ierr);
+    }
+
+    if (wantvl) {
+
+/*
+          Undo balancing of left eigenvectors
+          (Workspace: need N)
+*/
+
+	sgebak_("B", "L", n, &ilo, &ihi, &work[ibal], n, &vl[vl_offset], ldvl,
+		 &ierr);
+
+/*        Normalize left eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (wi[i__] == 0.f) {
+		scl = 1.f / snrm2_(n, &vl[i__ * vl_dim1 + 1], &c__1);
+		sscal_(n, &scl, &vl[i__ * vl_dim1 + 1], &c__1);
+	    } else if (wi[i__] > 0.f) {
+		r__1 = snrm2_(n, &vl[i__ * vl_dim1 + 1], &c__1);
+		r__2 = snrm2_(n, &vl[(i__ + 1) * vl_dim1 + 1], &c__1);
+		scl = 1.f / slapy2_(&r__1, &r__2);
+		sscal_(n, &scl, &vl[i__ * vl_dim1 + 1], &c__1);
+		sscal_(n, &scl, &vl[(i__ + 1) * vl_dim1 + 1], &c__1);
+		i__2 = *n;
+		for (k = 1; k <= i__2; ++k) {
+/* Computing 2nd power */
+		    r__1 = vl[k + i__ * vl_dim1];
+/* Computing 2nd power */
+		    r__2 = vl[k + (i__ + 1) * vl_dim1];
+		    work[iwrk + k - 1] = r__1 * r__1 + r__2 * r__2;
+/* L10: */
+		}
+		k = isamax_(n, &work[iwrk], &c__1);
+		slartg_(&vl[k + i__ * vl_dim1], &vl[k + (i__ + 1) * vl_dim1],
+			&cs, &sn, &r__);
+		srot_(n, &vl[i__ * vl_dim1 + 1], &c__1, &vl[(i__ + 1) *
+			vl_dim1 + 1], &c__1, &cs, &sn);
+		vl[k + (i__ + 1) * vl_dim1] = 0.f;
+	    }
+/* L20: */
+	}
+    }
+
+    if (wantvr) {
+
+/*
+          Undo balancing of right eigenvectors
+          (Workspace: need N)
+*/
+
+	sgebak_("B", "R", n, &ilo, &ihi, &work[ibal], n, &vr[vr_offset], ldvr,
+		 &ierr);
+
+/*        Normalize right eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (wi[i__] == 0.f) {
+		scl = 1.f / snrm2_(n, &vr[i__ * vr_dim1 + 1], &c__1);
+		sscal_(n, &scl, &vr[i__ * vr_dim1 + 1], &c__1);
+	    } else if (wi[i__] > 0.f) {
+		r__1 = snrm2_(n, &vr[i__ * vr_dim1 + 1], &c__1);
+		r__2 = snrm2_(n, &vr[(i__ + 1) * vr_dim1 + 1], &c__1);
+		scl = 1.f / slapy2_(&r__1, &r__2);
+		sscal_(n, &scl, &vr[i__ * vr_dim1 + 1], &c__1);
+		sscal_(n, &scl, &vr[(i__ + 1) * vr_dim1 + 1], &c__1);
+		i__2 = *n;
+		for (k = 1; k <= i__2; ++k) {
+/* Computing 2nd power */
+		    r__1 = vr[k + i__ * vr_dim1];
+/* Computing 2nd power */
+		    r__2 = vr[k + (i__ + 1) * vr_dim1];
+		    work[iwrk + k - 1] = r__1 * r__1 + r__2 * r__2;
+/* L30: */
+		}
+		k = isamax_(n, &work[iwrk], &c__1);
+		slartg_(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1],
+			&cs, &sn, &r__);
+		srot_(n, &vr[i__ * vr_dim1 + 1], &c__1, &vr[(i__ + 1) *
+			vr_dim1 + 1], &c__1, &cs, &sn);
+		vr[k + (i__ + 1) * vr_dim1] = 0.f;
+	    }
+/* L40: */
+	}
+    }
+
+/*     Undo scaling if necessary */
+
+L50:
+    if (scalea) {
+	i__1 = *n - *info;
+/* Computing MAX */
+	i__3 = *n - *info;
+	i__2 = max(i__3,1);
+	slascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wr[*info +
+		1], &i__2, &ierr);
+	i__1 = *n - *info;
+/* Computing MAX */
+	i__3 = *n - *info;
+	i__2 = max(i__3,1);
+	slascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wi[*info +
+		1], &i__2, &ierr);
+	if (*info > 0) {
+	    i__1 = ilo - 1;
+	    slascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wr[1],
+		    n, &ierr);
+	    i__1 = ilo - 1;
+	    slascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &wi[1],
+		    n, &ierr);
+	}
+    }
+
+    work[1] = (real) maxwrk;
+    return 0;
+
+/*     End of SGEEV */
+
+} /* sgeev_ */
+
+/* Subroutine */ int sgehd2_(integer *n, integer *ilo, integer *ihi, real *a,
+	integer *lda, real *tau, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    static real aii;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *), slarfg_(integer *, real *, real *,
+	    integer *, real *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEHD2 reduces a real general matrix A to upper Hessenberg form H by
+    an orthogonal similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to SGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= max(1,N).
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the n by n general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the orthogonal matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) REAL array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) REAL array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEHD2", &i__1);
+	return 0;
+    }
+
+    i__1 = *ihi - 1;
+    for (i__ = *ilo; i__ <= i__1; ++i__) {
+
+/*        Compute elementary reflector H(i) to annihilate A(i+2:ihi,i) */
+
+	i__2 = *ihi - i__;
+/* Computing MIN */
+	i__3 = i__ + 2;
+	slarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		a_dim1], &c__1, &tau[i__]);
+	aii = a[i__ + 1 + i__ * a_dim1];
+	a[i__ + 1 + i__ * a_dim1] = 1.f;
+
+/*        Apply H(i) to A(1:ihi,i+1:ihi) from the right */
+
+	i__2 = *ihi - i__;
+	slarf_("Right", ihi, &i__2, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+		i__], &a[(i__ + 1) * a_dim1 + 1], lda, &work[1]);
+
+/*        Apply H(i) to A(i+1:ihi,i+1:n) from the left */
+
+	i__2 = *ihi - i__;
+	i__3 = *n - i__;
+	slarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+		i__], &a[i__ + 1 + (i__ + 1) * a_dim1], lda, &work[1]);
+
+	a[i__ + 1 + i__ * a_dim1] = aii;
+/* L10: */
+    }
+
+    return 0;
+
+/*     End of SGEHD2 */
+
+} /* sgehd2_ */
+
+/* Subroutine */ int sgehrd_(integer *n, integer *ilo, integer *ihi, real *a,
+	integer *lda, real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j;
+    static real t[4160]	/* was [65][64] */;
+    static integer ib;
+    static real ei;
+    static integer nb, nh, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *), strmm_(char *, char *, char *,
+	     char *, integer *, integer *, real *, real *, integer *, real *,
+	    integer *), saxpy_(integer *,
+	    real *, real *, integer *, real *, integer *), sgehd2_(integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    ), slahr2_(integer *, integer *, integer *, real *, integer *,
+	    real *, real *, integer *, real *, integer *), slarfb_(char *,
+	    char *, char *, char *, integer *, integer *, integer *, real *,
+	    integer *, real *, integer *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2.1)                                  --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    SGEHRD reduces a real general matrix A to upper Hessenberg form H by
+    an orthogonal similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to SGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the N-by-N general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the orthogonal matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) REAL array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
+            zero.
+
+    WORK    (workspace/output) REAL array, dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This file is a slight modification of LAPACK-3.0's DGEHRD
+    subroutine incorporating improvements proposed by Quintana-Orti and
+    Van de Geijn (2006). (See DLAHR2.)
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MIN */
+    i__1 = 64, i__2 = ilaenv_(&c__1, "SGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    lwkopt = *n * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEHRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
+
+    i__1 = *ilo - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	tau[i__] = 0.f;
+/* L10: */
+    }
+    i__1 = *n - 1;
+    for (i__ = max(1,*ihi); i__ <= i__1; ++i__) {
+	tau[i__] = 0.f;
+/* L20: */
+    }
+
+/*     Quick return if possible */
+
+    nh = *ihi - *ilo + 1;
+    if (nh <= 1) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+/*
+       Determine the block size
+
+   Computing MIN
+*/
+    i__1 = 64, i__2 = ilaenv_(&c__1, "SGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    nbmin = 2;
+    iws = 1;
+    if (nb > 1 && nb < nh) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code)
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "SGEHRD", " ", n, ilo, ihi, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < nh) {
+
+/*           Determine if workspace is large enough for blocked code */
+
+	    iws = *n * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code
+
+   Computing MAX
+*/
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEHRD", " ", n, ilo, ihi, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+		if (*lwork >= *n * nbmin) {
+		    nb = *lwork / *n;
+		} else {
+		    nb = 1;
+		}
+	    }
+	}
+    }
+    ldwork = *n;
+
+    if (nb < nbmin || nb >= nh) {
+
+/*        Use unblocked code below */
+
+	i__ = *ilo;
+
+    } else {
+
+/*        Use blocked code */
+
+	i__1 = *ihi - 1 - nx;
+	i__2 = nb;
+	for (i__ = *ilo; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = nb, i__4 = *ihi - i__;
+	    ib = min(i__3,i__4);
+
+/*
+             Reduce columns i:i+ib-1 to Hessenberg form, returning the
+             matrices V and T of the block reflector H = I - V*T*V'
+             which performs the reduction, and also the matrix Y = A*V*T
+*/
+
+	    slahr2_(ihi, &i__, &ib, &a[i__ * a_dim1 + 1], lda, &tau[i__], t, &
+		    c__65, &work[1], &ldwork);
+
+/*
+             Apply the block reflector H to A(1:ihi,i+ib:ihi) from the
+             right, computing  A := A - Y * V'. V(i+ib,ib-1) must be set
+             to 1
+*/
+
+	    ei = a[i__ + ib + (i__ + ib - 1) * a_dim1];
+	    a[i__ + ib + (i__ + ib - 1) * a_dim1] = 1.f;
+	    i__3 = *ihi - i__ - ib + 1;
+	    sgemm_("No transpose", "Transpose", ihi, &i__3, &ib, &c_b151, &
+		    work[1], &ldwork, &a[i__ + ib + i__ * a_dim1], lda, &
+		    c_b15, &a[(i__ + ib) * a_dim1 + 1], lda);
+	    a[i__ + ib + (i__ + ib - 1) * a_dim1] = ei;
+
+/*
+             Apply the block reflector H to A(1:i,i+1:i+ib-1) from the
+             right
+*/
+
+	    i__3 = ib - 1;
+	    strmm_("Right", "Lower", "Transpose", "Unit", &i__, &i__3, &c_b15,
+		     &a[i__ + 1 + i__ * a_dim1], lda, &work[1], &ldwork);
+	    i__3 = ib - 2;
+	    for (j = 0; j <= i__3; ++j) {
+		saxpy_(&i__, &c_b151, &work[ldwork * j + 1], &c__1, &a[(i__ +
+			j + 1) * a_dim1 + 1], &c__1);
+/* L30: */
+	    }
+
+/*
+             Apply the block reflector H to A(i+1:ihi,i+ib:n) from the
+             left
+*/
+
+	    i__3 = *ihi - i__;
+	    i__4 = *n - i__ - ib + 1;
+	    slarfb_("Left", "Transpose", "Forward", "Columnwise", &i__3, &
+		    i__4, &ib, &a[i__ + 1 + i__ * a_dim1], lda, t, &c__65, &a[
+		    i__ + 1 + (i__ + ib) * a_dim1], lda, &work[1], &ldwork);
+/* L40: */
+	}
+    }
+
+/*     Use unblocked code to reduce the rest of the matrix */
+
+    sgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo);
+    work[1] = (real) iws;
+
+    return 0;
+
+/*     End of SGEHRD */
+
+} /* sgehrd_ */
+
+/* Subroutine */ int sgelq2_(integer *m, integer *n, real *a, integer *lda,
+	real *tau, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static real aii;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *), slarfg_(integer *, real *, real *,
+	    integer *, real *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SGELQ2 computes an LQ factorization of a real m by n matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m by min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) REAL array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k) . . . H(2) H(1), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELQ2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i,i+1:n) */
+
+	i__2 = *n - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3,*n) * a_dim1]
+		, lda, &tau[i__]);
+	if (i__ < *m) {
+
+/*           Apply H(i) to A(i+1:m,i:n) from the right */
+
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.f;
+	    i__2 = *m - i__;
+	    i__3 = *n - i__ + 1;
+	    slarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[
+		    i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of SGELQ2 */
+
+} /* sgelq2_ */
+
+/* Subroutine */ int sgelqf_(integer *m, integer *n, real *a, integer *lda,
+	real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int sgelq2_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *), slarfb_(char *, char *, char *,
+	    char *, integer *, integer *, integer *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGELQF computes an LQ factorization of a real M-by-N matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k) . . . H(2) H(1), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "SGELQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *m * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELQF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SGELQF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGELQF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the LQ factorization of the current block
+             A(i:i+ib-1,i:n)
+*/
+
+	    i__3 = *n - i__ + 1;
+	    sgelq2_(&ib, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *n - i__ + 1;
+		slarft_("Forward", "Rowwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i+ib:m,i:n) from the right */
+
+		i__3 = *m - i__ - ib + 1;
+		i__4 = *n - i__ + 1;
+		slarfb_("Right", "No transpose", "Forward", "Rowwise", &i__3,
+			&i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib +
+			1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	sgelq2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1] = (real) iws;
+    return 0;
+
+/*     End of SGELQF */
+
+} /* sgelqf_ */
+
+/* Subroutine */ int sgelsd_(integer *m, integer *n, integer *nrhs, real *a,
+	integer *lda, real *b, integer *ldb, real *s, real *rcond, integer *
+	rank, real *work, integer *lwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer ie, il, mm;
+    static real eps, anrm, bnrm;
+    static integer itau, nlvl, iascl, ibscl;
+    static real sfmin;
+    static integer minmn, maxmn, itaup, itauq, mnthr, nwork;
+    extern /* Subroutine */ int slabad_(real *, real *), sgebrd_(integer *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    real *, integer *, integer *);
+    extern doublereal slamch_(char *), slange_(char *, integer *,
+	    integer *, real *, integer *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real bignum;
+    extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *, integer *), slalsd_(char *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, integer *), slascl_(char *
+	    , integer *, integer *, real *, real *, integer *, integer *,
+	    real *, integer *, integer *);
+    static integer wlalsd;
+    extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *, integer *), slacpy_(char *, integer
+	    *, integer *, real *, integer *, real *, integer *),
+	    slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , real *, integer *, integer *);
+    static integer liwork, minwrk, maxwrk;
+    static real smlnum;
+    extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+    static logical lquery;
+    static integer smlsiz;
+    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGELSD computes the minimum-norm solution to a real linear least
+    squares problem:
+        minimize 2-norm(| b - A*x |)
+    using the singular value decomposition (SVD) of A. A is an M-by-N
+    matrix which may be rank-deficient.
+
+    Several right hand side vectors b and solution vectors x can be
+    handled in a single call; they are stored as the columns of the
+    M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+    matrix X.
+
+    The problem is solved in three steps:
+    (1) Reduce the coefficient matrix A to bidiagonal form with
+        Householder transformations, reducing the original problem
+        into a "bidiagonal least squares problem" (BLS)
+    (2) Solve the BLS using a divide and conquer approach.
+    (3) Apply back all the Householder tranformations to solve
+        the original least squares problem.
+
+    The effective rank of A is determined by treating as zero those
+    singular values which are less than RCOND times the largest singular
+    value.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of A. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of A. N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrices B and X. NRHS >= 0.
+
+    A       (input) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, A has been destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (input/output) REAL array, dimension (LDB,NRHS)
+            On entry, the M-by-NRHS right hand side matrix B.
+            On exit, B is overwritten by the N-by-NRHS solution
+            matrix X.  If m >= n and RANK = n, the residual
+            sum-of-squares for the solution in the i-th column is given
+            by the sum of squares of elements n+1:m in that column.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >= max(1,max(M,N)).
+
+    S       (output) REAL array, dimension (min(M,N))
+            The singular values of A in decreasing order.
+            The condition number of A in the 2-norm = S(1)/S(min(m,n)).
+
+    RCOND   (input) REAL
+            RCOND is used to determine the effective rank of A.
+            Singular values S(i) <= RCOND*S(1) are treated as zero.
+            If RCOND < 0, machine precision is used instead.
+
+    RANK    (output) INTEGER
+            The effective rank of A, i.e., the number of singular values
+            which are greater than RCOND*S(1).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK must be at least 1.
+            The exact minimum amount of workspace needed depends on M,
+            N and NRHS. As long as LWORK is at least
+                12*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2,
+            if M is greater than or equal to N or
+                12*M + 2*M*SMLSIZ + 8*M*NLVL + M*NRHS + (SMLSIZ+1)**2,
+            if M is less than N, the code will execute correctly.
+            SMLSIZ is returned by ILAENV and is equal to the maximum
+            size of the subproblems at the bottom of the computation
+            tree (usually about 25), and
+               NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+            For good performance, LWORK should generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the array WORK and the
+            minimum size of the array IWORK, and returns these values as
+            the first entries of the WORK and IWORK arrays, and no error
+            message related to LWORK is issued by XERBLA.
+
+    IWORK   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
+            LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN),
+            where MINMN = MIN( M,N ).
+            On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  the algorithm for computing the SVD failed to converge;
+                  if INFO = i, i off-diagonal elements of an intermediate
+                  bidiagonal form did not converge to zero.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input arguments.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    maxmn = max(*m,*n);
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldb < max(1,maxmn)) {
+	*info = -7;
+    }
+
+/*
+       Compute workspace.
+       (Note: Comments in the code beginning "Workspace:" describe the
+       minimal amount of workspace needed at that point in the code,
+       as well as the preferred amount for good performance.
+       NB refers to the optimal block size for the immediately
+       following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	liwork = 1;
+	if (minmn > 0) {
+	    smlsiz = ilaenv_(&c__9, "SGELSD", " ", &c__0, &c__0, &c__0, &c__0,
+		     (ftnlen)6, (ftnlen)1);
+	    mnthr = ilaenv_(&c__6, "SGELSD", " ", m, n, nrhs, &c_n1, (ftnlen)
+		    6, (ftnlen)1);
+/* Computing MAX */
+	    i__1 = (integer) (log((real) minmn / (real) (smlsiz + 1)) / log(
+		    2.f)) + 1;
+	    nlvl = max(i__1,0);
+	    liwork = minmn * 3 * nlvl + minmn * 11;
+	    mm = *m;
+	    if (*m >= *n && *m >= mnthr) {
+
+/*
+                Path 1a - overdetermined, with many more rows than
+                          columns.
+*/
+
+		mm = *n;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + *n * ilaenv_(&c__1, "SGEQRF",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + *nrhs * ilaenv_(&c__1, "SORMQR",
+			"LT", m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2);
+		maxwrk = max(i__1,i__2);
+	    }
+	    if (*m >= *n) {
+
+/*
+                Path 1 - overdetermined or exactly determined.
+
+   Computing MAX
+*/
+		i__1 = maxwrk, i__2 = *n * 3 + (mm + *n) * ilaenv_(&c__1,
+			"SGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, (
+			ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * 3 + *nrhs * ilaenv_(&c__1, "SORMBR"
+			, "QLT", &mm, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * 3 + (*n - 1) * ilaenv_(&c__1,
+			"SORMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing 2nd power */
+		i__1 = smlsiz + 1;
+		wlalsd = *n * 9 + (*n << 1) * smlsiz + (*n << 3) * nlvl + *n *
+			 *nrhs + i__1 * i__1;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * 3 + wlalsd;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = *n * 3 + mm, i__2 = *n * 3 + *nrhs, i__1 = max(i__1,
+			i__2), i__2 = *n * 3 + wlalsd;
+		minwrk = max(i__1,i__2);
+	    }
+	    if (*n > *m) {
+/* Computing 2nd power */
+		i__1 = smlsiz + 1;
+		wlalsd = *m * 9 + (*m << 1) * smlsiz + (*m << 3) * nlvl + *m *
+			 *nrhs + i__1 * i__1;
+		if (*n >= mnthr) {
+
+/*
+                   Path 2a - underdetermined, with many more columns
+                             than rows.
+*/
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) *
+			    ilaenv_(&c__1, "SGEBRD", " ", m, m, &c_n1, &c_n1,
+			    (ftnlen)6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs *
+			    ilaenv_(&c__1, "SORMBR", "QLT", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) *
+			    ilaenv_(&c__1, "SORMBR", "PLN", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		    if (*nrhs > 1) {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs;
+			maxwrk = max(i__1,i__2);
+		    } else {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + (*m << 1);
+			maxwrk = max(i__1,i__2);
+		    }
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m + *nrhs * ilaenv_(&c__1, "SORMLQ"
+			    , "LT", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)2);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + wlalsd;
+		    maxwrk = max(i__1,i__2);
+/*
+       XXX: Ensure the Path 2a case below is triggered.  The workspace
+       calculation should use queries for all routines eventually.
+   Computing MAX
+   Computing MAX
+*/
+		    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4),
+			    i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3;
+		    i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4)
+			    ;
+		    maxwrk = max(i__1,i__2);
+		} else {
+
+/*                 Path 2 - remaining underdetermined cases. */
+
+		    maxwrk = *m * 3 + (*n + *m) * ilaenv_(&c__1, "SGEBRD",
+			    " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * 3 + *nrhs * ilaenv_(&c__1,
+			    "SORMBR", "QLT", m, nrhs, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORM"
+			    "BR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)
+			    3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * 3 + wlalsd;
+		    maxwrk = max(i__1,i__2);
+		}
+/* Computing MAX */
+		i__1 = *m * 3 + *nrhs, i__2 = *m * 3 + *m, i__1 = max(i__1,
+			i__2), i__2 = *m * 3 + wlalsd;
+		minwrk = max(i__1,i__2);
+	    }
+	}
+	minwrk = min(minwrk,maxwrk);
+	work[1] = (real) maxwrk;
+	iwork[1] = liwork;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELSD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	*rank = 0;
+	return 0;
+    }
+
+/*     Get machine parameters. */
+
+    eps = slamch_("P");
+    sfmin = slamch_("S");
+    smlnum = sfmin / eps;
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+
+/*     Scale A if max entry outside range [SMLNUM,BIGNUM]. */
+
+    anrm = slange_("M", m, n, &a[a_offset], lda, &work[1]);
+    iascl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 2;
+    } else if (anrm == 0.f) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	slaset_("F", &minmn, &c__1, &c_b29, &c_b29, &s[1], &c__1);
+	*rank = 0;
+	goto L10;
+    }
+
+/*     Scale B if max entry outside range [SMLNUM,BIGNUM]. */
+
+    bnrm = slange_("M", m, nrhs, &b[b_offset], ldb, &work[1]);
+    ibscl = 0;
+    if (bnrm > 0.f && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	slascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	slascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 2;
+    }
+
+/*     If M < N make sure certain entries of B are zero. */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     Overdetermined case. */
+
+    if (*m >= *n) {
+
+/*        Path 1 - overdetermined or exactly determined. */
+
+	mm = *m;
+	if (*m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns. */
+
+	    mm = *n;
+	    itau = 1;
+	    nwork = itau + *n;
+
+/*
+             Compute A=Q*R.
+             (Workspace: need 2*N, prefer N+N*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+
+/*
+             Multiply B by transpose(Q).
+             (Workspace: need N+NRHS, prefer N+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormqr_("L", "T", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+/*           Zero out below R. */
+
+	    if (*n > 1) {
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+	    }
+	}
+
+	ie = 1;
+	itauq = ie + *n;
+	itaup = itauq + *n;
+	nwork = itaup + *n;
+
+/*
+          Bidiagonalize R in A.
+          (Workspace: need 3*N+MM, prefer 3*N+(MM+N)*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	sgebrd_(&mm, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of R.
+          (Workspace: need 3*N+NRHS, prefer 3*N+NRHS*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	sormbr_("Q", "L", "T", &mm, nrhs, n, &a[a_offset], lda, &work[itauq],
+		&b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	slalsd_("U", &smlsiz, n, nrhs, &s[1], &work[ie], &b[b_offset], ldb,
+		rcond, rank, &work[nwork], &iwork[1], info);
+	if (*info != 0) {
+	    goto L10;
+	}
+
+/*        Multiply B by right bidiagonalizing vectors of R. */
+
+	i__1 = *lwork - nwork + 1;
+	sormbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], &
+		b[b_offset], ldb, &work[nwork], &i__1, info);
+
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max(
+		i__1,*nrhs), i__2 = *n - *m * 3, i__1 = max(i__1,i__2);
+	if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,wlalsd)) {
+
+/*
+          Path 2a - underdetermined, with many more columns than rows
+          and sufficient workspace for an efficient algorithm.
+*/
+
+	    ldwork = *m;
+/*
+   Computing MAX
+   Computing MAX
+*/
+	    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+		    max(i__3,*nrhs), i__4 = *n - *m * 3;
+	    i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda +
+		    *m + *m * *nrhs, i__1 = max(i__1,i__2), i__2 = (*m << 2)
+		    + *m * *lda + wlalsd;
+	    if (*lwork >= max(i__1,i__2)) {
+		ldwork = *lda;
+	    }
+	    itau = 1;
+	    nwork = *m + 1;
+
+/*
+          Compute A=L*Q.
+          (Workspace: need 2*M, prefer M+M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+	    il = nwork;
+
+/*        Copy L to WORK(IL), zeroing out above its diagonal. */
+
+	    slacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork);
+	    i__1 = *m - 1;
+	    i__2 = *m - 1;
+	    slaset_("U", &i__1, &i__2, &c_b29, &c_b29, &work[il + ldwork], &
+		    ldwork);
+	    ie = il + ldwork * *m;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+          Bidiagonalize L in WORK(IL).
+          (Workspace: need M*M+5*M, prefer M*M+4*M+2*M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgebrd_(m, m, &work[il], &ldwork, &s[1], &work[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of L.
+          (Workspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("Q", "L", "T", m, nrhs, m, &work[il], &ldwork, &work[
+		    itauq], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    slalsd_("U", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &iwork[1], info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of L. */
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[
+		    itaup], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Zero out below first M rows of B. */
+
+	    i__1 = *n - *m;
+	    slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1],
+		    ldb);
+	    nwork = itau + *m;
+
+/*
+          Multiply transpose(Q) by B.
+          (Workspace: need M+NRHS, prefer M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormlq_("L", "T", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+	} else {
+
+/*        Path 2 - remaining underdetermined cases. */
+
+	    ie = 1;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+          Bidiagonalize A.
+          (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors.
+          (Workspace: need 3*M+NRHS, prefer 3*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("Q", "L", "T", m, nrhs, n, &a[a_offset], lda, &work[itauq]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    slalsd_("L", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &iwork[1], info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of A. */
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+	}
+    }
+
+/*     Undo scaling. */
+
+    if (iascl == 1) {
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    } else if (iascl == 2) {
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    }
+    if (ibscl == 1) {
+	slascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    } else if (ibscl == 2) {
+	slascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    }
+
+L10:
+    work[1] = (real) maxwrk;
+    iwork[1] = liwork;
+    return 0;
+
+/*     End of SGELSD */
+
+} /* sgelsd_ */
+
+/* Subroutine */ int sgeqr2_(integer *m, integer *n, real *a, integer *lda,
+	real *tau, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static real aii;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *), slarfg_(integer *, real *, real *,
+	    integer *, real *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SGEQR2 computes a QR factorization of a real m by n matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(m,n) by n upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) REAL array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQR2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
+		, &c__1, &tau[i__]);
+	if (i__ < *n) {
+
+/*           Apply H(i) to A(i:m,i+1:n) from the left */
+
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    slarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of SGEQR2 */
+
+} /* sgeqr2_ */
+
+/* Subroutine */ int sgeqrf_(integer *m, integer *n, real *a, integer *lda,
+	real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *), slarfb_(char *, char *, char *,
+	    char *, integer *, integer *, integer *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEQRF computes a QR factorization of a real M-by-N matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of min(m,n) elementary reflectors (see Further
+            Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *n * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQRF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQRF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the QR factorization of the current block
+             A(i:m,i:i+ib-1)
+*/
+
+	    i__3 = *m - i__ + 1;
+	    sgeqr2_(&i__3, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *m - i__ + 1;
+		slarft_("Forward", "Columnwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i:m,i+ib:n) from the left */
+
+		i__3 = *m - i__ + 1;
+		i__4 = *n - i__ - ib + 1;
+		slarfb_("Left", "Transpose", "Forward", "Columnwise", &i__3, &
+			i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &work[ib
+			+ 1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	sgeqr2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1] = (real) iws;
+    return 0;
+
+/*     End of SGEQRF */
+
+} /* sgeqrf_ */
+
+/* Subroutine */ int sgesdd_(char *jobz, integer *m, integer *n, real *a,
+	integer *lda, real *s, real *u, integer *ldu, real *vt, integer *ldvt,
+	 real *work, integer *lwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ie, il, ir, iu, blk;
+    static real dum[1], eps;
+    static integer ivt, iscl;
+    static real anrm;
+    static integer idum[1], ierr, itau;
+    extern logical lsame_(char *, char *);
+    static integer chunk;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer minmn, wrkbl, itaup, itauq, mnthr;
+    static logical wntqa;
+    static integer nwork;
+    static logical wntqn, wntqo, wntqs;
+    static integer bdspac;
+    extern /* Subroutine */ int sbdsdc_(char *, char *, integer *, real *,
+	    real *, real *, integer *, real *, integer *, real *, integer *,
+	    real *, integer *, integer *), sgebrd_(integer *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    real *, integer *, integer *);
+    extern doublereal slamch_(char *), slange_(char *, integer *,
+	    integer *, real *, integer *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real bignum;
+    extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *, integer *), slascl_(char *, integer
+	    *, integer *, real *, real *, integer *, integer *, real *,
+	    integer *, integer *), sgeqrf_(integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *), slacpy_(char
+	    *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *,
+	    real *, integer *), sorgbr_(char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, integer *
+	    );
+    static integer ldwrkl;
+    extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , real *, integer *, integer *);
+    static integer ldwrkr, minwrk, ldwrku, maxwrk;
+    extern /* Subroutine */ int sorglq_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *);
+    static integer ldwkvt;
+    static real smlnum;
+    static logical wntqas;
+    extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *);
+    static logical lquery;
+
+
+/*
+    -- LAPACK driver routine (version 3.2.1)                                  --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       March 2009
+
+
+    Purpose
+    =======
+
+    SGESDD computes the singular value decomposition (SVD) of a real
+    M-by-N matrix A, optionally computing the left and right singular
+    vectors.  If singular vectors are desired, it uses a
+    divide-and-conquer algorithm.
+
+    The SVD is written
+
+         A = U * SIGMA * transpose(V)
+
+    where SIGMA is an M-by-N matrix which is zero except for its
+    min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+    V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+    are the singular values of A; they are real and non-negative, and
+    are returned in descending order.  The first min(m,n) columns of
+    U and V are the left and right singular vectors of A.
+
+    Note that the routine returns VT = V**T, not V.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            Specifies options for computing all or part of the matrix U:
+            = 'A':  all M columns of U and all N rows of V**T are
+                    returned in the arrays U and VT;
+            = 'S':  the first min(M,N) columns of U and the first
+                    min(M,N) rows of V**T are returned in the arrays U
+                    and VT;
+            = 'O':  If M >= N, the first N columns of U are overwritten
+                    on the array A and all rows of V**T are returned in
+                    the array VT;
+                    otherwise, all columns of U are returned in the
+                    array U and the first M rows of V**T are overwritten
+                    in the array A;
+            = 'N':  no columns of U or rows of V**T are computed.
+
+    M       (input) INTEGER
+            The number of rows of the input matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the input matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit,
+            if JOBZ = 'O',  A is overwritten with the first N columns
+                            of U (the left singular vectors, stored
+                            columnwise) if M >= N;
+                            A is overwritten with the first M rows
+                            of V**T (the right singular vectors, stored
+                            rowwise) otherwise.
+            if JOBZ .ne. 'O', the contents of A are destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    S       (output) REAL array, dimension (min(M,N))
+            The singular values of A, sorted so that S(i) >= S(i+1).
+
+    U       (output) REAL array, dimension (LDU,UCOL)
+            UCOL = M if JOBZ = 'A' or JOBZ = 'O' and M < N;
+            UCOL = min(M,N) if JOBZ = 'S'.
+            If JOBZ = 'A' or JOBZ = 'O' and M < N, U contains the M-by-M
+            orthogonal matrix U;
+            if JOBZ = 'S', U contains the first min(M,N) columns of U
+            (the left singular vectors, stored columnwise);
+            if JOBZ = 'O' and M >= N, or JOBZ = 'N', U is not referenced.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= 1; if
+            JOBZ = 'S' or 'A' or JOBZ = 'O' and M < N, LDU >= M.
+
+    VT      (output) REAL array, dimension (LDVT,N)
+            If JOBZ = 'A' or JOBZ = 'O' and M >= N, VT contains the
+            N-by-N orthogonal matrix V**T;
+            if JOBZ = 'S', VT contains the first min(M,N) rows of
+            V**T (the right singular vectors, stored rowwise);
+            if JOBZ = 'O' and M < N, or JOBZ = 'N', VT is not referenced.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.  LDVT >= 1; if
+            JOBZ = 'A' or JOBZ = 'O' and M >= N, LDVT >= N;
+            if JOBZ = 'S', LDVT >= min(M,N).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= 1.
+            If JOBZ = 'N',
+              LWORK >= 3*min(M,N) + max(max(M,N),6*min(M,N)).
+            If JOBZ = 'O',
+              LWORK >= 3*min(M,N) +
+                       max(max(M,N),5*min(M,N)*min(M,N)+4*min(M,N)).
+            If JOBZ = 'S' or 'A'
+              LWORK >= 3*min(M,N) +
+                       max(max(M,N),4*min(M,N)*min(M,N)+4*min(M,N)).
+            For good performance, LWORK should generally be larger.
+            If LWORK = -1 but other input arguments are legal, WORK(1)
+            returns the optimal LWORK.
+
+    IWORK   (workspace) INTEGER array, dimension (8*min(M,N))
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  SBDSDC did not converge, updating process failed.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --s;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    wntqa = lsame_(jobz, "A");
+    wntqs = lsame_(jobz, "S");
+    wntqas = wntqa || wntqs;
+    wntqo = lsame_(jobz, "O");
+    wntqn = lsame_(jobz, "N");
+    lquery = *lwork == -1;
+
+    if (! (wntqa || wntqs || wntqo || wntqn)) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldu < 1 || wntqas && *ldu < *m || wntqo && *m < *n && *ldu < *
+	    m) {
+	*info = -8;
+    } else if (*ldvt < 1 || wntqa && *ldvt < *n || wntqs && *ldvt < minmn ||
+	    wntqo && *m >= *n && *ldvt < *n) {
+	*info = -10;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         NB refers to the optimal block size for the immediately
+         following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	if (*m >= *n && minmn > 0) {
+
+/*           Compute space needed for SBDSDC */
+
+	    mnthr = (integer) (minmn * 11.f / 6.f);
+	    if (wntqn) {
+		bdspac = *n * 7;
+	    } else {
+		bdspac = *n * 3 * *n + (*n << 2);
+	    }
+	    if (*m >= mnthr) {
+		if (wntqn) {
+
+/*                 Path 1 (M much larger than N, JOBZ='N') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "SGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = bdspac + *n;
+		} else if (wntqo) {
+
+/*                 Path 2 (M much larger than N, JOBZ='O') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "SGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "SORGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + (*n << 1) * *n;
+		    minwrk = bdspac + (*n << 1) * *n + *n * 3;
+		} else if (wntqs) {
+
+/*                 Path 3 (M much larger than N, JOBZ='S') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "SGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "SORGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *n * *n;
+		    minwrk = bdspac + *n * *n + *n * 3;
+		} else if (wntqa) {
+
+/*                 Path 4 (M much larger than N, JOBZ='A') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "SGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *m * ilaenv_(&c__1, "SORGQR",
+			    " ", m, m, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + (*n << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *n * *n;
+		    minwrk = bdspac + *n * *n + *n * 3;
+		}
+	    } else {
+
+/*              Path 5 (M at least N, but not much larger) */
+
+		wrkbl = *n * 3 + (*m + *n) * ilaenv_(&c__1, "SGEBRD", " ", m,
+			n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		if (wntqn) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3 + max(*m,bdspac);
+		} else if (wntqo) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *n;
+/* Computing MAX */
+		    i__1 = *m, i__2 = *n * *n + bdspac;
+		    minwrk = *n * 3 + max(i__1,i__2);
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *n * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3 + max(*m,bdspac);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n * 3 + *n * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = bdspac + *n * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3 + max(*m,bdspac);
+		}
+	    }
+	} else if (minmn > 0) {
+
+/*           Compute space needed for SBDSDC */
+
+	    mnthr = (integer) (minmn * 11.f / 6.f);
+	    if (wntqn) {
+		bdspac = *m * 7;
+	    } else {
+		bdspac = *m * 3 * *m + (*m << 2);
+	    }
+	    if (*n >= mnthr) {
+		if (wntqn) {
+
+/*                 Path 1t (N much larger than M, JOBZ='N') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = bdspac + *m;
+		} else if (wntqo) {
+
+/*                 Path 2t (N much larger than M, JOBZ='O') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "SORGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + (*m << 1) * *m;
+		    minwrk = bdspac + (*m << 1) * *m + *m * 3;
+		} else if (wntqs) {
+
+/*                 Path 3t (N much larger than M, JOBZ='S') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "SORGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *m;
+		    minwrk = bdspac + *m * *m + *m * 3;
+		} else if (wntqa) {
+
+/*                 Path 4t (N much larger than M, JOBZ='A') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *n * ilaenv_(&c__1, "SORGLQ",
+			    " ", n, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + (*m << 1) * ilaenv_(&c__1,
+			    "SGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)6, (
+			    ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", m, m, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *m;
+		    minwrk = bdspac + *m * *m + *m * 3;
+		}
+	    } else {
+
+/*              Path 5t (N greater than M, but not much larger) */
+
+		wrkbl = *m * 3 + (*m + *n) * ilaenv_(&c__1, "SGEBRD", " ", m,
+			n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		if (wntqn) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3 + max(*n,bdspac);
+		} else if (wntqo) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", m, n, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = wrkbl + *m * *n;
+/* Computing MAX */
+		    i__1 = *n, i__2 = *m * *m + bdspac;
+		    minwrk = *m * 3 + max(i__1,i__2);
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", m, n, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3 + max(*n,bdspac);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "QLN", m, m, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORMBR"
+			    , "PRT", n, n, m, &c_n1, (ftnlen)6, (ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = bdspac + *m * 3;
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3 + max(*n,bdspac);
+		}
+	    }
+	}
+	maxwrk = max(maxwrk,minwrk);
+	work[1] = (real) maxwrk;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGESDD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = slamch_("P");
+    smlnum = sqrt(slamch_("S")) / eps;
+    bignum = 1.f / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = slange_("M", m, n, &a[a_offset], lda, dum);
+    iscl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
+	iscl = 1;
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, &
+		ierr);
+    } else if (anrm > bignum) {
+	iscl = 1;
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+    if (*m >= *n) {
+
+/*
+          A has at least as many rows as columns. If A has sufficiently
+          more rows than columns, first reduce using the QR
+          decomposition (if sufficient workspace available)
+*/
+
+	if (*m >= mnthr) {
+
+	    if (wntqn) {
+
+/*
+                Path 1 (M much larger than N, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (Workspace: need 2*N, prefer N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Zero out below R */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+		ie = 1;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (Workspace: need 4*N, prefer 3*N+2*N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgebrd_(n, n, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+		nwork = ie + *n;
+
+/*
+                Perform bidiagonal SVD, computing singular values only
+                (Workspace: need N+BDSPAC)
+*/
+
+		sbdsdc_("U", "N", n, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2 (M much larger than N, JOBZ = 'O')
+                N left singular vectors to be overwritten on A and
+                N right singular vectors to be computed in VT
+*/
+
+		ir = 1;
+
+/*              WORK(IR) is LDWRKR by N */
+
+		if (*lwork >= *lda * *n + *n * *n + *n * 3 + bdspac) {
+		    ldwrkr = *lda;
+		} else {
+		    ldwrkr = (*lwork - *n * *n - *n * 3 - bdspac) / *n;
+		}
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy R to WORK(IR), zeroing out below it */
+
+		slacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sorgqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = itau;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in VT, copying result to WORK(IR)
+                (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*              WORK(IU) is N by N */
+
+		iu = nwork;
+		nwork = iu + *n * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in WORK(IU) and computing right
+                singular vectors of bidiagonal matrix in VT
+                (Workspace: need N+N*N+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &work[ie], &work[iu], n, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite WORK(IU) by left singular vectors of R
+                and VT by right singular vectors of R
+                (Workspace: need 2*N*N+3*N, prefer 2*N*N+2*N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &work[iu], n, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IU), storing result in WORK(IR) and copying to A
+                (Workspace: need 2*N*N, prefer N*N+M*N)
+*/
+
+		i__1 = *m;
+		i__2 = ldwrkr;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *m - i__ + 1;
+		    chunk = min(i__3,ldwrkr);
+		    sgemm_("N", "N", &chunk, n, n, &c_b15, &a[i__ + a_dim1],
+			    lda, &work[iu], n, &c_b29, &work[ir], &ldwrkr);
+		    slacpy_("F", &chunk, n, &work[ir], &ldwrkr, &a[i__ +
+			    a_dim1], lda);
+/* L10: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Path 3 (M much larger than N, JOBZ='S')
+                N left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		ir = 1;
+
+/*              WORK(IR) is N by N */
+
+		ldwrkr = *n;
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy R to WORK(IR), zeroing out below it */
+
+		slacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		slaset_("L", &i__2, &i__1, &c_b29, &c_b29, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sorgqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = itau;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in WORK(IR)
+                (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagoal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need N+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of R and VT
+                by right singular vectors of R
+                (Workspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IR), storing result in U
+                (Workspace: need N*N)
+*/
+
+		slacpy_("F", n, n, &u[u_offset], ldu, &work[ir], &ldwrkr);
+		sgemm_("N", "N", m, n, n, &c_b15, &a[a_offset], lda, &work[ir]
+			, &ldwrkr, &c_b29, &u[u_offset], ldu);
+
+	    } else if (wntqa) {
+
+/*
+                Path 4 (M much larger than N, JOBZ='A')
+                M left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		iu = 1;
+
+/*              WORK(IU) is N by N */
+
+		ldwrku = *n;
+		itau = iu + ldwrku * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R, copying result to U
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+		slacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Generate Q in U
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+		i__2 = *lwork - nwork + 1;
+		sorgqr_(m, m, n, &u[u_offset], ldu, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+
+/*              Produce R in A, zeroing out other entries */
+
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		slaset_("L", &i__2, &i__1, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+		ie = itau;
+		itauq = ie + *n;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgebrd_(n, n, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in WORK(IU) and computing right
+                singular vectors of bidiagonal matrix in VT
+                (Workspace: need N+N*N+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &work[ie], &work[iu], n, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite WORK(IU) by left singular vectors of R and VT
+                by right singular vectors of R
+                (Workspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", n, n, n, &a[a_offset], lda, &work[
+			itauq], &work[iu], &ldwrku, &work[nwork], &i__2, &
+			ierr);
+		i__2 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in U by left singular vectors of R in
+                WORK(IU), storing result in A
+                (Workspace: need N*N)
+*/
+
+		sgemm_("N", "N", m, n, n, &c_b15, &u[u_offset], ldu, &work[iu]
+			, &ldwrku, &c_b29, &a[a_offset], lda);
+
+/*              Copy left singular vectors of A from A to U */
+
+		slacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+	    }
+
+	} else {
+
+/*
+             M .LT. MNTHR
+
+             Path 5 (M at least N, but not much larger)
+             Reduce to bidiagonal form without QR decomposition
+*/
+
+	    ie = 1;
+	    itauq = ie + *n;
+	    itaup = itauq + *n;
+	    nwork = itaup + *n;
+
+/*
+             Bidiagonalize A
+             (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    sgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Perform bidiagonal SVD, only computing singular values
+                (Workspace: need N+BDSPAC)
+*/
+
+		sbdsdc_("U", "N", n, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+	    } else if (wntqo) {
+		iu = nwork;
+		if (*lwork >= *m * *n + *n * 3 + bdspac) {
+
+/*                 WORK( IU ) is M by N */
+
+		    ldwrku = *m;
+		    nwork = iu + ldwrku * *n;
+		    slaset_("F", m, n, &c_b29, &c_b29, &work[iu], &ldwrku);
+		} else {
+
+/*                 WORK( IU ) is N by N */
+
+		    ldwrku = *n;
+		    nwork = iu + ldwrku * *n;
+
+/*                 WORK(IR) is LDWRKR by N */
+
+		    ir = nwork;
+		    ldwrkr = (*lwork - *n * *n - *n * 3) / *n;
+		}
+		nwork = iu + ldwrku * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in WORK(IU) and computing right
+                singular vectors of bidiagonal matrix in VT
+                (Workspace: need N+N*N+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", n, &s[1], &work[ie], &work[iu], &ldwrku, &
+			vt[vt_offset], ldvt, dum, idum, &work[nwork], &iwork[
+			1], info);
+
+/*
+                Overwrite VT by right singular vectors of A
+                (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+		if (*lwork >= *m * *n + *n * 3 + bdspac) {
+
+/*
+                   Overwrite WORK(IU) by left singular vectors of A
+                   (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    sormbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			    itauq], &work[iu], &ldwrku, &work[nwork], &i__2, &
+			    ierr);
+
+/*                 Copy left singular vectors of A from WORK(IU) to A */
+
+		    slacpy_("F", m, n, &work[iu], &ldwrku, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate Q in A
+                   (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    sorgbr_("Q", m, n, n, &a[a_offset], lda, &work[itauq], &
+			    work[nwork], &i__2, &ierr);
+
+/*
+                   Multiply Q in A by left singular vectors of
+                   bidiagonal matrix in WORK(IU), storing result in
+                   WORK(IR) and copying to A
+                   (Workspace: need 2*N*N, prefer N*N+M*N)
+*/
+
+		    i__2 = *m;
+		    i__1 = ldwrkr;
+		    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			     i__1) {
+/* Computing MIN */
+			i__3 = *m - i__ + 1;
+			chunk = min(i__3,ldwrkr);
+			sgemm_("N", "N", &chunk, n, n, &c_b15, &a[i__ +
+				a_dim1], lda, &work[iu], &ldwrku, &c_b29, &
+				work[ir], &ldwrkr);
+			slacpy_("F", &chunk, n, &work[ir], &ldwrkr, &a[i__ +
+				a_dim1], lda);
+/* L20: */
+		    }
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need N+BDSPAC)
+*/
+
+		slaset_("F", m, n, &c_b29, &c_b29, &u[u_offset], ldu);
+		sbdsdc_("U", "I", n, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need 3*N, prefer 2*N+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    } else if (wntqa) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need N+BDSPAC)
+*/
+
+		slaset_("F", m, m, &c_b29, &c_b29, &u[u_offset], ldu);
+		sbdsdc_("U", "I", n, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*              Set the right corner of U to identity matrix */
+
+		if (*m > *n) {
+		    i__1 = *m - *n;
+		    i__2 = *m - *n;
+		    slaset_("F", &i__1, &i__2, &c_b29, &c_b15, &u[*n + 1 + (*
+			    n + 1) * u_dim1], ldu);
+		}
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need N*N+2*N+M, prefer N*N+2*N+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    }
+
+	}
+
+    } else {
+
+/*
+          A has more columns than rows. If A has sufficiently more
+          columns than rows, first reduce using the LQ decomposition (if
+          sufficient workspace available)
+*/
+
+	if (*n >= mnthr) {
+
+	    if (wntqn) {
+
+/*
+                Path 1t (N much larger than M, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (Workspace: need 2*M, prefer M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Zero out above L */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		slaset_("U", &i__1, &i__2, &c_b29, &c_b29, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = 1;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (Workspace: need 4*M, prefer 3*M+2*M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgebrd_(m, m, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+		nwork = ie + *m;
+
+/*
+                Perform bidiagonal SVD, computing singular values only
+                (Workspace: need M+BDSPAC)
+*/
+
+		sbdsdc_("U", "N", m, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2t (N much larger than M, JOBZ='O')
+                M right singular vectors to be overwritten on A and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+
+/*              IVT is M by M */
+
+		il = ivt + *m * *m;
+		if (*lwork >= *m * *n + *m * *m + *m * 3 + bdspac) {
+
+/*                 WORK(IL) is M by N */
+
+		    ldwrkl = *m;
+		    chunk = *n;
+		} else {
+		    ldwrkl = *m;
+		    chunk = (*lwork - *m * *m) / *m;
+		}
+		itau = il + ldwrkl * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy L to WORK(IL), zeroing about above it */
+
+		slacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		slaset_("U", &i__1, &i__2, &c_b29, &c_b29, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sorglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = itau;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IL)
+                (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sgebrd_(m, m, &work[il], &ldwrkl, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U, and computing right singular
+                vectors of bidiagonal matrix in WORK(IVT)
+                (Workspace: need M+M*M+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &
+			work[ivt], m, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of L and WORK(IVT)
+                by right singular vectors of L
+                (Workspace: need 2*M*M+3*M, prefer 2*M*M+2*M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &work[ivt], m, &work[nwork], &i__1, &ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IVT) by Q
+                in A, storing result in WORK(IL) and copying to A
+                (Workspace: need 2*M*M, prefer M*M+M*N)
+*/
+
+		i__1 = *n;
+		i__2 = chunk;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *n - i__ + 1;
+		    blk = min(i__3,chunk);
+		    sgemm_("N", "N", m, &blk, m, &c_b15, &work[ivt], m, &a[
+			    i__ * a_dim1 + 1], lda, &c_b29, &work[il], &
+			    ldwrkl);
+		    slacpy_("F", m, &blk, &work[il], &ldwrkl, &a[i__ * a_dim1
+			    + 1], lda);
+/* L30: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Path 3t (N much larger than M, JOBZ='S')
+                M right singular vectors to be computed in VT and
+                M left singular vectors to be computed in U
+*/
+
+		il = 1;
+
+/*              WORK(IL) is M by M */
+
+		ldwrkl = *m;
+		itau = il + ldwrkl * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy L to WORK(IL), zeroing out above it */
+
+		slacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		slaset_("U", &i__2, &i__1, &c_b29, &c_b29, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sorglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = itau;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IU), copying result to U
+                (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgebrd_(m, m, &work[il], &ldwrkl, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need M+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of L and VT
+                by right singular vectors of L
+                (Workspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+		i__2 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IL) by
+                Q in A, storing result in VT
+                (Workspace: need M*M)
+*/
+
+		slacpy_("F", m, m, &vt[vt_offset], ldvt, &work[il], &ldwrkl);
+		sgemm_("N", "N", m, n, m, &c_b15, &work[il], &ldwrkl, &a[
+			a_offset], lda, &c_b29, &vt[vt_offset], ldvt);
+
+	    } else if (wntqa) {
+
+/*
+                Path 4t (N much larger than M, JOBZ='A')
+                N right singular vectors to be computed in VT and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+
+/*              WORK(IVT) is M by M */
+
+		ldwkvt = *m;
+		itau = ivt + ldwkvt * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q, copying result to VT
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+		slacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Generate Q in VT
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sorglq_(n, n, m, &vt[vt_offset], ldvt, &work[itau], &work[
+			nwork], &i__2, &ierr);
+
+/*              Produce L in A, zeroing out other entries */
+
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		slaset_("U", &i__2, &i__1, &c_b29, &c_b29, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = itau;
+		itauq = ie + *m;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sgebrd_(m, m, &a[a_offset], lda, &s[1], &work[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in WORK(IVT)
+                (Workspace: need M+M*M+BDSPAC)
+*/
+
+		sbdsdc_("U", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &
+			work[ivt], &ldwkvt, dum, idum, &work[nwork], &iwork[1]
+			, info);
+
+/*
+                Overwrite U by left singular vectors of L and WORK(IVT)
+                by right singular vectors of L
+                (Workspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, m, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+		i__2 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", m, m, m, &a[a_offset], lda, &work[
+			itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IVT) by
+                Q in VT, storing result in A
+                (Workspace: need M*M)
+*/
+
+		sgemm_("N", "N", m, n, m, &c_b15, &work[ivt], &ldwkvt, &vt[
+			vt_offset], ldvt, &c_b29, &a[a_offset], lda);
+
+/*              Copy right singular vectors of A from A to VT */
+
+		slacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+	    }
+
+	} else {
+
+/*
+             N .LT. MNTHR
+
+             Path 5t (N greater than M, but not much larger)
+             Reduce to bidiagonal form without LQ decomposition
+*/
+
+	    ie = 1;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+             Bidiagonalize A
+             (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    sgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Perform bidiagonal SVD, only computing singular values
+                (Workspace: need M+BDSPAC)
+*/
+
+		sbdsdc_("L", "N", m, &s[1], &work[ie], dum, &c__1, dum, &c__1,
+			 dum, idum, &work[nwork], &iwork[1], info);
+	    } else if (wntqo) {
+		ldwkvt = *m;
+		ivt = nwork;
+		if (*lwork >= *m * *n + *m * 3 + bdspac) {
+
+/*                 WORK( IVT ) is M by N */
+
+		    slaset_("F", m, n, &c_b29, &c_b29, &work[ivt], &ldwkvt);
+		    nwork = ivt + ldwkvt * *n;
+		} else {
+
+/*                 WORK( IVT ) is M by M */
+
+		    nwork = ivt + ldwkvt * *m;
+		    il = nwork;
+
+/*                 WORK(IL) is M by CHUNK */
+
+		    chunk = (*lwork - *m * *m - *m * 3) / *m;
+		}
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in WORK(IVT)
+                (Workspace: need M*M+BDSPAC)
+*/
+
+		sbdsdc_("L", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &
+			work[ivt], &ldwkvt, dum, idum, &work[nwork], &iwork[1]
+			, info);
+
+/*
+                Overwrite U by left singular vectors of A
+                (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+		if (*lwork >= *m * *n + *m * 3 + bdspac) {
+
+/*
+                   Overwrite WORK(IVT) by left singular vectors of A
+                   (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    sormbr_("P", "R", "T", m, n, m, &a[a_offset], lda, &work[
+			    itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2,
+			    &ierr);
+
+/*                 Copy right singular vectors of A from WORK(IVT) to A */
+
+		    slacpy_("F", m, n, &work[ivt], &ldwkvt, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate P**T in A
+                   (Workspace: need M*M+2*M, prefer M*M+M+M*NB)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    sorgbr_("P", m, n, m, &a[a_offset], lda, &work[itaup], &
+			    work[nwork], &i__2, &ierr);
+
+/*
+                   Multiply Q in A by right singular vectors of
+                   bidiagonal matrix in WORK(IVT), storing result in
+                   WORK(IL) and copying to A
+                   (Workspace: need 2*M*M, prefer M*M+M*N)
+*/
+
+		    i__2 = *n;
+		    i__1 = chunk;
+		    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			     i__1) {
+/* Computing MIN */
+			i__3 = *n - i__ + 1;
+			blk = min(i__3,chunk);
+			sgemm_("N", "N", m, &blk, m, &c_b15, &work[ivt], &
+				ldwkvt, &a[i__ * a_dim1 + 1], lda, &c_b29, &
+				work[il], m);
+			slacpy_("F", m, &blk, &work[il], m, &a[i__ * a_dim1 +
+				1], lda);
+/* L40: */
+		    }
+		}
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need M+BDSPAC)
+*/
+
+		slaset_("F", m, n, &c_b29, &c_b29, &vt[vt_offset], ldvt);
+		sbdsdc_("L", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need 3*M, prefer 2*M+M*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", m, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    } else if (wntqa) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in U and computing right singular
+                vectors of bidiagonal matrix in VT
+                (Workspace: need M+BDSPAC)
+*/
+
+		slaset_("F", n, n, &c_b29, &c_b29, &vt[vt_offset], ldvt);
+		sbdsdc_("L", "I", m, &s[1], &work[ie], &u[u_offset], ldu, &vt[
+			vt_offset], ldvt, dum, idum, &work[nwork], &iwork[1],
+			info);
+
+/*              Set the right corner of VT to identity matrix */
+
+		if (*n > *m) {
+		    i__1 = *n - *m;
+		    i__2 = *n - *m;
+		    slaset_("F", &i__1, &i__2, &c_b29, &c_b15, &vt[*m + 1 + (*
+			    m + 1) * vt_dim1], ldvt);
+		}
+
+/*
+                Overwrite U by left singular vectors of A and VT
+                by right singular vectors of A
+                (Workspace: need 2*M+N, prefer 2*M+N*NB)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		sormbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+		i__1 = *lwork - nwork + 1;
+		sormbr_("P", "R", "T", n, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    }
+
+	}
+
+    }
+
+/*     Undo scaling if necessary */
+
+    if (iscl == 1) {
+	if (anrm > bignum) {
+	    slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+	if (anrm < smlnum) {
+	    slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+    }
+
+/*     Return optimal workspace in WORK(1) */
+
+    work[1] = (real) maxwrk;
+
+    return 0;
+
+/*     End of SGESDD */
+
+} /* sgesdd_ */
+
+/* Subroutine */ int sgesv_(integer *n, integer *nrhs, real *a, integer *lda,
+	integer *ipiv, real *b, integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int xerbla_(char *, integer *), sgetrf_(
+	    integer *, integer *, real *, integer *, integer *, integer *),
+	    sgetrs_(char *, integer *, integer *, real *, integer *, integer *
+	    , real *, integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGESV computes the solution to a real system of linear equations
+       A * X = B,
+    where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+
+    The LU decomposition with partial pivoting and row interchanges is
+    used to factor A as
+       A = P * L * U,
+    where P is a permutation matrix, L is unit lower triangular, and U is
+    upper triangular.  The factored form of A is then used to solve the
+    system of equations A * X = B.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of linear equations, i.e., the order of the
+            matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the N-by-N coefficient matrix A.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (output) INTEGER array, dimension (N)
+            The pivot indices that define the permutation matrix P;
+            row i of the matrix was interchanged with row IPIV(i).
+
+    B       (input/output) REAL array, dimension (LDB,NRHS)
+            On entry, the N-by-NRHS matrix of right hand side matrix B.
+            On exit, if INFO = 0, the N-by-NRHS solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization
+                  has been completed, but the factor U is exactly
+                  singular, so the solution could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*nrhs < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGESV ", &i__1);
+	return 0;
+    }
+
+/*     Compute the LU factorization of A. */
+
+    sgetrf_(n, n, &a[a_offset], lda, &ipiv[1], info);
+    if (*info == 0) {
+
+/*        Solve the system A*X = B, overwriting B with X. */
+
+	sgetrs_("No transpose", n, nrhs, &a[a_offset], lda, &ipiv[1], &b[
+		b_offset], ldb, info);
+    }
+    return 0;
+
+/*     End of SGESV */
+
+} /* sgesv_ */
+
+/* Subroutine */ int sgetf2_(integer *m, integer *n, real *a, integer *lda,
+	integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, jp;
+    extern /* Subroutine */ int sger_(integer *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, integer *), sscal_(integer *
+	    , real *, real *, integer *);
+    static real sfmin;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *,
+	    integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGETF2 computes an LU factorization of a general m-by-n matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the m by n matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, U(k,k) is exactly zero. The factorization
+                 has been completed, but the factor U is exactly
+                 singular, and division by zero will occur if it is used
+                 to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGETF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Compute machine safe minimum */
+
+    sfmin = slamch_("S");
+
+    i__1 = min(*m,*n);
+    for (j = 1; j <= i__1; ++j) {
+
+/*        Find pivot and test for singularity. */
+
+	i__2 = *m - j + 1;
+	jp = j - 1 + isamax_(&i__2, &a[j + j * a_dim1], &c__1);
+	ipiv[j] = jp;
+	if (a[jp + j * a_dim1] != 0.f) {
+
+/*           Apply the interchange to columns 1:N. */
+
+	    if (jp != j) {
+		sswap_(n, &a[j + a_dim1], lda, &a[jp + a_dim1], lda);
+	    }
+
+/*           Compute elements J+1:M of J-th column. */
+
+	    if (j < *m) {
+		if ((r__1 = a[j + j * a_dim1], dabs(r__1)) >= sfmin) {
+		    i__2 = *m - j;
+		    r__1 = 1.f / a[j + j * a_dim1];
+		    sscal_(&i__2, &r__1, &a[j + 1 + j * a_dim1], &c__1);
+		} else {
+		    i__2 = *m - j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			a[j + i__ + j * a_dim1] /= a[j + j * a_dim1];
+/* L20: */
+		    }
+		}
+	    }
+
+	} else if (*info == 0) {
+
+	    *info = j;
+	}
+
+	if (j < min(*m,*n)) {
+
+/*           Update trailing submatrix. */
+
+	    i__2 = *m - j;
+	    i__3 = *n - j;
+	    sger_(&i__2, &i__3, &c_b151, &a[j + 1 + j * a_dim1], &c__1, &a[j
+		    + (j + 1) * a_dim1], lda, &a[j + 1 + (j + 1) * a_dim1],
+		    lda);
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of SGETF2 */
+
+} /* sgetf2_ */
+
+/* Subroutine */ int sgetrf_(integer *m, integer *n, real *a, integer *lda,
+	integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+
+    /* Local variables */
+    static integer i__, j, jb, nb, iinfo;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *), strsm_(char *, char *, char *,
+	     char *, integer *, integer *, real *, real *, integer *, real *,
+	    integer *), sgetf2_(integer *,
+	    integer *, real *, integer *, integer *, integer *), xerbla_(char
+	    *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slaswp_(integer *, real *, integer *, integer
+	    *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGETRF computes an LU factorization of a general M-by-N matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
+                  has been completed, but the factor U is exactly
+                  singular, and division by zero will occur if it is used
+                  to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGETRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "SGETRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    if (nb <= 1 || nb >= min(*m,*n)) {
+
+/*        Use unblocked code. */
+
+	sgetf2_(m, n, &a[a_offset], lda, &ipiv[1], info);
+    } else {
+
+/*        Use blocked code. */
+
+	i__1 = min(*m,*n);
+	i__2 = nb;
+	for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+/* Computing MIN */
+	    i__3 = min(*m,*n) - j + 1;
+	    jb = min(i__3,nb);
+
+/*
+             Factor diagonal and subdiagonal blocks and test for exact
+             singularity.
+*/
+
+	    i__3 = *m - j + 1;
+	    sgetf2_(&i__3, &jb, &a[j + j * a_dim1], lda, &ipiv[j], &iinfo);
+
+/*           Adjust INFO and the pivot indices. */
+
+	    if (*info == 0 && iinfo > 0) {
+		*info = iinfo + j - 1;
+	    }
+/* Computing MIN */
+	    i__4 = *m, i__5 = j + jb - 1;
+	    i__3 = min(i__4,i__5);
+	    for (i__ = j; i__ <= i__3; ++i__) {
+		ipiv[i__] = j - 1 + ipiv[i__];
+/* L10: */
+	    }
+
+/*           Apply interchanges to columns 1:J-1. */
+
+	    i__3 = j - 1;
+	    i__4 = j + jb - 1;
+	    slaswp_(&i__3, &a[a_offset], lda, &j, &i__4, &ipiv[1], &c__1);
+
+	    if (j + jb <= *n) {
+
+/*              Apply interchanges to columns J+JB:N. */
+
+		i__3 = *n - j - jb + 1;
+		i__4 = j + jb - 1;
+		slaswp_(&i__3, &a[(j + jb) * a_dim1 + 1], lda, &j, &i__4, &
+			ipiv[1], &c__1);
+
+/*              Compute block row of U. */
+
+		i__3 = *n - j - jb + 1;
+		strsm_("Left", "Lower", "No transpose", "Unit", &jb, &i__3, &
+			c_b15, &a[j + j * a_dim1], lda, &a[j + (j + jb) *
+			a_dim1], lda);
+		if (j + jb <= *m) {
+
+/*                 Update trailing submatrix. */
+
+		    i__3 = *m - j - jb + 1;
+		    i__4 = *n - j - jb + 1;
+		    sgemm_("No transpose", "No transpose", &i__3, &i__4, &jb,
+			    &c_b151, &a[j + jb + j * a_dim1], lda, &a[j + (j
+			    + jb) * a_dim1], lda, &c_b15, &a[j + jb + (j + jb)
+			     * a_dim1], lda);
+		}
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of SGETRF */
+
+} /* sgetrf_ */
+
+/* Subroutine */ int sgetrs_(char *trans, integer *n, integer *nrhs, real *a,
+	integer *lda, integer *ipiv, real *b, integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *);
+    static logical notran;
+    extern /* Subroutine */ int slaswp_(integer *, real *, integer *, integer
+	    *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGETRS solves a system of linear equations
+       A * X = B  or  A' * X = B
+    with a general N-by-N matrix A using the LU factorization computed
+    by SGETRF.
+
+    Arguments
+    =========
+
+    TRANS   (input) CHARACTER*1
+            Specifies the form of the system of equations:
+            = 'N':  A * X = B  (No transpose)
+            = 'T':  A'* X = B  (Transpose)
+            = 'C':  A'* X = B  (Conjugate transpose = Transpose)
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The factors L and U from the factorization A = P*L*U
+            as computed by SGETRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (input) INTEGER array, dimension (N)
+            The pivot indices from SGETRF; for 1<=i<=N, row i of the
+            matrix was interchanged with row IPIV(i).
+
+    B       (input/output) REAL array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    notran = lsame_(trans, "N");
+    if (! notran && ! lsame_(trans, "T") && ! lsame_(
+	    trans, "C")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGETRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (notran) {
+
+/*
+          Solve A * X = B.
+
+          Apply row interchanges to the right hand sides.
+*/
+
+	slaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c__1);
+
+/*        Solve L*X = B, overwriting B with X. */
+
+	strsm_("Left", "Lower", "No transpose", "Unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	strsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b15, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A' * X = B.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	strsm_("Left", "Upper", "Transpose", "Non-unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	strsm_("Left", "Lower", "Transpose", "Unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Apply row interchanges to the solution vectors. */
+
+	slaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1);
+    }
+
+    return 0;
+
+/*     End of SGETRS */
+
+} /* sgetrs_ */
+
+/* Subroutine */ int shseqr_(char *job, char *compz, integer *n, integer *ilo,
+	 integer *ihi, real *h__, integer *ldh, real *wr, real *wi, real *z__,
+	 integer *ldz, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2[2], i__3;
+    real r__1;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static real hl[2401]	/* was [49][49] */;
+    static integer kbot, nmin;
+    extern logical lsame_(char *, char *);
+    static logical initz;
+    static real workl[49];
+    static logical wantt, wantz;
+    extern /* Subroutine */ int slaqr0_(logical *, logical *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , integer *, real *, integer *, real *, integer *, integer *),
+	    xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slahqr_(logical *, logical *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , integer *, real *, integer *, integer *), slacpy_(char *,
+	    integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *,
+	    real *, integer *);
+    static logical lquery;
+
+
+/*
+    -- LAPACK computational routine (version 3.2.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       June 2010
+
+       Purpose
+       =======
+
+       SHSEQR computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+       Schur form), and Z is the orthogonal matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input orthogonal
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+       Arguments
+       =========
+
+       JOB   (input) CHARACTER*1
+             = 'E':  compute eigenvalues only;
+             = 'S':  compute eigenvalues and the Schur form T.
+
+       COMPZ (input) CHARACTER*1
+             = 'N':  no Schur vectors are computed;
+             = 'I':  Z is initialized to the unit matrix and the matrix Z
+                     of Schur vectors of H is returned;
+             = 'V':  Z must contain an orthogonal matrix Q on entry, and
+                     the product Q*Z is returned.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+             set by a previous call to SGEBAL, and then passed to SGEHRD
+             when the matrix output by SGEBAL is reduced to Hessenberg
+             form. Otherwise ILO and IHI should be set to 1 and N
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) REAL array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and JOB = 'S', then H contains the
+             upper quasi-triangular matrix T from the Schur decomposition
+             (the Schur form); 2-by-2 diagonal blocks (corresponding to
+             complex conjugate pairs of eigenvalues) are returned in
+             standard form, with H(i,i) = H(i+1,i+1) and
+             H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the
+             contents of H are unspecified on exit.  (The output value of
+             H when INFO.GT.0 is given under the description of INFO
+             below.)
+
+             Unlike earlier versions of SHSEQR, this subroutine may
+             explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+             or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       WR    (output) REAL array, dimension (N)
+       WI    (output) REAL array, dimension (N)
+             The real and imaginary parts, respectively, of the computed
+             eigenvalues. If two eigenvalues are computed as a complex
+             conjugate pair, they are stored in consecutive elements of
+             WR and WI, say the i-th and (i+1)th, with WI(i) .GT. 0 and
+             WI(i+1) .LT. 0. If JOB = 'S', the eigenvalues are stored in
+             the same order as on the diagonal of the Schur form returned
+             in H, with WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2
+             diagonal block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
+             WI(i+1) = -WI(i).
+
+       Z     (input/output) REAL array, dimension (LDZ,N)
+             If COMPZ = 'N', Z is not referenced.
+             If COMPZ = 'I', on entry Z need not be set and on exit,
+             if INFO = 0, Z contains the orthogonal matrix Z of the Schur
+             vectors of H.  If COMPZ = 'V', on entry Z must contain an
+             N-by-N matrix Q, which is assumed to be equal to the unit
+             matrix except for the submatrix Z(ILO:IHI,ILO:IHI). On exit,
+             if INFO = 0, Z contains Q*Z.
+             Normally Q is the orthogonal matrix generated by SORGHR
+             after the call to SGEHRD which formed the Hessenberg matrix
+             H. (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if COMPZ = 'I' or
+             COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) REAL array, dimension (LWORK)
+             On exit, if INFO = 0, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient and delivers very good and sometimes
+             optimal performance.  However, LWORK as large as 11*N
+             may be required for optimal performance.  A workspace
+             query is recommended to determine the optimal workspace
+             size.
+
+             If LWORK = -1, then SHSEQR does a workspace query.
+             In this case, SHSEQR checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .LT. 0:  if INFO = -i, the i-th argument had an illegal
+                      value
+             .GT. 0:  if INFO = i, SHSEQR failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and JOB = 'E', then on exit, the
+                  remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and JOB   = 'S', then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is an orthogonal matrix.  The final
+                  value of H is upper Hessenberg and quasi-triangular
+                  in rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and COMPZ = 'V', then on exit
+
+                    (final value of Z)  =  (initial value of Z)*U
+
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'I', then on exit
+                        (final value of Z)  = U
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'N', then Z is not
+                  accessed.
+
+       ================================================================
+               Default values supplied by
+               ILAENV(ISPEC,'SHSEQR',JOB(:1)//COMPZ(:1),N,ILO,IHI,LWORK).
+               It is suggested that these defaults be adjusted in order
+               to attain best performance in each particular
+               computational environment.
+
+              ISPEC=12: The SLAHQR vs SLAQR0 crossover point.
+                        Default: 75. (Must be at least 11.)
+
+              ISPEC=13: Recommended deflation window size.
+                        This depends on ILO, IHI and NS.  NS is the
+                        number of simultaneous shifts returned
+                        by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
+                        The default for (IHI-ILO+1).LE.500 is NS.
+                        The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+
+              ISPEC=14: Nibble crossover point. (See IPARMQ for
+                        details.)  Default: 14% of deflation window
+                        size.
+
+              ISPEC=15: Number of simultaneous shifts in a multishift
+                        QR iteration.
+
+                        If IHI-ILO+1 is ...
+
+                        greater than      ...but less    ... the
+                        or equal to ...      than        default is
+
+                             1               30          NS =   2(+)
+                            30               60          NS =   4(+)
+                            60              150          NS =  10(+)
+                           150              590          NS =  **
+                           590             3000          NS =  64
+                          3000             6000          NS = 128
+                          6000             infinity      NS = 256
+
+                    (+)  By default some or all matrices of this order
+                         are passed to the implicit double shift routine
+                         SLAHQR and this parameter is ignored.  See
+                         ISPEC=12 above and comments in IPARMQ for
+                         details.
+
+                   (**)  The asterisks (**) indicate an ad-hoc
+                         function of N increasing from 10 to 64.
+
+              ISPEC=16: Select structured matrix multiply.
+                        If the number of simultaneous shifts (specified
+                        by ISPEC=15) is less than 14, then the default
+                        for ISPEC=16 is 0.  Otherwise the default for
+                        ISPEC=16 is 2.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    SLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== NL allocates some local workspace to help small matrices
+       .    through a rare SLAHQR failure.  NL .GT. NTINY = 11 is
+       .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+       .    mended.  (The default value of NMIN is 75.)  Using NL = 49
+       .    allows up to six simultaneous shifts and a 16-by-16
+       .    deflation window.  ====
+
+       ==== Decode and check the input parameters. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    wantt = lsame_(job, "S");
+    initz = lsame_(compz, "I");
+    wantz = initz || lsame_(compz, "V");
+    work[1] = (real) max(1,*n);
+    lquery = *lwork == -1;
+
+    *info = 0;
+    if (! lsame_(job, "E") && ! wantt) {
+	*info = -1;
+    } else if (! lsame_(compz, "N") && ! wantz) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*ldh < max(1,*n)) {
+	*info = -7;
+    } else if (*ldz < 1 || wantz && *ldz < max(1,*n)) {
+	*info = -11;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -13;
+    }
+
+    if (*info != 0) {
+
+/*        ==== Quick return in case of invalid argument. ==== */
+
+	i__1 = -(*info);
+	xerbla_("SHSEQR", &i__1);
+	return 0;
+
+    } else if (*n == 0) {
+
+/*        ==== Quick return in case N = 0; nothing to do. ==== */
+
+	return 0;
+
+    } else if (lquery) {
+
+/*        ==== Quick return in case of a workspace query ==== */
+
+	slaqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1], &wi[
+		1], ilo, ihi, &z__[z_offset], ldz, &work[1], lwork, info);
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+   Computing MAX
+*/
+	r__1 = (real) max(1,*n);
+	work[1] = dmax(r__1,work[1]);
+	return 0;
+
+    } else {
+
+/*        ==== copy eigenvalues isolated by SGEBAL ==== */
+
+	i__1 = *ilo - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    wr[i__] = h__[i__ + i__ * h_dim1];
+	    wi[i__] = 0.f;
+/* L10: */
+	}
+	i__1 = *n;
+	for (i__ = *ihi + 1; i__ <= i__1; ++i__) {
+	    wr[i__] = h__[i__ + i__ * h_dim1];
+	    wi[i__] = 0.f;
+/* L20: */
+	}
+
+/*        ==== Initialize Z, if requested ==== */
+
+	if (initz) {
+	    slaset_("A", n, n, &c_b29, &c_b15, &z__[z_offset], ldz)
+		    ;
+	}
+
+/*        ==== Quick return if possible ==== */
+
+	if (*ilo == *ihi) {
+	    wr[*ilo] = h__[*ilo + *ilo * h_dim1];
+	    wi[*ilo] = 0.f;
+	    return 0;
+	}
+
+/*
+          ==== SLAHQR/SLAQR0 crossover point ====
+
+   Writing concatenation
+*/
+	i__2[0] = 1, a__1[0] = job;
+	i__2[1] = 1, a__1[1] = compz;
+	s_cat(ch__1, a__1, i__2, &c__2, (ftnlen)2);
+	nmin = ilaenv_(&c__12, "SHSEQR", ch__1, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== SLAQR0 for big matrices; SLAHQR for small ones ==== */
+
+	if (*n > nmin) {
+	    slaqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1],
+		    &wi[1], ilo, ihi, &z__[z_offset], ldz, &work[1], lwork,
+		    info);
+	} else {
+
+/*           ==== Small matrix ==== */
+
+	    slahqr_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1],
+		    &wi[1], ilo, ihi, &z__[z_offset], ldz, info);
+
+	    if (*info > 0) {
+
+/*
+                ==== A rare SLAHQR failure!  SLAQR0 sometimes succeeds
+                .    when SLAHQR fails. ====
+*/
+
+		kbot = *info;
+
+		if (*n >= 49) {
+
+/*
+                   ==== Larger matrices have enough subdiagonal scratch
+                   .    space to call SLAQR0 directly. ====
+*/
+
+		    slaqr0_(&wantt, &wantz, n, ilo, &kbot, &h__[h_offset],
+			    ldh, &wr[1], &wi[1], ilo, ihi, &z__[z_offset],
+			    ldz, &work[1], lwork, info);
+
+		} else {
+
+/*
+                   ==== Tiny matrices don't have enough subdiagonal
+                   .    scratch space to benefit from SLAQR0.  Hence,
+                   .    tiny matrices must be copied into a larger
+                   .    array before calling SLAQR0. ====
+*/
+
+		    slacpy_("A", n, n, &h__[h_offset], ldh, hl, &c__49);
+		    hl[*n + 1 + *n * 49 - 50] = 0.f;
+		    i__1 = 49 - *n;
+		    slaset_("A", &c__49, &i__1, &c_b29, &c_b29, &hl[(*n + 1) *
+			     49 - 49], &c__49);
+		    slaqr0_(&wantt, &wantz, &c__49, ilo, &kbot, hl, &c__49, &
+			    wr[1], &wi[1], ilo, ihi, &z__[z_offset], ldz,
+			    workl, &c__49, info);
+		    if (wantt || *info != 0) {
+			slacpy_("A", n, n, hl, &c__49, &h__[h_offset], ldh);
+		    }
+		}
+	    }
+	}
+
+/*        ==== Clear out the trash, if necessary. ==== */
+
+	if ((wantt || *info != 0) && *n > 2) {
+	    i__1 = *n - 2;
+	    i__3 = *n - 2;
+	    slaset_("L", &i__1, &i__3, &c_b29, &c_b29, &h__[h_dim1 + 3], ldh);
+	}
+
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+
+   Computing MAX
+*/
+	r__1 = (real) max(1,*n);
+	work[1] = dmax(r__1,work[1]);
+    }
+
+/*     ==== End of SHSEQR ==== */
+
+    return 0;
+} /* shseqr_ */
+
+logical sisnan_(real *sin__)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    extern logical slaisnan_(real *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SISNAN returns .TRUE. if its argument is NaN, and .FALSE.
+    otherwise.  To be replaced by the Fortran 2003 intrinsic in the
+    future.
+
+    Arguments
+    =========
+
+    SIN     (input) REAL
+            Input to test for NaN.
+
+    =====================================================================
+*/
+
+    ret_val = slaisnan_(sin__, sin__);
+    return ret_val;
+} /* sisnan_ */
+
+/* Subroutine */ int slabad_(real *small, real *large)
+{
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLABAD takes as input the values computed by SLAMCH for underflow and
+    overflow, and returns the square root of each of these values if the
+    log of LARGE is sufficiently large.  This subroutine is intended to
+    identify machines with a large exponent range, such as the Crays, and
+    redefine the underflow and overflow limits to be the square roots of
+    the values computed by SLAMCH.  This subroutine is needed because
+    SLAMCH does not compensate for poor arithmetic in the upper half of
+    the exponent range, as is found on a Cray.
+
+    Arguments
+    =========
+
+    SMALL   (input/output) REAL
+            On entry, the underflow threshold as computed by SLAMCH.
+            On exit, if LOG10(LARGE) is sufficiently large, the square
+            root of SMALL, otherwise unchanged.
+
+    LARGE   (input/output) REAL
+            On entry, the overflow threshold as computed by SLAMCH.
+            On exit, if LOG10(LARGE) is sufficiently large, the square
+            root of LARGE, otherwise unchanged.
+
+    =====================================================================
+
+
+       If it looks like we're on a Cray, take the square root of
+       SMALL and LARGE to avoid overflow and underflow problems.
+*/
+
+    if (r_lg10(large) > 2e3f) {
+	*small = sqrt(*small);
+	*large = sqrt(*large);
+    }
+
+    return 0;
+
+/*     End of SLABAD */
+
+} /* slabad_ */
+
+/* Subroutine */ int slabrd_(integer *m, integer *n, integer *nb, real *a,
+	integer *lda, real *d__, real *e, real *tauq, real *taup, real *x,
+	integer *ldx, real *y, integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+
+    /* Local variables */
+    static integer i__;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemv_(char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *), slarfg_(
+	    integer *, real *, real *, integer *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLABRD reduces the first NB rows and columns of a real general
+    m by n matrix A to upper or lower bidiagonal form by an orthogonal
+    transformation Q' * A * P, and returns the matrices X and Y which
+    are needed to apply the transformation to the unreduced part of A.
+
+    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+    bidiagonal form.
+
+    This is an auxiliary routine called by SGEBRD
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.
+
+    NB      (input) INTEGER
+            The number of leading rows and columns of A to be reduced.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit, the first NB rows and columns of the matrix are
+            overwritten; the rest of the array is unchanged.
+            If m >= n, elements on and below the diagonal in the first NB
+              columns, with the array TAUQ, represent the orthogonal
+              matrix Q as a product of elementary reflectors; and
+              elements above the diagonal in the first NB rows, with the
+              array TAUP, represent the orthogonal matrix P as a product
+              of elementary reflectors.
+            If m < n, elements below the diagonal in the first NB
+              columns, with the array TAUQ, represent the orthogonal
+              matrix Q as a product of elementary reflectors, and
+              elements on and above the diagonal in the first NB rows,
+              with the array TAUP, represent the orthogonal matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) REAL array, dimension (NB)
+            The diagonal elements of the first NB rows and columns of
+            the reduced matrix.  D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (NB)
+            The off-diagonal elements of the first NB rows and columns of
+            the reduced matrix.
+
+    TAUQ    (output) REAL array dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) REAL array, dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the orthogonal matrix P. See Further Details.
+
+    X       (output) REAL array, dimension (LDX,NB)
+            The m-by-nb matrix X required to update the unreduced part
+            of A.
+
+    LDX     (input) INTEGER
+            The leading dimension of the array X. LDX >= M.
+
+    Y       (output) REAL array, dimension (LDY,NB)
+            The n-by-nb matrix Y required to update the unreduced part
+            of A.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are real scalars, and v and u are real vectors.
+
+    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
+    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The elements of the vectors v and u together form the m-by-nb matrix
+    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
+    the transformation to the unreduced part of the matrix, using a block
+    update of the form:  A := A - V*Y' - X*U'.
+
+    The contents of A on exit are illustrated by the following examples
+    with nb = 2:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
+      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
+      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )
+
+    where a denotes an element of the original matrix which is unchanged,
+    vi denotes an element of the vector defining H(i), and ui an element
+    of the vector defining G(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:m,i) */
+
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + a_dim1],
+		    lda, &y[i__ + y_dim1], ldy, &c_b15, &a[i__ + i__ * a_dim1]
+		    , &c__1);
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("No transpose", &i__2, &i__3, &c_b151, &x[i__ + x_dim1],
+		    ldx, &a[i__ * a_dim1 + 1], &c__1, &c_b15, &a[i__ + i__ *
+		    a_dim1], &c__1);
+
+/*           Generate reflection Q(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ *
+		    a_dim1], &c__1, &tauq[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    if (i__ < *n) {
+		a[i__ + i__ * a_dim1] = 1.f;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + (i__ + 1) *
+			 a_dim1], lda, &a[i__ + i__ * a_dim1], &c__1, &c_b29,
+			&y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + a_dim1],
+			lda, &a[i__ + i__ * a_dim1], &c__1, &c_b29, &y[i__ *
+			y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b15, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &x[i__ + x_dim1],
+			ldx, &a[i__ + i__ * a_dim1], &c__1, &c_b29, &y[i__ *
+			y_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		sgemv_("Transpose", &i__2, &i__3, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &c_b15,
+			 &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		sscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+
+/*              Update A(i,i+1:n) */
+
+		i__2 = *n - i__;
+		sgemv_("No transpose", &i__2, &i__, &c_b151, &y[i__ + 1 +
+			y_dim1], ldy, &a[i__ + a_dim1], lda, &c_b15, &a[i__ +
+			(i__ + 1) * a_dim1], lda);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		sgemv_("Transpose", &i__2, &i__3, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, &c_b15, &a[
+			i__ + (i__ + 1) * a_dim1], lda);
+
+/*              Generate reflection P(i) to annihilate A(i,i+2:n) */
+
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		slarfg_(&i__2, &a[i__ + (i__ + 1) * a_dim1], &a[i__ + min(
+			i__3,*n) * a_dim1], lda, &taup[i__]);
+		e[i__] = a[i__ + (i__ + 1) * a_dim1];
+		a[i__ + (i__ + 1) * a_dim1] = 1.f;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		sgemv_("No transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + (
+			i__ + 1) * a_dim1], lda, &a[i__ + (i__ + 1) * a_dim1],
+			 lda, &c_b29, &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__;
+		sgemv_("Transpose", &i__2, &i__, &c_b15, &y[i__ + 1 + y_dim1],
+			 ldy, &a[i__ + (i__ + 1) * a_dim1], lda, &c_b29, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		sgemv_("No transpose", &i__2, &i__, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		sgemv_("No transpose", &i__2, &i__3, &c_b15, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b29, &x[i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		sscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i,i:n) */
+
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("No transpose", &i__2, &i__3, &c_b151, &y[i__ + y_dim1],
+		    ldy, &a[i__ + a_dim1], lda, &c_b15, &a[i__ + i__ * a_dim1]
+		    , lda);
+	    i__2 = i__ - 1;
+	    i__3 = *n - i__ + 1;
+	    sgemv_("Transpose", &i__2, &i__3, &c_b151, &a[i__ * a_dim1 + 1],
+		    lda, &x[i__ + x_dim1], ldx, &c_b15, &a[i__ + i__ * a_dim1]
+		    , lda);
+
+/*           Generate reflection P(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3,*n) *
+		    a_dim1], lda, &taup[i__]);
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    if (i__ < *m) {
+		a[i__ + i__ * a_dim1] = 1.f;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + i__
+			* a_dim1], lda, &a[i__ + i__ * a_dim1], lda, &c_b29, &
+			x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__ + 1;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &y[i__ + y_dim1],
+			ldy, &a[i__ + i__ * a_dim1], lda, &c_b29, &x[i__ *
+			x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__ + 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b15, &a[i__ * a_dim1
+			+ 1], lda, &a[i__ + i__ * a_dim1], lda, &c_b29, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b15, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		sscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+
+/*              Update A(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &y[i__ + y_dim1], ldy, &c_b15, &a[i__ +
+			1 + i__ * a_dim1], &c__1);
+		i__2 = *m - i__;
+		sgemv_("No transpose", &i__2, &i__, &c_b151, &x[i__ + 1 +
+			x_dim1], ldx, &a[i__ * a_dim1 + 1], &c__1, &c_b15, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+
+/*              Generate reflection Q(i) to annihilate A(i+2:m,i) */
+
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		slarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*m) +
+			i__ * a_dim1], &c__1, &tauq[i__]);
+		e[i__] = a[i__ + 1 + i__ * a_dim1];
+		a[i__ + 1 + i__ * a_dim1] = 1.f;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + (i__ +
+			1) * a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1,
+			&c_b29, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + a_dim1]
+			, lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &y[
+			i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b15, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__;
+		sgemv_("Transpose", &i__2, &i__, &c_b15, &x[i__ + 1 + x_dim1],
+			 ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &y[
+			i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		sgemv_("Transpose", &i__, &i__2, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &c_b15,
+			 &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		sscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of SLABRD */
+
+} /* slabrd_ */
+
+/* Subroutine */ int slacpy_(char *uplo, integer *m, integer *n, real *a,
+	integer *lda, real *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLACPY copies all or part of a two-dimensional matrix A to another
+    matrix B.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be copied to B.
+            = 'U':      Upper triangular part
+            = 'L':      Lower triangular part
+            Otherwise:  All of the matrix A
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The m by n matrix A.  If UPLO = 'U', only the upper triangle
+            or trapezoid is accessed; if UPLO = 'L', only the lower
+            triangle or trapezoid is accessed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (output) REAL array, dimension (LDB,N)
+            On exit, B = A in the locations specified by UPLO.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(uplo, "L")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
+/* L30: */
+	    }
+/* L40: */
+	}
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+    return 0;
+
+/*     End of SLACPY */
+
+} /* slacpy_ */
+
+/* Subroutine */ int sladiv_(real *a, real *b, real *c__, real *d__, real *p,
+	real *q)
+{
+    static real e, f;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLADIV performs complex division in  real arithmetic
+
+                          a + i*b
+               p + i*q = ---------
+                          c + i*d
+
+    The algorithm is due to Robert L. Smith and can be found
+    in D. Knuth, The art of Computer Programming, Vol.2, p.195
+
+    Arguments
+    =========
+
+    A       (input) REAL
+    B       (input) REAL
+    C       (input) REAL
+    D       (input) REAL
+            The scalars a, b, c, and d in the above expression.
+
+    P       (output) REAL
+    Q       (output) REAL
+            The scalars p and q in the above expression.
+
+    =====================================================================
+*/
+
+
+    if (dabs(*d__) < dabs(*c__)) {
+	e = *d__ / *c__;
+	f = *c__ + *d__ * e;
+	*p = (*a + *b * e) / f;
+	*q = (*b - *a * e) / f;
+    } else {
+	e = *c__ / *d__;
+	f = *d__ + *c__ * e;
+	*p = (*b + *a * e) / f;
+	*q = (-(*a) + *b * e) / f;
+    }
+
+    return 0;
+
+/*     End of SLADIV */
+
+} /* sladiv_ */
+
+/* Subroutine */ int slae2_(real *a, real *b, real *c__, real *rt1, real *rt2)
+{
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    static real ab, df, tb, sm, rt, adf, acmn, acmx;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAE2  computes the eigenvalues of a 2-by-2 symmetric matrix
+       [  A   B  ]
+       [  B   C  ].
+    On return, RT1 is the eigenvalue of larger absolute value, and RT2
+    is the eigenvalue of smaller absolute value.
+
+    Arguments
+    =========
+
+    A       (input) REAL
+            The (1,1) element of the 2-by-2 matrix.
+
+    B       (input) REAL
+            The (1,2) and (2,1) elements of the 2-by-2 matrix.
+
+    C       (input) REAL
+            The (2,2) element of the 2-by-2 matrix.
+
+    RT1     (output) REAL
+            The eigenvalue of larger absolute value.
+
+    RT2     (output) REAL
+            The eigenvalue of smaller absolute value.
+
+    Further Details
+    ===============
+
+    RT1 is accurate to a few ulps barring over/underflow.
+
+    RT2 may be inaccurate if there is massive cancellation in the
+    determinant A*C-B*B; higher precision or correctly rounded or
+    correctly truncated arithmetic would be needed to compute RT2
+    accurately in all cases.
+
+    Overflow is possible only if RT1 is within a factor of 5 of overflow.
+    Underflow is harmless if the input data is 0 or exceeds
+       underflow_threshold / macheps.
+
+   =====================================================================
+
+
+       Compute the eigenvalues
+*/
+
+    sm = *a + *c__;
+    df = *a - *c__;
+    adf = dabs(df);
+    tb = *b + *b;
+    ab = dabs(tb);
+    if (dabs(*a) > dabs(*c__)) {
+	acmx = *a;
+	acmn = *c__;
+    } else {
+	acmx = *c__;
+	acmn = *a;
+    }
+    if (adf > ab) {
+/* Computing 2nd power */
+	r__1 = ab / adf;
+	rt = adf * sqrt(r__1 * r__1 + 1.f);
+    } else if (adf < ab) {
+/* Computing 2nd power */
+	r__1 = adf / ab;
+	rt = ab * sqrt(r__1 * r__1 + 1.f);
+    } else {
+
+/*        Includes case AB=ADF=0 */
+
+	rt = ab * sqrt(2.f);
+    }
+    if (sm < 0.f) {
+	*rt1 = (sm - rt) * .5f;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else if (sm > 0.f) {
+	*rt1 = (sm + rt) * .5f;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else {
+
+/*        Includes case RT1 = RT2 = 0 */
+
+	*rt1 = rt * .5f;
+	*rt2 = rt * -.5f;
+    }
+    return 0;
+
+/*     End of SLAE2 */
+
+} /* slae2_ */
+
+/* Subroutine */ int slaed0_(integer *icompq, integer *qsiz, integer *n, real
+	*d__, real *e, real *q, integer *ldq, real *qstore, integer *ldqs,
+	real *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, qstore_dim1, qstore_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, k, iq, lgn, msd2, smm1, spm1, spm2;
+    static real temp;
+    static integer curr;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer iperm, indxq, iwrem;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static integer iqptr, tlvls;
+    extern /* Subroutine */ int slaed1_(integer *, real *, real *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *),
+	    slaed7_(integer *, integer *, integer *, integer *, integer *,
+	    integer *, real *, real *, integer *, integer *, real *, integer *
+	    , real *, integer *, integer *, integer *, integer *, integer *,
+	    real *, real *, integer *, integer *);
+    static integer igivcl;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer igivnm, submat;
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *,
+	    integer *, real *, integer *);
+    static integer curprb, subpbs, igivpt, curlvl, matsiz, iprmpt, smlsiz;
+    extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *,
+	    real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAED0 computes all eigenvalues and corresponding eigenvectors of a
+    symmetric tridiagonal matrix using the divide and conquer method.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            = 0:  Compute eigenvalues only.
+            = 1:  Compute eigenvectors of original dense symmetric matrix
+                  also.  On entry, Q contains the orthogonal matrix used
+                  to reduce the original matrix to tridiagonal form.
+            = 2:  Compute eigenvalues and eigenvectors of tridiagonal
+                  matrix.
+
+    QSIZ   (input) INTEGER
+           The dimension of the orthogonal matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, the main diagonal of the tridiagonal matrix.
+           On exit, its eigenvalues.
+
+    E      (input) REAL array, dimension (N-1)
+           The off-diagonal elements of the tridiagonal matrix.
+           On exit, E has been destroyed.
+
+    Q      (input/output) REAL array, dimension (LDQ, N)
+           On entry, Q must contain an N-by-N orthogonal matrix.
+           If ICOMPQ = 0    Q is not referenced.
+           If ICOMPQ = 1    On entry, Q is a subset of the columns of the
+                            orthogonal matrix used to reduce the full
+                            matrix to tridiagonal form corresponding to
+                            the subset of the full matrix which is being
+                            decomposed at this time.
+           If ICOMPQ = 2    On entry, Q will be the identity matrix.
+                            On exit, Q contains the eigenvectors of the
+                            tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  If eigenvectors are
+           desired, then  LDQ >= max(1,N).  In any case,  LDQ >= 1.
+
+    QSTORE (workspace) REAL array, dimension (LDQS, N)
+           Referenced only when ICOMPQ = 1.  Used to store parts of
+           the eigenvector matrix when the updating matrix multiplies
+           take place.
+
+    LDQS   (input) INTEGER
+           The leading dimension of the array QSTORE.  If ICOMPQ = 1,
+           then  LDQS >= max(1,N).  In any case,  LDQS >= 1.
+
+    WORK   (workspace) REAL array,
+           If ICOMPQ = 0 or 1, the dimension of WORK must be at least
+                       1 + 3*N + 2*N*lg N + 2*N**2
+                       ( lg( N ) = smallest integer k
+                                   such that 2^k >= N )
+           If ICOMPQ = 2, the dimension of WORK must be at least
+                       4*N + N**2.
+
+    IWORK  (workspace) INTEGER array,
+           If ICOMPQ = 0 or 1, the dimension of IWORK must be at least
+                          6 + 6*N + 5*N*lg N.
+                          ( lg( N ) = smallest integer k
+                                      such that 2^k >= N )
+           If ICOMPQ = 2, the dimension of IWORK must be at least
+                          3 + 5*N.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    qstore_dim1 = *ldqs;
+    qstore_offset = 1 + qstore_dim1;
+    qstore -= qstore_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 2) {
+	*info = -1;
+    } else if (*icompq == 1 && *qsiz < max(0,*n)) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ldq < max(1,*n)) {
+	*info = -7;
+    } else if (*ldqs < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED0", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    smlsiz = ilaenv_(&c__9, "SLAED0", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+
+/*
+       Determine the size and placement of the submatrices, and save in
+       the leading elements of IWORK.
+*/
+
+    iwork[1] = *n;
+    subpbs = 1;
+    tlvls = 0;
+L10:
+    if (iwork[subpbs] > smlsiz) {
+	for (j = subpbs; j >= 1; --j) {
+	    iwork[j * 2] = (iwork[j] + 1) / 2;
+	    iwork[(j << 1) - 1] = iwork[j] / 2;
+/* L20: */
+	}
+	++tlvls;
+	subpbs <<= 1;
+	goto L10;
+    }
+    i__1 = subpbs;
+    for (j = 2; j <= i__1; ++j) {
+	iwork[j] += iwork[j - 1];
+/* L30: */
+    }
+
+/*
+       Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1
+       using rank-1 modifications (cuts).
+*/
+
+    spm1 = subpbs - 1;
+    i__1 = spm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	submat = iwork[i__] + 1;
+	smm1 = submat - 1;
+	d__[smm1] -= (r__1 = e[smm1], dabs(r__1));
+	d__[submat] -= (r__1 = e[smm1], dabs(r__1));
+/* L40: */
+    }
+
+    indxq = (*n << 2) + 3;
+    if (*icompq != 2) {
+
+/*
+          Set up workspaces for eigenvalues only/accumulate new vectors
+          routine
+*/
+
+	temp = log((real) (*n)) / log(2.f);
+	lgn = (integer) temp;
+	if (pow_ii(&c__2, &lgn) < *n) {
+	    ++lgn;
+	}
+	if (pow_ii(&c__2, &lgn) < *n) {
+	    ++lgn;
+	}
+	iprmpt = indxq + *n + 1;
+	iperm = iprmpt + *n * lgn;
+	iqptr = iperm + *n * lgn;
+	igivpt = iqptr + *n + 2;
+	igivcl = igivpt + *n * lgn;
+
+	igivnm = 1;
+	iq = igivnm + (*n << 1) * lgn;
+/* Computing 2nd power */
+	i__1 = *n;
+	iwrem = iq + i__1 * i__1 + 1;
+
+/*        Initialize pointers */
+
+	i__1 = subpbs;
+	for (i__ = 0; i__ <= i__1; ++i__) {
+	    iwork[iprmpt + i__] = 1;
+	    iwork[igivpt + i__] = 1;
+/* L50: */
+	}
+	iwork[iqptr] = 1;
+    }
+
+/*
+       Solve each submatrix eigenproblem at the bottom of the divide and
+       conquer tree.
+*/
+
+    curr = 0;
+    i__1 = spm1;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	if (i__ == 0) {
+	    submat = 1;
+	    matsiz = iwork[1];
+	} else {
+	    submat = iwork[i__] + 1;
+	    matsiz = iwork[i__ + 1] - iwork[i__];
+	}
+	if (*icompq == 2) {
+	    ssteqr_("I", &matsiz, &d__[submat], &e[submat], &q[submat +
+		    submat * q_dim1], ldq, &work[1], info);
+	    if (*info != 0) {
+		goto L130;
+	    }
+	} else {
+	    ssteqr_("I", &matsiz, &d__[submat], &e[submat], &work[iq - 1 +
+		    iwork[iqptr + curr]], &matsiz, &work[1], info);
+	    if (*info != 0) {
+		goto L130;
+	    }
+	    if (*icompq == 1) {
+		sgemm_("N", "N", qsiz, &matsiz, &matsiz, &c_b15, &q[submat *
+			q_dim1 + 1], ldq, &work[iq - 1 + iwork[iqptr + curr]],
+			 &matsiz, &c_b29, &qstore[submat * qstore_dim1 + 1],
+			ldqs);
+	    }
+/* Computing 2nd power */
+	    i__2 = matsiz;
+	    iwork[iqptr + curr + 1] = iwork[iqptr + curr] + i__2 * i__2;
+	    ++curr;
+	}
+	k = 1;
+	i__2 = iwork[i__ + 1];
+	for (j = submat; j <= i__2; ++j) {
+	    iwork[indxq + j] = k;
+	    ++k;
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       Successively merge eigensystems of adjacent submatrices
+       into eigensystem for the corresponding larger matrix.
+
+       while ( SUBPBS > 1 )
+*/
+
+    curlvl = 1;
+L80:
+    if (subpbs > 1) {
+	spm2 = subpbs - 2;
+	i__1 = spm2;
+	for (i__ = 0; i__ <= i__1; i__ += 2) {
+	    if (i__ == 0) {
+		submat = 1;
+		matsiz = iwork[2];
+		msd2 = iwork[1];
+		curprb = 0;
+	    } else {
+		submat = iwork[i__] + 1;
+		matsiz = iwork[i__ + 2] - iwork[i__];
+		msd2 = matsiz / 2;
+		++curprb;
+	    }
+
+/*
+       Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2)
+       into an eigensystem of size MATSIZ.
+       SLAED1 is used only for the full eigensystem of a tridiagonal
+       matrix.
+       SLAED7 handles the cases in which eigenvalues only or eigenvalues
+       and eigenvectors of a full symmetric matrix (which was reduced to
+       tridiagonal form) are desired.
+*/
+
+	    if (*icompq == 2) {
+		slaed1_(&matsiz, &d__[submat], &q[submat + submat * q_dim1],
+			ldq, &iwork[indxq + submat], &e[submat + msd2 - 1], &
+			msd2, &work[1], &iwork[subpbs + 1], info);
+	    } else {
+		slaed7_(icompq, &matsiz, qsiz, &tlvls, &curlvl, &curprb, &d__[
+			submat], &qstore[submat * qstore_dim1 + 1], ldqs, &
+			iwork[indxq + submat], &e[submat + msd2 - 1], &msd2, &
+			work[iq], &iwork[iqptr], &iwork[iprmpt], &iwork[iperm]
+			, &iwork[igivpt], &iwork[igivcl], &work[igivnm], &
+			work[iwrem], &iwork[subpbs + 1], info);
+	    }
+	    if (*info != 0) {
+		goto L130;
+	    }
+	    iwork[i__ / 2 + 1] = iwork[i__ + 2];
+/* L90: */
+	}
+	subpbs /= 2;
+	++curlvl;
+	goto L80;
+    }
+
+/*
+       end while
+
+       Re-merge the eigenvalues/vectors which were deflated at the final
+       merge step.
+*/
+
+    if (*icompq == 1) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    j = iwork[indxq + i__];
+	    work[i__] = d__[j];
+	    scopy_(qsiz, &qstore[j * qstore_dim1 + 1], &c__1, &q[i__ * q_dim1
+		    + 1], &c__1);
+/* L100: */
+	}
+	scopy_(n, &work[1], &c__1, &d__[1], &c__1);
+    } else if (*icompq == 2) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    j = iwork[indxq + i__];
+	    work[i__] = d__[j];
+	    scopy_(n, &q[j * q_dim1 + 1], &c__1, &work[*n * i__ + 1], &c__1);
+/* L110: */
+	}
+	scopy_(n, &work[1], &c__1, &d__[1], &c__1);
+	slacpy_("A", n, n, &work[*n + 1], n, &q[q_offset], ldq);
+    } else {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    j = iwork[indxq + i__];
+	    work[i__] = d__[j];
+/* L120: */
+	}
+	scopy_(n, &work[1], &c__1, &d__[1], &c__1);
+    }
+    goto L140;
+
+L130:
+    *info = submat * (*n + 1) + submat + matsiz - 1;
+
+L140:
+    return 0;
+
+/*     End of SLAED0 */
+
+} /* slaed0_ */
+
+/* Subroutine */ int slaed1_(integer *n, real *d__, real *q, integer *ldq,
+	integer *indxq, real *rho, integer *cutpnt, real *work, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, k, n1, n2, is, iw, iz, iq2, cpp1, indx, indxc, indxp;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slaed2_(integer *, integer *, integer *, real *, real
+	    *, integer *, integer *, real *, real *, real *, real *, real *,
+	    integer *, integer *, integer *, integer *, integer *), slaed3_(
+	    integer *, integer *, integer *, real *, real *, integer *, real *
+	    , real *, real *, integer *, integer *, real *, real *, integer *)
+	    ;
+    static integer idlmda;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slamrg_(
+	    integer *, integer *, real *, integer *, integer *, integer *);
+    static integer coltyp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAED1 computes the updated eigensystem of a diagonal
+    matrix after modification by a rank-one symmetric matrix.  This
+    routine is used only for the eigenproblem which requires all
+    eigenvalues and eigenvectors of a tridiagonal matrix.  SLAED7 handles
+    the case in which eigenvalues only or eigenvalues and eigenvectors
+    of a full symmetric matrix (which was reduced to tridiagonal form)
+    are desired.
+
+      T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out)
+
+       where Z = Q'u, u is a vector of length N with ones in the
+       CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+
+       The eigenvectors of the original matrix are stored in Q, and the
+       eigenvalues are in D.  The algorithm consists of three stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple eigenvalues or if there is a zero in
+          the Z vector.  For each such occurence the dimension of the
+          secular equation problem is reduced by one.  This stage is
+          performed by the routine SLAED2.
+
+          The second stage consists of calculating the updated
+          eigenvalues. This is done by finding the roots of the secular
+          equation via the routine SLAED4 (as called by SLAED3).
+          This routine also calculates the eigenvectors of the current
+          problem.
+
+          The final stage consists of computing the updated eigenvectors
+          directly using the updated eigenvalues.  The eigenvectors for
+          the current problem are multiplied with the eigenvectors from
+          the overall problem.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, the eigenvalues of the rank-1-perturbed matrix.
+           On exit, the eigenvalues of the repaired matrix.
+
+    Q      (input/output) REAL array, dimension (LDQ,N)
+           On entry, the eigenvectors of the rank-1-perturbed matrix.
+           On exit, the eigenvectors of the repaired tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (input/output) INTEGER array, dimension (N)
+           On entry, the permutation which separately sorts the two
+           subproblems in D into ascending order.
+           On exit, the permutation which will reintegrate the
+           subproblems back into sorted order,
+           i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.
+
+    RHO    (input) REAL
+           The subdiagonal entry used to create the rank-1 modification.
+
+    CUTPNT (input) INTEGER
+           The location of the last eigenvalue in the leading sub-matrix.
+           min(1,N) <= CUTPNT <= N/2.
+
+    WORK   (workspace) REAL array, dimension (4*N + N**2)
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ldq < max(1,*n)) {
+	*info = -4;
+    } else /* if(complicated condition) */ {
+/* Computing MIN */
+	i__1 = 1, i__2 = *n / 2;
+	if (min(i__1,i__2) > *cutpnt || *n / 2 < *cutpnt) {
+	    *info = -7;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED1", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*
+       The following values are integer pointers which indicate
+       the portion of the workspace
+       used by a particular array in SLAED2 and SLAED3.
+*/
+
+    iz = 1;
+    idlmda = iz + *n;
+    iw = idlmda + *n;
+    iq2 = iw + *n;
+
+    indx = 1;
+    indxc = indx + *n;
+    coltyp = indxc + *n;
+    indxp = coltyp + *n;
+
+
+/*
+       Form the z-vector which consists of the last row of Q_1 and the
+       first row of Q_2.
+*/
+
+    scopy_(cutpnt, &q[*cutpnt + q_dim1], ldq, &work[iz], &c__1);
+    cpp1 = *cutpnt + 1;
+    i__1 = *n - *cutpnt;
+    scopy_(&i__1, &q[cpp1 + cpp1 * q_dim1], ldq, &work[iz + *cutpnt], &c__1);
+
+/*     Deflate eigenvalues. */
+
+    slaed2_(&k, n, cutpnt, &d__[1], &q[q_offset], ldq, &indxq[1], rho, &work[
+	    iz], &work[idlmda], &work[iw], &work[iq2], &iwork[indx], &iwork[
+	    indxc], &iwork[indxp], &iwork[coltyp], info);
+
+    if (*info != 0) {
+	goto L20;
+    }
+
+/*     Solve Secular Equation. */
+
+    if (k != 0) {
+	is = (iwork[coltyp] + iwork[coltyp + 1]) * *cutpnt + (iwork[coltyp +
+		1] + iwork[coltyp + 2]) * (*n - *cutpnt) + iq2;
+	slaed3_(&k, n, cutpnt, &d__[1], &q[q_offset], ldq, rho, &work[idlmda],
+		 &work[iq2], &iwork[indxc], &iwork[coltyp], &work[iw], &work[
+		is], info);
+	if (*info != 0) {
+	    goto L20;
+	}
+
+/*     Prepare the INDXQ sorting permutation. */
+
+	n1 = k;
+	n2 = *n - k;
+	slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &indxq[1]);
+    } else {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    indxq[i__] = i__;
+/* L10: */
+	}
+    }
+
+L20:
+    return 0;
+
+/*     End of SLAED1 */
+
+} /* slaed1_ */
+
+/* Subroutine */ int slaed2_(integer *k, integer *n, integer *n1, real *d__,
+	real *q, integer *ldq, integer *indxq, real *rho, real *z__, real *
+	dlamda, real *w, real *q2, integer *indx, integer *indxc, integer *
+	indxp, integer *coltyp, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+    real r__1, r__2, r__3, r__4;
+
+    /* Local variables */
+    static real c__;
+    static integer i__, j;
+    static real s, t;
+    static integer k2, n2, ct, nj, pj, js, iq1, iq2, n1p1;
+    static real eps, tau, tol;
+    static integer psm[4], imax, jmax, ctot[4];
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *), sscal_(integer *, real *, real *,
+	    integer *), scopy_(integer *, real *, integer *, real *, integer *
+	    );
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int slamrg_(integer *, integer *, real *, integer
+	    *, integer *, integer *), slacpy_(char *, integer *, integer *,
+	    real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAED2 merges the two sets of eigenvalues together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    eigenvalues are close together or if there is a tiny entry in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    Arguments
+    =========
+
+    K      (output) INTEGER
+           The number of non-deflated eigenvalues, and the order of the
+           related secular equation. 0 <= K <=N.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    N1     (input) INTEGER
+           The location of the last eigenvalue in the leading sub-matrix.
+           min(1,N) <= N1 <= N/2.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, D contains the eigenvalues of the two submatrices to
+           be combined.
+           On exit, D contains the trailing (N-K) updated eigenvalues
+           (those which were deflated) sorted into increasing order.
+
+    Q      (input/output) REAL array, dimension (LDQ, N)
+           On entry, Q contains the eigenvectors of two submatrices in
+           the two square blocks with corners at (1,1), (N1,N1)
+           and (N1+1, N1+1), (N,N).
+           On exit, Q contains the trailing (N-K) updated eigenvectors
+           (those which were deflated) in its last N-K columns.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (input/output) INTEGER array, dimension (N)
+           The permutation which separately sorts the two sub-problems
+           in D into ascending order.  Note that elements in the second
+           half of this permutation must first have N1 added to their
+           values. Destroyed on exit.
+
+    RHO    (input/output) REAL
+           On entry, the off-diagonal element associated with the rank-1
+           cut which originally split the two submatrices which are now
+           being recombined.
+           On exit, RHO has been modified to the value required by
+           SLAED3.
+
+    Z      (input) REAL array, dimension (N)
+           On entry, Z contains the updating vector (the last
+           row of the first sub-eigenvector matrix and the first row of
+           the second sub-eigenvector matrix).
+           On exit, the contents of Z have been destroyed by the updating
+           process.
+
+    DLAMDA (output) REAL array, dimension (N)
+           A copy of the first K eigenvalues which will be used by
+           SLAED3 to form the secular equation.
+
+    W      (output) REAL array, dimension (N)
+           The first k values of the final deflation-altered z-vector
+           which will be passed to SLAED3.
+
+    Q2     (output) REAL array, dimension (N1**2+(N-N1)**2)
+           A copy of the first K eigenvectors which will be used by
+           SLAED3 in a matrix multiply (SGEMM) to solve for the new
+           eigenvectors.
+
+    INDX   (workspace) INTEGER array, dimension (N)
+           The permutation used to sort the contents of DLAMDA into
+           ascending order.
+
+    INDXC  (output) INTEGER array, dimension (N)
+           The permutation used to arrange the columns of the deflated
+           Q matrix into three groups:  the first group contains non-zero
+           elements only at and above N1, the second contains
+           non-zero elements only below N1, and the third is dense.
+
+    INDXP  (workspace) INTEGER array, dimension (N)
+           The permutation used to place deflated values of D at the end
+           of the array.  INDXP(1:K) points to the nondeflated D-values
+           and INDXP(K+1:N) points to the deflated eigenvalues.
+
+    COLTYP (workspace/output) INTEGER array, dimension (N)
+           During execution, a label which will indicate which of the
+           following types a column in the Q2 matrix is:
+           1 : non-zero in the upper half only;
+           2 : dense;
+           3 : non-zero in the lower half only;
+           4 : deflated.
+           On exit, COLTYP(i) is the number of columns of type i,
+           for i=1 to 4 only.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --z__;
+    --dlamda;
+    --w;
+    --q2;
+    --indx;
+    --indxc;
+    --indxp;
+    --coltyp;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -2;
+    } else if (*ldq < max(1,*n)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MIN */
+	i__1 = 1, i__2 = *n / 2;
+	if (min(i__1,i__2) > *n1 || *n / 2 < *n1) {
+	    *info = -3;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    n2 = *n - *n1;
+    n1p1 = *n1 + 1;
+
+    if (*rho < 0.f) {
+	sscal_(&n2, &c_b151, &z__[n1p1], &c__1);
+    }
+
+/*
+       Normalize z so that norm(z) = 1.  Since z is the concatenation of
+       two normalized vectors, norm2(z) = sqrt(2).
+*/
+
+    t = 1.f / sqrt(2.f);
+    sscal_(n, &t, &z__[1], &c__1);
+
+/*     RHO = ABS( norm(z)**2 * RHO ) */
+
+    *rho = (r__1 = *rho * 2.f, dabs(r__1));
+
+/*     Sort the eigenvalues into increasing order */
+
+    i__1 = *n;
+    for (i__ = n1p1; i__ <= i__1; ++i__) {
+	indxq[i__] += *n1;
+/* L10: */
+    }
+
+/*     re-integrate the deflated parts from the last pass */
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = d__[indxq[i__]];
+/* L20: */
+    }
+    slamrg_(n1, &n2, &dlamda[1], &c__1, &c__1, &indxc[1]);
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	indx[i__] = indxq[indxc[i__]];
+/* L30: */
+    }
+
+/*     Calculate the allowable deflation tolerance */
+
+    imax = isamax_(n, &z__[1], &c__1);
+    jmax = isamax_(n, &d__[1], &c__1);
+    eps = slamch_("Epsilon");
+/* Computing MAX */
+    r__3 = (r__1 = d__[jmax], dabs(r__1)), r__4 = (r__2 = z__[imax], dabs(
+	    r__2));
+    tol = eps * 8.f * dmax(r__3,r__4);
+
+/*
+       If the rank-1 modifier is small enough, no more needs to be done
+       except to reorganize Q so that its columns correspond with the
+       elements in D.
+*/
+
+    if (*rho * (r__1 = z__[imax], dabs(r__1)) <= tol) {
+	*k = 0;
+	iq2 = 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__ = indx[j];
+	    scopy_(n, &q[i__ * q_dim1 + 1], &c__1, &q2[iq2], &c__1);
+	    dlamda[j] = d__[i__];
+	    iq2 += *n;
+/* L40: */
+	}
+	slacpy_("A", n, n, &q2[1], n, &q[q_offset], ldq);
+	scopy_(n, &dlamda[1], &c__1, &d__[1], &c__1);
+	goto L190;
+    }
+
+/*
+       If there are multiple eigenvalues then the problem deflates.  Here
+       the number of equal eigenvalues are found.  As each equal
+       eigenvalue is found, an elementary reflector is computed to rotate
+       the corresponding eigensubspace so that the corresponding
+       components of Z are zero in this new basis.
+*/
+
+    i__1 = *n1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	coltyp[i__] = 1;
+/* L50: */
+    }
+    i__1 = *n;
+    for (i__ = n1p1; i__ <= i__1; ++i__) {
+	coltyp[i__] = 3;
+/* L60: */
+    }
+
+
+    *k = 0;
+    k2 = *n + 1;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	nj = indx[j];
+	if (*rho * (r__1 = z__[nj], dabs(r__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    coltyp[nj] = 4;
+	    indxp[k2] = nj;
+	    if (j == *n) {
+		goto L100;
+	    }
+	} else {
+	    pj = nj;
+	    goto L80;
+	}
+/* L70: */
+    }
+L80:
+    ++j;
+    nj = indx[j];
+    if (j > *n) {
+	goto L100;
+    }
+    if (*rho * (r__1 = z__[nj], dabs(r__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	coltyp[nj] = 4;
+	indxp[k2] = nj;
+    } else {
+
+/*        Check if eigenvalues are close enough to allow deflation. */
+
+	s = z__[pj];
+	c__ = z__[nj];
+
+/*
+          Find sqrt(a**2+b**2) without overflow or
+          destructive underflow.
+*/
+
+	tau = slapy2_(&c__, &s);
+	t = d__[nj] - d__[pj];
+	c__ /= tau;
+	s = -s / tau;
+	if ((r__1 = t * c__ * s, dabs(r__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    z__[nj] = tau;
+	    z__[pj] = 0.f;
+	    if (coltyp[nj] != coltyp[pj]) {
+		coltyp[nj] = 2;
+	    }
+	    coltyp[pj] = 4;
+	    srot_(n, &q[pj * q_dim1 + 1], &c__1, &q[nj * q_dim1 + 1], &c__1, &
+		    c__, &s);
+/* Computing 2nd power */
+	    r__1 = c__;
+/* Computing 2nd power */
+	    r__2 = s;
+	    t = d__[pj] * (r__1 * r__1) + d__[nj] * (r__2 * r__2);
+/* Computing 2nd power */
+	    r__1 = s;
+/* Computing 2nd power */
+	    r__2 = c__;
+	    d__[nj] = d__[pj] * (r__1 * r__1) + d__[nj] * (r__2 * r__2);
+	    d__[pj] = t;
+	    --k2;
+	    i__ = 1;
+L90:
+	    if (k2 + i__ <= *n) {
+		if (d__[pj] < d__[indxp[k2 + i__]]) {
+		    indxp[k2 + i__ - 1] = indxp[k2 + i__];
+		    indxp[k2 + i__] = pj;
+		    ++i__;
+		    goto L90;
+		} else {
+		    indxp[k2 + i__ - 1] = pj;
+		}
+	    } else {
+		indxp[k2 + i__ - 1] = pj;
+	    }
+	    pj = nj;
+	} else {
+	    ++(*k);
+	    dlamda[*k] = d__[pj];
+	    w[*k] = z__[pj];
+	    indxp[*k] = pj;
+	    pj = nj;
+	}
+    }
+    goto L80;
+L100:
+
+/*     Record the last eigenvalue. */
+
+    ++(*k);
+    dlamda[*k] = d__[pj];
+    w[*k] = z__[pj];
+    indxp[*k] = pj;
+
+/*
+       Count up the total number of the various types of columns, then
+       form a permutation which positions the four column types into
+       four uniform groups (although one or more of these groups may be
+       empty).
+*/
+
+    for (j = 1; j <= 4; ++j) {
+	ctot[j - 1] = 0;
+/* L110: */
+    }
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	ct = coltyp[j];
+	++ctot[ct - 1];
+/* L120: */
+    }
+
+/*     PSM(*) = Position in SubMatrix (of types 1 through 4) */
+
+    psm[0] = 1;
+    psm[1] = ctot[0] + 1;
+    psm[2] = psm[1] + ctot[1];
+    psm[3] = psm[2] + ctot[2];
+    *k = *n - ctot[3];
+
+/*
+       Fill out the INDXC array so that the permutation which it induces
+       will place all type-1 columns first, all type-2 columns next,
+       then all type-3's, and finally all type-4's.
+*/
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	js = indxp[j];
+	ct = coltyp[js];
+	indx[psm[ct - 1]] = js;
+	indxc[psm[ct - 1]] = j;
+	++psm[ct - 1];
+/* L130: */
+    }
+
+/*
+       Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+       and Q2 respectively.  The eigenvalues/vectors which were not
+       deflated go into the first K slots of DLAMDA and Q2 respectively,
+       while those which were deflated go into the last N - K slots.
+*/
+
+    i__ = 1;
+    iq1 = 1;
+    iq2 = (ctot[0] + ctot[1]) * *n1 + 1;
+    i__1 = ctot[0];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	scopy_(n1, &q[js * q_dim1 + 1], &c__1, &q2[iq1], &c__1);
+	z__[i__] = d__[js];
+	++i__;
+	iq1 += *n1;
+/* L140: */
+    }
+
+    i__1 = ctot[1];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	scopy_(n1, &q[js * q_dim1 + 1], &c__1, &q2[iq1], &c__1);
+	scopy_(&n2, &q[*n1 + 1 + js * q_dim1], &c__1, &q2[iq2], &c__1);
+	z__[i__] = d__[js];
+	++i__;
+	iq1 += *n1;
+	iq2 += n2;
+/* L150: */
+    }
+
+    i__1 = ctot[2];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	scopy_(&n2, &q[*n1 + 1 + js * q_dim1], &c__1, &q2[iq2], &c__1);
+	z__[i__] = d__[js];
+	++i__;
+	iq2 += n2;
+/* L160: */
+    }
+
+    iq1 = iq2;
+    i__1 = ctot[3];
+    for (j = 1; j <= i__1; ++j) {
+	js = indx[i__];
+	scopy_(n, &q[js * q_dim1 + 1], &c__1, &q2[iq2], &c__1);
+	iq2 += *n;
+	z__[i__] = d__[js];
+	++i__;
+/* L170: */
+    }
+
+/*
+       The deflated eigenvalues and their corresponding vectors go back
+       into the last N - K slots of D and Q respectively.
+*/
+
+    slacpy_("A", n, &ctot[3], &q2[iq1], n, &q[(*k + 1) * q_dim1 + 1], ldq);
+    i__1 = *n - *k;
+    scopy_(&i__1, &z__[*k + 1], &c__1, &d__[*k + 1], &c__1);
+
+/*     Copy CTOT into COLTYP for referencing in SLAED3. */
+
+    for (j = 1; j <= 4; ++j) {
+	coltyp[j] = ctot[j - 1];
+/* L180: */
+    }
+
+L190:
+    return 0;
+
+/*     End of SLAED2 */
+
+} /* slaed2_ */
+
+/* Subroutine */ int slaed3_(integer *k, integer *n, integer *n1, real *d__,
+	real *q, integer *ldq, real *rho, real *dlamda, real *q2, integer *
+	indx, integer *ctot, real *w, real *s, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, n2, n12, ii, n23, iq2;
+    static real temp;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *), scopy_(integer *, real *,
+	    integer *, real *, integer *), slaed4_(integer *, integer *, real
+	    *, real *, real *, real *, real *, integer *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), slacpy_(
+	    char *, integer *, integer *, real *, integer *, real *, integer *
+	    ), slaset_(char *, integer *, integer *, real *, real *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAED3 finds the roots of the secular equation, as defined by the
+    values in D, W, and RHO, between 1 and K.  It makes the
+    appropriate calls to SLAED4 and then updates the eigenvectors by
+    multiplying the matrix of eigenvectors of the pair of eigensystems
+    being combined by the matrix of eigenvectors of the K-by-K system
+    which is solved here.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    K       (input) INTEGER
+            The number of terms in the rational function to be solved by
+            SLAED4.  K >= 0.
+
+    N       (input) INTEGER
+            The number of rows and columns in the Q matrix.
+            N >= K (deflation may result in N>K).
+
+    N1      (input) INTEGER
+            The location of the last eigenvalue in the leading submatrix.
+            min(1,N) <= N1 <= N/2.
+
+    D       (output) REAL array, dimension (N)
+            D(I) contains the updated eigenvalues for
+            1 <= I <= K.
+
+    Q       (output) REAL array, dimension (LDQ,N)
+            Initially the first K columns are used as workspace.
+            On output the columns 1 to K contain
+            the updated eigenvectors.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    RHO     (input) REAL
+            The value of the parameter in the rank one update equation.
+            RHO >= 0 required.
+
+    DLAMDA  (input/output) REAL array, dimension (K)
+            The first K elements of this array contain the old roots
+            of the deflated updating problem.  These are the poles
+            of the secular equation. May be changed on output by
+            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
+            Cray-2, or Cray C-90, as described above.
+
+    Q2      (input) REAL array, dimension (LDQ2, N)
+            The first K columns of this matrix contain the non-deflated
+            eigenvectors for the split problem.
+
+    INDX    (input) INTEGER array, dimension (N)
+            The permutation used to arrange the columns of the deflated
+            Q matrix into three groups (see SLAED2).
+            The rows of the eigenvectors found by SLAED4 must be likewise
+            permuted before the matrix multiply can take place.
+
+    CTOT    (input) INTEGER array, dimension (4)
+            A count of the total number of the various types of columns
+            in Q, as described in INDX.  The fourth column type is any
+            column which has been deflated.
+
+    W       (input/output) REAL array, dimension (K)
+            The first K elements of this array contain the components
+            of the deflation-adjusted updating vector. Destroyed on
+            output.
+
+    S       (workspace) REAL array, dimension (N1 + 1)*K
+            Will contain the eigenvectors of the repaired matrix which
+            will be multiplied by the previously accumulated eigenvectors
+            to update the system.
+
+    LDS     (input) INTEGER
+            The leading dimension of S.  LDS >= max(1,K).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --dlamda;
+    --q2;
+    --indx;
+    --ctot;
+    --w;
+    --s;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*k < 0) {
+	*info = -1;
+    } else if (*n < *k) {
+	*info = -2;
+    } else if (*ldq < max(1,*n)) {
+	*info = -6;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED3", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 0) {
+	return 0;
+    }
+
+/*
+       Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DLAMDA(I) if it is 1; this makes the subsequent
+       subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DLAMDA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DLAMDA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DLAMBDA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = slamc3_(&dlamda[i__], &dlamda[i__]) - dlamda[i__];
+/* L10: */
+    }
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	slaed4_(k, &j, &dlamda[1], &w[1], &q[j * q_dim1 + 1], rho, &d__[j],
+		info);
+
+/*        If the zero finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    goto L120;
+	}
+/* L20: */
+    }
+
+    if (*k == 1) {
+	goto L110;
+    }
+    if (*k == 2) {
+	i__1 = *k;
+	for (j = 1; j <= i__1; ++j) {
+	    w[1] = q[j * q_dim1 + 1];
+	    w[2] = q[j * q_dim1 + 2];
+	    ii = indx[1];
+	    q[j * q_dim1 + 1] = w[ii];
+	    ii = indx[2];
+	    q[j * q_dim1 + 2] = w[ii];
+/* L30: */
+	}
+	goto L110;
+    }
+
+/*     Compute updated W. */
+
+    scopy_(k, &w[1], &c__1, &s[1], &c__1);
+
+/*     Initialize W(I) = Q(I,I) */
+
+    i__1 = *ldq + 1;
+    scopy_(k, &q[q_offset], &i__1, &w[1], &c__1);
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L40: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L50: */
+	}
+/* L60: */
+    }
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	r__1 = sqrt(-w[i__]);
+	w[i__] = r_sign(&r__1, &s[i__]);
+/* L70: */
+    }
+
+/*     Compute eigenvectors of the modified rank-1 modification. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    s[i__] = w[i__] / q[i__ + j * q_dim1];
+/* L80: */
+	}
+	temp = snrm2_(k, &s[1], &c__1);
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    ii = indx[i__];
+	    q[i__ + j * q_dim1] = s[ii] / temp;
+/* L90: */
+	}
+/* L100: */
+    }
+
+/*     Compute the updated eigenvectors. */
+
+L110:
+
+    n2 = *n - *n1;
+    n12 = ctot[1] + ctot[2];
+    n23 = ctot[2] + ctot[3];
+
+    slacpy_("A", &n23, k, &q[ctot[1] + 1 + q_dim1], ldq, &s[1], &n23);
+    iq2 = *n1 * n12 + 1;
+    if (n23 != 0) {
+	sgemm_("N", "N", &n2, k, &n23, &c_b15, &q2[iq2], &n2, &s[1], &n23, &
+		c_b29, &q[*n1 + 1 + q_dim1], ldq);
+    } else {
+	slaset_("A", &n2, k, &c_b29, &c_b29, &q[*n1 + 1 + q_dim1], ldq);
+    }
+
+    slacpy_("A", &n12, k, &q[q_offset], ldq, &s[1], &n12);
+    if (n12 != 0) {
+	sgemm_("N", "N", n1, k, &n12, &c_b15, &q2[1], n1, &s[1], &n12, &c_b29,
+		 &q[q_offset], ldq);
+    } else {
+	slaset_("A", n1, k, &c_b29, &c_b29, &q[q_dim1 + 1], ldq);
+    }
+
+
+L120:
+    return 0;
+
+/*     End of SLAED3 */
+
+} /* slaed3_ */
+
+/* Subroutine */ int slaed4_(integer *n, integer *i__, real *d__, real *z__,
+	real *delta, real *rho, real *dlam, integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1;
+
+    /* Local variables */
+    static real a, b, c__;
+    static integer j;
+    static real w;
+    static integer ii;
+    static real dw, zz[3];
+    static integer ip1;
+    static real del, eta, phi, eps, tau, psi;
+    static integer iim1, iip1;
+    static real dphi, dpsi;
+    static integer iter;
+    static real temp, prew, temp1, dltlb, dltub, midpt;
+    static integer niter;
+    static logical swtch;
+    extern /* Subroutine */ int slaed5_(integer *, real *, real *, real *,
+	    real *, real *), slaed6_(integer *, logical *, real *, real *,
+	    real *, real *, real *, integer *);
+    static logical swtch3;
+    extern doublereal slamch_(char *);
+    static logical orgati;
+    static real erretm, rhoinv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the I-th updated eigenvalue of a symmetric
+    rank-one modification to a diagonal matrix whose elements are
+    given in the array d, and that
+
+               D(i) < D(j)  for  i < j
+
+    and that RHO > 0.  This is arranged by the calling routine, and is
+    no loss in generality.  The rank-one modified system is thus
+
+               diag( D )  +  RHO *  Z * Z_transpose.
+
+    where we assume the Euclidean norm of Z is 1.
+
+    The method consists of approximating the rational functions in the
+    secular equation by simpler interpolating rational functions.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The length of all arrays.
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  1 <= I <= N.
+
+    D      (input) REAL array, dimension (N)
+           The original eigenvalues.  It is assumed that they are in
+           order, D(I) < D(J)  for I < J.
+
+    Z      (input) REAL array, dimension (N)
+           The components of the updating vector.
+
+    DELTA  (output) REAL array, dimension (N)
+           If N .GT. 2, DELTA contains (D(j) - lambda_I) in its  j-th
+           component.  If N = 1, then DELTA(1) = 1. If N = 2, see SLAED5
+           for detail. The vector DELTA contains the information necessary
+           to construct the eigenvectors by SLAED3 and SLAED9.
+
+    RHO    (input) REAL
+           The scalar in the symmetric updating formula.
+
+    DLAM   (output) REAL
+           The computed lambda_I, the I-th updated eigenvalue.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit
+           > 0:  if INFO = 1, the updating process failed.
+
+    Internal Parameters
+    ===================
+
+    Logical variable ORGATI (origin-at-i?) is used for distinguishing
+    whether D(i) or D(i+1) is treated as the origin.
+
+              ORGATI = .true.    origin at i
+              ORGATI = .false.   origin at i+1
+
+     Logical variable SWTCH3 (switch-for-3-poles?) is for noting
+     if we are working with THREE poles!
+
+     MAXIT is the maximum number of iterations allowed for each
+     eigenvalue.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Since this routine is called in an inner loop, we do no argument
+       checking.
+
+       Quick return for N=1 and 2.
+*/
+
+    /* Parameter adjustments */
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    if (*n == 1) {
+
+/*         Presumably, I=1 upon entry */
+
+	*dlam = d__[1] + *rho * z__[1] * z__[1];
+	delta[1] = 1.f;
+	return 0;
+    }
+    if (*n == 2) {
+	slaed5_(i__, &d__[1], &z__[1], &delta[1], rho, dlam);
+	return 0;
+    }
+
+/*     Compute machine epsilon */
+
+    eps = slamch_("Epsilon");
+    rhoinv = 1.f / *rho;
+
+/*     The case I = N */
+
+    if (*i__ == *n) {
+
+/*        Initialize some basic variables */
+
+	ii = *n - 1;
+	niter = 1;
+
+/*        Calculate initial guess */
+
+	midpt = *rho / 2.f;
+
+/*
+          If ||Z||_2 is not one, then TEMP should be set to
+          RHO * ||Z||_2^2 / TWO
+*/
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - midpt;
+/* L10: */
+	}
+
+	psi = 0.f;
+	i__1 = *n - 2;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / delta[j];
+/* L20: */
+	}
+
+	c__ = rhoinv + psi;
+	w = c__ + z__[ii] * z__[ii] / delta[ii] + z__[*n] * z__[*n] / delta[*
+		n];
+
+	if (w <= 0.f) {
+	    temp = z__[*n - 1] * z__[*n - 1] / (d__[*n] - d__[*n - 1] + *rho)
+		    + z__[*n] * z__[*n] / *rho;
+	    if (c__ <= temp) {
+		tau = *rho;
+	    } else {
+		del = d__[*n] - d__[*n - 1];
+		a = -c__ * del + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*n]
+			;
+		b = z__[*n] * z__[*n] * del;
+		if (a < 0.f) {
+		    tau = b * 2.f / (sqrt(a * a + b * 4.f * c__) - a);
+		} else {
+		    tau = (a + sqrt(a * a + b * 4.f * c__)) / (c__ * 2.f);
+		}
+	    }
+
+/*
+             It can be proved that
+                 D(N)+RHO/2 <= LAMBDA(N) < D(N)+TAU <= D(N)+RHO
+*/
+
+	    dltlb = midpt;
+	    dltub = *rho;
+	} else {
+	    del = d__[*n] - d__[*n - 1];
+	    a = -c__ * del + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*n];
+	    b = z__[*n] * z__[*n] * del;
+	    if (a < 0.f) {
+		tau = b * 2.f / (sqrt(a * a + b * 4.f * c__) - a);
+	    } else {
+		tau = (a + sqrt(a * a + b * 4.f * c__)) / (c__ * 2.f);
+	    }
+
+/*
+             It can be proved that
+                 D(N) < D(N)+TAU < LAMBDA(N) < D(N)+RHO/2
+*/
+
+	    dltlb = 0.f;
+	    dltub = midpt;
+	}
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - tau;
+/* L30: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L40: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / delta[*n];
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8.f + erretm - phi + rhoinv + dabs(tau) * (
+		dpsi + dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Test for convergence */
+
+	if (dabs(w) <= eps * erretm) {
+	    *dlam = d__[*i__] + tau;
+	    goto L250;
+	}
+
+	if (w <= 0.f) {
+	    dltlb = dmax(dltlb,tau);
+	} else {
+	    dltub = dmin(dltub,tau);
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	c__ = w - delta[*n - 1] * dpsi - delta[*n] * dphi;
+	a = (delta[*n - 1] + delta[*n]) * w - delta[*n - 1] * delta[*n] * (
+		dpsi + dphi);
+	b = delta[*n - 1] * delta[*n] * w;
+	if (c__ < 0.f) {
+	    c__ = dabs(c__);
+	}
+	if (c__ == 0.f) {
+/*
+            ETA = B/A
+             ETA = RHO - TAU
+*/
+	    eta = dltub - tau;
+	} else if (a >= 0.f) {
+	    eta = (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) / (
+		    c__ * 2.f);
+	} else {
+	    eta = b * 2.f / (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+		    r__1))));
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta > 0.f) {
+	    eta = -w / (dpsi + dphi);
+	}
+	temp = tau + eta;
+	if (temp > dltub || temp < dltlb) {
+	    if (w < 0.f) {
+		eta = (dltub - tau) / 2.f;
+	    } else {
+		eta = (dltlb - tau) / 2.f;
+	    }
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] -= eta;
+/* L50: */
+	}
+
+	tau += eta;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L60: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / delta[*n];
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8.f + erretm - phi + rhoinv + dabs(tau) * (
+		dpsi + dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Main loop to update the values of the array   DELTA */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 30; ++niter) {
+
+/*           Test for convergence */
+
+	    if (dabs(w) <= eps * erretm) {
+		*dlam = d__[*i__] + tau;
+		goto L250;
+	    }
+
+	    if (w <= 0.f) {
+		dltlb = dmax(dltlb,tau);
+	    } else {
+		dltub = dmin(dltub,tau);
+	    }
+
+/*           Calculate the new step */
+
+	    c__ = w - delta[*n - 1] * dpsi - delta[*n] * dphi;
+	    a = (delta[*n - 1] + delta[*n]) * w - delta[*n - 1] * delta[*n] *
+		    (dpsi + dphi);
+	    b = delta[*n - 1] * delta[*n] * w;
+	    if (a >= 0.f) {
+		eta = (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) /
+			 (c__ * 2.f);
+	    } else {
+		eta = b * 2.f / (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+			r__1))));
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta > 0.f) {
+		eta = -w / (dpsi + dphi);
+	    }
+	    temp = tau + eta;
+	    if (temp > dltub || temp < dltlb) {
+		if (w < 0.f) {
+		    eta = (dltub - tau) / 2.f;
+		} else {
+		    eta = (dltlb - tau) / 2.f;
+		}
+	    }
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] -= eta;
+/* L70: */
+	    }
+
+	    tau += eta;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.f;
+	    psi = 0.f;
+	    erretm = 0.f;
+	    i__1 = ii;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / delta[j];
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L80: */
+	    }
+	    erretm = dabs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    temp = z__[*n] / delta[*n];
+	    phi = z__[*n] * temp;
+	    dphi = temp * temp;
+	    erretm = (-phi - psi) * 8.f + erretm - phi + rhoinv + dabs(tau) *
+		    (dpsi + dphi);
+
+	    w = rhoinv + phi + psi;
+/* L90: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+	*dlam = d__[*i__] + tau;
+	goto L250;
+
+/*        End for the case I = N */
+
+    } else {
+
+/*        The case for I < N */
+
+	niter = 1;
+	ip1 = *i__ + 1;
+
+/*        Calculate initial guess */
+
+	del = d__[ip1] - d__[*i__];
+	midpt = del / 2.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - midpt;
+/* L100: */
+	}
+
+	psi = 0.f;
+	i__1 = *i__ - 1;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / delta[j];
+/* L110: */
+	}
+
+	phi = 0.f;
+	i__1 = *i__ + 2;
+	for (j = *n; j >= i__1; --j) {
+	    phi += z__[j] * z__[j] / delta[j];
+/* L120: */
+	}
+	c__ = rhoinv + psi + phi;
+	w = c__ + z__[*i__] * z__[*i__] / delta[*i__] + z__[ip1] * z__[ip1] /
+		delta[ip1];
+
+	if (w > 0.f) {
+
+/*
+             d(i)< the ith eigenvalue < (d(i)+d(i+1))/2
+
+             We choose d(i) as origin.
+*/
+
+	    orgati = TRUE_;
+	    a = c__ * del + z__[*i__] * z__[*i__] + z__[ip1] * z__[ip1];
+	    b = z__[*i__] * z__[*i__] * del;
+	    if (a > 0.f) {
+		tau = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+			r__1))));
+	    } else {
+		tau = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) /
+			 (c__ * 2.f);
+	    }
+	    dltlb = 0.f;
+	    dltub = midpt;
+	} else {
+
+/*
+             (d(i)+d(i+1))/2 <= the ith eigenvalue < d(i+1)
+
+             We choose d(i+1) as origin.
+*/
+
+	    orgati = FALSE_;
+	    a = c__ * del - z__[*i__] * z__[*i__] - z__[ip1] * z__[ip1];
+	    b = z__[ip1] * z__[ip1] * del;
+	    if (a < 0.f) {
+		tau = b * 2.f / (a - sqrt((r__1 = a * a + b * 4.f * c__, dabs(
+			r__1))));
+	    } else {
+		tau = -(a + sqrt((r__1 = a * a + b * 4.f * c__, dabs(r__1))))
+			/ (c__ * 2.f);
+	    }
+	    dltlb = -midpt;
+	    dltub = 0.f;
+	}
+
+	if (orgati) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] = d__[j] - d__[*i__] - tau;
+/* L130: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] = d__[j] - d__[ip1] - tau;
+/* L140: */
+	    }
+	}
+	if (orgati) {
+	    ii = *i__;
+	} else {
+	    ii = *i__ + 1;
+	}
+	iim1 = ii - 1;
+	iip1 = ii + 1;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L150: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.f;
+	phi = 0.f;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / delta[j];
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L160: */
+	}
+
+	w = rhoinv + phi + psi;
+
+/*
+          W is the value of the secular function with
+          its ii-th element removed.
+*/
+
+	swtch3 = FALSE_;
+	if (orgati) {
+	    if (w < 0.f) {
+		swtch3 = TRUE_;
+	    }
+	} else {
+	    if (w > 0.f) {
+		swtch3 = TRUE_;
+	    }
+	}
+	if (ii == 1 || ii == *n) {
+	    swtch3 = FALSE_;
+	}
+
+	temp = z__[ii] / delta[ii];
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w += temp;
+	erretm = (phi - psi) * 8.f + erretm + rhoinv * 2.f + dabs(temp) * 3.f
+		+ dabs(tau) * dw;
+
+/*        Test for convergence */
+
+	if (dabs(w) <= eps * erretm) {
+	    if (orgati) {
+		*dlam = d__[*i__] + tau;
+	    } else {
+		*dlam = d__[ip1] + tau;
+	    }
+	    goto L250;
+	}
+
+	if (w <= 0.f) {
+	    dltlb = dmax(dltlb,tau);
+	} else {
+	    dltub = dmin(dltub,tau);
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	if (! swtch3) {
+	    if (orgati) {
+/* Computing 2nd power */
+		r__1 = z__[*i__] / delta[*i__];
+		c__ = w - delta[ip1] * dw - (d__[*i__] - d__[ip1]) * (r__1 *
+			r__1);
+	    } else {
+/* Computing 2nd power */
+		r__1 = z__[ip1] / delta[ip1];
+		c__ = w - delta[*i__] * dw - (d__[ip1] - d__[*i__]) * (r__1 *
+			r__1);
+	    }
+	    a = (delta[*i__] + delta[ip1]) * w - delta[*i__] * delta[ip1] *
+		    dw;
+	    b = delta[*i__] * delta[ip1] * w;
+	    if (c__ == 0.f) {
+		if (a == 0.f) {
+		    if (orgati) {
+			a = z__[*i__] * z__[*i__] + delta[ip1] * delta[ip1] *
+				(dpsi + dphi);
+		    } else {
+			a = z__[ip1] * z__[ip1] + delta[*i__] * delta[*i__] *
+				(dpsi + dphi);
+		    }
+		}
+		eta = b / a;
+	    } else if (a <= 0.f) {
+		eta = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) /
+			 (c__ * 2.f);
+	    } else {
+		eta = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+			r__1))));
+	    }
+	} else {
+
+/*           Interpolation using THREE most relevant poles */
+
+	    temp = rhoinv + psi + phi;
+	    if (orgati) {
+		temp1 = z__[iim1] / delta[iim1];
+		temp1 *= temp1;
+		c__ = temp - delta[iip1] * (dpsi + dphi) - (d__[iim1] - d__[
+			iip1]) * temp1;
+		zz[0] = z__[iim1] * z__[iim1];
+		zz[2] = delta[iip1] * delta[iip1] * (dpsi - temp1 + dphi);
+	    } else {
+		temp1 = z__[iip1] / delta[iip1];
+		temp1 *= temp1;
+		c__ = temp - delta[iim1] * (dpsi + dphi) - (d__[iip1] - d__[
+			iim1]) * temp1;
+		zz[0] = delta[iim1] * delta[iim1] * (dpsi + (dphi - temp1));
+		zz[2] = z__[iip1] * z__[iip1];
+	    }
+	    zz[1] = z__[ii] * z__[ii];
+	    slaed6_(&niter, &orgati, &c__, &delta[iim1], zz, &w, &eta, info);
+	    if (*info != 0) {
+		goto L250;
+	    }
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta >= 0.f) {
+	    eta = -w / dw;
+	}
+	temp = tau + eta;
+	if (temp > dltub || temp < dltlb) {
+	    if (w < 0.f) {
+		eta = (dltub - tau) / 2.f;
+	    } else {
+		eta = (dltlb - tau) / 2.f;
+	    }
+	}
+
+	prew = w;
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] -= eta;
+/* L180: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / delta[j];
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L190: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.f;
+	phi = 0.f;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / delta[j];
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L200: */
+	}
+
+	temp = z__[ii] / delta[ii];
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w = rhoinv + phi + psi + temp;
+	erretm = (phi - psi) * 8.f + erretm + rhoinv * 2.f + dabs(temp) * 3.f
+		+ (r__1 = tau + eta, dabs(r__1)) * dw;
+
+	swtch = FALSE_;
+	if (orgati) {
+	    if (-w > dabs(prew) / 10.f) {
+		swtch = TRUE_;
+	    }
+	} else {
+	    if (w > dabs(prew) / 10.f) {
+		swtch = TRUE_;
+	    }
+	}
+
+	tau += eta;
+
+/*        Main loop to update the values of the array   DELTA */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 30; ++niter) {
+
+/*           Test for convergence */
+
+	    if (dabs(w) <= eps * erretm) {
+		if (orgati) {
+		    *dlam = d__[*i__] + tau;
+		} else {
+		    *dlam = d__[ip1] + tau;
+		}
+		goto L250;
+	    }
+
+	    if (w <= 0.f) {
+		dltlb = dmax(dltlb,tau);
+	    } else {
+		dltub = dmin(dltub,tau);
+	    }
+
+/*           Calculate the new step */
+
+	    if (! swtch3) {
+		if (! swtch) {
+		    if (orgati) {
+/* Computing 2nd power */
+			r__1 = z__[*i__] / delta[*i__];
+			c__ = w - delta[ip1] * dw - (d__[*i__] - d__[ip1]) * (
+				r__1 * r__1);
+		    } else {
+/* Computing 2nd power */
+			r__1 = z__[ip1] / delta[ip1];
+			c__ = w - delta[*i__] * dw - (d__[ip1] - d__[*i__]) *
+				(r__1 * r__1);
+		    }
+		} else {
+		    temp = z__[ii] / delta[ii];
+		    if (orgati) {
+			dpsi += temp * temp;
+		    } else {
+			dphi += temp * temp;
+		    }
+		    c__ = w - delta[*i__] * dpsi - delta[ip1] * dphi;
+		}
+		a = (delta[*i__] + delta[ip1]) * w - delta[*i__] * delta[ip1]
+			* dw;
+		b = delta[*i__] * delta[ip1] * w;
+		if (c__ == 0.f) {
+		    if (a == 0.f) {
+			if (! swtch) {
+			    if (orgati) {
+				a = z__[*i__] * z__[*i__] + delta[ip1] *
+					delta[ip1] * (dpsi + dphi);
+			    } else {
+				a = z__[ip1] * z__[ip1] + delta[*i__] * delta[
+					*i__] * (dpsi + dphi);
+			    }
+			} else {
+			    a = delta[*i__] * delta[*i__] * dpsi + delta[ip1]
+				    * delta[ip1] * dphi;
+			}
+		    }
+		    eta = b / a;
+		} else if (a <= 0.f) {
+		    eta = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1))
+			    )) / (c__ * 2.f);
+		} else {
+		    eta = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__,
+			    dabs(r__1))));
+		}
+	    } else {
+
+/*              Interpolation using THREE most relevant poles */
+
+		temp = rhoinv + psi + phi;
+		if (swtch) {
+		    c__ = temp - delta[iim1] * dpsi - delta[iip1] * dphi;
+		    zz[0] = delta[iim1] * delta[iim1] * dpsi;
+		    zz[2] = delta[iip1] * delta[iip1] * dphi;
+		} else {
+		    if (orgati) {
+			temp1 = z__[iim1] / delta[iim1];
+			temp1 *= temp1;
+			c__ = temp - delta[iip1] * (dpsi + dphi) - (d__[iim1]
+				- d__[iip1]) * temp1;
+			zz[0] = z__[iim1] * z__[iim1];
+			zz[2] = delta[iip1] * delta[iip1] * (dpsi - temp1 +
+				dphi);
+		    } else {
+			temp1 = z__[iip1] / delta[iip1];
+			temp1 *= temp1;
+			c__ = temp - delta[iim1] * (dpsi + dphi) - (d__[iip1]
+				- d__[iim1]) * temp1;
+			zz[0] = delta[iim1] * delta[iim1] * (dpsi + (dphi -
+				temp1));
+			zz[2] = z__[iip1] * z__[iip1];
+		    }
+		}
+		slaed6_(&niter, &orgati, &c__, &delta[iim1], zz, &w, &eta,
+			info);
+		if (*info != 0) {
+		    goto L250;
+		}
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta >= 0.f) {
+		eta = -w / dw;
+	    }
+	    temp = tau + eta;
+	    if (temp > dltub || temp < dltlb) {
+		if (w < 0.f) {
+		    eta = (dltub - tau) / 2.f;
+		} else {
+		    eta = (dltlb - tau) / 2.f;
+		}
+	    }
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] -= eta;
+/* L210: */
+	    }
+
+	    tau += eta;
+	    prew = w;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.f;
+	    psi = 0.f;
+	    erretm = 0.f;
+	    i__1 = iim1;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / delta[j];
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L220: */
+	    }
+	    erretm = dabs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    dphi = 0.f;
+	    phi = 0.f;
+	    i__1 = iip1;
+	    for (j = *n; j >= i__1; --j) {
+		temp = z__[j] / delta[j];
+		phi += z__[j] * temp;
+		dphi += temp * temp;
+		erretm += phi;
+/* L230: */
+	    }
+
+	    temp = z__[ii] / delta[ii];
+	    dw = dpsi + dphi + temp * temp;
+	    temp = z__[ii] * temp;
+	    w = rhoinv + phi + psi + temp;
+	    erretm = (phi - psi) * 8.f + erretm + rhoinv * 2.f + dabs(temp) *
+		    3.f + dabs(tau) * dw;
+	    if (w * prew > 0.f && dabs(w) > dabs(prew) / 10.f) {
+		swtch = ! swtch;
+	    }
+
+/* L240: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+	if (orgati) {
+	    *dlam = d__[*i__] + tau;
+	} else {
+	    *dlam = d__[ip1] + tau;
+	}
+
+    }
+
+L250:
+
+    return 0;
+
+/*     End of SLAED4 */
+
+} /* slaed4_ */
+
+/* Subroutine */ int slaed5_(integer *i__, real *d__, real *z__, real *delta,
+	real *rho, real *dlam)
+{
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    static real b, c__, w, del, tau, temp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the I-th eigenvalue of a symmetric rank-one
+    modification of a 2-by-2 diagonal matrix
+
+               diag( D )  +  RHO *  Z * transpose(Z) .
+
+    The diagonal elements in the array D are assumed to satisfy
+
+               D(i) < D(j)  for  i < j .
+
+    We also assume RHO > 0 and that the Euclidean norm of the vector
+    Z is one.
+
+    Arguments
+    =========
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  I = 1 or I = 2.
+
+    D      (input) REAL array, dimension (2)
+           The original eigenvalues.  We assume D(1) < D(2).
+
+    Z      (input) REAL array, dimension (2)
+           The components of the updating vector.
+
+    DELTA  (output) REAL array, dimension (2)
+           The vector DELTA contains the information necessary
+           to construct the eigenvectors.
+
+    RHO    (input) REAL
+           The scalar in the symmetric updating formula.
+
+    DLAM   (output) REAL
+           The computed lambda_I, the I-th updated eigenvalue.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    del = d__[2] - d__[1];
+    if (*i__ == 1) {
+	w = *rho * 2.f * (z__[2] * z__[2] - z__[1] * z__[1]) / del + 1.f;
+	if (w > 0.f) {
+	    b = del + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[1] * z__[1] * del;
+
+/*           B > ZERO, always */
+
+	    tau = c__ * 2.f / (b + sqrt((r__1 = b * b - c__ * 4.f, dabs(r__1))
+		    ));
+	    *dlam = d__[1] + tau;
+	    delta[1] = -z__[1] / tau;
+	    delta[2] = z__[2] / (del - tau);
+	} else {
+	    b = -del + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[2] * z__[2] * del;
+	    if (b > 0.f) {
+		tau = c__ * -2.f / (b + sqrt(b * b + c__ * 4.f));
+	    } else {
+		tau = (b - sqrt(b * b + c__ * 4.f)) / 2.f;
+	    }
+	    *dlam = d__[2] + tau;
+	    delta[1] = -z__[1] / (del + tau);
+	    delta[2] = -z__[2] / tau;
+	}
+	temp = sqrt(delta[1] * delta[1] + delta[2] * delta[2]);
+	delta[1] /= temp;
+	delta[2] /= temp;
+    } else {
+
+/*     Now I=2 */
+
+	b = -del + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	c__ = *rho * z__[2] * z__[2] * del;
+	if (b > 0.f) {
+	    tau = (b + sqrt(b * b + c__ * 4.f)) / 2.f;
+	} else {
+	    tau = c__ * 2.f / (-b + sqrt(b * b + c__ * 4.f));
+	}
+	*dlam = d__[2] + tau;
+	delta[1] = -z__[1] / (del + tau);
+	delta[2] = -z__[2] / tau;
+	temp = sqrt(delta[1] * delta[1] + delta[2] * delta[2]);
+	delta[1] /= temp;
+	delta[2] /= temp;
+    }
+    return 0;
+
+/*     End OF SLAED5 */
+
+} /* slaed5_ */
+
+/* Subroutine */ int slaed6_(integer *kniter, logical *orgati, real *rho,
+	real *d__, real *z__, real *finit, real *tau, integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2, r__3, r__4;
+
+    /* Local variables */
+    static real a, b, c__, f;
+    static integer i__;
+    static real fc, df, ddf, lbd, eta, ubd, eps, base;
+    static integer iter;
+    static real temp, temp1, temp2, temp3, temp4;
+    static logical scale;
+    static integer niter;
+    static real small1, small2, sminv1, sminv2, dscale[3], sclfac;
+    extern doublereal slamch_(char *);
+    static real zscale[3], erretm, sclinv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       February 2007
+
+
+    Purpose
+    =======
+
+    SLAED6 computes the positive or negative root (closest to the origin)
+    of
+                     z(1)        z(2)        z(3)
+    f(x) =   rho + --------- + ---------- + ---------
+                    d(1)-x      d(2)-x      d(3)-x
+
+    It is assumed that
+
+          if ORGATI = .true. the root is between d(2) and d(3);
+          otherwise it is between d(1) and d(2)
+
+    This routine will be called by SLAED4 when necessary. In most cases,
+    the root sought is the smallest in magnitude, though it might not be
+    in some extremely rare situations.
+
+    Arguments
+    =========
+
+    KNITER       (input) INTEGER
+                 Refer to SLAED4 for its significance.
+
+    ORGATI       (input) LOGICAL
+                 If ORGATI is true, the needed root is between d(2) and
+                 d(3); otherwise it is between d(1) and d(2).  See
+                 SLAED4 for further details.
+
+    RHO          (input) REAL
+                 Refer to the equation f(x) above.
+
+    D            (input) REAL array, dimension (3)
+                 D satisfies d(1) < d(2) < d(3).
+
+    Z            (input) REAL array, dimension (3)
+                 Each of the elements in z must be positive.
+
+    FINIT        (input) REAL
+                 The value of f at 0. It is more accurate than the one
+                 evaluated inside this routine (if someone wants to do
+                 so).
+
+    TAU          (output) REAL
+                 The root of the equation f(x).
+
+    INFO         (output) INTEGER
+                 = 0: successful exit
+                 > 0: if INFO = 1, failure to converge
+
+    Further Details
+    ===============
+
+    30/06/99: Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    10/02/03: This version has a few statements commented out for thread safety
+       (machine parameters are computed on each entry). SJH.
+
+    05/10/06: Modified from a new version of Ren-Cang Li, use
+       Gragg-Thornton-Warner cubic convergent scheme for better stability.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*orgati) {
+	lbd = d__[2];
+	ubd = d__[3];
+    } else {
+	lbd = d__[1];
+	ubd = d__[2];
+    }
+    if (*finit < 0.f) {
+	lbd = 0.f;
+    } else {
+	ubd = 0.f;
+    }
+
+    niter = 1;
+    *tau = 0.f;
+    if (*kniter == 2) {
+	if (*orgati) {
+	    temp = (d__[3] - d__[2]) / 2.f;
+	    c__ = *rho + z__[1] / (d__[1] - d__[2] - temp);
+	    a = c__ * (d__[2] + d__[3]) + z__[2] + z__[3];
+	    b = c__ * d__[2] * d__[3] + z__[2] * d__[3] + z__[3] * d__[2];
+	} else {
+	    temp = (d__[1] - d__[2]) / 2.f;
+	    c__ = *rho + z__[3] / (d__[3] - d__[2] - temp);
+	    a = c__ * (d__[1] + d__[2]) + z__[1] + z__[2];
+	    b = c__ * d__[1] * d__[2] + z__[1] * d__[2] + z__[2] * d__[1];
+	}
+/* Computing MAX */
+	r__1 = dabs(a), r__2 = dabs(b), r__1 = max(r__1,r__2), r__2 = dabs(
+		c__);
+	temp = dmax(r__1,r__2);
+	a /= temp;
+	b /= temp;
+	c__ /= temp;
+	if (c__ == 0.f) {
+	    *tau = b / a;
+	} else if (a <= 0.f) {
+	    *tau = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) / (
+		    c__ * 2.f);
+	} else {
+	    *tau = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+		    r__1))));
+	}
+	if (*tau < lbd || *tau > ubd) {
+	    *tau = (lbd + ubd) / 2.f;
+	}
+	if (d__[1] == *tau || d__[2] == *tau || d__[3] == *tau) {
+	    *tau = 0.f;
+	} else {
+	    temp = *finit + *tau * z__[1] / (d__[1] * (d__[1] - *tau)) + *tau
+		    * z__[2] / (d__[2] * (d__[2] - *tau)) + *tau * z__[3] / (
+		    d__[3] * (d__[3] - *tau));
+	    if (temp <= 0.f) {
+		lbd = *tau;
+	    } else {
+		ubd = *tau;
+	    }
+	    if (dabs(*finit) <= dabs(temp)) {
+		*tau = 0.f;
+	    }
+	}
+    }
+
+/*
+       get machine parameters for possible scaling to avoid overflow
+
+       modified by Sven: parameters SMALL1, SMINV1, SMALL2,
+       SMINV2, EPS are not SAVEd anymore between one call to the
+       others but recomputed at each call
+*/
+
+    eps = slamch_("Epsilon");
+    base = slamch_("Base");
+    i__1 = (integer) (log(slamch_("SafMin")) / log(base) / 3.f);
+    small1 = pow_ri(&base, &i__1);
+    sminv1 = 1.f / small1;
+    small2 = small1 * small1;
+    sminv2 = sminv1 * sminv1;
+
+/*
+       Determine if scaling of inputs necessary to avoid overflow
+       when computing 1/TEMP**3
+*/
+
+    if (*orgati) {
+/* Computing MIN */
+	r__3 = (r__1 = d__[2] - *tau, dabs(r__1)), r__4 = (r__2 = d__[3] - *
+		tau, dabs(r__2));
+	temp = dmin(r__3,r__4);
+    } else {
+/* Computing MIN */
+	r__3 = (r__1 = d__[1] - *tau, dabs(r__1)), r__4 = (r__2 = d__[2] - *
+		tau, dabs(r__2));
+	temp = dmin(r__3,r__4);
+    }
+    scale = FALSE_;
+    if (temp <= small1) {
+	scale = TRUE_;
+	if (temp <= small2) {
+
+/*        Scale up by power of radix nearest 1/SAFMIN**(2/3) */
+
+	    sclfac = sminv2;
+	    sclinv = small2;
+	} else {
+
+/*        Scale up by power of radix nearest 1/SAFMIN**(1/3) */
+
+	    sclfac = sminv1;
+	    sclinv = small1;
+	}
+
+/*        Scaling up safe because D, Z, TAU scaled elsewhere to be O(1) */
+
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    dscale[i__ - 1] = d__[i__] * sclfac;
+	    zscale[i__ - 1] = z__[i__] * sclfac;
+/* L10: */
+	}
+	*tau *= sclfac;
+	lbd *= sclfac;
+	ubd *= sclfac;
+    } else {
+
+/*        Copy D and Z to DSCALE and ZSCALE */
+
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    dscale[i__ - 1] = d__[i__];
+	    zscale[i__ - 1] = z__[i__];
+/* L20: */
+	}
+    }
+
+    fc = 0.f;
+    df = 0.f;
+    ddf = 0.f;
+    for (i__ = 1; i__ <= 3; ++i__) {
+	temp = 1.f / (dscale[i__ - 1] - *tau);
+	temp1 = zscale[i__ - 1] * temp;
+	temp2 = temp1 * temp;
+	temp3 = temp2 * temp;
+	fc += temp1 / dscale[i__ - 1];
+	df += temp2;
+	ddf += temp3;
+/* L30: */
+    }
+    f = *finit + *tau * fc;
+
+    if (dabs(f) <= 0.f) {
+	goto L60;
+    }
+    if (f <= 0.f) {
+	lbd = *tau;
+    } else {
+	ubd = *tau;
+    }
+
+/*
+          Iteration begins -- Use Gragg-Thornton-Warner cubic convergent
+                              scheme
+
+       It is not hard to see that
+
+             1) Iterations will go up monotonically
+                if FINIT < 0;
+
+             2) Iterations will go down monotonically
+                if FINIT > 0.
+*/
+
+    iter = niter + 1;
+
+    for (niter = iter; niter <= 40; ++niter) {
+
+	if (*orgati) {
+	    temp1 = dscale[1] - *tau;
+	    temp2 = dscale[2] - *tau;
+	} else {
+	    temp1 = dscale[0] - *tau;
+	    temp2 = dscale[1] - *tau;
+	}
+	a = (temp1 + temp2) * f - temp1 * temp2 * df;
+	b = temp1 * temp2 * f;
+	c__ = f - (temp1 + temp2) * df + temp1 * temp2 * ddf;
+/* Computing MAX */
+	r__1 = dabs(a), r__2 = dabs(b), r__1 = max(r__1,r__2), r__2 = dabs(
+		c__);
+	temp = dmax(r__1,r__2);
+	a /= temp;
+	b /= temp;
+	c__ /= temp;
+	if (c__ == 0.f) {
+	    eta = b / a;
+	} else if (a <= 0.f) {
+	    eta = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) / (
+		    c__ * 2.f);
+	} else {
+	    eta = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+		    r__1))));
+	}
+	if (f * eta >= 0.f) {
+	    eta = -f / df;
+	}
+
+	*tau += eta;
+	if (*tau < lbd || *tau > ubd) {
+	    *tau = (lbd + ubd) / 2.f;
+	}
+
+	fc = 0.f;
+	erretm = 0.f;
+	df = 0.f;
+	ddf = 0.f;
+	for (i__ = 1; i__ <= 3; ++i__) {
+	    temp = 1.f / (dscale[i__ - 1] - *tau);
+	    temp1 = zscale[i__ - 1] * temp;
+	    temp2 = temp1 * temp;
+	    temp3 = temp2 * temp;
+	    temp4 = temp1 / dscale[i__ - 1];
+	    fc += temp4;
+	    erretm += dabs(temp4);
+	    df += temp2;
+	    ddf += temp3;
+/* L40: */
+	}
+	f = *finit + *tau * fc;
+	erretm = (dabs(*finit) + dabs(*tau) * erretm) * 8.f + dabs(*tau) * df;
+	if (dabs(f) <= eps * erretm) {
+	    goto L60;
+	}
+	if (f <= 0.f) {
+	    lbd = *tau;
+	} else {
+	    ubd = *tau;
+	}
+/* L50: */
+    }
+    *info = 1;
+L60:
+
+/*     Undo scaling */
+
+    if (scale) {
+	*tau *= sclinv;
+    }
+    return 0;
+
+/*     End of SLAED6 */
+
+} /* slaed6_ */
+
+/* Subroutine */ int slaed7_(integer *icompq, integer *n, integer *qsiz,
+	integer *tlvls, integer *curlvl, integer *curpbm, real *d__, real *q,
+	integer *ldq, integer *indxq, real *rho, integer *cutpnt, real *
+	qstore, integer *qptr, integer *prmptr, integer *perm, integer *
+	givptr, integer *givcol, real *givnum, real *work, integer *iwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, k, n1, n2, is, iw, iz, iq2, ptr, ldq2, indx, curr,
+	    indxc;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer indxp;
+    extern /* Subroutine */ int slaed8_(integer *, integer *, integer *,
+	    integer *, real *, real *, integer *, integer *, real *, integer *
+	    , real *, real *, real *, integer *, real *, integer *, integer *,
+	     integer *, real *, integer *, integer *, integer *), slaed9_(
+	    integer *, integer *, integer *, integer *, real *, real *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    slaeda_(integer *, integer *, integer *, integer *, integer *,
+	    integer *, integer *, integer *, real *, real *, integer *, real *
+	    , real *, integer *);
+    static integer idlmda;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slamrg_(
+	    integer *, integer *, real *, integer *, integer *, integer *);
+    static integer coltyp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAED7 computes the updated eigensystem of a diagonal
+    matrix after modification by a rank-one symmetric matrix. This
+    routine is used only for the eigenproblem which requires all
+    eigenvalues and optionally eigenvectors of a dense symmetric matrix
+    that has been reduced to tridiagonal form.  SLAED1 handles
+    the case in which all eigenvalues and eigenvectors of a symmetric
+    tridiagonal matrix are desired.
+
+      T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out)
+
+       where Z = Q'u, u is a vector of length N with ones in the
+       CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+
+       The eigenvectors of the original matrix are stored in Q, and the
+       eigenvalues are in D.  The algorithm consists of three stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple eigenvalues or if there is a zero in
+          the Z vector.  For each such occurence the dimension of the
+          secular equation problem is reduced by one.  This stage is
+          performed by the routine SLAED8.
+
+          The second stage consists of calculating the updated
+          eigenvalues. This is done by finding the roots of the secular
+          equation via the routine SLAED4 (as called by SLAED9).
+          This routine also calculates the eigenvectors of the current
+          problem.
+
+          The final stage consists of computing the updated eigenvectors
+          directly using the updated eigenvalues.  The eigenvectors for
+          the current problem are multiplied with the eigenvectors from
+          the overall problem.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            = 0:  Compute eigenvalues only.
+            = 1:  Compute eigenvectors of original dense symmetric matrix
+                  also.  On entry, Q contains the orthogonal matrix used
+                  to reduce the original matrix to tridiagonal form.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    QSIZ   (input) INTEGER
+           The dimension of the orthogonal matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    TLVLS  (input) INTEGER
+           The total number of merging levels in the overall divide and
+           conquer tree.
+
+    CURLVL (input) INTEGER
+           The current level in the overall merge routine,
+           0 <= CURLVL <= TLVLS.
+
+    CURPBM (input) INTEGER
+           The current problem in the current level in the overall
+           merge routine (counting from upper left to lower right).
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, the eigenvalues of the rank-1-perturbed matrix.
+           On exit, the eigenvalues of the repaired matrix.
+
+    Q      (input/output) REAL array, dimension (LDQ, N)
+           On entry, the eigenvectors of the rank-1-perturbed matrix.
+           On exit, the eigenvectors of the repaired tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (output) INTEGER array, dimension (N)
+           The permutation which will reintegrate the subproblem just
+           solved back into sorted order, i.e., D( INDXQ( I = 1, N ) )
+           will be in ascending order.
+
+    RHO    (input) REAL
+           The subdiagonal element used to create the rank-1
+           modification.
+
+    CUTPNT (input) INTEGER
+           Contains the location of the last eigenvalue in the leading
+           sub-matrix.  min(1,N) <= CUTPNT <= N.
+
+    QSTORE (input/output) REAL array, dimension (N**2+1)
+           Stores eigenvectors of submatrices encountered during
+           divide and conquer, packed together. QPTR points to
+           beginning of the submatrices.
+
+    QPTR   (input/output) INTEGER array, dimension (N+2)
+           List of indices pointing to beginning of submatrices stored
+           in QSTORE. The submatrices are numbered starting at the
+           bottom left of the divide and conquer tree, from left to
+           right and bottom to top.
+
+    PRMPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in PERM a
+           level's permutation is stored.  PRMPTR(i+1) - PRMPTR(i)
+           indicates the size of the permutation and also the size of
+           the full, non-deflated problem.
+
+    PERM   (input) INTEGER array, dimension (N lg N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in GIVCOL a
+           level's Givens rotations are stored.  GIVPTR(i+1) - GIVPTR(i)
+           indicates the number of Givens rotations.
+
+    GIVCOL (input) INTEGER array, dimension (2, N lg N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (input) REAL array, dimension (2, N lg N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    WORK   (workspace) REAL array, dimension (3*N+QSIZ*N)
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --qstore;
+    --qptr;
+    --prmptr;
+    --perm;
+    --givptr;
+    givcol -= 3;
+    givnum -= 3;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*icompq == 1 && *qsiz < *n) {
+	*info = -4;
+    } else if (*ldq < max(1,*n)) {
+	*info = -9;
+    } else if (min(1,*n) > *cutpnt || *n < *cutpnt) {
+	*info = -12;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED7", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in SLAED8 and SLAED9.
+*/
+
+    if (*icompq == 1) {
+	ldq2 = *qsiz;
+    } else {
+	ldq2 = *n;
+    }
+
+    iz = 1;
+    idlmda = iz + *n;
+    iw = idlmda + *n;
+    iq2 = iw + *n;
+    is = iq2 + *n * ldq2;
+
+    indx = 1;
+    indxc = indx + *n;
+    coltyp = indxc + *n;
+    indxp = coltyp + *n;
+
+/*
+       Form the z-vector which consists of the last row of Q_1 and the
+       first row of Q_2.
+*/
+
+    ptr = pow_ii(&c__2, tlvls) + 1;
+    i__1 = *curlvl - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = *tlvls - i__;
+	ptr += pow_ii(&c__2, &i__2);
+/* L10: */
+    }
+    curr = ptr + *curpbm;
+    slaeda_(n, tlvls, curlvl, curpbm, &prmptr[1], &perm[1], &givptr[1], &
+	    givcol[3], &givnum[3], &qstore[1], &qptr[1], &work[iz], &work[iz
+	    + *n], info);
+
+/*
+       When solving the final problem, we no longer need the stored data,
+       so we will overwrite the data from this level onto the previously
+       used storage space.
+*/
+
+    if (*curlvl == *tlvls) {
+	qptr[curr] = 1;
+	prmptr[curr] = 1;
+	givptr[curr] = 1;
+    }
+
+/*     Sort and Deflate eigenvalues. */
+
+    slaed8_(icompq, &k, n, qsiz, &d__[1], &q[q_offset], ldq, &indxq[1], rho,
+	    cutpnt, &work[iz], &work[idlmda], &work[iq2], &ldq2, &work[iw], &
+	    perm[prmptr[curr]], &givptr[curr + 1], &givcol[(givptr[curr] << 1)
+	     + 1], &givnum[(givptr[curr] << 1) + 1], &iwork[indxp], &iwork[
+	    indx], info);
+    prmptr[curr + 1] = prmptr[curr] + *n;
+    givptr[curr + 1] += givptr[curr];
+
+/*     Solve Secular Equation. */
+
+    if (k != 0) {
+	slaed9_(&k, &c__1, &k, n, &d__[1], &work[is], &k, rho, &work[idlmda],
+		&work[iw], &qstore[qptr[curr]], &k, info);
+	if (*info != 0) {
+	    goto L30;
+	}
+	if (*icompq == 1) {
+	    sgemm_("N", "N", qsiz, &k, &k, &c_b15, &work[iq2], &ldq2, &qstore[
+		    qptr[curr]], &k, &c_b29, &q[q_offset], ldq);
+	}
+/* Computing 2nd power */
+	i__1 = k;
+	qptr[curr + 1] = qptr[curr] + i__1 * i__1;
+
+/*     Prepare the INDXQ sorting permutation. */
+
+	n1 = k;
+	n2 = *n - k;
+	slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &indxq[1]);
+    } else {
+	qptr[curr + 1] = qptr[curr];
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    indxq[i__] = i__;
+/* L20: */
+	}
+    }
+
+L30:
+    return 0;
+
+/*     End of SLAED7 */
+
+} /* slaed7_ */
+
+/* Subroutine */ int slaed8_(integer *icompq, integer *k, integer *n, integer
+	*qsiz, real *d__, real *q, integer *ldq, integer *indxq, real *rho,
+	integer *cutpnt, real *z__, real *dlamda, real *q2, integer *ldq2,
+	real *w, integer *perm, integer *givptr, integer *givcol, real *
+	givnum, integer *indxp, integer *indx, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, q2_dim1, q2_offset, i__1;
+    real r__1;
+
+    /* Local variables */
+    static real c__;
+    static integer i__, j;
+    static real s, t;
+    static integer k2, n1, n2, jp, n1p1;
+    static real eps, tau, tol;
+    static integer jlam, imax, jmax;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *), sscal_(integer *, real *, real *,
+	    integer *), scopy_(integer *, real *, integer *, real *, integer *
+	    );
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int slamrg_(integer *, integer *, real *, integer
+	    *, integer *, integer *), slacpy_(char *, integer *, integer *,
+	    real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLAED8 merges the two sets of eigenvalues together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    eigenvalues are close together or if there is a tiny element in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            = 0:  Compute eigenvalues only.
+            = 1:  Compute eigenvectors of original dense symmetric matrix
+                  also.  On entry, Q contains the orthogonal matrix used
+                  to reduce the original matrix to tridiagonal form.
+
+    K      (output) INTEGER
+           The number of non-deflated eigenvalues, and the order of the
+           related secular equation.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    QSIZ   (input) INTEGER
+           The dimension of the orthogonal matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry, the eigenvalues of the two submatrices to be
+           combined.  On exit, the trailing (N-K) updated eigenvalues
+           (those which were deflated) sorted into increasing order.
+
+    Q      (input/output) REAL array, dimension (LDQ,N)
+           If ICOMPQ = 0, Q is not referenced.  Otherwise,
+           on entry, Q contains the eigenvectors of the partially solved
+           system which has been previously updated in matrix
+           multiplies with other partially solved eigensystems.
+           On exit, Q contains the trailing (N-K) updated eigenvectors
+           (those which were deflated) in its last N-K columns.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    INDXQ  (input) INTEGER array, dimension (N)
+           The permutation which separately sorts the two sub-problems
+           in D into ascending order.  Note that elements in the second
+           half of this permutation must first have CUTPNT added to
+           their values in order to be accurate.
+
+    RHO    (input/output) REAL
+           On entry, the off-diagonal element associated with the rank-1
+           cut which originally split the two submatrices which are now
+           being recombined.
+           On exit, RHO has been modified to the value required by
+           SLAED3.
+
+    CUTPNT (input) INTEGER
+           The location of the last eigenvalue in the leading
+           sub-matrix.  min(1,N) <= CUTPNT <= N.
+
+    Z      (input) REAL array, dimension (N)
+           On entry, Z contains the updating vector (the last row of
+           the first sub-eigenvector matrix and the first row of the
+           second sub-eigenvector matrix).
+           On exit, the contents of Z are destroyed by the updating
+           process.
+
+    DLAMDA (output) REAL array, dimension (N)
+           A copy of the first K eigenvalues which will be used by
+           SLAED3 to form the secular equation.
+
+    Q2     (output) REAL array, dimension (LDQ2,N)
+           If ICOMPQ = 0, Q2 is not referenced.  Otherwise,
+           a copy of the first K eigenvectors which will be used by
+           SLAED7 in a matrix multiply (SGEMM) to update the new
+           eigenvectors.
+
+    LDQ2   (input) INTEGER
+           The leading dimension of the array Q2.  LDQ2 >= max(1,N).
+
+    W      (output) REAL array, dimension (N)
+           The first k values of the final deflation-altered z-vector and
+           will be passed to SLAED3.
+
+    PERM   (output) INTEGER array, dimension (N)
+           The permutations (from deflation and sorting) to be applied
+           to each eigenblock.
+
+    GIVPTR (output) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (output) INTEGER array, dimension (2, N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (output) REAL array, dimension (2, N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    INDXP  (workspace) INTEGER array, dimension (N)
+           The permutation used to place deflated values of D at the end
+           of the array.  INDXP(1:K) points to the nondeflated D-values
+           and INDXP(K+1:N) points to the deflated eigenvalues.
+
+    INDX   (workspace) INTEGER array, dimension (N)
+           The permutation used to sort the contents of D into ascending
+           order.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --z__;
+    --dlamda;
+    q2_dim1 = *ldq2;
+    q2_offset = 1 + q2_dim1;
+    q2 -= q2_offset;
+    --w;
+    --perm;
+    givcol -= 3;
+    givnum -= 3;
+    --indxp;
+    --indx;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*icompq == 1 && *qsiz < *n) {
+	*info = -4;
+    } else if (*ldq < max(1,*n)) {
+	*info = -7;
+    } else if (*cutpnt < min(1,*n) || *cutpnt > *n) {
+	*info = -10;
+    } else if (*ldq2 < max(1,*n)) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED8", &i__1);
+	return 0;
+    }
+
+/*
+       Need to initialize GIVPTR to O here in case of quick exit
+       to prevent an unspecified code behavior (usually sigfault)
+       when IWORK array on entry to *stedc is not zeroed
+       (or at least some IWORK entries which used in *laed7 for GIVPTR).
+*/
+
+    *givptr = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    n1 = *cutpnt;
+    n2 = *n - n1;
+    n1p1 = n1 + 1;
+
+    if (*rho < 0.f) {
+	sscal_(&n2, &c_b151, &z__[n1p1], &c__1);
+    }
+
+/*     Normalize z so that norm(z) = 1 */
+
+    t = 1.f / sqrt(2.f);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	indx[j] = j;
+/* L10: */
+    }
+    sscal_(n, &t, &z__[1], &c__1);
+    *rho = (r__1 = *rho * 2.f, dabs(r__1));
+
+/*     Sort the eigenvalues into increasing order */
+
+    i__1 = *n;
+    for (i__ = *cutpnt + 1; i__ <= i__1; ++i__) {
+	indxq[i__] += *cutpnt;
+/* L20: */
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = d__[indxq[i__]];
+	w[i__] = z__[indxq[i__]];
+/* L30: */
+    }
+    i__ = 1;
+    j = *cutpnt + 1;
+    slamrg_(&n1, &n2, &dlamda[1], &c__1, &c__1, &indx[1]);
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = dlamda[indx[i__]];
+	z__[i__] = w[indx[i__]];
+/* L40: */
+    }
+
+/*     Calculate the allowable deflation tolerence */
+
+    imax = isamax_(n, &z__[1], &c__1);
+    jmax = isamax_(n, &d__[1], &c__1);
+    eps = slamch_("Epsilon");
+    tol = eps * 8.f * (r__1 = d__[jmax], dabs(r__1));
+
+/*
+       If the rank-1 modifier is small enough, no more needs to be done
+       except to reorganize Q so that its columns correspond with the
+       elements in D.
+*/
+
+    if (*rho * (r__1 = z__[imax], dabs(r__1)) <= tol) {
+	*k = 0;
+	if (*icompq == 0) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		perm[j] = indxq[indx[j]];
+/* L50: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		perm[j] = indxq[indx[j]];
+		scopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1
+			+ 1], &c__1);
+/* L60: */
+	    }
+	    slacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq);
+	}
+	return 0;
+    }
+
+/*
+       If there are multiple eigenvalues then the problem deflates.  Here
+       the number of equal eigenvalues are found.  As each equal
+       eigenvalue is found, an elementary reflector is computed to rotate
+       the corresponding eigensubspace so that the corresponding
+       components of Z are zero in this new basis.
+*/
+
+    *k = 0;
+    k2 = *n + 1;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (*rho * (r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    indxp[k2] = j;
+	    if (j == *n) {
+		goto L110;
+	    }
+	} else {
+	    jlam = j;
+	    goto L80;
+	}
+/* L70: */
+    }
+L80:
+    ++j;
+    if (j > *n) {
+	goto L100;
+    }
+    if (*rho * (r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	indxp[k2] = j;
+    } else {
+
+/*        Check if eigenvalues are close enough to allow deflation. */
+
+	s = z__[jlam];
+	c__ = z__[j];
+
+/*
+          Find sqrt(a**2+b**2) without overflow or
+          destructive underflow.
+*/
+
+	tau = slapy2_(&c__, &s);
+	t = d__[j] - d__[jlam];
+	c__ /= tau;
+	s = -s / tau;
+	if ((r__1 = t * c__ * s, dabs(r__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    z__[j] = tau;
+	    z__[jlam] = 0.f;
+
+/*           Record the appropriate Givens rotation */
+
+	    ++(*givptr);
+	    givcol[(*givptr << 1) + 1] = indxq[indx[jlam]];
+	    givcol[(*givptr << 1) + 2] = indxq[indx[j]];
+	    givnum[(*givptr << 1) + 1] = c__;
+	    givnum[(*givptr << 1) + 2] = s;
+	    if (*icompq == 1) {
+		srot_(qsiz, &q[indxq[indx[jlam]] * q_dim1 + 1], &c__1, &q[
+			indxq[indx[j]] * q_dim1 + 1], &c__1, &c__, &s);
+	    }
+	    t = d__[jlam] * c__ * c__ + d__[j] * s * s;
+	    d__[j] = d__[jlam] * s * s + d__[j] * c__ * c__;
+	    d__[jlam] = t;
+	    --k2;
+	    i__ = 1;
+L90:
+	    if (k2 + i__ <= *n) {
+		if (d__[jlam] < d__[indxp[k2 + i__]]) {
+		    indxp[k2 + i__ - 1] = indxp[k2 + i__];
+		    indxp[k2 + i__] = jlam;
+		    ++i__;
+		    goto L90;
+		} else {
+		    indxp[k2 + i__ - 1] = jlam;
+		}
+	    } else {
+		indxp[k2 + i__ - 1] = jlam;
+	    }
+	    jlam = j;
+	} else {
+	    ++(*k);
+	    w[*k] = z__[jlam];
+	    dlamda[*k] = d__[jlam];
+	    indxp[*k] = jlam;
+	    jlam = j;
+	}
+    }
+    goto L80;
+L100:
+
+/*     Record the last eigenvalue. */
+
+    ++(*k);
+    w[*k] = z__[jlam];
+    dlamda[*k] = d__[jlam];
+    indxp[*k] = jlam;
+
+L110:
+
+/*
+       Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+       and Q2 respectively.  The eigenvalues/vectors which were not
+       deflated go into the first K slots of DLAMDA and Q2 respectively,
+       while those which were deflated go into the last N - K slots.
+*/
+
+    if (*icompq == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    jp = indxp[j];
+	    dlamda[j] = d__[jp];
+	    perm[j] = indxq[indx[jp]];
+/* L120: */
+	}
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    jp = indxp[j];
+	    dlamda[j] = d__[jp];
+	    perm[j] = indxq[indx[jp]];
+	    scopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1 + 1]
+		    , &c__1);
+/* L130: */
+	}
+    }
+
+/*
+       The deflated eigenvalues and their corresponding vectors go back
+       into the last N - K slots of D and Q respectively.
+*/
+
+    if (*k < *n) {
+	if (*icompq == 0) {
+	    i__1 = *n - *k;
+	    scopy_(&i__1, &dlamda[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	} else {
+	    i__1 = *n - *k;
+	    scopy_(&i__1, &dlamda[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	    i__1 = *n - *k;
+	    slacpy_("A", qsiz, &i__1, &q2[(*k + 1) * q2_dim1 + 1], ldq2, &q[(*
+		    k + 1) * q_dim1 + 1], ldq);
+	}
+    }
+
+    return 0;
+
+/*     End of SLAED8 */
+
+} /* slaed8_ */
+
+/* Subroutine */ int slaed9_(integer *k, integer *kstart, integer *kstop,
+	integer *n, real *d__, real *q, integer *ldq, real *rho, real *dlamda,
+	 real *w, real *s, integer *lds, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, s_dim1, s_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j;
+    static real temp;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slaed4_(integer *, integer *, real *, real *, real *,
+	    real *, real *, integer *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAED9 finds the roots of the secular equation, as defined by the
+    values in D, Z, and RHO, between KSTART and KSTOP.  It makes the
+    appropriate calls to SLAED4 and then stores the new matrix of
+    eigenvectors for use in calculating the next level of Z vectors.
+
+    Arguments
+    =========
+
+    K       (input) INTEGER
+            The number of terms in the rational function to be solved by
+            SLAED4.  K >= 0.
+
+    KSTART  (input) INTEGER
+    KSTOP   (input) INTEGER
+            The updated eigenvalues Lambda(I), KSTART <= I <= KSTOP
+            are to be computed.  1 <= KSTART <= KSTOP <= K.
+
+    N       (input) INTEGER
+            The number of rows and columns in the Q matrix.
+            N >= K (delation may result in N > K).
+
+    D       (output) REAL array, dimension (N)
+            D(I) contains the updated eigenvalues
+            for KSTART <= I <= KSTOP.
+
+    Q       (workspace) REAL array, dimension (LDQ,N)
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max( 1, N ).
+
+    RHO     (input) REAL
+            The value of the parameter in the rank one update equation.
+            RHO >= 0 required.
+
+    DLAMDA  (input) REAL array, dimension (K)
+            The first K elements of this array contain the old roots
+            of the deflated updating problem.  These are the poles
+            of the secular equation.
+
+    W       (input) REAL array, dimension (K)
+            The first K elements of this array contain the components
+            of the deflation-adjusted updating vector.
+
+    S       (output) REAL array, dimension (LDS, K)
+            Will contain the eigenvectors of the repaired matrix which
+            will be stored for subsequent Z vector calculation and
+            multiplied by the previously accumulated eigenvectors
+            to update the system.
+
+    LDS     (input) INTEGER
+            The leading dimension of S.  LDS >= max( 1, K ).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --dlamda;
+    --w;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1;
+    s -= s_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*k < 0) {
+	*info = -1;
+    } else if (*kstart < 1 || *kstart > max(1,*k)) {
+	*info = -2;
+    } else if (max(1,*kstop) < *kstart || *kstop > max(1,*k)) {
+	*info = -3;
+    } else if (*n < *k) {
+	*info = -4;
+    } else if (*ldq < max(1,*k)) {
+	*info = -7;
+    } else if (*lds < max(1,*k)) {
+	*info = -12;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAED9", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 0) {
+	return 0;
+    }
+
+/*
+       Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DLAMDA(I) if it is 1; this makes the subsequent
+       subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DLAMDA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DLAMDA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DLAMBDA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = slamc3_(&dlamda[i__], &dlamda[i__]) - dlamda[i__];
+/* L10: */
+    }
+
+    i__1 = *kstop;
+    for (j = *kstart; j <= i__1; ++j) {
+	slaed4_(k, &j, &dlamda[1], &w[1], &q[j * q_dim1 + 1], rho, &d__[j],
+		info);
+
+/*        If the zero finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    goto L120;
+	}
+/* L20: */
+    }
+
+    if (*k == 1 || *k == 2) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = *k;
+	    for (j = 1; j <= i__2; ++j) {
+		s[j + i__ * s_dim1] = q[j + i__ * q_dim1];
+/* L30: */
+	    }
+/* L40: */
+	}
+	goto L120;
+    }
+
+/*     Compute updated W. */
+
+    scopy_(k, &w[1], &c__1, &s[s_offset], &c__1);
+
+/*     Initialize W(I) = Q(I,I) */
+
+    i__1 = *ldq + 1;
+    scopy_(k, &q[q_offset], &i__1, &w[1], &c__1);
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L50: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    w[i__] *= q[i__ + j * q_dim1] / (dlamda[i__] - dlamda[j]);
+/* L60: */
+	}
+/* L70: */
+    }
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	r__1 = sqrt(-w[i__]);
+	w[i__] = r_sign(&r__1, &s[i__ + s_dim1]);
+/* L80: */
+    }
+
+/*     Compute eigenvectors of the modified rank-1 modification. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    q[i__ + j * q_dim1] = w[i__] / q[i__ + j * q_dim1];
+/* L90: */
+	}
+	temp = snrm2_(k, &q[j * q_dim1 + 1], &c__1);
+	i__2 = *k;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    s[i__ + j * s_dim1] = q[i__ + j * q_dim1] / temp;
+/* L100: */
+	}
+/* L110: */
+    }
+
+L120:
+    return 0;
+
+/*     End of SLAED9 */
+
+} /* slaed9_ */
+
+/* Subroutine */ int slaeda_(integer *n, integer *tlvls, integer *curlvl,
+	integer *curpbm, integer *prmptr, integer *perm, integer *givptr,
+	integer *givcol, real *givnum, real *q, integer *qptr, real *z__,
+	real *ztemp, integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k, mid, ptr, curr;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    static integer bsiz1, bsiz2, psiz1, psiz2, zptr1;
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLAEDA computes the Z vector corresponding to the merge step in the
+    CURLVLth step of the merge process with TLVLS steps for the CURPBMth
+    problem.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    TLVLS  (input) INTEGER
+           The total number of merging levels in the overall divide and
+           conquer tree.
+
+    CURLVL (input) INTEGER
+           The current level in the overall merge routine,
+           0 <= curlvl <= tlvls.
+
+    CURPBM (input) INTEGER
+           The current problem in the current level in the overall
+           merge routine (counting from upper left to lower right).
+
+    PRMPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in PERM a
+           level's permutation is stored.  PRMPTR(i+1) - PRMPTR(i)
+           indicates the size of the permutation and incidentally the
+           size of the full, non-deflated problem.
+
+    PERM   (input) INTEGER array, dimension (N lg N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in GIVCOL a
+           level's Givens rotations are stored.  GIVPTR(i+1) - GIVPTR(i)
+           indicates the number of Givens rotations.
+
+    GIVCOL (input) INTEGER array, dimension (2, N lg N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (input) REAL array, dimension (2, N lg N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    Q      (input) REAL array, dimension (N**2)
+           Contains the square eigenblocks from previous levels, the
+           starting positions for blocks are given by QPTR.
+
+    QPTR   (input) INTEGER array, dimension (N+2)
+           Contains a list of pointers which indicate where in Q an
+           eigenblock is stored.  SQRT( QPTR(i+1) - QPTR(i) ) indicates
+           the size of the block.
+
+    Z      (output) REAL array, dimension (N)
+           On output this vector contains the updating vector (the last
+           row of the first sub-eigenvector matrix and the first row of
+           the second sub-eigenvector matrix).
+
+    ZTEMP  (workspace) REAL array, dimension (N)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --ztemp;
+    --z__;
+    --qptr;
+    --q;
+    givnum -= 3;
+    givcol -= 3;
+    --givptr;
+    --perm;
+    --prmptr;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -1;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAEDA", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine location of first number in second half. */
+
+    mid = *n / 2 + 1;
+
+/*     Gather last/first rows of appropriate eigenblocks into center of Z */
+
+    ptr = 1;
+
+/*
+       Determine location of lowest level subproblem in the full storage
+       scheme
+*/
+
+    i__1 = *curlvl - 1;
+    curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1;
+
+/*
+       Determine size of these matrices.  We add HALF to the value of
+       the SQRT in case the machine underestimates one of these square
+       roots.
+*/
+
+    bsiz1 = (integer) (sqrt((real) (qptr[curr + 1] - qptr[curr])) + .5f);
+    bsiz2 = (integer) (sqrt((real) (qptr[curr + 2] - qptr[curr + 1])) + .5f);
+    i__1 = mid - bsiz1 - 1;
+    for (k = 1; k <= i__1; ++k) {
+	z__[k] = 0.f;
+/* L10: */
+    }
+    scopy_(&bsiz1, &q[qptr[curr] + bsiz1 - 1], &bsiz1, &z__[mid - bsiz1], &
+	    c__1);
+    scopy_(&bsiz2, &q[qptr[curr + 1]], &bsiz2, &z__[mid], &c__1);
+    i__1 = *n;
+    for (k = mid + bsiz2; k <= i__1; ++k) {
+	z__[k] = 0.f;
+/* L20: */
+    }
+
+/*
+       Loop through remaining levels 1 -> CURLVL applying the Givens
+       rotations and permutation and then multiplying the center matrices
+       against the current Z.
+*/
+
+    ptr = pow_ii(&c__2, tlvls) + 1;
+    i__1 = *curlvl - 1;
+    for (k = 1; k <= i__1; ++k) {
+	i__2 = *curlvl - k;
+	i__3 = *curlvl - k - 1;
+	curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) -
+		1;
+	psiz1 = prmptr[curr + 1] - prmptr[curr];
+	psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
+	zptr1 = mid - psiz1;
+
+/*       Apply Givens at CURR and CURR+1 */
+
+	i__2 = givptr[curr + 1] - 1;
+	for (i__ = givptr[curr]; i__ <= i__2; ++i__) {
+	    srot_(&c__1, &z__[zptr1 + givcol[(i__ << 1) + 1] - 1], &c__1, &
+		    z__[zptr1 + givcol[(i__ << 1) + 2] - 1], &c__1, &givnum[(
+		    i__ << 1) + 1], &givnum[(i__ << 1) + 2]);
+/* L30: */
+	}
+	i__2 = givptr[curr + 2] - 1;
+	for (i__ = givptr[curr + 1]; i__ <= i__2; ++i__) {
+	    srot_(&c__1, &z__[mid - 1 + givcol[(i__ << 1) + 1]], &c__1, &z__[
+		    mid - 1 + givcol[(i__ << 1) + 2]], &c__1, &givnum[(i__ <<
+		    1) + 1], &givnum[(i__ << 1) + 2]);
+/* L40: */
+	}
+	psiz1 = prmptr[curr + 1] - prmptr[curr];
+	psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
+	i__2 = psiz1 - 1;
+	for (i__ = 0; i__ <= i__2; ++i__) {
+	    ztemp[i__ + 1] = z__[zptr1 + perm[prmptr[curr] + i__] - 1];
+/* L50: */
+	}
+	i__2 = psiz2 - 1;
+	for (i__ = 0; i__ <= i__2; ++i__) {
+	    ztemp[psiz1 + i__ + 1] = z__[mid + perm[prmptr[curr + 1] + i__] -
+		    1];
+/* L60: */
+	}
+
+/*
+          Multiply Blocks at CURR and CURR+1
+
+          Determine size of these matrices.  We add HALF to the value of
+          the SQRT in case the machine underestimates one of these
+          square roots.
+*/
+
+	bsiz1 = (integer) (sqrt((real) (qptr[curr + 1] - qptr[curr])) + .5f);
+	bsiz2 = (integer) (sqrt((real) (qptr[curr + 2] - qptr[curr + 1])) +
+		.5f);
+	if (bsiz1 > 0) {
+	    sgemv_("T", &bsiz1, &bsiz1, &c_b15, &q[qptr[curr]], &bsiz1, &
+		    ztemp[1], &c__1, &c_b29, &z__[zptr1], &c__1);
+	}
+	i__2 = psiz1 - bsiz1;
+	scopy_(&i__2, &ztemp[bsiz1 + 1], &c__1, &z__[zptr1 + bsiz1], &c__1);
+	if (bsiz2 > 0) {
+	    sgemv_("T", &bsiz2, &bsiz2, &c_b15, &q[qptr[curr + 1]], &bsiz2, &
+		    ztemp[psiz1 + 1], &c__1, &c_b29, &z__[mid], &c__1);
+	}
+	i__2 = psiz2 - bsiz2;
+	scopy_(&i__2, &ztemp[psiz1 + bsiz2 + 1], &c__1, &z__[mid + bsiz2], &
+		c__1);
+
+	i__2 = *tlvls - k;
+	ptr += pow_ii(&c__2, &i__2);
+/* L70: */
+    }
+
+    return 0;
+
+/*     End of SLAEDA */
+
+} /* slaeda_ */
+
+/* Subroutine */ int slaev2_(real *a, real *b, real *c__, real *rt1, real *
+	rt2, real *cs1, real *sn1)
+{
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    static real ab, df, cs, ct, tb, sm, tn, rt, adf, acs;
+    static integer sgn1, sgn2;
+    static real acmn, acmx;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAEV2 computes the eigendecomposition of a 2-by-2 symmetric matrix
+       [  A   B  ]
+       [  B   C  ].
+    On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+    eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+    eigenvector for RT1, giving the decomposition
+
+       [ CS1  SN1 ] [  A   B  ] [ CS1 -SN1 ]  =  [ RT1  0  ]
+       [-SN1  CS1 ] [  B   C  ] [ SN1  CS1 ]     [  0  RT2 ].
+
+    Arguments
+    =========
+
+    A       (input) REAL
+            The (1,1) element of the 2-by-2 matrix.
+
+    B       (input) REAL
+            The (1,2) element and the conjugate of the (2,1) element of
+            the 2-by-2 matrix.
+
+    C       (input) REAL
+            The (2,2) element of the 2-by-2 matrix.
+
+    RT1     (output) REAL
+            The eigenvalue of larger absolute value.
+
+    RT2     (output) REAL
+            The eigenvalue of smaller absolute value.
+
+    CS1     (output) REAL
+    SN1     (output) REAL
+            The vector (CS1, SN1) is a unit right eigenvector for RT1.
+
+    Further Details
+    ===============
+
+    RT1 is accurate to a few ulps barring over/underflow.
+
+    RT2 may be inaccurate if there is massive cancellation in the
+    determinant A*C-B*B; higher precision or correctly rounded or
+    correctly truncated arithmetic would be needed to compute RT2
+    accurately in all cases.
+
+    CS1 and SN1 are accurate to a few ulps barring over/underflow.
+
+    Overflow is possible only if RT1 is within a factor of 5 of overflow.
+    Underflow is harmless if the input data is 0 or exceeds
+       underflow_threshold / macheps.
+
+   =====================================================================
+
+
+       Compute the eigenvalues
+*/
+
+    sm = *a + *c__;
+    df = *a - *c__;
+    adf = dabs(df);
+    tb = *b + *b;
+    ab = dabs(tb);
+    if (dabs(*a) > dabs(*c__)) {
+	acmx = *a;
+	acmn = *c__;
+    } else {
+	acmx = *c__;
+	acmn = *a;
+    }
+    if (adf > ab) {
+/* Computing 2nd power */
+	r__1 = ab / adf;
+	rt = adf * sqrt(r__1 * r__1 + 1.f);
+    } else if (adf < ab) {
+/* Computing 2nd power */
+	r__1 = adf / ab;
+	rt = ab * sqrt(r__1 * r__1 + 1.f);
+    } else {
+
+/*        Includes case AB=ADF=0 */
+
+	rt = ab * sqrt(2.f);
+    }
+    if (sm < 0.f) {
+	*rt1 = (sm - rt) * .5f;
+	sgn1 = -1;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else if (sm > 0.f) {
+	*rt1 = (sm + rt) * .5f;
+	sgn1 = 1;
+
+/*
+          Order of execution important.
+          To get fully accurate smaller eigenvalue,
+          next line needs to be executed in higher precision.
+*/
+
+	*rt2 = acmx / *rt1 * acmn - *b / *rt1 * *b;
+    } else {
+
+/*        Includes case RT1 = RT2 = 0 */
+
+	*rt1 = rt * .5f;
+	*rt2 = rt * -.5f;
+	sgn1 = 1;
+    }
+
+/*     Compute the eigenvector */
+
+    if (df >= 0.f) {
+	cs = df + rt;
+	sgn2 = 1;
+    } else {
+	cs = df - rt;
+	sgn2 = -1;
+    }
+    acs = dabs(cs);
+    if (acs > ab) {
+	ct = -tb / cs;
+	*sn1 = 1.f / sqrt(ct * ct + 1.f);
+	*cs1 = ct * *sn1;
+    } else {
+	if (ab == 0.f) {
+	    *cs1 = 1.f;
+	    *sn1 = 0.f;
+	} else {
+	    tn = -cs / tb;
+	    *cs1 = 1.f / sqrt(tn * tn + 1.f);
+	    *sn1 = tn * *cs1;
+	}
+    }
+    if (sgn1 == sgn2) {
+	tn = *cs1;
+	*cs1 = -(*sn1);
+	*sn1 = tn;
+    }
+    return 0;
+
+/*     End of SLAEV2 */
+
+} /* slaev2_ */
+
+/* Subroutine */ int slaexc_(logical *wantq, integer *n, real *t, integer *
+	ldt, real *q, integer *ldq, integer *j1, integer *n1, integer *n2,
+	real *work, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, t_dim1, t_offset, i__1;
+    real r__1, r__2, r__3;
+
+    /* Local variables */
+    static real d__[16]	/* was [4][4] */;
+    static integer k;
+    static real u[3], x[4]	/* was [2][2] */;
+    static integer j2, j3, j4;
+    static real u1[3], u2[3];
+    static integer nd;
+    static real cs, t11, t22, t33, sn, wi1, wi2, wr1, wr2, eps, tau, tau1,
+	    tau2;
+    static integer ierr;
+    static real temp;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    static real scale, dnorm, xnorm;
+    extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real *
+	    , real *, real *, real *, real *, real *), slasy2_(logical *,
+	    logical *, integer *, integer *, integer *, real *, integer *,
+	    real *, integer *, real *, integer *, real *, real *, integer *,
+	    real *, integer *);
+    extern doublereal slamch_(char *), slange_(char *, integer *,
+	    integer *, real *, integer *, real *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *,
+	    real *), slacpy_(char *, integer *, integer *, real *, integer *,
+	    real *, integer *), slartg_(real *, real *, real *, real *
+	    , real *);
+    static real thresh;
+    extern /* Subroutine */ int slarfx_(char *, integer *, integer *, real *,
+	    real *, real *, integer *, real *);
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAEXC swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in
+    an upper quasi-triangular matrix T by an orthogonal similarity
+    transformation.
+
+    T must be in Schur canonical form, that is, block upper triangular
+    with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block
+    has its diagonal elemnts equal and its off-diagonal elements of
+    opposite sign.
+
+    Arguments
+    =========
+
+    WANTQ   (input) LOGICAL
+            = .TRUE. : accumulate the transformation in the matrix Q;
+            = .FALSE.: do not accumulate the transformation.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) REAL array, dimension (LDT,N)
+            On entry, the upper quasi-triangular matrix T, in Schur
+            canonical form.
+            On exit, the updated matrix T, again in Schur canonical form.
+
+    LDT     (input)  INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    Q       (input/output) REAL array, dimension (LDQ,N)
+            On entry, if WANTQ is .TRUE., the orthogonal matrix Q.
+            On exit, if WANTQ is .TRUE., the updated matrix Q.
+            If WANTQ is .FALSE., Q is not referenced.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.
+            LDQ >= 1; and if WANTQ is .TRUE., LDQ >= N.
+
+    J1      (input) INTEGER
+            The index of the first row of the first block T11.
+
+    N1      (input) INTEGER
+            The order of the first block T11. N1 = 0, 1 or 2.
+
+    N2      (input) INTEGER
+            The order of the second block T22. N2 = 0, 1 or 2.
+
+    WORK    (workspace) REAL array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            = 1: the transformed matrix T would be too far from Schur
+                 form; the blocks are not swapped and T and Q are
+                 unchanged.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *n1 == 0 || *n2 == 0) {
+	return 0;
+    }
+    if (*j1 + *n1 > *n) {
+	return 0;
+    }
+
+    j2 = *j1 + 1;
+    j3 = *j1 + 2;
+    j4 = *j1 + 3;
+
+    if (*n1 == 1 && *n2 == 1) {
+
+/*        Swap two 1-by-1 blocks. */
+
+	t11 = t[*j1 + *j1 * t_dim1];
+	t22 = t[j2 + j2 * t_dim1];
+
+/*        Determine the transformation to perform the interchange. */
+
+	r__1 = t22 - t11;
+	slartg_(&t[*j1 + j2 * t_dim1], &r__1, &cs, &sn, &temp);
+
+/*        Apply transformation to the matrix T. */
+
+	if (j3 <= *n) {
+	    i__1 = *n - *j1 - 1;
+	    srot_(&i__1, &t[*j1 + j3 * t_dim1], ldt, &t[j2 + j3 * t_dim1],
+		    ldt, &cs, &sn);
+	}
+	i__1 = *j1 - 1;
+	srot_(&i__1, &t[*j1 * t_dim1 + 1], &c__1, &t[j2 * t_dim1 + 1], &c__1,
+		&cs, &sn);
+
+	t[*j1 + *j1 * t_dim1] = t22;
+	t[j2 + j2 * t_dim1] = t11;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    srot_(n, &q[*j1 * q_dim1 + 1], &c__1, &q[j2 * q_dim1 + 1], &c__1,
+		    &cs, &sn);
+	}
+
+    } else {
+
+/*
+          Swapping involves at least one 2-by-2 block.
+
+          Copy the diagonal block of order N1+N2 to the local array D
+          and compute its norm.
+*/
+
+	nd = *n1 + *n2;
+	slacpy_("Full", &nd, &nd, &t[*j1 + *j1 * t_dim1], ldt, d__, &c__4);
+	dnorm = slange_("Max", &nd, &nd, d__, &c__4, &work[1]);
+
+/*
+          Compute machine-dependent threshold for test for accepting
+          swap.
+*/
+
+	eps = slamch_("P");
+	smlnum = slamch_("S") / eps;
+/* Computing MAX */
+	r__1 = eps * 10.f * dnorm;
+	thresh = dmax(r__1,smlnum);
+
+/*        Solve T11*X - X*T22 = scale*T12 for X. */
+
+	slasy2_(&c_false, &c_false, &c_n1, n1, n2, d__, &c__4, &d__[*n1 + 1 +
+		(*n1 + 1 << 2) - 5], &c__4, &d__[(*n1 + 1 << 2) - 4], &c__4, &
+		scale, x, &c__2, &xnorm, &ierr);
+
+/*        Swap the adjacent diagonal blocks. */
+
+	k = *n1 + *n1 + *n2 - 3;
+	switch (k) {
+	    case 1:  goto L10;
+	    case 2:  goto L20;
+	    case 3:  goto L30;
+	}
+
+L10:
+
+/*
+          N1 = 1, N2 = 2: generate elementary reflector H so that:
+
+          ( scale, X11, X12 ) H = ( 0, 0, * )
+*/
+
+	u[0] = scale;
+	u[1] = x[0];
+	u[2] = x[2];
+	slarfg_(&c__3, &u[2], u, &c__1, &tau);
+	u[2] = 1.f;
+	t11 = t[*j1 + *j1 * t_dim1];
+
+/*        Perform swap provisionally on diagonal block in D. */
+
+	slarfx_("L", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+	slarfx_("R", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+
+/*
+          Test whether to reject swap.
+
+   Computing MAX
+*/
+	r__2 = dabs(d__[2]), r__3 = dabs(d__[6]), r__2 = max(r__2,r__3), r__3
+		= (r__1 = d__[10] - t11, dabs(r__1));
+	if (dmax(r__2,r__3) > thresh) {
+	    goto L50;
+	}
+
+/*        Accept swap: apply transformation to the entire matrix T. */
+
+	i__1 = *n - *j1 + 1;
+	slarfx_("L", &c__3, &i__1, u, &tau, &t[*j1 + *j1 * t_dim1], ldt, &
+		work[1]);
+	slarfx_("R", &j2, &c__3, u, &tau, &t[*j1 * t_dim1 + 1], ldt, &work[1]);
+
+	t[j3 + *j1 * t_dim1] = 0.f;
+	t[j3 + j2 * t_dim1] = 0.f;
+	t[j3 + j3 * t_dim1] = t11;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    slarfx_("R", n, &c__3, u, &tau, &q[*j1 * q_dim1 + 1], ldq, &work[
+		    1]);
+	}
+	goto L40;
+
+L20:
+
+/*
+          N1 = 2, N2 = 1: generate elementary reflector H so that:
+
+          H (  -X11 ) = ( * )
+            (  -X21 ) = ( 0 )
+            ( scale ) = ( 0 )
+*/
+
+	u[0] = -x[0];
+	u[1] = -x[1];
+	u[2] = scale;
+	slarfg_(&c__3, u, &u[1], &c__1, &tau);
+	u[0] = 1.f;
+	t33 = t[j3 + j3 * t_dim1];
+
+/*        Perform swap provisionally on diagonal block in D. */
+
+	slarfx_("L", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+	slarfx_("R", &c__3, &c__3, u, &tau, d__, &c__4, &work[1]);
+
+/*
+          Test whether to reject swap.
+
+   Computing MAX
+*/
+	r__2 = dabs(d__[1]), r__3 = dabs(d__[2]), r__2 = max(r__2,r__3), r__3
+		= (r__1 = d__[0] - t33, dabs(r__1));
+	if (dmax(r__2,r__3) > thresh) {
+	    goto L50;
+	}
+
+/*        Accept swap: apply transformation to the entire matrix T. */
+
+	slarfx_("R", &j3, &c__3, u, &tau, &t[*j1 * t_dim1 + 1], ldt, &work[1]);
+	i__1 = *n - *j1;
+	slarfx_("L", &c__3, &i__1, u, &tau, &t[*j1 + j2 * t_dim1], ldt, &work[
+		1]);
+
+	t[*j1 + *j1 * t_dim1] = t33;
+	t[j2 + *j1 * t_dim1] = 0.f;
+	t[j3 + *j1 * t_dim1] = 0.f;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    slarfx_("R", n, &c__3, u, &tau, &q[*j1 * q_dim1 + 1], ldq, &work[
+		    1]);
+	}
+	goto L40;
+
+L30:
+
+/*
+          N1 = 2, N2 = 2: generate elementary reflectors H(1) and H(2) so
+          that:
+
+          H(2) H(1) (  -X11  -X12 ) = (  *  * )
+                    (  -X21  -X22 )   (  0  * )
+                    ( scale    0  )   (  0  0 )
+                    (    0  scale )   (  0  0 )
+*/
+
+	u1[0] = -x[0];
+	u1[1] = -x[1];
+	u1[2] = scale;
+	slarfg_(&c__3, u1, &u1[1], &c__1, &tau1);
+	u1[0] = 1.f;
+
+	temp = -tau1 * (x[2] + u1[1] * x[3]);
+	u2[0] = -temp * u1[1] - x[3];
+	u2[1] = -temp * u1[2];
+	u2[2] = scale;
+	slarfg_(&c__3, u2, &u2[1], &c__1, &tau2);
+	u2[0] = 1.f;
+
+/*        Perform swap provisionally on diagonal block in D. */
+
+	slarfx_("L", &c__3, &c__4, u1, &tau1, d__, &c__4, &work[1])
+		;
+	slarfx_("R", &c__4, &c__3, u1, &tau1, d__, &c__4, &work[1])
+		;
+	slarfx_("L", &c__3, &c__4, u2, &tau2, &d__[1], &c__4, &work[1]);
+	slarfx_("R", &c__4, &c__3, u2, &tau2, &d__[4], &c__4, &work[1]);
+
+/*
+          Test whether to reject swap.
+
+   Computing MAX
+*/
+	r__1 = dabs(d__[2]), r__2 = dabs(d__[6]), r__1 = max(r__1,r__2), r__2
+		= dabs(d__[3]), r__1 = max(r__1,r__2), r__2 = dabs(d__[7]);
+	if (dmax(r__1,r__2) > thresh) {
+	    goto L50;
+	}
+
+/*        Accept swap: apply transformation to the entire matrix T. */
+
+	i__1 = *n - *j1 + 1;
+	slarfx_("L", &c__3, &i__1, u1, &tau1, &t[*j1 + *j1 * t_dim1], ldt, &
+		work[1]);
+	slarfx_("R", &j4, &c__3, u1, &tau1, &t[*j1 * t_dim1 + 1], ldt, &work[
+		1]);
+	i__1 = *n - *j1 + 1;
+	slarfx_("L", &c__3, &i__1, u2, &tau2, &t[j2 + *j1 * t_dim1], ldt, &
+		work[1]);
+	slarfx_("R", &j4, &c__3, u2, &tau2, &t[j2 * t_dim1 + 1], ldt, &work[1]
+		);
+
+	t[j3 + *j1 * t_dim1] = 0.f;
+	t[j3 + j2 * t_dim1] = 0.f;
+	t[j4 + *j1 * t_dim1] = 0.f;
+	t[j4 + j2 * t_dim1] = 0.f;
+
+	if (*wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    slarfx_("R", n, &c__3, u1, &tau1, &q[*j1 * q_dim1 + 1], ldq, &
+		    work[1]);
+	    slarfx_("R", n, &c__3, u2, &tau2, &q[j2 * q_dim1 + 1], ldq, &work[
+		    1]);
+	}
+
+L40:
+
+	if (*n2 == 2) {
+
+/*           Standardize new 2-by-2 block T11 */
+
+	    slanv2_(&t[*j1 + *j1 * t_dim1], &t[*j1 + j2 * t_dim1], &t[j2 + *
+		    j1 * t_dim1], &t[j2 + j2 * t_dim1], &wr1, &wi1, &wr2, &
+		    wi2, &cs, &sn);
+	    i__1 = *n - *j1 - 1;
+	    srot_(&i__1, &t[*j1 + (*j1 + 2) * t_dim1], ldt, &t[j2 + (*j1 + 2)
+		    * t_dim1], ldt, &cs, &sn);
+	    i__1 = *j1 - 1;
+	    srot_(&i__1, &t[*j1 * t_dim1 + 1], &c__1, &t[j2 * t_dim1 + 1], &
+		    c__1, &cs, &sn);
+	    if (*wantq) {
+		srot_(n, &q[*j1 * q_dim1 + 1], &c__1, &q[j2 * q_dim1 + 1], &
+			c__1, &cs, &sn);
+	    }
+	}
+
+	if (*n1 == 2) {
+
+/*           Standardize new 2-by-2 block T22 */
+
+	    j3 = *j1 + *n2;
+	    j4 = j3 + 1;
+	    slanv2_(&t[j3 + j3 * t_dim1], &t[j3 + j4 * t_dim1], &t[j4 + j3 *
+		    t_dim1], &t[j4 + j4 * t_dim1], &wr1, &wi1, &wr2, &wi2, &
+		    cs, &sn);
+	    if (j3 + 2 <= *n) {
+		i__1 = *n - j3 - 1;
+		srot_(&i__1, &t[j3 + (j3 + 2) * t_dim1], ldt, &t[j4 + (j3 + 2)
+			 * t_dim1], ldt, &cs, &sn);
+	    }
+	    i__1 = j3 - 1;
+	    srot_(&i__1, &t[j3 * t_dim1 + 1], &c__1, &t[j4 * t_dim1 + 1], &
+		    c__1, &cs, &sn);
+	    if (*wantq) {
+		srot_(n, &q[j3 * q_dim1 + 1], &c__1, &q[j4 * q_dim1 + 1], &
+			c__1, &cs, &sn);
+	    }
+	}
+
+    }
+    return 0;
+
+/*     Exit with INFO = 1 if swap was rejected. */
+
+L50:
+    *info = 1;
+    return 0;
+
+/*     End of SLAEXC */
+
+} /* slaexc_ */
+
+/* Subroutine */ int slahqr_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real *
+	wi, integer *iloz, integer *ihiz, real *z__, integer *ldz, integer *
+	info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3;
+    real r__1, r__2, r__3, r__4;
+
+    /* Local variables */
+    static integer i__, j, k, l, m;
+    static real s, v[3];
+    static integer i1, i2;
+    static real t1, t2, t3, v2, v3, aa, ab, ba, bb, h11, h12, h21, h22, cs;
+    static integer nh;
+    static real sn;
+    static integer nr;
+    static real tr;
+    static integer nz;
+    static real det, h21s;
+    static integer its;
+    static real ulp, sum, tst, rt1i, rt2i, rt1r, rt2r;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *), scopy_(integer *, real *, integer *,
+	    real *, integer *), slanv2_(real *, real *, real *, real *, real *
+	    , real *, real *, real *, real *, real *), slabad_(real *, real *)
+	    ;
+    extern doublereal slamch_(char *);
+    static real safmin;
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *,
+	    real *);
+    static real safmax, rtdisc, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       SLAHQR is an auxiliary routine called by SHSEQR to update the
+       eigenvalues and Schur decomposition already computed by SHSEQR, by
+       dealing with the Hessenberg submatrix in rows and columns ILO to
+       IHI.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N       (input) INTEGER
+            The order of the matrix H.  N >= 0.
+
+       ILO     (input) INTEGER
+       IHI     (input) INTEGER
+            It is assumed that H is already upper quasi-triangular in
+            rows and columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless
+            ILO = 1). SLAHQR works primarily with the Hessenberg
+            submatrix in rows and columns ILO to IHI, but applies
+            transformations to all of H if WANTT is .TRUE..
+            1 <= ILO <= max(1,IHI); IHI <= N.
+
+       H       (input/output) REAL array, dimension (LDH,N)
+            On entry, the upper Hessenberg matrix H.
+            On exit, if INFO is zero and if WANTT is .TRUE., H is upper
+            quasi-triangular in rows and columns ILO:IHI, with any
+            2-by-2 diagonal blocks in standard form. If INFO is zero
+            and WANTT is .FALSE., the contents of H are unspecified on
+            exit.  The output state of H if INFO is nonzero is given
+            below under the description of INFO.
+
+       LDH     (input) INTEGER
+            The leading dimension of the array H. LDH >= max(1,N).
+
+       WR      (output) REAL array, dimension (N)
+       WI      (output) REAL array, dimension (N)
+            The real and imaginary parts, respectively, of the computed
+            eigenvalues ILO to IHI are stored in the corresponding
+            elements of WR and WI. If two eigenvalues are computed as a
+            complex conjugate pair, they are stored in consecutive
+            elements of WR and WI, say the i-th and (i+1)th, with
+            WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the
+            eigenvalues are stored in the same order as on the diagonal
+            of the Schur form returned in H, with WR(i) = H(i,i), and, if
+            H(i:i+1,i:i+1) is a 2-by-2 diagonal block,
+            WI(i) = sqrt(H(i+1,i)*H(i,i+1)) and WI(i+1) = -WI(i).
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE..
+            1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
+
+       Z       (input/output) REAL array, dimension (LDZ,N)
+            If WANTZ is .TRUE., on entry Z must contain the current
+            matrix Z of transformations accumulated by SHSEQR, and on
+            exit Z has been updated; transformations are applied only to
+            the submatrix Z(ILOZ:IHIZ,ILO:IHI).
+            If WANTZ is .FALSE., Z is not referenced.
+
+       LDZ     (input) INTEGER
+            The leading dimension of the array Z. LDZ >= max(1,N).
+
+       INFO    (output) INTEGER
+             =   0: successful exit
+            .GT. 0: If INFO = i, SLAHQR failed to compute all the
+                    eigenvalues ILO to IHI in a total of 30 iterations
+                    per eigenvalue; elements i+1:ihi of WR and WI
+                    contain those eigenvalues which have been
+                    successfully computed.
+
+                    If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+                    the remaining unconverged eigenvalues are the
+                    eigenvalues of the upper Hessenberg matrix rows
+                    and columns ILO thorugh INFO of the final, output
+                    value of H.
+
+                    If INFO .GT. 0 and WANTT is .TRUE., then on exit
+            (*)       (initial value of H)*U  = U*(final value of H)
+                    where U is an orthognal matrix.    The final
+                    value of H is upper Hessenberg and triangular in
+                    rows and columns INFO+1 through IHI.
+
+                    If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+                        (final value of Z)  = (initial value of Z)*U
+                    where U is the orthogonal matrix in (*)
+                    (regardless of the value of WANTT.)
+
+       Further Details
+       ===============
+
+       02-96 Based on modifications by
+       David Day, Sandia National Laboratory, USA
+
+       12-04 Further modifications by
+       Ralph Byers, University of Kansas, USA
+       This is a modified version of SLAHQR from LAPACK version 3.0.
+       It is (1) more robust against overflow and underflow and
+       (2) adopts the more conservative Ahues & Tisseur stopping
+       criterion (LAWN 122, 1997).
+
+       =========================================================
+*/
+
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*ilo == *ihi) {
+	wr[*ilo] = h__[*ilo + *ilo * h_dim1];
+	wi[*ilo] = 0.f;
+	return 0;
+    }
+
+/*     ==== clear out the trash ==== */
+    i__1 = *ihi - 3;
+    for (j = *ilo; j <= i__1; ++j) {
+	h__[j + 2 + j * h_dim1] = 0.f;
+	h__[j + 3 + j * h_dim1] = 0.f;
+/* L10: */
+    }
+    if (*ilo <= *ihi - 2) {
+	h__[*ihi + (*ihi - 2) * h_dim1] = 0.f;
+    }
+
+    nh = *ihi - *ilo + 1;
+    nz = *ihiz - *iloz + 1;
+
+/*     Set machine-dependent constants for the stopping criterion. */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) nh / ulp);
+
+/*
+       I1 and I2 are the indices of the first row and last column of H
+       to which transformations must be applied. If eigenvalues only are
+       being computed, I1 and I2 are set inside the main loop.
+*/
+
+    if (*wantt) {
+	i1 = 1;
+	i2 = *n;
+    }
+
+/*
+       The main loop begins here. I is the loop index and decreases from
+       IHI to ILO in steps of 1 or 2. Each iteration of the loop works
+       with the active submatrix in rows and columns L to I.
+       Eigenvalues I+1 to IHI have already converged. Either L = ILO or
+       H(L,L-1) is negligible so that the matrix splits.
+*/
+
+    i__ = *ihi;
+L20:
+    l = *ilo;
+    if (i__ < *ilo) {
+	goto L160;
+    }
+
+/*
+       Perform QR iterations on rows and columns ILO to I until a
+       submatrix of order 1 or 2 splits off at the bottom because a
+       subdiagonal element has become negligible.
+*/
+
+    for (its = 0; its <= 30; ++its) {
+
+/*        Look for a single small subdiagonal element. */
+
+	i__1 = l + 1;
+	for (k = i__; k >= i__1; --k) {
+	    if ((r__1 = h__[k + (k - 1) * h_dim1], dabs(r__1)) <= smlnum) {
+		goto L40;
+	    }
+	    tst = (r__1 = h__[k - 1 + (k - 1) * h_dim1], dabs(r__1)) + (r__2 =
+		     h__[k + k * h_dim1], dabs(r__2));
+	    if (tst == 0.f) {
+		if (k - 2 >= *ilo) {
+		    tst += (r__1 = h__[k - 1 + (k - 2) * h_dim1], dabs(r__1));
+		}
+		if (k + 1 <= *ihi) {
+		    tst += (r__1 = h__[k + 1 + k * h_dim1], dabs(r__1));
+		}
+	    }
+/*
+             ==== The following is a conservative small subdiagonal
+             .    deflation  criterion due to Ahues & Tisseur (LAWN 122,
+             .    1997). It has better mathematical foundation and
+             .    improves accuracy in some cases.  ====
+*/
+	    if ((r__1 = h__[k + (k - 1) * h_dim1], dabs(r__1)) <= ulp * tst) {
+/* Computing MAX */
+		r__3 = (r__1 = h__[k + (k - 1) * h_dim1], dabs(r__1)), r__4 =
+			(r__2 = h__[k - 1 + k * h_dim1], dabs(r__2));
+		ab = dmax(r__3,r__4);
+/* Computing MIN */
+		r__3 = (r__1 = h__[k + (k - 1) * h_dim1], dabs(r__1)), r__4 =
+			(r__2 = h__[k - 1 + k * h_dim1], dabs(r__2));
+		ba = dmin(r__3,r__4);
+/* Computing MAX */
+		r__3 = (r__1 = h__[k + k * h_dim1], dabs(r__1)), r__4 = (r__2
+			= h__[k - 1 + (k - 1) * h_dim1] - h__[k + k * h_dim1],
+			 dabs(r__2));
+		aa = dmax(r__3,r__4);
+/* Computing MIN */
+		r__3 = (r__1 = h__[k + k * h_dim1], dabs(r__1)), r__4 = (r__2
+			= h__[k - 1 + (k - 1) * h_dim1] - h__[k + k * h_dim1],
+			 dabs(r__2));
+		bb = dmin(r__3,r__4);
+		s = aa + ab;
+/* Computing MAX */
+		r__1 = smlnum, r__2 = ulp * (bb * (aa / s));
+		if (ba * (ab / s) <= dmax(r__1,r__2)) {
+		    goto L40;
+		}
+	    }
+/* L30: */
+	}
+L40:
+	l = k;
+	if (l > *ilo) {
+
+/*           H(L,L-1) is negligible */
+
+	    h__[l + (l - 1) * h_dim1] = 0.f;
+	}
+
+/*        Exit from loop if a submatrix of order 1 or 2 has split off. */
+
+	if (l >= i__ - 1) {
+	    goto L150;
+	}
+
+/*
+          Now the active submatrix is in rows and columns L to I. If
+          eigenvalues only are being computed, only the active submatrix
+          need be transformed.
+*/
+
+	if (! (*wantt)) {
+	    i1 = l;
+	    i2 = i__;
+	}
+
+	if (its == 10) {
+
+/*           Exceptional shift. */
+
+	    s = (r__1 = h__[l + 1 + l * h_dim1], dabs(r__1)) + (r__2 = h__[l
+		    + 2 + (l + 1) * h_dim1], dabs(r__2));
+	    h11 = s * .75f + h__[l + l * h_dim1];
+	    h12 = s * -.4375f;
+	    h21 = s;
+	    h22 = h11;
+	} else if (its == 20) {
+
+/*           Exceptional shift. */
+
+	    s = (r__1 = h__[i__ + (i__ - 1) * h_dim1], dabs(r__1)) + (r__2 =
+		    h__[i__ - 1 + (i__ - 2) * h_dim1], dabs(r__2));
+	    h11 = s * .75f + h__[i__ + i__ * h_dim1];
+	    h12 = s * -.4375f;
+	    h21 = s;
+	    h22 = h11;
+	} else {
+
+/*
+             Prepare to use Francis' double shift
+             (i.e. 2nd degree generalized Rayleigh quotient)
+*/
+
+	    h11 = h__[i__ - 1 + (i__ - 1) * h_dim1];
+	    h21 = h__[i__ + (i__ - 1) * h_dim1];
+	    h12 = h__[i__ - 1 + i__ * h_dim1];
+	    h22 = h__[i__ + i__ * h_dim1];
+	}
+	s = dabs(h11) + dabs(h12) + dabs(h21) + dabs(h22);
+	if (s == 0.f) {
+	    rt1r = 0.f;
+	    rt1i = 0.f;
+	    rt2r = 0.f;
+	    rt2i = 0.f;
+	} else {
+	    h11 /= s;
+	    h21 /= s;
+	    h12 /= s;
+	    h22 /= s;
+	    tr = (h11 + h22) / 2.f;
+	    det = (h11 - tr) * (h22 - tr) - h12 * h21;
+	    rtdisc = sqrt((dabs(det)));
+	    if (det >= 0.f) {
+
+/*              ==== complex conjugate shifts ==== */
+
+		rt1r = tr * s;
+		rt2r = rt1r;
+		rt1i = rtdisc * s;
+		rt2i = -rt1i;
+	    } else {
+
+/*              ==== real shifts (use only one of them)  ==== */
+
+		rt1r = tr + rtdisc;
+		rt2r = tr - rtdisc;
+		if ((r__1 = rt1r - h22, dabs(r__1)) <= (r__2 = rt2r - h22,
+			dabs(r__2))) {
+		    rt1r *= s;
+		    rt2r = rt1r;
+		} else {
+		    rt2r *= s;
+		    rt1r = rt2r;
+		}
+		rt1i = 0.f;
+		rt2i = 0.f;
+	    }
+	}
+
+/*        Look for two consecutive small subdiagonal elements. */
+
+	i__1 = l;
+	for (m = i__ - 2; m >= i__1; --m) {
+/*
+             Determine the effect of starting the double-shift QR
+             iteration at row M, and see if this would make H(M,M-1)
+             negligible.  (The following uses scaling to avoid
+             overflows and most underflows.)
+*/
+
+	    h21s = h__[m + 1 + m * h_dim1];
+	    s = (r__1 = h__[m + m * h_dim1] - rt2r, dabs(r__1)) + dabs(rt2i)
+		    + dabs(h21s);
+	    h21s = h__[m + 1 + m * h_dim1] / s;
+	    v[0] = h21s * h__[m + (m + 1) * h_dim1] + (h__[m + m * h_dim1] -
+		    rt1r) * ((h__[m + m * h_dim1] - rt2r) / s) - rt1i * (rt2i
+		    / s);
+	    v[1] = h21s * (h__[m + m * h_dim1] + h__[m + 1 + (m + 1) * h_dim1]
+		     - rt1r - rt2r);
+	    v[2] = h21s * h__[m + 2 + (m + 1) * h_dim1];
+	    s = dabs(v[0]) + dabs(v[1]) + dabs(v[2]);
+	    v[0] /= s;
+	    v[1] /= s;
+	    v[2] /= s;
+	    if (m == l) {
+		goto L60;
+	    }
+	    if ((r__1 = h__[m + (m - 1) * h_dim1], dabs(r__1)) * (dabs(v[1])
+		    + dabs(v[2])) <= ulp * dabs(v[0]) * ((r__2 = h__[m - 1 + (
+		    m - 1) * h_dim1], dabs(r__2)) + (r__3 = h__[m + m *
+		    h_dim1], dabs(r__3)) + (r__4 = h__[m + 1 + (m + 1) *
+		    h_dim1], dabs(r__4)))) {
+		goto L60;
+	    }
+/* L50: */
+	}
+L60:
+
+/*        Double-shift QR step */
+
+	i__1 = i__ - 1;
+	for (k = m; k <= i__1; ++k) {
+
+/*
+             The first iteration of this loop determines a reflection G
+             from the vector V and applies it from left and right to H,
+             thus creating a nonzero bulge below the subdiagonal.
+
+             Each subsequent iteration determines a reflection G to
+             restore the Hessenberg form in the (K-1)th column, and thus
+             chases the bulge one step toward the bottom of the active
+             submatrix. NR is the order of G.
+
+   Computing MIN
+*/
+	    i__2 = 3, i__3 = i__ - k + 1;
+	    nr = min(i__2,i__3);
+	    if (k > m) {
+		scopy_(&nr, &h__[k + (k - 1) * h_dim1], &c__1, v, &c__1);
+	    }
+	    slarfg_(&nr, v, &v[1], &c__1, &t1);
+	    if (k > m) {
+		h__[k + (k - 1) * h_dim1] = v[0];
+		h__[k + 1 + (k - 1) * h_dim1] = 0.f;
+		if (k < i__ - 1) {
+		    h__[k + 2 + (k - 1) * h_dim1] = 0.f;
+		}
+	    } else if (m > l) {
+/*
+                 ==== Use the following instead of
+                 .    H( K, K-1 ) = -H( K, K-1 ) to
+                 .    avoid a bug when v(2) and v(3)
+                 .    underflow. ====
+*/
+		h__[k + (k - 1) * h_dim1] *= 1.f - t1;
+	    }
+	    v2 = v[1];
+	    t2 = t1 * v2;
+	    if (nr == 3) {
+		v3 = v[2];
+		t3 = t1 * v3;
+
+/*
+                Apply G from the left to transform the rows of the matrix
+                in columns K to I2.
+*/
+
+		i__2 = i2;
+		for (j = k; j <= i__2; ++j) {
+		    sum = h__[k + j * h_dim1] + v2 * h__[k + 1 + j * h_dim1]
+			    + v3 * h__[k + 2 + j * h_dim1];
+		    h__[k + j * h_dim1] -= sum * t1;
+		    h__[k + 1 + j * h_dim1] -= sum * t2;
+		    h__[k + 2 + j * h_dim1] -= sum * t3;
+/* L70: */
+		}
+
+/*
+                Apply G from the right to transform the columns of the
+                matrix in rows I1 to min(K+3,I).
+
+   Computing MIN
+*/
+		i__3 = k + 3;
+		i__2 = min(i__3,i__);
+		for (j = i1; j <= i__2; ++j) {
+		    sum = h__[j + k * h_dim1] + v2 * h__[j + (k + 1) * h_dim1]
+			     + v3 * h__[j + (k + 2) * h_dim1];
+		    h__[j + k * h_dim1] -= sum * t1;
+		    h__[j + (k + 1) * h_dim1] -= sum * t2;
+		    h__[j + (k + 2) * h_dim1] -= sum * t3;
+/* L80: */
+		}
+
+		if (*wantz) {
+
+/*                 Accumulate transformations in the matrix Z */
+
+		    i__2 = *ihiz;
+		    for (j = *iloz; j <= i__2; ++j) {
+			sum = z__[j + k * z_dim1] + v2 * z__[j + (k + 1) *
+				z_dim1] + v3 * z__[j + (k + 2) * z_dim1];
+			z__[j + k * z_dim1] -= sum * t1;
+			z__[j + (k + 1) * z_dim1] -= sum * t2;
+			z__[j + (k + 2) * z_dim1] -= sum * t3;
+/* L90: */
+		    }
+		}
+	    } else if (nr == 2) {
+
+/*
+                Apply G from the left to transform the rows of the matrix
+                in columns K to I2.
+*/
+
+		i__2 = i2;
+		for (j = k; j <= i__2; ++j) {
+		    sum = h__[k + j * h_dim1] + v2 * h__[k + 1 + j * h_dim1];
+		    h__[k + j * h_dim1] -= sum * t1;
+		    h__[k + 1 + j * h_dim1] -= sum * t2;
+/* L100: */
+		}
+
+/*
+                Apply G from the right to transform the columns of the
+                matrix in rows I1 to min(K+3,I).
+*/
+
+		i__2 = i__;
+		for (j = i1; j <= i__2; ++j) {
+		    sum = h__[j + k * h_dim1] + v2 * h__[j + (k + 1) * h_dim1]
+			    ;
+		    h__[j + k * h_dim1] -= sum * t1;
+		    h__[j + (k + 1) * h_dim1] -= sum * t2;
+/* L110: */
+		}
+
+		if (*wantz) {
+
+/*                 Accumulate transformations in the matrix Z */
+
+		    i__2 = *ihiz;
+		    for (j = *iloz; j <= i__2; ++j) {
+			sum = z__[j + k * z_dim1] + v2 * z__[j + (k + 1) *
+				z_dim1];
+			z__[j + k * z_dim1] -= sum * t1;
+			z__[j + (k + 1) * z_dim1] -= sum * t2;
+/* L120: */
+		    }
+		}
+	    }
+/* L130: */
+	}
+
+/* L140: */
+    }
+
+/*     Failure to converge in remaining number of iterations */
+
+    *info = i__;
+    return 0;
+
+L150:
+
+    if (l == i__) {
+
+/*        H(I,I-1) is negligible: one eigenvalue has converged. */
+
+	wr[i__] = h__[i__ + i__ * h_dim1];
+	wi[i__] = 0.f;
+    } else if (l == i__ - 1) {
+
+/*
+          H(I-1,I-2) is negligible: a pair of eigenvalues have converged.
+
+          Transform the 2-by-2 submatrix to standard Schur form,
+          and compute and store the eigenvalues.
+*/
+
+	slanv2_(&h__[i__ - 1 + (i__ - 1) * h_dim1], &h__[i__ - 1 + i__ *
+		h_dim1], &h__[i__ + (i__ - 1) * h_dim1], &h__[i__ + i__ *
+		h_dim1], &wr[i__ - 1], &wi[i__ - 1], &wr[i__], &wi[i__], &cs,
+		&sn);
+
+	if (*wantt) {
+
+/*           Apply the transformation to the rest of H. */
+
+	    if (i2 > i__) {
+		i__1 = i2 - i__;
+		srot_(&i__1, &h__[i__ - 1 + (i__ + 1) * h_dim1], ldh, &h__[
+			i__ + (i__ + 1) * h_dim1], ldh, &cs, &sn);
+	    }
+	    i__1 = i__ - i1 - 1;
+	    srot_(&i__1, &h__[i1 + (i__ - 1) * h_dim1], &c__1, &h__[i1 + i__ *
+		     h_dim1], &c__1, &cs, &sn);
+	}
+	if (*wantz) {
+
+/*           Apply the transformation to Z. */
+
+	    srot_(&nz, &z__[*iloz + (i__ - 1) * z_dim1], &c__1, &z__[*iloz +
+		    i__ * z_dim1], &c__1, &cs, &sn);
+	}
+    }
+
+/*     return to start of the main loop with new value of I. */
+
+    i__ = l - 1;
+    goto L20;
+
+L160:
+    return 0;
+
+/*     End of SLAHQR */
+
+} /* slahqr_ */
+
+/* Subroutine */ int slahr2_(integer *n, integer *k, integer *nb, real *a,
+	integer *lda, real *tau, real *t, integer *ldt, real *y, integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+    real r__1;
+
+    /* Local variables */
+    static integer i__;
+    static real ei;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemm_(char *, char *, integer *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *),
+	    strmm_(char *, char *, char *, char *, integer *, integer *, real
+	    *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *,
+	    integer *), strmv_(char *, char *, char *, integer *, real *,
+	    integer *, real *, integer *), slarfg_(
+	    integer *, real *, real *, integer *, real *), slacpy_(char *,
+	    integer *, integer *, real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    SLAHR2 reduces the first NB columns of A real general n-BY-(n-k+1)
+    matrix A so that elements below the k-th subdiagonal are zero. The
+    reduction is performed by an orthogonal similarity transformation
+    Q' * A * Q. The routine returns the matrices V and T which determine
+    Q as a block reflector I - V*T*V', and also the matrix Y = A * V * T.
+
+    This is an auxiliary routine called by SGEHRD.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    K       (input) INTEGER
+            The offset for the reduction. Elements below the k-th
+            subdiagonal in the first NB columns are reduced to zero.
+            K < N.
+
+    NB      (input) INTEGER
+            The number of columns to be reduced.
+
+    A       (input/output) REAL array, dimension (LDA,N-K+1)
+            On entry, the n-by-(n-k+1) general matrix A.
+            On exit, the elements on and above the k-th subdiagonal in
+            the first NB columns are overwritten with the corresponding
+            elements of the reduced matrix; the elements below the k-th
+            subdiagonal, with the array TAU, represent the matrix Q as a
+            product of elementary reflectors. The other columns of A are
+            unchanged. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) REAL array, dimension (NB)
+            The scalar factors of the elementary reflectors. See Further
+            Details.
+
+    T       (output) REAL array, dimension (LDT,NB)
+            The upper triangular matrix T.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T.  LDT >= NB.
+
+    Y       (output) REAL array, dimension (LDY,NB)
+            The n-by-nb matrix Y.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of nb elementary reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
+    A(i+k+1:n,i), and tau in TAU(i).
+
+    The elements of the vectors v together form the (n-k+1)-by-nb matrix
+    V which is needed, with T and Y, to apply the transformation to the
+    unreduced part of the matrix, using an update of the form:
+    A := (I - V*T*V') * (A - Y*V').
+
+    The contents of A on exit are illustrated by the following example
+    with n = 7, k = 3 and nb = 2:
+
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( h   h   a   a   a )
+       ( v1  h   a   a   a )
+       ( v1  v2  a   a   a )
+       ( v1  v2  a   a   a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This subroutine is a slight modification of LAPACK-3.0's DLAHRD
+    incorporating improvements proposed by Quintana-Orti and Van de
+    Gejin. Note that the entries of A(1:K,2:NB) differ from those
+    returned by the original LAPACK-3.0's DLAHRD routine. (This
+    subroutine is not backward compatible with LAPACK-3.0's DLAHRD.)
+
+    References
+    ==========
+
+    Gregorio Quintana-Orti and Robert van de Geijn, "Improving the
+    performance of reduction to Hessenberg form," ACM Transactions on
+    Mathematical Software, 32(2):180-194, June 2006.
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    --tau;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*n <= 1) {
+	return 0;
+    }
+
+    i__1 = *nb;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (i__ > 1) {
+
+/*
+             Update A(K+1:N,I)
+
+             Update I-th column of A - Y * V'
+*/
+
+	    i__2 = *n - *k;
+	    i__3 = i__ - 1;
+	    sgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b151, &y[*k + 1 + y_dim1],
+		     ldy, &a[*k + i__ - 1 + a_dim1], lda, &c_b15, &a[*k + 1 +
+		    i__ * a_dim1], &c__1);
+
+/*
+             Apply I - V * T' * V' to this column (call it b) from the
+             left, using the last column of T as workspace
+
+             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)
+                      ( V2 )             ( b2 )
+
+             where V1 is unit lower triangular
+
+             w := V1' * b1
+*/
+
+	    i__2 = i__ - 1;
+	    scopy_(&i__2, &a[*k + 1 + i__ * a_dim1], &c__1, &t[*nb * t_dim1 +
+		    1], &c__1);
+	    i__2 = i__ - 1;
+	    strmv_("Lower", "Transpose", "UNIT", &i__2, &a[*k + 1 + a_dim1],
+		    lda, &t[*nb * t_dim1 + 1], &c__1);
+
+/*           w := w + V2'*b2 */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[*k + i__ + a_dim1],
+		    lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b15, &t[*nb *
+		    t_dim1 + 1], &c__1);
+
+/*           w := T'*w */
+
+	    i__2 = i__ - 1;
+	    strmv_("Upper", "Transpose", "NON-UNIT", &i__2, &t[t_offset], ldt,
+		     &t[*nb * t_dim1 + 1], &c__1);
+
+/*           b2 := b2 - V2*w */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b151, &a[*k + i__ +
+		    a_dim1], lda, &t[*nb * t_dim1 + 1], &c__1, &c_b15, &a[*k
+		    + i__ + i__ * a_dim1], &c__1);
+
+/*           b1 := b1 - V1*w */
+
+	    i__2 = i__ - 1;
+	    strmv_("Lower", "NO TRANSPOSE", "UNIT", &i__2, &a[*k + 1 + a_dim1]
+		    , lda, &t[*nb * t_dim1 + 1], &c__1);
+	    i__2 = i__ - 1;
+	    saxpy_(&i__2, &c_b151, &t[*nb * t_dim1 + 1], &c__1, &a[*k + 1 +
+		    i__ * a_dim1], &c__1);
+
+	    a[*k + i__ - 1 + (i__ - 1) * a_dim1] = ei;
+	}
+
+/*
+          Generate the elementary reflector H(I) to annihilate
+          A(K+I+1:N,I)
+*/
+
+	i__2 = *n - *k - i__ + 1;
+/* Computing MIN */
+	i__3 = *k + i__ + 1;
+	slarfg_(&i__2, &a[*k + i__ + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		a_dim1], &c__1, &tau[i__]);
+	ei = a[*k + i__ + i__ * a_dim1];
+	a[*k + i__ + i__ * a_dim1] = 1.f;
+
+/*        Compute  Y(K+1:N,I) */
+
+	i__2 = *n - *k;
+	i__3 = *n - *k - i__ + 1;
+	sgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b15, &a[*k + 1 + (i__ + 1) *
+		a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b29, &y[*
+		k + 1 + i__ * y_dim1], &c__1);
+	i__2 = *n - *k - i__ + 1;
+	i__3 = i__ - 1;
+	sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[*k + i__ + a_dim1], lda,
+		&a[*k + i__ + i__ * a_dim1], &c__1, &c_b29, &t[i__ * t_dim1 +
+		1], &c__1);
+	i__2 = *n - *k;
+	i__3 = i__ - 1;
+	sgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b151, &y[*k + 1 + y_dim1],
+		ldy, &t[i__ * t_dim1 + 1], &c__1, &c_b15, &y[*k + 1 + i__ *
+		y_dim1], &c__1);
+	i__2 = *n - *k;
+	sscal_(&i__2, &tau[i__], &y[*k + 1 + i__ * y_dim1], &c__1);
+
+/*        Compute T(1:I,I) */
+
+	i__2 = i__ - 1;
+	r__1 = -tau[i__];
+	sscal_(&i__2, &r__1, &t[i__ * t_dim1 + 1], &c__1);
+	i__2 = i__ - 1;
+	strmv_("Upper", "No Transpose", "NON-UNIT", &i__2, &t[t_offset], ldt,
+		&t[i__ * t_dim1 + 1], &c__1)
+		;
+	t[i__ + i__ * t_dim1] = tau[i__];
+
+/* L10: */
+    }
+    a[*k + *nb + *nb * a_dim1] = ei;
+
+/*     Compute Y(1:K,1:NB) */
+
+    slacpy_("ALL", k, nb, &a[(a_dim1 << 1) + 1], lda, &y[y_offset], ldy);
+    strmm_("RIGHT", "Lower", "NO TRANSPOSE", "UNIT", k, nb, &c_b15, &a[*k + 1
+	    + a_dim1], lda, &y[y_offset], ldy);
+    if (*n > *k + *nb) {
+	i__1 = *n - *k - *nb;
+	sgemm_("NO TRANSPOSE", "NO TRANSPOSE", k, nb, &i__1, &c_b15, &a[(*nb
+		+ 2) * a_dim1 + 1], lda, &a[*k + 1 + *nb + a_dim1], lda, &
+		c_b15, &y[y_offset], ldy);
+    }
+    strmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b15, &t[
+	    t_offset], ldt, &y[y_offset], ldy);
+
+    return 0;
+
+/*     End of SLAHR2 */
+
+} /* slahr2_ */
+
+logical slaisnan_(real *sin1, real *sin2)
+{
+    /* System generated locals */
+    logical ret_val;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    This routine is not for general use.  It exists solely to avoid
+    over-optimization in SISNAN.
+
+    SLAISNAN checks for NaNs by comparing its two arguments for
+    inequality.  NaN is the only floating-point value where NaN != NaN
+    returns .TRUE.  To check for NaNs, pass the same variable as both
+    arguments.
+
+    A compiler must assume that the two arguments are
+    not the same variable, and the test will not be optimized away.
+    Interprocedural or whole-program optimization may delete this
+    test.  The ISNAN functions will be replaced by the correct
+    Fortran 03 intrinsic once the intrinsic is widely available.
+
+    Arguments
+    =========
+
+    SIN1     (input) REAL
+
+    SIN2     (input) REAL
+            Two numbers to compare for inequality.
+
+    =====================================================================
+*/
+
+    ret_val = *sin1 != *sin2;
+    return ret_val;
+} /* slaisnan_ */
+
+/* Subroutine */ int slaln2_(logical *ltrans, integer *na, integer *nw, real *
+	smin, real *ca, real *a, integer *lda, real *d1, real *d2, real *b,
+	integer *ldb, real *wr, real *wi, real *x, integer *ldx, real *scale,
+	real *xnorm, integer *info)
+{
+    /* Initialized data */
+
+    static logical cswap[4] = { FALSE_,FALSE_,TRUE_,TRUE_ };
+    static logical rswap[4] = { FALSE_,TRUE_,FALSE_,TRUE_ };
+    static integer ipivot[16]	/* was [4][4] */ = { 1,2,3,4,2,1,4,3,3,4,1,2,
+	    4,3,2,1 };
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    static real equiv_0[4], equiv_1[4];
+
+    /* Local variables */
+    static integer j;
+#define ci (equiv_0)
+#define cr (equiv_1)
+    static real bi1, bi2, br1, br2, xi1, xi2, xr1, xr2, ci21, ci22, cr21,
+	    cr22, li21, csi, ui11, lr21, ui12, ui22;
+#define civ (equiv_0)
+    static real csr, ur11, ur12, ur22;
+#define crv (equiv_1)
+    static real bbnd, cmax, ui11r, ui12s, temp, ur11r, ur12s, u22abs;
+    static integer icmax;
+    static real bnorm, cnorm, smini;
+    extern doublereal slamch_(char *);
+    static real bignum;
+    extern /* Subroutine */ int sladiv_(real *, real *, real *, real *, real *
+	    , real *);
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLALN2 solves a system of the form  (ca A - w D ) X = s B
+    or (ca A' - w D) X = s B   with possible scaling ("s") and
+    perturbation of A.  (A' means A-transpose.)
+
+    A is an NA x NA real matrix, ca is a real scalar, D is an NA x NA
+    real diagonal matrix, w is a real or complex value, and X and B are
+    NA x 1 matrices -- real if w is real, complex if w is complex.  NA
+    may be 1 or 2.
+
+    If w is complex, X and B are represented as NA x 2 matrices,
+    the first column of each being the real part and the second
+    being the imaginary part.
+
+    "s" is a scaling factor (.LE. 1), computed by SLALN2, which is
+    so chosen that X can be computed without overflow.  X is further
+    scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
+    than overflow.
+
+    If both singular values of (ca A - w D) are less than SMIN,
+    SMIN*identity will be used instead of (ca A - w D).  If only one
+    singular value is less than SMIN, one element of (ca A - w D) will be
+    perturbed enough to make the smallest singular value roughly SMIN.
+    If both singular values are at least SMIN, (ca A - w D) will not be
+    perturbed.  In any case, the perturbation will be at most some small
+    multiple of max( SMIN, ulp*norm(ca A - w D) ).  The singular values
+    are computed by infinity-norm approximations, and thus will only be
+    correct to a factor of 2 or so.
+
+    Note: all input quantities are assumed to be smaller than overflow
+    by a reasonable factor.  (See BIGNUM.)
+
+    Arguments
+    ==========
+
+    LTRANS  (input) LOGICAL
+            =.TRUE.:  A-transpose will be used.
+            =.FALSE.: A will be used (not transposed.)
+
+    NA      (input) INTEGER
+            The size of the matrix A.  It may (only) be 1 or 2.
+
+    NW      (input) INTEGER
+            1 if "w" is real, 2 if "w" is complex.  It may only be 1
+            or 2.
+
+    SMIN    (input) REAL
+            The desired lower bound on the singular values of A.  This
+            should be a safe distance away from underflow or overflow,
+            say, between (underflow/machine precision) and  (machine
+            precision * overflow ).  (See BIGNUM and ULP.)
+
+    CA      (input) REAL
+            The coefficient c, which A is multiplied by.
+
+    A       (input) REAL array, dimension (LDA,NA)
+            The NA x NA matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of A.  It must be at least NA.
+
+    D1      (input) REAL
+            The 1,1 element in the diagonal matrix D.
+
+    D2      (input) REAL
+            The 2,2 element in the diagonal matrix D.  Not used if NW=1.
+
+    B       (input) REAL array, dimension (LDB,NW)
+            The NA x NW matrix B (right-hand side).  If NW=2 ("w" is
+            complex), column 1 contains the real part of B and column 2
+            contains the imaginary part.
+
+    LDB     (input) INTEGER
+            The leading dimension of B.  It must be at least NA.
+
+    WR      (input) REAL
+            The real part of the scalar "w".
+
+    WI      (input) REAL
+            The imaginary part of the scalar "w".  Not used if NW=1.
+
+    X       (output) REAL array, dimension (LDX,NW)
+            The NA x NW matrix X (unknowns), as computed by SLALN2.
+            If NW=2 ("w" is complex), on exit, column 1 will contain
+            the real part of X and column 2 will contain the imaginary
+            part.
+
+    LDX     (input) INTEGER
+            The leading dimension of X.  It must be at least NA.
+
+    SCALE   (output) REAL
+            The scale factor that B must be multiplied by to insure
+            that overflow does not occur when computing X.  Thus,
+            (ca A - w D) X  will be SCALE*B, not B (ignoring
+            perturbations of A.)  It will be at most 1.
+
+    XNORM   (output) REAL
+            The infinity-norm of X, when X is regarded as an NA x NW
+            real matrix.
+
+    INFO    (output) INTEGER
+            An error flag.  It will be set to zero if no error occurs,
+            a negative number if an argument is in error, or a positive
+            number if  ca A - w D  had to be perturbed.
+            The possible values are:
+            = 0: No error occurred, and (ca A - w D) did not have to be
+                   perturbed.
+            = 1: (ca A - w D) had to be perturbed to make its smallest
+                 (or only) singular value greater than SMIN.
+            NOTE: In the interests of speed, this routine does not
+                  check the inputs for errors.
+
+   =====================================================================
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+
+    /* Function Body */
+
+/*     Compute BIGNUM */
+
+    smlnum = 2.f * slamch_("Safe minimum");
+    bignum = 1.f / smlnum;
+    smini = dmax(*smin,smlnum);
+
+/*     Don't check for input errors */
+
+    *info = 0;
+
+/*     Standard Initializations */
+
+    *scale = 1.f;
+
+    if (*na == 1) {
+
+/*        1 x 1  (i.e., scalar) system   C X = B */
+
+	if (*nw == 1) {
+
+/*
+             Real 1x1 system.
+
+             C = ca A - w D
+*/
+
+	    csr = *ca * a[a_dim1 + 1] - *wr * *d1;
+	    cnorm = dabs(csr);
+
+/*           If | C | < SMINI, use C = SMINI */
+
+	    if (cnorm < smini) {
+		csr = smini;
+		cnorm = smini;
+		*info = 1;
+	    }
+
+/*           Check scaling for  X = B / C */
+
+	    bnorm = (r__1 = b[b_dim1 + 1], dabs(r__1));
+	    if (cnorm < 1.f && bnorm > 1.f) {
+		if (bnorm > bignum * cnorm) {
+		    *scale = 1.f / bnorm;
+		}
+	    }
+
+/*           Compute X */
+
+	    x[x_dim1 + 1] = b[b_dim1 + 1] * *scale / csr;
+	    *xnorm = (r__1 = x[x_dim1 + 1], dabs(r__1));
+	} else {
+
+/*
+             Complex 1x1 system (w is complex)
+
+             C = ca A - w D
+*/
+
+	    csr = *ca * a[a_dim1 + 1] - *wr * *d1;
+	    csi = -(*wi) * *d1;
+	    cnorm = dabs(csr) + dabs(csi);
+
+/*           If | C | < SMINI, use C = SMINI */
+
+	    if (cnorm < smini) {
+		csr = smini;
+		csi = 0.f;
+		cnorm = smini;
+		*info = 1;
+	    }
+
+/*           Check scaling for  X = B / C */
+
+	    bnorm = (r__1 = b[b_dim1 + 1], dabs(r__1)) + (r__2 = b[(b_dim1 <<
+		    1) + 1], dabs(r__2));
+	    if (cnorm < 1.f && bnorm > 1.f) {
+		if (bnorm > bignum * cnorm) {
+		    *scale = 1.f / bnorm;
+		}
+	    }
+
+/*           Compute X */
+
+	    r__1 = *scale * b[b_dim1 + 1];
+	    r__2 = *scale * b[(b_dim1 << 1) + 1];
+	    sladiv_(&r__1, &r__2, &csr, &csi, &x[x_dim1 + 1], &x[(x_dim1 << 1)
+		     + 1]);
+	    *xnorm = (r__1 = x[x_dim1 + 1], dabs(r__1)) + (r__2 = x[(x_dim1 <<
+		     1) + 1], dabs(r__2));
+	}
+
+    } else {
+
+/*
+          2x2 System
+
+          Compute the real part of  C = ca A - w D  (or  ca A' - w D )
+*/
+
+	cr[0] = *ca * a[a_dim1 + 1] - *wr * *d1;
+	cr[3] = *ca * a[(a_dim1 << 1) + 2] - *wr * *d2;
+	if (*ltrans) {
+	    cr[2] = *ca * a[a_dim1 + 2];
+	    cr[1] = *ca * a[(a_dim1 << 1) + 1];
+	} else {
+	    cr[1] = *ca * a[a_dim1 + 2];
+	    cr[2] = *ca * a[(a_dim1 << 1) + 1];
+	}
+
+	if (*nw == 1) {
+
+/*
+             Real 2x2 system  (w is real)
+
+             Find the largest element in C
+*/
+
+	    cmax = 0.f;
+	    icmax = 0;
+
+	    for (j = 1; j <= 4; ++j) {
+		if ((r__1 = crv[j - 1], dabs(r__1)) > cmax) {
+		    cmax = (r__1 = crv[j - 1], dabs(r__1));
+		    icmax = j;
+		}
+/* L10: */
+	    }
+
+/*           If norm(C) < SMINI, use SMINI*identity. */
+
+	    if (cmax < smini) {
+/* Computing MAX */
+		r__3 = (r__1 = b[b_dim1 + 1], dabs(r__1)), r__4 = (r__2 = b[
+			b_dim1 + 2], dabs(r__2));
+		bnorm = dmax(r__3,r__4);
+		if (smini < 1.f && bnorm > 1.f) {
+		    if (bnorm > bignum * smini) {
+			*scale = 1.f / bnorm;
+		    }
+		}
+		temp = *scale / smini;
+		x[x_dim1 + 1] = temp * b[b_dim1 + 1];
+		x[x_dim1 + 2] = temp * b[b_dim1 + 2];
+		*xnorm = temp * bnorm;
+		*info = 1;
+		return 0;
+	    }
+
+/*           Gaussian elimination with complete pivoting. */
+
+	    ur11 = crv[icmax - 1];
+	    cr21 = crv[ipivot[(icmax << 2) - 3] - 1];
+	    ur12 = crv[ipivot[(icmax << 2) - 2] - 1];
+	    cr22 = crv[ipivot[(icmax << 2) - 1] - 1];
+	    ur11r = 1.f / ur11;
+	    lr21 = ur11r * cr21;
+	    ur22 = cr22 - ur12 * lr21;
+
+/*           If smaller pivot < SMINI, use SMINI */
+
+	    if (dabs(ur22) < smini) {
+		ur22 = smini;
+		*info = 1;
+	    }
+	    if (rswap[icmax - 1]) {
+		br1 = b[b_dim1 + 2];
+		br2 = b[b_dim1 + 1];
+	    } else {
+		br1 = b[b_dim1 + 1];
+		br2 = b[b_dim1 + 2];
+	    }
+	    br2 -= lr21 * br1;
+/* Computing MAX */
+	    r__2 = (r__1 = br1 * (ur22 * ur11r), dabs(r__1)), r__3 = dabs(br2)
+		    ;
+	    bbnd = dmax(r__2,r__3);
+	    if (bbnd > 1.f && dabs(ur22) < 1.f) {
+		if (bbnd >= bignum * dabs(ur22)) {
+		    *scale = 1.f / bbnd;
+		}
+	    }
+
+	    xr2 = br2 * *scale / ur22;
+	    xr1 = *scale * br1 * ur11r - xr2 * (ur11r * ur12);
+	    if (cswap[icmax - 1]) {
+		x[x_dim1 + 1] = xr2;
+		x[x_dim1 + 2] = xr1;
+	    } else {
+		x[x_dim1 + 1] = xr1;
+		x[x_dim1 + 2] = xr2;
+	    }
+/* Computing MAX */
+	    r__1 = dabs(xr1), r__2 = dabs(xr2);
+	    *xnorm = dmax(r__1,r__2);
+
+/*           Further scaling if  norm(A) norm(X) > overflow */
+
+	    if (*xnorm > 1.f && cmax > 1.f) {
+		if (*xnorm > bignum / cmax) {
+		    temp = cmax / bignum;
+		    x[x_dim1 + 1] = temp * x[x_dim1 + 1];
+		    x[x_dim1 + 2] = temp * x[x_dim1 + 2];
+		    *xnorm = temp * *xnorm;
+		    *scale = temp * *scale;
+		}
+	    }
+	} else {
+
+/*
+             Complex 2x2 system  (w is complex)
+
+             Find the largest element in C
+*/
+
+	    ci[0] = -(*wi) * *d1;
+	    ci[1] = 0.f;
+	    ci[2] = 0.f;
+	    ci[3] = -(*wi) * *d2;
+	    cmax = 0.f;
+	    icmax = 0;
+
+	    for (j = 1; j <= 4; ++j) {
+		if ((r__1 = crv[j - 1], dabs(r__1)) + (r__2 = civ[j - 1],
+			dabs(r__2)) > cmax) {
+		    cmax = (r__1 = crv[j - 1], dabs(r__1)) + (r__2 = civ[j -
+			    1], dabs(r__2));
+		    icmax = j;
+		}
+/* L20: */
+	    }
+
+/*           If norm(C) < SMINI, use SMINI*identity. */
+
+	    if (cmax < smini) {
+/* Computing MAX */
+		r__5 = (r__1 = b[b_dim1 + 1], dabs(r__1)) + (r__2 = b[(b_dim1
+			<< 1) + 1], dabs(r__2)), r__6 = (r__3 = b[b_dim1 + 2],
+			 dabs(r__3)) + (r__4 = b[(b_dim1 << 1) + 2], dabs(
+			r__4));
+		bnorm = dmax(r__5,r__6);
+		if (smini < 1.f && bnorm > 1.f) {
+		    if (bnorm > bignum * smini) {
+			*scale = 1.f / bnorm;
+		    }
+		}
+		temp = *scale / smini;
+		x[x_dim1 + 1] = temp * b[b_dim1 + 1];
+		x[x_dim1 + 2] = temp * b[b_dim1 + 2];
+		x[(x_dim1 << 1) + 1] = temp * b[(b_dim1 << 1) + 1];
+		x[(x_dim1 << 1) + 2] = temp * b[(b_dim1 << 1) + 2];
+		*xnorm = temp * bnorm;
+		*info = 1;
+		return 0;
+	    }
+
+/*           Gaussian elimination with complete pivoting. */
+
+	    ur11 = crv[icmax - 1];
+	    ui11 = civ[icmax - 1];
+	    cr21 = crv[ipivot[(icmax << 2) - 3] - 1];
+	    ci21 = civ[ipivot[(icmax << 2) - 3] - 1];
+	    ur12 = crv[ipivot[(icmax << 2) - 2] - 1];
+	    ui12 = civ[ipivot[(icmax << 2) - 2] - 1];
+	    cr22 = crv[ipivot[(icmax << 2) - 1] - 1];
+	    ci22 = civ[ipivot[(icmax << 2) - 1] - 1];
+	    if (icmax == 1 || icmax == 4) {
+
+/*              Code when off-diagonals of pivoted C are real */
+
+		if (dabs(ur11) > dabs(ui11)) {
+		    temp = ui11 / ur11;
+/* Computing 2nd power */
+		    r__1 = temp;
+		    ur11r = 1.f / (ur11 * (r__1 * r__1 + 1.f));
+		    ui11r = -temp * ur11r;
+		} else {
+		    temp = ur11 / ui11;
+/* Computing 2nd power */
+		    r__1 = temp;
+		    ui11r = -1.f / (ui11 * (r__1 * r__1 + 1.f));
+		    ur11r = -temp * ui11r;
+		}
+		lr21 = cr21 * ur11r;
+		li21 = cr21 * ui11r;
+		ur12s = ur12 * ur11r;
+		ui12s = ur12 * ui11r;
+		ur22 = cr22 - ur12 * lr21;
+		ui22 = ci22 - ur12 * li21;
+	    } else {
+
+/*              Code when diagonals of pivoted C are real */
+
+		ur11r = 1.f / ur11;
+		ui11r = 0.f;
+		lr21 = cr21 * ur11r;
+		li21 = ci21 * ur11r;
+		ur12s = ur12 * ur11r;
+		ui12s = ui12 * ur11r;
+		ur22 = cr22 - ur12 * lr21 + ui12 * li21;
+		ui22 = -ur12 * li21 - ui12 * lr21;
+	    }
+	    u22abs = dabs(ur22) + dabs(ui22);
+
+/*           If smaller pivot < SMINI, use SMINI */
+
+	    if (u22abs < smini) {
+		ur22 = smini;
+		ui22 = 0.f;
+		*info = 1;
+	    }
+	    if (rswap[icmax - 1]) {
+		br2 = b[b_dim1 + 1];
+		br1 = b[b_dim1 + 2];
+		bi2 = b[(b_dim1 << 1) + 1];
+		bi1 = b[(b_dim1 << 1) + 2];
+	    } else {
+		br1 = b[b_dim1 + 1];
+		br2 = b[b_dim1 + 2];
+		bi1 = b[(b_dim1 << 1) + 1];
+		bi2 = b[(b_dim1 << 1) + 2];
+	    }
+	    br2 = br2 - lr21 * br1 + li21 * bi1;
+	    bi2 = bi2 - li21 * br1 - lr21 * bi1;
+/* Computing MAX */
+	    r__1 = (dabs(br1) + dabs(bi1)) * (u22abs * (dabs(ur11r) + dabs(
+		    ui11r))), r__2 = dabs(br2) + dabs(bi2);
+	    bbnd = dmax(r__1,r__2);
+	    if (bbnd > 1.f && u22abs < 1.f) {
+		if (bbnd >= bignum * u22abs) {
+		    *scale = 1.f / bbnd;
+		    br1 = *scale * br1;
+		    bi1 = *scale * bi1;
+		    br2 = *scale * br2;
+		    bi2 = *scale * bi2;
+		}
+	    }
+
+	    sladiv_(&br2, &bi2, &ur22, &ui22, &xr2, &xi2);
+	    xr1 = ur11r * br1 - ui11r * bi1 - ur12s * xr2 + ui12s * xi2;
+	    xi1 = ui11r * br1 + ur11r * bi1 - ui12s * xr2 - ur12s * xi2;
+	    if (cswap[icmax - 1]) {
+		x[x_dim1 + 1] = xr2;
+		x[x_dim1 + 2] = xr1;
+		x[(x_dim1 << 1) + 1] = xi2;
+		x[(x_dim1 << 1) + 2] = xi1;
+	    } else {
+		x[x_dim1 + 1] = xr1;
+		x[x_dim1 + 2] = xr2;
+		x[(x_dim1 << 1) + 1] = xi1;
+		x[(x_dim1 << 1) + 2] = xi2;
+	    }
+/* Computing MAX */
+	    r__1 = dabs(xr1) + dabs(xi1), r__2 = dabs(xr2) + dabs(xi2);
+	    *xnorm = dmax(r__1,r__2);
+
+/*           Further scaling if  norm(A) norm(X) > overflow */
+
+	    if (*xnorm > 1.f && cmax > 1.f) {
+		if (*xnorm > bignum / cmax) {
+		    temp = cmax / bignum;
+		    x[x_dim1 + 1] = temp * x[x_dim1 + 1];
+		    x[x_dim1 + 2] = temp * x[x_dim1 + 2];
+		    x[(x_dim1 << 1) + 1] = temp * x[(x_dim1 << 1) + 1];
+		    x[(x_dim1 << 1) + 2] = temp * x[(x_dim1 << 1) + 2];
+		    *xnorm = temp * *xnorm;
+		    *scale = temp * *scale;
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SLALN2 */
+
+} /* slaln2_ */
+
+#undef crv
+#undef civ
+#undef cr
+#undef ci
+
+
+/* Subroutine */ int slals0_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *nrhs, real *b, integer *ldb, real *bx,
+	integer *ldbx, integer *perm, integer *givptr, integer *givcol,
+	integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real *
+	difl, real *difr, real *z__, integer *k, real *c__, real *s, real *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, b_dim1, b_offset, bx_dim1, bx_offset,
+	    difr_dim1, difr_offset, givnum_dim1, givnum_offset, poles_dim1,
+	    poles_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, m, n;
+    static real dj;
+    static integer nlp1;
+    static real temp;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    extern doublereal snrm2_(integer *, real *, integer *);
+    static real diflj, difrj, dsigj;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemv_(char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *), scopy_(
+	    integer *, real *, integer *, real *, integer *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real dsigjp;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLALS0 applies back the multiplying factors of either the left or the
+    right singular vector matrix of a diagonal matrix appended by a row
+    to the right hand side matrix B in solving the least squares problem
+    using the divide-and-conquer SVD approach.
+
+    For the left singular vector matrix, three types of orthogonal
+    matrices are involved:
+
+    (1L) Givens rotations: the number of such rotations is GIVPTR; the
+         pairs of columns/rows they were applied to are stored in GIVCOL;
+         and the C- and S-values of these rotations are stored in GIVNUM.
+
+    (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+         row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+         J-th row.
+
+    (3L) The left singular vector matrix of the remaining matrix.
+
+    For the right singular vector matrix, four types of orthogonal
+    matrices are involved:
+
+    (1R) The right singular vector matrix of the remaining matrix.
+
+    (2R) If SQRE = 1, one extra Givens rotation to generate the right
+         null space.
+
+    (3R) The inverse transformation of (2L).
+
+    (4R) The inverse transformation of (1L).
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Left singular vector matrix.
+           = 1: Right singular vector matrix.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) REAL array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M. On output, B contains
+           the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B. LDB must be at least
+           max(1,MAX( M, N ) ).
+
+    BX     (workspace) REAL array, dimension ( LDBX, NRHS )
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    PERM   (input) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) applied
+           to the two blocks.
+
+    GIVPTR (input) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of rows/columns
+           involved in a Givens rotation.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (input) REAL array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value used in the
+           corresponding Givens rotation.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of arrays DIFR, POLES and
+           GIVNUM, must be at least K.
+
+    POLES  (input) REAL array, dimension ( LDGNUM, 2 )
+           On entry, POLES(1:K, 1) contains the new singular
+           values obtained from solving the secular equation, and
+           POLES(1:K, 2) is an array containing the poles in the secular
+           equation.
+
+    DIFL   (input) REAL array, dimension ( K ).
+           On entry, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (input) REAL array, dimension ( LDGNUM, 2 ).
+           On entry, DIFR(I, 1) contains the distances between I-th
+           updated (undeflated) singular value and the I+1-th
+           (undeflated) old singular value. And DIFR(I, 2) is the
+           normalizing factor for the I-th right singular vector.
+
+    Z      (input) REAL array, dimension ( K )
+           Contain the components of the deflation-adjusted updating row
+           vector.
+
+    K      (input) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (input) REAL
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (input) REAL
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    WORK   (workspace) REAL array, dimension ( K )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    difr_dim1 = *ldgnum;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --z__;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    }
+
+    n = *nl + *nr + 1;
+
+    if (*nrhs < 1) {
+	*info = -5;
+    } else if (*ldb < n) {
+	*info = -7;
+    } else if (*ldbx < n) {
+	*info = -9;
+    } else if (*givptr < 0) {
+	*info = -11;
+    } else if (*ldgcol < n) {
+	*info = -13;
+    } else if (*ldgnum < n) {
+	*info = -15;
+    } else if (*k < 1) {
+	*info = -20;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLALS0", &i__1);
+	return 0;
+    }
+
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+
+    if (*icompq == 0) {
+
+/*
+          Apply back orthogonal transformations from the left.
+
+          Step (1L): apply back the Givens rotations performed.
+*/
+
+	i__1 = *givptr;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    srot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]);
+/* L10: */
+	}
+
+/*        Step (2L): permute rows of B. */
+
+	scopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx);
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    scopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1],
+		    ldbx);
+/* L20: */
+	}
+
+/*
+          Step (3L): apply the inverse of the left singular vector
+          matrix to BX.
+*/
+
+	if (*k == 1) {
+	    scopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb);
+	    if (z__[1] < 0.f) {
+		sscal_(nrhs, &c_b151, &b[b_offset], ldb);
+	    }
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		diflj = difl[j];
+		dj = poles[j + poles_dim1];
+		dsigj = -poles[j + (poles_dim1 << 1)];
+		if (j < *k) {
+		    difrj = -difr[j + difr_dim1];
+		    dsigjp = -poles[j + 1 + (poles_dim1 << 1)];
+		}
+		if (z__[j] == 0.f || poles[j + (poles_dim1 << 1)] == 0.f) {
+		    work[j] = 0.f;
+		} else {
+		    work[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj /
+			     (poles[j + (poles_dim1 << 1)] + dj);
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			work[i__] = 0.f;
+		    } else {
+			work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				/ (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigj) - diflj) / (poles[i__ + (poles_dim1 <<
+				1)] + dj);
+		    }
+/* L30: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			work[i__] = 0.f;
+		    } else {
+			work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				/ (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigjp) + difrj) / (poles[i__ + (poles_dim1 <<
+				 1)] + dj);
+		    }
+/* L40: */
+		}
+		work[1] = -1.f;
+		temp = snrm2_(k, &work[1], &c__1);
+		sgemv_("T", k, nrhs, &c_b15, &bx[bx_offset], ldbx, &work[1], &
+			c__1, &c_b29, &b[j + b_dim1], ldb);
+		slascl_("G", &c__0, &c__0, &temp, &c_b15, &c__1, nrhs, &b[j +
+			b_dim1], ldb, info);
+/* L50: */
+	    }
+	}
+
+/*        Move the deflated rows of BX to B also. */
+
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    slacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1
+		    + b_dim1], ldb);
+	}
+    } else {
+
+/*
+          Apply back the right orthogonal transformations.
+
+          Step (1R): apply back the new right singular vector matrix
+          to B.
+*/
+
+	if (*k == 1) {
+	    scopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx);
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		dsigj = poles[j + (poles_dim1 << 1)];
+		if (z__[j] == 0.f) {
+		    work[j] = 0.f;
+		} else {
+		    work[j] = -z__[j] / difl[j] / (dsigj + poles[j +
+			    poles_dim1]) / difr[j + (difr_dim1 << 1)];
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			work[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + 1 + (poles_dim1 << 1)];
+			work[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difr[
+				i__ + difr_dim1]) / (dsigj + poles[i__ +
+				poles_dim1]) / difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L60: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			work[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + (poles_dim1 << 1)];
+			work[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difl[
+				i__]) / (dsigj + poles[i__ + poles_dim1]) /
+				difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L70: */
+		}
+		sgemv_("T", k, nrhs, &c_b15, &b[b_offset], ldb, &work[1], &
+			c__1, &c_b29, &bx[j + bx_dim1], ldbx);
+/* L80: */
+	    }
+	}
+
+/*
+          Step (2R): if SQRE = 1, apply back the rotation that is
+          related to the right null space of the subproblem.
+*/
+
+	if (*sqre == 1) {
+	    scopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx);
+	    srot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__,
+		    s);
+	}
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    slacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 +
+		    bx_dim1], ldbx);
+	}
+
+/*        Step (3R): permute rows of B. */
+
+	scopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb);
+	if (*sqre == 1) {
+	    scopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb);
+	}
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    scopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1],
+		    ldb);
+/* L90: */
+	}
+
+/*        Step (4R): apply back the Givens rotations performed. */
+
+	for (i__ = *givptr; i__ >= 1; --i__) {
+	    r__1 = -givnum[i__ + givnum_dim1];
+	    srot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &r__1);
+/* L100: */
+	}
+    }
+
+    return 0;
+
+/*     End of SLALS0 */
+
+} /* slals0_ */
+
+/* Subroutine */ int slalsa_(integer *icompq, integer *smlsiz, integer *n,
+	integer *nrhs, real *b, integer *ldb, real *bx, integer *ldbx, real *
+	u, integer *ldu, real *vt, integer *k, real *difl, real *difr, real *
+	z__, real *poles, integer *givptr, integer *givcol, integer *ldgcol,
+	integer *perm, real *givnum, real *c__, real *s, real *work, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, b_dim1,
+	    b_offset, bx_dim1, bx_offset, difl_dim1, difl_offset, difr_dim1,
+	    difr_offset, givnum_dim1, givnum_offset, poles_dim1, poles_offset,
+	     u_dim1, u_offset, vt_dim1, vt_offset, z_dim1, z_offset, i__1,
+	    i__2;
+
+    /* Local variables */
+    static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl,
+	    ndb1, nlp1, lvl2, nrp1, nlvl, sqre, inode, ndiml;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer ndimr;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slals0_(integer *, integer *, integer *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
+	    integer *, integer *, integer *, real *, integer *, real *, real *
+	    , real *, real *, integer *, real *, real *, real *, integer *),
+	    xerbla_(char *, integer *), slasdt_(integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLALSA is an itermediate step in solving the least squares problem
+    by computing the SVD of the coefficient matrix in compact form (The
+    singular vectors are computed as products of simple orthorgonal
+    matrices.).
+
+    If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector
+    matrix of an upper bidiagonal matrix to the right hand side; and if
+    ICOMPQ = 1, SLALSA applies the right singular vector matrix to the
+    right hand side. The singular vector matrices were generated in
+    compact form by SLALSA.
+
+    Arguments
+    =========
+
+
+    ICOMPQ (input) INTEGER
+           Specifies whether the left or the right singular vector
+           matrix is involved.
+           = 0: Left singular vector matrix
+           = 1: Right singular vector matrix
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row and column dimensions of the upper bidiagonal matrix.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) REAL array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M.
+           On output, B contains the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,MAX( M, N ) ).
+
+    BX     (output) REAL array, dimension ( LDBX, NRHS )
+           On exit, the result of applying the left or right singular
+           vector matrix to B.
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    U      (input) REAL array, dimension ( LDU, SMLSIZ ).
+           On entry, U contains the left singular vector matrices of all
+           subproblems at the bottom level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR,
+           POLES, GIVNUM, and Z.
+
+    VT     (input) REAL array, dimension ( LDU, SMLSIZ+1 ).
+           On entry, VT' contains the right singular vector matrices of
+           all subproblems at the bottom level.
+
+    K      (input) INTEGER array, dimension ( N ).
+
+    DIFL   (input) REAL array, dimension ( LDU, NLVL ).
+           where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1.
+
+    DIFR   (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record
+           distances between singular values on the I-th level and
+           singular values on the (I -1)-th level, and DIFR(*, 2 * I)
+           record the normalizing factors of the right singular vectors
+           matrices of subproblems on I-th level.
+
+    Z      (input) REAL array, dimension ( LDU, NLVL ).
+           On entry, Z(1, I) contains the components of the deflation-
+           adjusted updating row vector for subproblems on the I-th
+           level.
+
+    POLES  (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old
+           singular values involved in the secular equations on the I-th
+           level.
+
+    GIVPTR (input) INTEGER array, dimension ( N ).
+           On entry, GIVPTR( I ) records the number of Givens
+           rotations performed on the I-th problem on the computation
+           tree.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ).
+           On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the
+           locations of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (input) INTEGER array, dimension ( LDGCOL, NLVL ).
+           On entry, PERM(*, I) records permutations done on the I-th
+           level of the computation tree.
+
+    GIVNUM (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S-
+           values of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    C      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           S( I ) contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    WORK   (workspace) REAL array.
+           The dimension must be at least N.
+
+    IWORK  (workspace) INTEGER array.
+           The dimension must be at least 3 * N
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < *smlsiz) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < *n) {
+	*info = -6;
+    } else if (*ldbx < *n) {
+	*info = -8;
+    } else if (*ldu < *n) {
+	*info = -10;
+    } else if (*ldgcol < *n) {
+	*info = -19;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLALSA", &i__1);
+	return 0;
+    }
+
+/*     Book-keeping and  setting up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+
+    slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       The following code applies back the left singular vector factors.
+       For applying back the right singular vector factors, go to 50.
+*/
+
+    if (*icompq == 1) {
+	goto L50;
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding left and right singular vector
+       matrices are in explicit form. First apply back the left
+       singular vector matrices.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+	sgemm_("T", "N", &nl, nrhs, &nl, &c_b15, &u[nlf + u_dim1], ldu, &b[
+		nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx);
+	sgemm_("T", "N", &nr, nrhs, &nr, &c_b15, &u[nrf + u_dim1], ldu, &b[
+		nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx);
+/* L10: */
+    }
+
+/*
+       Next copy the rows of B that correspond to unchanged rows
+       in the bidiagonal matrix to BX.
+*/
+
+    i__1 = nd;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	ic = iwork[inode + i__ - 1];
+	scopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx);
+/* L20: */
+    }
+
+/*
+       Finally go through the left singular vector matrices of all
+       the other subproblems bottom-up on the tree.
+*/
+
+    j = pow_ii(&c__2, &nlvl);
+    sqre = 0;
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          find the first node LF and last node LL on
+          the current level LVL
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    --j;
+	    slals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, &
+		    b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &work[1], info);
+/* L30: */
+	}
+/* L40: */
+    }
+    goto L90;
+
+/*     ICOMPQ = 1: applying back the right singular vector factors. */
+
+L50:
+
+/*
+       First now go through the right singular vector matrices of all
+       the tree nodes top-down.
+*/
+
+    j = 0;
+    i__1 = nlvl;
+    for (lvl = 1; lvl <= i__1; ++lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__2 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__2);
+	    ll = (lf << 1) - 1;
+	}
+	i__2 = lf;
+	for (i__ = ll; i__ >= i__2; --i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqre = 0;
+	    } else {
+		sqre = 1;
+	    }
+	    ++j;
+	    slals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[
+		    nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &work[1], info);
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding right singular vector
+       matrices are in explicit form. Apply them back.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlp1 = nl + 1;
+	if (i__ == nd) {
+	    nrp1 = nr;
+	} else {
+	    nrp1 = nr + 1;
+	}
+	nlf = ic - nl;
+	nrf = ic + 1;
+	sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b15, &vt[nlf + vt_dim1], ldu,
+		&b[nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx);
+	sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b15, &vt[nrf + vt_dim1], ldu,
+		&b[nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx);
+/* L80: */
+    }
+
+L90:
+
+    return 0;
+
+/*     End of SLALSA */
+
+} /* slalsa_ */
+
+/* Subroutine */ int slalsd_(char *uplo, integer *smlsiz, integer *n, integer
+	*nrhs, real *d__, real *e, real *b, integer *ldb, real *rcond,
+	integer *rank, real *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer c__, i__, j, k;
+    static real r__;
+    static integer s, u, z__;
+    static real cs;
+    static integer bx;
+    static real sn;
+    static integer st, vt, nm1, st1;
+    static real eps;
+    static integer iwk;
+    static real tol;
+    static integer difl, difr;
+    static real rcnd;
+    static integer perm, nsub, nlvl, sqre, bxst;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *), sgemm_(char *, char *, integer *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    , real *, real *, integer *);
+    static integer poles, sizei, nsize;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static integer nwork, icmpq1, icmpq2;
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int slasda_(integer *, integer *, integer *,
+	    integer *, real *, real *, real *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    xerbla_(char *, integer *), slalsa_(integer *, integer *,
+	    integer *, integer *, real *, integer *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, real *, real *, real *,
+	    integer *, integer *, integer *, integer *, real *, real *, real *
+	    , real *, integer *, integer *), slascl_(char *, integer *,
+	    integer *, real *, real *, integer *, integer *, real *, integer *
+	    , integer *);
+    static integer givcol;
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *),
+	    slacpy_(char *, integer *, integer *, real *, integer *, real *,
+	    integer *), slartg_(real *, real *, real *, real *, real *
+	    ), slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *);
+    static real orgnrm;
+    static integer givnum;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+    static integer givptr, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLALSD uses the singular value decomposition of A to solve the least
+    squares problem of finding X to minimize the Euclidean norm of each
+    column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+    are N-by-NRHS. The solution X overwrites B.
+
+    The singular values of A smaller than RCOND times the largest
+    singular value are treated as zero in solving the least squares
+    problem; in this case a minimum norm solution is returned.
+    The actual singular values are returned in D in ascending order.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    UPLO   (input) CHARACTER*1
+           = 'U': D and E define an upper bidiagonal matrix.
+           = 'L': D and E define a  lower bidiagonal matrix.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The dimension of the  bidiagonal matrix.  N >= 0.
+
+    NRHS   (input) INTEGER
+           The number of columns of B. NRHS must be at least 1.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit, if INFO = 0, D contains its singular values.
+
+    E      (input/output) REAL array, dimension (N-1)
+           Contains the super-diagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    B      (input/output) REAL array, dimension (LDB,NRHS)
+           On input, B contains the right hand sides of the least
+           squares problem. On output, B contains the solution X.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,N).
+
+    RCOND  (input) REAL
+           The singular values of A less than or equal to RCOND times
+           the largest singular value are treated as zero in solving
+           the least squares problem. If RCOND is negative,
+           machine precision is used instead.
+           For example, if diag(S)*X=B were the least squares problem,
+           where diag(S) is a diagonal matrix of singular values, the
+           solution would be X(i) = B(i) / S(i) if S(i) is greater than
+           RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to
+           RCOND*max(S).
+
+    RANK   (output) INTEGER
+           The number of singular values of A greater than RCOND times
+           the largest singular value.
+
+    WORK   (workspace) REAL array, dimension at least
+           (9*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2),
+           where NLVL = max(0, INT(log_2 (N/(SMLSIZ+1))) + 1).
+
+    IWORK  (workspace) INTEGER array, dimension at least
+           (3*N*NLVL + 11*N)
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  The algorithm failed to compute a singular value while
+                 working on the submatrix lying in rows and columns
+                 INFO/(N+1) through MOD(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < 1 || *ldb < *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLALSD", &i__1);
+	return 0;
+    }
+
+    eps = slamch_("Epsilon");
+
+/*     Set up the tolerance. */
+
+    if (*rcond <= 0.f || *rcond >= 1.f) {
+	rcnd = eps;
+    } else {
+	rcnd = *rcond;
+    }
+
+    *rank = 0;
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	if (d__[1] == 0.f) {
+	    slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	} else {
+	    *rank = 1;
+	    slascl_("G", &c__0, &c__0, &d__[1], &c_b15, &c__1, nrhs, &b[
+		    b_offset], ldb, info);
+	    d__[1] = dabs(d__[1]);
+	}
+	return 0;
+    }
+
+/*     Rotate the matrix if it is lower bidiagonal. */
+
+    if (*(unsigned char *)uplo == 'L') {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (*nrhs == 1) {
+		srot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], &
+			c__1, &cs, &sn);
+	    } else {
+		work[(i__ << 1) - 1] = cs;
+		work[i__ * 2] = sn;
+	    }
+/* L10: */
+	}
+	if (*nrhs > 1) {
+	    i__1 = *nrhs;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = *n - 1;
+		for (j = 1; j <= i__2; ++j) {
+		    cs = work[(j << 1) - 1];
+		    sn = work[j * 2];
+		    srot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__ *
+			     b_dim1], &c__1, &cs, &sn);
+/* L20: */
+		}
+/* L30: */
+	    }
+	}
+    }
+
+/*     Scale. */
+
+    nm1 = *n - 1;
+    orgnrm = slanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.f) {
+	slaset_("A", n, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	return 0;
+    }
+
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, info);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1,
+	    info);
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= *smlsiz) {
+	nwork = *n * *n + 1;
+	slaset_("A", n, n, &c_b29, &c_b15, &work[1], n);
+	slasdq_("U", &c__0, n, n, &c__0, nrhs, &d__[1], &e[1], &work[1], n, &
+		work[1], n, &b[b_offset], ldb, &work[nwork], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (d__[i__] <= tol) {
+		slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[i__ + b_dim1],
+			ldb);
+	    } else {
+		slascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &b[
+			i__ + b_dim1], ldb, info);
+		++(*rank);
+	    }
+/* L40: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b15, &work[1], n, &b[b_offset], ldb, &
+		c_b29, &work[nwork], n);
+	slacpy_("A", n, nrhs, &work[nwork], n, &b[b_offset], ldb);
+
+/*        Unscale. */
+
+	slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n,
+		info);
+	slasrt_("D", n, &d__[1], info);
+	slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset],
+		ldb, info);
+
+	return 0;
+    }
+
+/*     Book-keeping and setting up some constants. */
+
+    nlvl = (integer) (log((real) (*n) / (real) (*smlsiz + 1)) / log(2.f)) + 1;
+
+    smlszp = *smlsiz + 1;
+
+    u = 1;
+    vt = *smlsiz * *n + 1;
+    difl = vt + smlszp * *n;
+    difr = difl + nlvl * *n;
+    z__ = difr + (nlvl * *n << 1);
+    c__ = z__ + nlvl * *n;
+    s = c__ + *n;
+    poles = s + *n;
+    givnum = poles + (nlvl << 1) * *n;
+    bx = givnum + (nlvl << 1) * *n;
+    nwork = bx + *n * *nrhs;
+
+    sizei = *n + 1;
+    k = sizei + *n;
+    givptr = k + *n;
+    perm = givptr + *n;
+    givcol = perm + nlvl * *n;
+    iwk = givcol + (nlvl * *n << 1);
+
+    st = 1;
+    sqre = 0;
+    icmpq1 = 1;
+    icmpq2 = 0;
+    nsub = 0;
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) < eps) {
+	    d__[i__] = r_sign(&eps, &d__[i__]);
+	}
+/* L50: */
+    }
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) {
+	    ++nsub;
+	    iwork[nsub] = st;
+
+/*
+             Subproblem found. First determine its size and then
+             apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*              A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else if ((r__1 = e[i__], dabs(r__1)) >= eps) {
+
+/*              A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else {
+
+/*
+                A subproblem with E(NM1) small. This implies an
+                1-by-1 subproblem at D(N), which is not solved
+                explicitly.
+*/
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+		++nsub;
+		iwork[nsub] = *n;
+		iwork[sizei + nsub - 1] = 1;
+		scopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n);
+	    }
+	    st1 = st - 1;
+	    if (nsize == 1) {
+
+/*
+                This is a 1-by-1 subproblem and is not solved
+                explicitly.
+*/
+
+		scopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n);
+	    } else if (nsize <= *smlsiz) {
+
+/*              This is a small subproblem and is solved by SLASDQ. */
+
+		slaset_("A", &nsize, &nsize, &c_b29, &c_b15, &work[vt + st1],
+			n);
+		slasdq_("U", &c__0, &nsize, &nsize, &c__0, nrhs, &d__[st], &e[
+			st], &work[vt + st1], n, &work[nwork], n, &b[st +
+			b_dim1], ldb, &work[nwork], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		slacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx +
+			st1], n);
+	    } else {
+
+/*              A large problem. Solve it using divide and conquer. */
+
+		slasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], &
+			work[u + st1], n, &work[vt + st1], &iwork[k + st1], &
+			work[difl + st1], &work[difr + st1], &work[z__ + st1],
+			 &work[poles + st1], &iwork[givptr + st1], &iwork[
+			givcol + st1], n, &iwork[perm + st1], &work[givnum +
+			st1], &work[c__ + st1], &work[s + st1], &work[nwork],
+			&iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		bxst = bx + st1;
+		slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, &
+			work[bxst], n, &work[u + st1], n, &work[vt + st1], &
+			iwork[k + st1], &work[difl + st1], &work[difr + st1],
+			&work[z__ + st1], &work[poles + st1], &iwork[givptr +
+			st1], &iwork[givcol + st1], n, &iwork[perm + st1], &
+			work[givnum + st1], &work[c__ + st1], &work[s + st1],
+			&work[nwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+	    }
+	    st = i__ + 1;
+	}
+/* L60: */
+    }
+
+/*     Apply the singular values and treat the tiny ones as zero. */
+
+    tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+          Some of the elements in D can be negative because 1-by-1
+          subproblems were not solved explicitly.
+*/
+
+	if ((r__1 = d__[i__], dabs(r__1)) <= tol) {
+	    slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &work[bx + i__ - 1], n);
+	} else {
+	    ++(*rank);
+	    slascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &work[
+		    bx + i__ - 1], n, info);
+	}
+	d__[i__] = (r__1 = d__[i__], dabs(r__1));
+/* L70: */
+    }
+
+/*     Now apply back the right singular vectors. */
+
+    icmpq2 = 1;
+    i__1 = nsub;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	st = iwork[i__];
+	st1 = st - 1;
+	nsize = iwork[sizei + i__ - 1];
+	bxst = bx + st1;
+	if (nsize == 1) {
+	    scopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb);
+	} else if (nsize <= *smlsiz) {
+	    sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b15, &work[vt + st1], n,
+		     &work[bxst], n, &c_b29, &b[st + b_dim1], ldb);
+	} else {
+	    slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st +
+		    b_dim1], ldb, &work[u + st1], n, &work[vt + st1], &iwork[
+		    k + st1], &work[difl + st1], &work[difr + st1], &work[z__
+		    + st1], &work[poles + st1], &iwork[givptr + st1], &iwork[
+		    givcol + st1], n, &iwork[perm + st1], &work[givnum + st1],
+		     &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[
+		    iwk], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+	}
+/* L80: */
+    }
+
+/*     Unscale and sort the singular values. */
+
+    slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, info);
+    slasrt_("D", n, &d__[1], info);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset], ldb,
+	    info);
+
+    return 0;
+
+/*     End of SLALSD */
+
+} /* slalsd_ */
+
+/* Subroutine */ int slamrg_(integer *n1, integer *n2, real *a, integer *
+	strd1, integer *strd2, integer *index)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    static integer i__, ind1, ind2, n1sv, n2sv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAMRG will create a permutation list which will merge the elements
+    of A (which is composed of two independently sorted sets) into a
+    single set which is sorted in ascending order.
+
+    Arguments
+    =========
+
+    N1     (input) INTEGER
+    N2     (input) INTEGER
+           These arguements contain the respective lengths of the two
+           sorted lists to be merged.
+
+    A      (input) REAL array, dimension (N1+N2)
+           The first N1 elements of A contain a list of numbers which
+           are sorted in either ascending or descending order.  Likewise
+           for the final N2 elements.
+
+    STRD1  (input) INTEGER
+    STRD2  (input) INTEGER
+           These are the strides to be taken through the array A.
+           Allowable strides are 1 and -1.  They indicate whether a
+           subset of A is sorted in ascending (STRDx = 1) or descending
+           (STRDx = -1) order.
+
+    INDEX  (output) INTEGER array, dimension (N1+N2)
+           On exit this array will contain a permutation such that
+           if B( I ) = A( INDEX( I ) ) for I=1,N1+N2, then B will be
+           sorted in ascending order.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --index;
+    --a;
+
+    /* Function Body */
+    n1sv = *n1;
+    n2sv = *n2;
+    if (*strd1 > 0) {
+	ind1 = 1;
+    } else {
+	ind1 = *n1;
+    }
+    if (*strd2 > 0) {
+	ind2 = *n1 + 1;
+    } else {
+	ind2 = *n1 + *n2;
+    }
+    i__ = 1;
+/*     while ( (N1SV > 0) & (N2SV > 0) ) */
+L10:
+    if (n1sv > 0 && n2sv > 0) {
+	if (a[ind1] <= a[ind2]) {
+	    index[i__] = ind1;
+	    ++i__;
+	    ind1 += *strd1;
+	    --n1sv;
+	} else {
+	    index[i__] = ind2;
+	    ++i__;
+	    ind2 += *strd2;
+	    --n2sv;
+	}
+	goto L10;
+    }
+/*     end while */
+    if (n1sv == 0) {
+	i__1 = n2sv;
+	for (n1sv = 1; n1sv <= i__1; ++n1sv) {
+	    index[i__] = ind2;
+	    ++i__;
+	    ind2 += *strd2;
+/* L20: */
+	}
+    } else {
+/*     N2SV .EQ. 0 */
+	i__1 = n1sv;
+	for (n2sv = 1; n2sv <= i__1; ++n2sv) {
+	    index[i__] = ind1;
+	    ++i__;
+	    ind1 += *strd1;
+/* L30: */
+	}
+    }
+
+    return 0;
+
+/*     End of SLAMRG */
+
+} /* slamrg_ */
+
+doublereal slange_(char *norm, integer *m, integer *n, real *a, integer *lda,
+	real *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real ret_val, r__1, r__2, r__3;
+
+    /* Local variables */
+    static integer i__, j;
+    static real sum, scale;
+    extern logical lsame_(char *, char *);
+    static real value;
+    extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *,
+	    real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLANGE  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    real matrix A.
+
+    Description
+    ===========
+
+    SLANGE returns the value
+
+       SLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in SLANGE as described
+            above.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.  When M = 0,
+            SLANGE is set to zero.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.  When N = 0,
+            SLANGE is set to zero.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(M,1).
+
+    WORK    (workspace) REAL array, dimension (MAX(1,LWORK)),
+            where LWORK >= M when NORM = 'I'; otherwise, WORK is not
+            referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (min(*m,*n) == 0) {
+	value = 0.f;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		r__2 = value, r__3 = (r__1 = a[i__ + j * a_dim1], dabs(r__1));
+		value = dmax(r__2,r__3);
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1') {
+
+/*        Find norm1(A). */
+
+	value = 0.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = 0.f;
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		sum += (r__1 = a[i__ + j * a_dim1], dabs(r__1));
+/* L30: */
+	    }
+	    value = dmax(value,sum);
+/* L40: */
+	}
+    } else if (lsame_(norm, "I")) {
+
+/*        Find normI(A). */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[i__] = 0.f;
+/* L50: */
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		work[i__] += (r__1 = a[i__ + j * a_dim1], dabs(r__1));
+/* L60: */
+	    }
+/* L70: */
+	}
+	value = 0.f;
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    r__1 = value, r__2 = work[i__];
+	    value = dmax(r__1,r__2);
+/* L80: */
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.f;
+	sum = 1.f;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    slassq_(m, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L90: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of SLANGE */
+
+} /* slange_ */
+
+doublereal slanst_(char *norm, integer *n, real *d__, real *e)
+{
+    /* System generated locals */
+    integer i__1;
+    real ret_val, r__1, r__2, r__3, r__4, r__5;
+
+    /* Local variables */
+    static integer i__;
+    static real sum, scale;
+    extern logical lsame_(char *, char *);
+    static real anorm;
+    extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *,
+	    real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLANST  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    real symmetric tridiagonal matrix A.
+
+    Description
+    ===========
+
+    SLANST returns the value
+
+       SLANST = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in SLANST as described
+            above.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.  When N = 0, SLANST is
+            set to zero.
+
+    D       (input) REAL array, dimension (N)
+            The diagonal elements of A.
+
+    E       (input) REAL array, dimension (N-1)
+            The (n-1) sub-diagonal or super-diagonal elements of A.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --e;
+    --d__;
+
+    /* Function Body */
+    if (*n <= 0) {
+	anorm = 0.f;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	anorm = (r__1 = d__[*n], dabs(r__1));
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    r__2 = anorm, r__3 = (r__1 = d__[i__], dabs(r__1));
+	    anorm = dmax(r__2,r__3);
+/* Computing MAX */
+	    r__2 = anorm, r__3 = (r__1 = e[i__], dabs(r__1));
+	    anorm = dmax(r__2,r__3);
+/* L10: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1' || lsame_(norm, "I")) {
+
+/*        Find norm1(A). */
+
+	if (*n == 1) {
+	    anorm = dabs(d__[1]);
+	} else {
+/* Computing MAX */
+	    r__3 = dabs(d__[1]) + dabs(e[1]), r__4 = (r__1 = e[*n - 1], dabs(
+		    r__1)) + (r__2 = d__[*n], dabs(r__2));
+	    anorm = dmax(r__3,r__4);
+	    i__1 = *n - 1;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+/* Computing MAX */
+		r__4 = anorm, r__5 = (r__1 = d__[i__], dabs(r__1)) + (r__2 =
+			e[i__], dabs(r__2)) + (r__3 = e[i__ - 1], dabs(r__3));
+		anorm = dmax(r__4,r__5);
+/* L20: */
+	    }
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.f;
+	sum = 1.f;
+	if (*n > 1) {
+	    i__1 = *n - 1;
+	    slassq_(&i__1, &e[1], &c__1, &scale, &sum);
+	    sum *= 2;
+	}
+	slassq_(n, &d__[1], &c__1, &scale, &sum);
+	anorm = scale * sqrt(sum);
+    }
+
+    ret_val = anorm;
+    return ret_val;
+
+/*     End of SLANST */
+
+} /* slanst_ */
+
+doublereal slansy_(char *norm, char *uplo, integer *n, real *a, integer *lda,
+	real *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real ret_val, r__1, r__2, r__3;
+
+    /* Local variables */
+    static integer i__, j;
+    static real sum, absa, scale;
+    extern logical lsame_(char *, char *);
+    static real value;
+    extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *,
+	    real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLANSY  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    real symmetric matrix A.
+
+    Description
+    ===========
+
+    SLANSY returns the value
+
+       SLANSY = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in SLANSY as described
+            above.
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is to be referenced.
+            = 'U':  Upper triangular part of A is referenced
+            = 'L':  Lower triangular part of A is referenced
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.  When N = 0, SLANSY is
+            set to zero.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The symmetric matrix A.  If UPLO = 'U', the leading n by n
+            upper triangular part of A contains the upper triangular part
+            of the matrix A, and the strictly lower triangular part of A
+            is not referenced.  If UPLO = 'L', the leading n by n lower
+            triangular part of A contains the lower triangular part of
+            the matrix A, and the strictly upper triangular part of A is
+            not referenced.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(N,1).
+
+    WORK    (workspace) REAL array, dimension (MAX(1,LWORK)),
+            where LWORK >= N when NORM = 'I' or '1' or 'O'; otherwise,
+            WORK is not referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (*n == 0) {
+	value = 0.f;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.f;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    r__2 = value, r__3 = (r__1 = a[i__ + j * a_dim1], dabs(
+			    r__1));
+		    value = dmax(r__2,r__3);
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = j; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    r__2 = value, r__3 = (r__1 = a[i__ + j * a_dim1], dabs(
+			    r__1));
+		    value = dmax(r__2,r__3);
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+    } else if (lsame_(norm, "I") || lsame_(norm, "O") || *(unsigned char *)norm == '1') {
+
+/*        Find normI(A) ( = norm1(A), since A is symmetric). */
+
+	value = 0.f;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		sum = 0.f;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    absa = (r__1 = a[i__ + j * a_dim1], dabs(r__1));
+		    sum += absa;
+		    work[i__] += absa;
+/* L50: */
+		}
+		work[j] = sum + (r__1 = a[j + j * a_dim1], dabs(r__1));
+/* L60: */
+	    }
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+		r__1 = value, r__2 = work[i__];
+		value = dmax(r__1,r__2);
+/* L70: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		work[i__] = 0.f;
+/* L80: */
+	    }
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		sum = work[j] + (r__1 = a[j + j * a_dim1], dabs(r__1));
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    absa = (r__1 = a[i__ + j * a_dim1], dabs(r__1));
+		    sum += absa;
+		    work[i__] += absa;
+/* L90: */
+		}
+		value = dmax(value,sum);
+/* L100: */
+	    }
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.f;
+	sum = 1.f;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		i__2 = j - 1;
+		slassq_(&i__2, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n - j;
+		slassq_(&i__2, &a[j + 1 + j * a_dim1], &c__1, &scale, &sum);
+/* L120: */
+	    }
+	}
+	sum *= 2;
+	i__1 = *lda + 1;
+	slassq_(n, &a[a_offset], &i__1, &scale, &sum);
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of SLANSY */
+
+} /* slansy_ */
+
+/* Subroutine */ int slanv2_(real *a, real *b, real *c__, real *d__, real *
+	rt1r, real *rt1i, real *rt2r, real *rt2i, real *cs, real *sn)
+{
+    /* System generated locals */
+    real r__1, r__2;
+
+    /* Local variables */
+    static real p, z__, aa, bb, cc, dd, cs1, sn1, sab, sac, eps, tau, temp,
+	    scale, bcmax, bcmis, sigma;
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLANV2 computes the Schur factorization of a real 2-by-2 nonsymmetric
+    matrix in standard form:
+
+         [ A  B ] = [ CS -SN ] [ AA  BB ] [ CS  SN ]
+         [ C  D ]   [ SN  CS ] [ CC  DD ] [-SN  CS ]
+
+    where either
+    1) CC = 0 so that AA and DD are real eigenvalues of the matrix, or
+    2) AA = DD and BB*CC < 0, so that AA + or - sqrt(BB*CC) are complex
+    conjugate eigenvalues.
+
+    Arguments
+    =========
+
+    A       (input/output) REAL
+    B       (input/output) REAL
+    C       (input/output) REAL
+    D       (input/output) REAL
+            On entry, the elements of the input matrix.
+            On exit, they are overwritten by the elements of the
+            standardised Schur form.
+
+    RT1R    (output) REAL
+    RT1I    (output) REAL
+    RT2R    (output) REAL
+    RT2I    (output) REAL
+            The real and imaginary parts of the eigenvalues. If the
+            eigenvalues are a complex conjugate pair, RT1I > 0.
+
+    CS      (output) REAL
+    SN      (output) REAL
+            Parameters of the rotation matrix.
+
+    Further Details
+    ===============
+
+    Modified by V. Sima, Research Institute for Informatics, Bucharest,
+    Romania, to reduce the risk of cancellation errors,
+    when computing real eigenvalues, and to ensure, if possible, that
+    abs(RT1R) >= abs(RT2R).
+
+    =====================================================================
+*/
+
+
+    eps = slamch_("P");
+    if (*c__ == 0.f) {
+	*cs = 1.f;
+	*sn = 0.f;
+	goto L10;
+
+    } else if (*b == 0.f) {
+
+/*        Swap rows and columns */
+
+	*cs = 0.f;
+	*sn = 1.f;
+	temp = *d__;
+	*d__ = *a;
+	*a = temp;
+	*b = -(*c__);
+	*c__ = 0.f;
+	goto L10;
+    } else if (*a - *d__ == 0.f && r_sign(&c_b15, b) != r_sign(&c_b15, c__)) {
+	*cs = 1.f;
+	*sn = 0.f;
+	goto L10;
+    } else {
+
+	temp = *a - *d__;
+	p = temp * .5f;
+/* Computing MAX */
+	r__1 = dabs(*b), r__2 = dabs(*c__);
+	bcmax = dmax(r__1,r__2);
+/* Computing MIN */
+	r__1 = dabs(*b), r__2 = dabs(*c__);
+	bcmis = dmin(r__1,r__2) * r_sign(&c_b15, b) * r_sign(&c_b15, c__);
+/* Computing MAX */
+	r__1 = dabs(p);
+	scale = dmax(r__1,bcmax);
+	z__ = p / scale * p + bcmax / scale * bcmis;
+
+/*
+          If Z is of the order of the machine accuracy, postpone the
+          decision on the nature of eigenvalues
+*/
+
+	if (z__ >= eps * 4.f) {
+
+/*           Real eigenvalues. Compute A and D. */
+
+	    r__1 = sqrt(scale) * sqrt(z__);
+	    z__ = p + r_sign(&r__1, &p);
+	    *a = *d__ + z__;
+	    *d__ -= bcmax / z__ * bcmis;
+
+/*           Compute B and the rotation matrix */
+
+	    tau = slapy2_(c__, &z__);
+	    *cs = z__ / tau;
+	    *sn = *c__ / tau;
+	    *b -= *c__;
+	    *c__ = 0.f;
+	} else {
+
+/*
+             Complex eigenvalues, or real (almost) equal eigenvalues.
+             Make diagonal elements equal.
+*/
+
+	    sigma = *b + *c__;
+	    tau = slapy2_(&sigma, &temp);
+	    *cs = sqrt((dabs(sigma) / tau + 1.f) * .5f);
+	    *sn = -(p / (tau * *cs)) * r_sign(&c_b15, &sigma);
+
+/*
+             Compute [ AA  BB ] = [ A  B ] [ CS -SN ]
+                     [ CC  DD ]   [ C  D ] [ SN  CS ]
+*/
+
+	    aa = *a * *cs + *b * *sn;
+	    bb = -(*a) * *sn + *b * *cs;
+	    cc = *c__ * *cs + *d__ * *sn;
+	    dd = -(*c__) * *sn + *d__ * *cs;
+
+/*
+             Compute [ A  B ] = [ CS  SN ] [ AA  BB ]
+                     [ C  D ]   [-SN  CS ] [ CC  DD ]
+*/
+
+	    *a = aa * *cs + cc * *sn;
+	    *b = bb * *cs + dd * *sn;
+	    *c__ = -aa * *sn + cc * *cs;
+	    *d__ = -bb * *sn + dd * *cs;
+
+	    temp = (*a + *d__) * .5f;
+	    *a = temp;
+	    *d__ = temp;
+
+	    if (*c__ != 0.f) {
+		if (*b != 0.f) {
+		    if (r_sign(&c_b15, b) == r_sign(&c_b15, c__)) {
+
+/*                    Real eigenvalues: reduce to upper triangular form */
+
+			sab = sqrt((dabs(*b)));
+			sac = sqrt((dabs(*c__)));
+			r__1 = sab * sac;
+			p = r_sign(&r__1, c__);
+			tau = 1.f / sqrt((r__1 = *b + *c__, dabs(r__1)));
+			*a = temp + p;
+			*d__ = temp - p;
+			*b -= *c__;
+			*c__ = 0.f;
+			cs1 = sab * tau;
+			sn1 = sac * tau;
+			temp = *cs * cs1 - *sn * sn1;
+			*sn = *cs * sn1 + *sn * cs1;
+			*cs = temp;
+		    }
+		} else {
+		    *b = -(*c__);
+		    *c__ = 0.f;
+		    temp = *cs;
+		    *cs = -(*sn);
+		    *sn = temp;
+		}
+	    }
+	}
+
+    }
+
+L10:
+
+/*     Store eigenvalues in (RT1R,RT1I) and (RT2R,RT2I). */
+
+    *rt1r = *a;
+    *rt2r = *d__;
+    if (*c__ == 0.f) {
+	*rt1i = 0.f;
+	*rt2i = 0.f;
+    } else {
+	*rt1i = sqrt((dabs(*b))) * sqrt((dabs(*c__)));
+	*rt2i = -(*rt1i);
+    }
+    return 0;
+
+/*     End of SLANV2 */
+
+} /* slanv2_ */
+
+doublereal slapy2_(real *x, real *y)
+{
+    /* System generated locals */
+    real ret_val, r__1;
+
+    /* Local variables */
+    static real w, z__, xabs, yabs;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAPY2 returns sqrt(x**2+y**2), taking care not to cause unnecessary
+    overflow.
+
+    Arguments
+    =========
+
+    X       (input) REAL
+    Y       (input) REAL
+            X and Y specify the values x and y.
+
+    =====================================================================
+*/
+
+
+    xabs = dabs(*x);
+    yabs = dabs(*y);
+    w = dmax(xabs,yabs);
+    z__ = dmin(xabs,yabs);
+    if (z__ == 0.f) {
+	ret_val = w;
+    } else {
+/* Computing 2nd power */
+	r__1 = z__ / w;
+	ret_val = w * sqrt(r__1 * r__1 + 1.f);
+    }
+    return ret_val;
+
+/*     End of SLAPY2 */
+
+} /* slapy2_ */
+
+doublereal slapy3_(real *x, real *y, real *z__)
+{
+    /* System generated locals */
+    real ret_val, r__1, r__2, r__3;
+
+    /* Local variables */
+    static real w, xabs, yabs, zabs;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAPY3 returns sqrt(x**2+y**2+z**2), taking care not to cause
+    unnecessary overflow.
+
+    Arguments
+    =========
+
+    X       (input) REAL
+    Y       (input) REAL
+    Z       (input) REAL
+            X, Y and Z specify the values x, y and z.
+
+    =====================================================================
+*/
+
+
+    xabs = dabs(*x);
+    yabs = dabs(*y);
+    zabs = dabs(*z__);
+/* Computing MAX */
+    r__1 = max(xabs,yabs);
+    w = dmax(r__1,zabs);
+    if (w == 0.f) {
+/*
+       W can be zero for max(0,nan,0)
+       adding all three entries together will make sure
+       NaN will not disappear.
+*/
+	ret_val = xabs + yabs + zabs;
+    } else {
+/* Computing 2nd power */
+	r__1 = xabs / w;
+/* Computing 2nd power */
+	r__2 = yabs / w;
+/* Computing 2nd power */
+	r__3 = zabs / w;
+	ret_val = w * sqrt(r__1 * r__1 + r__2 * r__2 + r__3 * r__3);
+    }
+    return ret_val;
+
+/*     End of SLAPY3 */
+
+} /* slapy3_ */
+
+/* Subroutine */ int slaqr0_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real *
+	wi, integer *iloz, integer *ihiz, real *z__, integer *ldz, real *work,
+	 integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1, r__2, r__3, r__4;
+
+    /* Local variables */
+    static integer i__, k;
+    static real aa, bb, cc, dd;
+    static integer ld;
+    static real cs;
+    static integer nh, it, ks, kt;
+    static real sn;
+    static integer ku, kv, ls, ns;
+    static real ss;
+    static integer nw, inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl,
+	    kbot, nmin;
+    static real swap;
+    static integer ktop;
+    static real zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real *
+	    , real *, real *, real *, real *, real *), slaqr3_(logical *,
+	    logical *, integer *, integer *, integer *, integer *, real *,
+	    integer *, integer *, integer *, real *, integer *, integer *,
+	    integer *, real *, real *, real *, integer *, integer *, real *,
+	    integer *, integer *, real *, integer *, real *, integer *),
+	    slaqr4_(logical *, logical *, integer *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *, integer *, real *,
+	    integer *, real *, integer *, integer *), slaqr5_(logical *,
+	    logical *, integer *, integer *, integer *, integer *, integer *,
+	    real *, real *, real *, integer *, integer *, integer *, real *,
+	    integer *, real *, integer *, real *, integer *, integer *, real *
+	    , integer *, integer *, real *, integer *);
+    static integer nibble;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    extern /* Subroutine */ int slahqr_(logical *, logical *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , integer *, real *, integer *, integer *), slacpy_(char *,
+	    integer *, integer *, real *, integer *, real *, integer *);
+    static integer nwupbd;
+    static logical sorted;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       SLAQR0 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+       Schur form), and Z is the orthogonal matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input orthogonal
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to SGEBAL, and then passed to SGEHRD when the
+             matrix output by SGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) REAL array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H contains
+             the upper quasi-triangular matrix T from the Schur
+             decomposition (the Schur form); 2-by-2 diagonal blocks
+             (corresponding to complex conjugate pairs of eigenvalues)
+             are returned in standard form, with H(i,i) = H(i+1,i+1)
+             and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       WR    (output) REAL array, dimension (IHI)
+       WI    (output) REAL array, dimension (IHI)
+             The real and imaginary parts, respectively, of the computed
+             eigenvalues of H(ILO:IHI,ILO:IHI) are stored in WR(ILO:IHI)
+             and WI(ILO:IHI). If two eigenvalues are computed as a
+             complex conjugate pair, they are stored in consecutive
+             elements of WR and WI, say the i-th and (i+1)th, with
+             WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+             the eigenvalues are stored in the same order as on the
+             diagonal of the Schur form returned in H, with
+             WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
+             block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
+             WI(i+1) = -WI(i).
+
+       ILOZ     (input) INTEGER
+       IHIZ     (input) INTEGER
+             Specify the rows of Z to which transformations must be
+             applied if WANTZ is .TRUE..
+             1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+
+       Z     (input/output) REAL array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) REAL array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then SLAQR0 does a workspace query.
+             In this case, SLAQR0 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, SLAQR0 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is an orthogonal matrix.  The final
+                  value of H is upper Hessenberg and quasi-triangular
+                  in rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    SLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constants WILK1 and WILK2 are used to form the
+       .    exceptional shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use SLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    slahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1], &
+		    wi[1], iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "SLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "SLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to SLAQR3 ====
+*/
+
+	i__1 = nwr + 1;
+	slaqr3_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1], &h__[
+		h_offset], ldh, n, &h__[h_offset], ldh, n, &h__[h_offset],
+		ldh, &work[1], &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(SLAQR5, SLAQR3) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1];
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    work[1] = (real) lwkopt;
+	    return 0;
+	}
+
+/*        ==== SLAHQR/SLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "SLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "SLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "SLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L90;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		if (h__[k + (k - 1) * h_dim1] == 0.f) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    if ((r__1 = h__[kwtop + (kwtop - 1) * h_dim1], dabs(r__1))
+			     > (r__2 = h__[kwtop - 1 + (kwtop - 2) * h_dim1],
+			    dabs(r__2))) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    slaqr3_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1],
+		     &h__[kv + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1],
+		    ldh, &nve, &h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if SLAQR3
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    SLAQR3 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+/* Computing MAX */
+		    i__3 = ks + 1, i__4 = ktop + 2;
+		    i__2 = max(i__3,i__4);
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			ss = (r__1 = h__[i__ + (i__ - 1) * h_dim1], dabs(r__1)
+				) + (r__2 = h__[i__ - 1 + (i__ - 2) * h_dim1],
+				 dabs(r__2));
+			aa = ss * .75f + h__[i__ + i__ * h_dim1];
+			bb = ss;
+			cc = ss * -.4375f;
+			dd = aa;
+			slanv2_(&aa, &bb, &cc, &dd, &wr[i__ - 1], &wi[i__ - 1]
+				, &wr[i__], &wi[i__], &cs, &sn);
+/* L30: */
+		    }
+		    if (ks == ktop) {
+			wr[ks + 1] = h__[ks + 1 + (ks + 1) * h_dim1];
+			wi[ks + 1] = 0.f;
+			wr[ks] = wr[ks + 1];
+			wi[ks] = wi[ks + 1];
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use SLAQR4 or
+                   .    SLAHQR on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			slacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			if (ns > nmin) {
+			    slaqr4_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &wr[ks], &wi[ks], &
+				    c__1, &c__1, zdum, &c__1, &work[1], lwork,
+				     &inf);
+			} else {
+			    slahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &wr[ks], &wi[ks], &
+				    c__1, &c__1, zdum, &c__1, &inf);
+			}
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  ====
+*/
+
+			if (ks >= kbot) {
+			    aa = h__[kbot - 1 + (kbot - 1) * h_dim1];
+			    cc = h__[kbot + (kbot - 1) * h_dim1];
+			    bb = h__[kbot - 1 + kbot * h_dim1];
+			    dd = h__[kbot + kbot * h_dim1];
+			    slanv2_(&aa, &bb, &cc, &dd, &wr[kbot - 1], &wi[
+				    kbot - 1], &wr[kbot], &wi[kbot], &cs, &sn)
+				    ;
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*
+                      ==== Sort the shifts (Helps a little)
+                      .    Bubble sort keeps complex conjugate
+                      .    pairs together. ====
+*/
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				if ((r__1 = wr[i__], dabs(r__1)) + (r__2 = wi[
+					i__], dabs(r__2)) < (r__3 = wr[i__ +
+					1], dabs(r__3)) + (r__4 = wi[i__ + 1],
+					 dabs(r__4))) {
+				    sorted = FALSE_;
+
+				    swap = wr[i__];
+				    wr[i__] = wr[i__ + 1];
+				    wr[i__ + 1] = swap;
+
+				    swap = wi[i__];
+				    wi[i__] = wi[i__ + 1];
+				    wi[i__ + 1] = swap;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+
+/*
+                   ==== Shuffle shifts into pairs of real shifts
+                   .    and pairs of complex conjugate shifts
+                   .    assuming complex conjugate shifts are
+                   .    already adjacent to one another. (Yes,
+                   .    they are.)  ====
+*/
+
+		    i__2 = ks + 2;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			if (wi[i__] != -wi[i__ - 1]) {
+
+			    swap = wr[i__];
+			    wr[i__] = wr[i__ - 1];
+			    wr[i__ - 1] = wr[i__ - 2];
+			    wr[i__ - 2] = swap;
+
+			    swap = wi[i__];
+			    wi[i__] = wi[i__ - 1];
+			    wi[i__ - 1] = wi[i__ - 2];
+			    wi[i__ - 2] = swap;
+			}
+/* L70: */
+		    }
+		}
+
+/*
+                ==== If there are only two shifts and both are
+                .    real, then use only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    if (wi[kbot] == 0.f) {
+			if ((r__1 = wr[kbot] - h__[kbot + kbot * h_dim1],
+				dabs(r__1)) < (r__2 = wr[kbot - 1] - h__[kbot
+				+ kbot * h_dim1], dabs(r__2))) {
+			    wr[kbot - 1] = wr[kbot];
+			} else {
+			    wr[kbot] = wr[kbot - 1];
+			}
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		slaqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &wr[ks],
+			&wi[ks], &h__[h_offset], ldh, iloz, ihiz, &z__[
+			z_offset], ldz, &work[1], &c__3, &h__[ku + h_dim1],
+			ldh, &nve, &h__[kwv + h_dim1], ldh, &nho, &h__[ku +
+			kwh * h_dim1], ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L80:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L90:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    work[1] = (real) lwkopt;
+
+/*     ==== End of SLAQR0 ==== */
+
+    return 0;
+} /* slaqr0_ */
+
+/* Subroutine */ int slaqr1_(integer *n, real *h__, integer *ldh, real *sr1,
+	real *si1, real *sr2, real *si2, real *v)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset;
+    real r__1, r__2, r__3;
+
+    /* Local variables */
+    static real s, h21s, h31s;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+         Given a 2-by-2 or 3-by-3 matrix H, SLAQR1 sets v to a
+         scalar multiple of the first column of the product
+
+         (*)  K = (H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+
+         scaling to avoid overflows and most underflows. It
+         is assumed that either
+
+                 1) sr1 = sr2 and si1 = -si2
+             or
+                 2) si1 = si2 = 0.
+
+         This is useful for starting double implicit shift bulges
+         in the QR algorithm.
+
+
+         N      (input) integer
+                Order of the matrix H. N must be either 2 or 3.
+
+         H      (input) REAL array of dimension (LDH,N)
+                The 2-by-2 or 3-by-3 matrix H in (*).
+
+         LDH    (input) integer
+                The leading dimension of H as declared in
+                the calling procedure.  LDH.GE.N
+
+         SR1    (input) REAL
+         SI1    The shifts in (*).
+         SR2
+         SI2
+
+         V      (output) REAL array of dimension N
+                A scalar multiple of the first column of the
+                matrix K in (*).
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --v;
+
+    /* Function Body */
+    if (*n == 2) {
+	s = (r__1 = h__[h_dim1 + 1] - *sr2, dabs(r__1)) + dabs(*si2) + (r__2 =
+		 h__[h_dim1 + 2], dabs(r__2));
+	if (s == 0.f) {
+	    v[1] = 0.f;
+	    v[2] = 0.f;
+	} else {
+	    h21s = h__[h_dim1 + 2] / s;
+	    v[1] = h21s * h__[(h_dim1 << 1) + 1] + (h__[h_dim1 + 1] - *sr1) *
+		    ((h__[h_dim1 + 1] - *sr2) / s) - *si1 * (*si2 / s);
+	    v[2] = h21s * (h__[h_dim1 + 1] + h__[(h_dim1 << 1) + 2] - *sr1 - *
+		    sr2);
+	}
+    } else {
+	s = (r__1 = h__[h_dim1 + 1] - *sr2, dabs(r__1)) + dabs(*si2) + (r__2 =
+		 h__[h_dim1 + 2], dabs(r__2)) + (r__3 = h__[h_dim1 + 3], dabs(
+		r__3));
+	if (s == 0.f) {
+	    v[1] = 0.f;
+	    v[2] = 0.f;
+	    v[3] = 0.f;
+	} else {
+	    h21s = h__[h_dim1 + 2] / s;
+	    h31s = h__[h_dim1 + 3] / s;
+	    v[1] = (h__[h_dim1 + 1] - *sr1) * ((h__[h_dim1 + 1] - *sr2) / s)
+		    - *si1 * (*si2 / s) + h__[(h_dim1 << 1) + 1] * h21s + h__[
+		    h_dim1 * 3 + 1] * h31s;
+	    v[2] = h21s * (h__[h_dim1 + 1] + h__[(h_dim1 << 1) + 2] - *sr1 - *
+		    sr2) + h__[h_dim1 * 3 + 2] * h31s;
+	    v[3] = h31s * (h__[h_dim1 + 1] + h__[h_dim1 * 3 + 3] - *sr1 - *
+		    sr2) + h21s * h__[(h_dim1 << 1) + 3];
+	}
+    }
+    return 0;
+} /* slaqr1_ */
+
+/* Subroutine */ int slaqr2_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, real *h__, integer *ldh,
+	integer *iloz, integer *ihiz, real *z__, integer *ldz, integer *ns,
+	integer *nd, real *sr, real *si, real *v, integer *ldv, integer *nh,
+	real *t, integer *ldt, integer *nv, real *wv, integer *ldwv, real *
+	work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static real s, aa, bb, cc, dd, cs, sn;
+    static integer jw;
+    static real evi, evk, foo;
+    static integer kln;
+    static real tau, ulp;
+    static integer lwk1, lwk2;
+    static real beta;
+    static integer kend, kcol, info, ifst, ilst, ltop, krow;
+    static logical bulge;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), sgemm_(
+	    char *, char *, integer *, integer *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, real *, integer *);
+    static integer infqr;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static integer kwtop;
+    extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real *
+	    , real *, real *, real *, real *, real *), slabad_(real *, real *)
+	    ;
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *);
+    static real safmin;
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *,
+	    real *);
+    static real safmax;
+    extern /* Subroutine */ int slahqr_(logical *, logical *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , integer *, real *, integer *, integer *), slacpy_(char *,
+	    integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *,
+	    real *, integer *);
+    static logical sorted;
+    extern /* Subroutine */ int strexc_(char *, integer *, real *, integer *,
+	    real *, integer *, integer *, integer *, real *, integer *), sormhr_(char *, char *, integer *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+    static real smlnum;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- April 2009                                                      --
+
+
+       This subroutine is identical to SLAQR3 except that it avoids
+       recursion by calling SLAHQR instead of SLAQR4.
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an orthogonal similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an orthogonal similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the quasi-triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the orthogonal matrix Z is updated so
+            so that the orthogonal Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the orthogonal matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) REAL array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by an orthogonal
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) REAL array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the orthogonal
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SR      (output) REAL array, dimension KBOT
+       SI      (output) REAL array, dimension KBOT
+            On output, the real and imaginary parts of approximate
+            eigenvalues that may be used for shifts are stored in
+            SR(KBOT-ND-NS+1) through SR(KBOT-ND) and
+            SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively.
+            The real and imaginary parts of converged eigenvalues
+            are stored in SR(KBOT-ND+1) through SR(KBOT) and
+            SI(KBOT-ND+1) through SI(KBOT), respectively.
+
+       V       (workspace) REAL array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) REAL array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) REAL array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) REAL array, dimension LWORK.
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; SLAQR2
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sr;
+    --si;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to SGEHRD ==== */
+
+	i__1 = jw - 1;
+	sgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1];
+
+/*        ==== Workspace query call to SORMHR ==== */
+
+	i__1 = jw - 1;
+	sormhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1];
+
+/*        ==== Optimal workspace ==== */
+
+	lwkopt = jw + max(lwk1,lwk2);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1] = 1.f;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s = 0.f;
+    } else {
+	s = h__[kwtop + (kwtop - 1) * h_dim1];
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	sr[kwtop] = h__[kwtop + kwtop * h_dim1];
+	si[kwtop] = 0.f;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	r__2 = smlnum, r__3 = ulp * (r__1 = h__[kwtop + kwtop * h_dim1], dabs(
+		r__1));
+	if (dabs(s) <= dmax(r__2,r__3)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		h__[kwtop + (kwtop - 1) * h_dim1] = 0.f;
+	    }
+	}
+	work[1] = 1.f;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    slacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    scopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    slaset_("A", &jw, &jw, &c_b29, &c_b15, &v[v_offset], ldv);
+    slahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[kwtop],
+	    &si[kwtop], &c__1, &jw, &v[v_offset], ldv, &infqr);
+
+/*     ==== STREXC needs a clean margin near the diagonal ==== */
+
+    i__1 = jw - 3;
+    for (j = 1; j <= i__1; ++j) {
+	t[j + 2 + j * t_dim1] = 0.f;
+	t[j + 3 + j * t_dim1] = 0.f;
+/* L10: */
+    }
+    if (jw > 2) {
+	t[jw + (jw - 2) * t_dim1] = 0.f;
+    }
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+L20:
+    if (ilst <= *ns) {
+	if (*ns == 1) {
+	    bulge = FALSE_;
+	} else {
+	    bulge = t[*ns + (*ns - 1) * t_dim1] != 0.f;
+	}
+
+/*        ==== Small spike tip test for deflation ==== */
+
+	if (! bulge) {
+
+/*           ==== Real eigenvalue ==== */
+
+	    foo = (r__1 = t[*ns + *ns * t_dim1], dabs(r__1));
+	    if (foo == 0.f) {
+		foo = dabs(s);
+	    }
+/* Computing MAX */
+	    r__2 = smlnum, r__3 = ulp * foo;
+	    if ((r__1 = s * v[*ns * v_dim1 + 1], dabs(r__1)) <= dmax(r__2,
+		    r__3)) {
+
+/*              ==== Deflatable ==== */
+
+		--(*ns);
+	    } else {
+
+/*
+                ==== Undeflatable.   Move it up out of the way.
+                .    (STREXC can not fail in this case.) ====
+*/
+
+		ifst = *ns;
+		strexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		++ilst;
+	    }
+	} else {
+
+/*           ==== Complex conjugate pair ==== */
+
+	    foo = (r__3 = t[*ns + *ns * t_dim1], dabs(r__3)) + sqrt((r__1 = t[
+		    *ns + (*ns - 1) * t_dim1], dabs(r__1))) * sqrt((r__2 = t[*
+		    ns - 1 + *ns * t_dim1], dabs(r__2)));
+	    if (foo == 0.f) {
+		foo = dabs(s);
+	    }
+/* Computing MAX */
+	    r__3 = (r__1 = s * v[*ns * v_dim1 + 1], dabs(r__1)), r__4 = (r__2
+		    = s * v[(*ns - 1) * v_dim1 + 1], dabs(r__2));
+/* Computing MAX */
+	    r__5 = smlnum, r__6 = ulp * foo;
+	    if (dmax(r__3,r__4) <= dmax(r__5,r__6)) {
+
+/*              ==== Deflatable ==== */
+
+		*ns += -2;
+	    } else {
+
+/*
+                ==== Undeflatable. Move them up out of the way.
+                .    Fortunately, STREXC does the right thing with
+                .    ILST in case of a rare exchange failure. ====
+*/
+
+		ifst = *ns;
+		strexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		ilst += 2;
+	    }
+	}
+
+/*        ==== End deflation detection loop ==== */
+
+	goto L20;
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s = 0.f;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting diagonal blocks of T improves accuracy for
+          .    graded matrices.  Bubble sort deals well with
+          .    exchange failures. ====
+*/
+
+	sorted = FALSE_;
+	i__ = *ns + 1;
+L30:
+	if (sorted) {
+	    goto L50;
+	}
+	sorted = TRUE_;
+
+	kend = i__ - 1;
+	i__ = infqr + 1;
+	if (i__ == *ns) {
+	    k = i__ + 1;
+	} else if (t[i__ + 1 + i__ * t_dim1] == 0.f) {
+	    k = i__ + 1;
+	} else {
+	    k = i__ + 2;
+	}
+L40:
+	if (k <= kend) {
+	    if (k == i__ + 1) {
+		evi = (r__1 = t[i__ + i__ * t_dim1], dabs(r__1));
+	    } else {
+		evi = (r__3 = t[i__ + i__ * t_dim1], dabs(r__3)) + sqrt((r__1
+			= t[i__ + 1 + i__ * t_dim1], dabs(r__1))) * sqrt((
+			r__2 = t[i__ + (i__ + 1) * t_dim1], dabs(r__2)));
+	    }
+
+	    if (k == kend) {
+		evk = (r__1 = t[k + k * t_dim1], dabs(r__1));
+	    } else if (t[k + 1 + k * t_dim1] == 0.f) {
+		evk = (r__1 = t[k + k * t_dim1], dabs(r__1));
+	    } else {
+		evk = (r__3 = t[k + k * t_dim1], dabs(r__3)) + sqrt((r__1 = t[
+			k + 1 + k * t_dim1], dabs(r__1))) * sqrt((r__2 = t[k
+			+ (k + 1) * t_dim1], dabs(r__2)));
+	    }
+
+	    if (evi >= evk) {
+		i__ = k;
+	    } else {
+		sorted = FALSE_;
+		ifst = i__;
+		ilst = k;
+		strexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		if (info == 0) {
+		    i__ = ilst;
+		} else {
+		    i__ = k;
+		}
+	    }
+	    if (i__ == kend) {
+		k = i__ + 1;
+	    } else if (t[i__ + 1 + i__ * t_dim1] == 0.f) {
+		k = i__ + 1;
+	    } else {
+		k = i__ + 2;
+	    }
+	    goto L40;
+	}
+	goto L30;
+L50:
+	;
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__ = jw;
+L60:
+    if (i__ >= infqr + 1) {
+	if (i__ == infqr + 1) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.f;
+	    --i__;
+	} else if (t[i__ + (i__ - 1) * t_dim1] == 0.f) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.f;
+	    --i__;
+	} else {
+	    aa = t[i__ - 1 + (i__ - 1) * t_dim1];
+	    cc = t[i__ + (i__ - 1) * t_dim1];
+	    bb = t[i__ - 1 + i__ * t_dim1];
+	    dd = t[i__ + i__ * t_dim1];
+	    slanv2_(&aa, &bb, &cc, &dd, &sr[kwtop + i__ - 2], &si[kwtop + i__
+		    - 2], &sr[kwtop + i__ - 1], &si[kwtop + i__ - 1], &cs, &
+		    sn);
+	    i__ += -2;
+	}
+	goto L60;
+    }
+
+    if (*ns < jw || s == 0.f) {
+	if (*ns > 1 && s != 0.f) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    scopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    beta = work[1];
+	    slarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1] = 1.f;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &t[t_dim1 + 3], ldt);
+
+	    slarf_("L", ns, &jw, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    slarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    slarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    sgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    h__[kwtop + (kwtop - 1) * h_dim1] = s * v[v_dim1 + 1];
+	}
+	slacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	scopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && s != 0.f) {
+	    i__1 = *lwork - jw;
+	    sormhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    sgemm_("N", "N", &kln, &jw, &jw, &c_b15, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b29, &wv[wv_offset],
+		    ldwv);
+	    slacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L70: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		sgemm_("C", "N", &jw, &kln, &jw, &c_b15, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b29, &t[t_offset],
+			 ldt);
+		slacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L80: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		sgemm_("N", "N", &kln, &jw, &jw, &c_b15, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b29, &wv[
+			wv_offset], ldwv);
+		slacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L90: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    work[1] = (real) lwkopt;
+
+/*     ==== End of SLAQR2 ==== */
+
+    return 0;
+} /* slaqr2_ */
+
+/* Subroutine */ int slaqr3_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, real *h__, integer *ldh,
+	integer *iloz, integer *ihiz, real *z__, integer *ldz, integer *ns,
+	integer *nd, real *sr, real *si, real *v, integer *ldv, integer *nh,
+	real *t, integer *ldt, integer *nv, real *wv, integer *ldwv, real *
+	work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static real s, aa, bb, cc, dd, cs, sn;
+    static integer jw;
+    static real evi, evk, foo;
+    static integer kln;
+    static real tau, ulp;
+    static integer lwk1, lwk2, lwk3;
+    static real beta;
+    static integer kend, kcol, info, nmin, ifst, ilst, ltop, krow;
+    static logical bulge;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), sgemm_(
+	    char *, char *, integer *, integer *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, real *, integer *);
+    static integer infqr;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static integer kwtop;
+    extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real *
+	    , real *, real *, real *, real *, real *), slaqr4_(logical *,
+	    logical *, integer *, integer *, integer *, real *, integer *,
+	    real *, real *, integer *, integer *, real *, integer *, real *,
+	    integer *, integer *), slabad_(real *, real *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *);
+    static real safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real safmax;
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *,
+	    real *), slahqr_(logical *, logical *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, integer *
+	    , real *, integer *, integer *), slacpy_(char *, integer *,
+	    integer *, real *, integer *, real *, integer *), slaset_(
+	    char *, integer *, integer *, real *, real *, real *, integer *);
+    static logical sorted;
+    extern /* Subroutine */ int strexc_(char *, integer *, real *, integer *,
+	    real *, integer *, integer *, integer *, real *, integer *), sormhr_(char *, char *, integer *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+    static real smlnum;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- April 2009                                                      --
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an orthogonal similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an orthogonal similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the quasi-triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the orthogonal matrix Z is updated so
+            so that the orthogonal Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the orthogonal matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) REAL array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by an orthogonal
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) REAL array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the orthogonal
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SR      (output) REAL array, dimension KBOT
+       SI      (output) REAL array, dimension KBOT
+            On output, the real and imaginary parts of approximate
+            eigenvalues that may be used for shifts are stored in
+            SR(KBOT-ND-NS+1) through SR(KBOT-ND) and
+            SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively.
+            The real and imaginary parts of converged eigenvalues
+            are stored in SR(KBOT-ND+1) through SR(KBOT) and
+            SI(KBOT-ND+1) through SI(KBOT), respectively.
+
+       V       (workspace) REAL array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) REAL array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) REAL array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) REAL array, dimension LWORK.
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; SLAQR3
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sr;
+    --si;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to SGEHRD ==== */
+
+	i__1 = jw - 1;
+	sgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1];
+
+/*        ==== Workspace query call to SORMHR ==== */
+
+	i__1 = jw - 1;
+	sormhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1];
+
+/*        ==== Workspace query call to SLAQR4 ==== */
+
+	slaqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[1],
+		&si[1], &c__1, &jw, &v[v_offset], ldv, &work[1], &c_n1, &
+		infqr);
+	lwk3 = (integer) work[1];
+
+/*
+          ==== Optimal workspace ====
+
+   Computing MAX
+*/
+	i__1 = jw + max(lwk1,lwk2);
+	lwkopt = max(i__1,lwk3);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1] = 1.f;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s = 0.f;
+    } else {
+	s = h__[kwtop + (kwtop - 1) * h_dim1];
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	sr[kwtop] = h__[kwtop + kwtop * h_dim1];
+	si[kwtop] = 0.f;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	r__2 = smlnum, r__3 = ulp * (r__1 = h__[kwtop + kwtop * h_dim1], dabs(
+		r__1));
+	if (dabs(s) <= dmax(r__2,r__3)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		h__[kwtop + (kwtop - 1) * h_dim1] = 0.f;
+	    }
+	}
+	work[1] = 1.f;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    slacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    scopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    slaset_("A", &jw, &jw, &c_b29, &c_b15, &v[v_offset], ldv);
+    nmin = ilaenv_(&c__12, "SLAQR3", "SV", &jw, &c__1, &jw, lwork, (ftnlen)6,
+	    (ftnlen)2);
+    if (jw > nmin) {
+	slaqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[
+		kwtop], &si[kwtop], &c__1, &jw, &v[v_offset], ldv, &work[1],
+		lwork, &infqr);
+    } else {
+	slahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sr[
+		kwtop], &si[kwtop], &c__1, &jw, &v[v_offset], ldv, &infqr);
+    }
+
+/*     ==== STREXC needs a clean margin near the diagonal ==== */
+
+    i__1 = jw - 3;
+    for (j = 1; j <= i__1; ++j) {
+	t[j + 2 + j * t_dim1] = 0.f;
+	t[j + 3 + j * t_dim1] = 0.f;
+/* L10: */
+    }
+    if (jw > 2) {
+	t[jw + (jw - 2) * t_dim1] = 0.f;
+    }
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+L20:
+    if (ilst <= *ns) {
+	if (*ns == 1) {
+	    bulge = FALSE_;
+	} else {
+	    bulge = t[*ns + (*ns - 1) * t_dim1] != 0.f;
+	}
+
+/*        ==== Small spike tip test for deflation ==== */
+
+	if (! bulge) {
+
+/*           ==== Real eigenvalue ==== */
+
+	    foo = (r__1 = t[*ns + *ns * t_dim1], dabs(r__1));
+	    if (foo == 0.f) {
+		foo = dabs(s);
+	    }
+/* Computing MAX */
+	    r__2 = smlnum, r__3 = ulp * foo;
+	    if ((r__1 = s * v[*ns * v_dim1 + 1], dabs(r__1)) <= dmax(r__2,
+		    r__3)) {
+
+/*              ==== Deflatable ==== */
+
+		--(*ns);
+	    } else {
+
+/*
+                ==== Undeflatable.   Move it up out of the way.
+                .    (STREXC can not fail in this case.) ====
+*/
+
+		ifst = *ns;
+		strexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		++ilst;
+	    }
+	} else {
+
+/*           ==== Complex conjugate pair ==== */
+
+	    foo = (r__3 = t[*ns + *ns * t_dim1], dabs(r__3)) + sqrt((r__1 = t[
+		    *ns + (*ns - 1) * t_dim1], dabs(r__1))) * sqrt((r__2 = t[*
+		    ns - 1 + *ns * t_dim1], dabs(r__2)));
+	    if (foo == 0.f) {
+		foo = dabs(s);
+	    }
+/* Computing MAX */
+	    r__3 = (r__1 = s * v[*ns * v_dim1 + 1], dabs(r__1)), r__4 = (r__2
+		    = s * v[(*ns - 1) * v_dim1 + 1], dabs(r__2));
+/* Computing MAX */
+	    r__5 = smlnum, r__6 = ulp * foo;
+	    if (dmax(r__3,r__4) <= dmax(r__5,r__6)) {
+
+/*              ==== Deflatable ==== */
+
+		*ns += -2;
+	    } else {
+
+/*
+                ==== Undeflatable. Move them up out of the way.
+                .    Fortunately, STREXC does the right thing with
+                .    ILST in case of a rare exchange failure. ====
+*/
+
+		ifst = *ns;
+		strexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		ilst += 2;
+	    }
+	}
+
+/*        ==== End deflation detection loop ==== */
+
+	goto L20;
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s = 0.f;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting diagonal blocks of T improves accuracy for
+          .    graded matrices.  Bubble sort deals well with
+          .    exchange failures. ====
+*/
+
+	sorted = FALSE_;
+	i__ = *ns + 1;
+L30:
+	if (sorted) {
+	    goto L50;
+	}
+	sorted = TRUE_;
+
+	kend = i__ - 1;
+	i__ = infqr + 1;
+	if (i__ == *ns) {
+	    k = i__ + 1;
+	} else if (t[i__ + 1 + i__ * t_dim1] == 0.f) {
+	    k = i__ + 1;
+	} else {
+	    k = i__ + 2;
+	}
+L40:
+	if (k <= kend) {
+	    if (k == i__ + 1) {
+		evi = (r__1 = t[i__ + i__ * t_dim1], dabs(r__1));
+	    } else {
+		evi = (r__3 = t[i__ + i__ * t_dim1], dabs(r__3)) + sqrt((r__1
+			= t[i__ + 1 + i__ * t_dim1], dabs(r__1))) * sqrt((
+			r__2 = t[i__ + (i__ + 1) * t_dim1], dabs(r__2)));
+	    }
+
+	    if (k == kend) {
+		evk = (r__1 = t[k + k * t_dim1], dabs(r__1));
+	    } else if (t[k + 1 + k * t_dim1] == 0.f) {
+		evk = (r__1 = t[k + k * t_dim1], dabs(r__1));
+	    } else {
+		evk = (r__3 = t[k + k * t_dim1], dabs(r__3)) + sqrt((r__1 = t[
+			k + 1 + k * t_dim1], dabs(r__1))) * sqrt((r__2 = t[k
+			+ (k + 1) * t_dim1], dabs(r__2)));
+	    }
+
+	    if (evi >= evk) {
+		i__ = k;
+	    } else {
+		sorted = FALSE_;
+		ifst = i__;
+		ilst = k;
+		strexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &work[1], &info);
+		if (info == 0) {
+		    i__ = ilst;
+		} else {
+		    i__ = k;
+		}
+	    }
+	    if (i__ == kend) {
+		k = i__ + 1;
+	    } else if (t[i__ + 1 + i__ * t_dim1] == 0.f) {
+		k = i__ + 1;
+	    } else {
+		k = i__ + 2;
+	    }
+	    goto L40;
+	}
+	goto L30;
+L50:
+	;
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__ = jw;
+L60:
+    if (i__ >= infqr + 1) {
+	if (i__ == infqr + 1) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.f;
+	    --i__;
+	} else if (t[i__ + (i__ - 1) * t_dim1] == 0.f) {
+	    sr[kwtop + i__ - 1] = t[i__ + i__ * t_dim1];
+	    si[kwtop + i__ - 1] = 0.f;
+	    --i__;
+	} else {
+	    aa = t[i__ - 1 + (i__ - 1) * t_dim1];
+	    cc = t[i__ + (i__ - 1) * t_dim1];
+	    bb = t[i__ - 1 + i__ * t_dim1];
+	    dd = t[i__ + i__ * t_dim1];
+	    slanv2_(&aa, &bb, &cc, &dd, &sr[kwtop + i__ - 2], &si[kwtop + i__
+		    - 2], &sr[kwtop + i__ - 1], &si[kwtop + i__ - 1], &cs, &
+		    sn);
+	    i__ += -2;
+	}
+	goto L60;
+    }
+
+    if (*ns < jw || s == 0.f) {
+	if (*ns > 1 && s != 0.f) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    scopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    beta = work[1];
+	    slarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1] = 1.f;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &t[t_dim1 + 3], ldt);
+
+	    slarf_("L", ns, &jw, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    slarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    slarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    sgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    h__[kwtop + (kwtop - 1) * h_dim1] = s * v[v_dim1 + 1];
+	}
+	slacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	scopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && s != 0.f) {
+	    i__1 = *lwork - jw;
+	    sormhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    sgemm_("N", "N", &kln, &jw, &jw, &c_b15, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b29, &wv[wv_offset],
+		    ldwv);
+	    slacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L70: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		sgemm_("C", "N", &jw, &kln, &jw, &c_b15, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b29, &t[t_offset],
+			 ldt);
+		slacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L80: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		sgemm_("N", "N", &kln, &jw, &jw, &c_b15, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b29, &wv[
+			wv_offset], ldwv);
+		slacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L90: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    work[1] = (real) lwkopt;
+
+/*     ==== End of SLAQR3 ==== */
+
+    return 0;
+} /* slaqr3_ */
+
+/* Subroutine */ int slaqr4_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real *
+	wi, integer *iloz, integer *ihiz, real *z__, integer *ldz, real *work,
+	 integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1, r__2, r__3, r__4;
+
+    /* Local variables */
+    static integer i__, k;
+    static real aa, bb, cc, dd;
+    static integer ld;
+    static real cs;
+    static integer nh, it, ks, kt;
+    static real sn;
+    static integer ku, kv, ls, ns;
+    static real ss;
+    static integer nw, inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl,
+	    kbot, nmin;
+    static real swap;
+    static integer ktop;
+    static real zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int slaqr2_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, real *, integer *, integer *,
+	    integer *, real *, integer *, integer *, integer *, real *, real *
+	    , real *, integer *, integer *, real *, integer *, integer *,
+	    real *, integer *, real *, integer *), slanv2_(real *, real *,
+	    real *, real *, real *, real *, real *, real *, real *, real *),
+	    slaqr5_(logical *, logical *, integer *, integer *, integer *,
+	    integer *, integer *, real *, real *, real *, integer *, integer *
+	    , integer *, real *, integer *, real *, integer *, real *,
+	    integer *, integer *, real *, integer *, integer *, real *,
+	    integer *);
+    static integer nibble;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    extern /* Subroutine */ int slahqr_(logical *, logical *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , integer *, real *, integer *, integer *), slacpy_(char *,
+	    integer *, integer *, real *, integer *, real *, integer *);
+    static integer nwupbd;
+    static logical sorted;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       This subroutine implements one level of recursion for SLAQR0.
+       It is a complete implementation of the small bulge multi-shift
+       QR algorithm.  It may be called by SLAQR0 and, for large enough
+       deflation window size, it may be called by SLAQR3.  This
+       subroutine is identical to SLAQR0 except that it calls SLAQR2
+       instead of SLAQR3.
+
+       Purpose
+       =======
+
+       SLAQR4 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+       Schur form), and Z is the orthogonal matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input orthogonal
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to SGEBAL, and then passed to SGEHRD when the
+             matrix output by SGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) REAL array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H contains
+             the upper quasi-triangular matrix T from the Schur
+             decomposition (the Schur form); 2-by-2 diagonal blocks
+             (corresponding to complex conjugate pairs of eigenvalues)
+             are returned in standard form, with H(i,i) = H(i+1,i+1)
+             and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       WR    (output) REAL array, dimension (IHI)
+       WI    (output) REAL array, dimension (IHI)
+             The real and imaginary parts, respectively, of the computed
+             eigenvalues of H(ILO:IHI,ILO:IHI) are stored in WR(ILO:IHI)
+             and WI(ILO:IHI). If two eigenvalues are computed as a
+             complex conjugate pair, they are stored in consecutive
+             elements of WR and WI, say the i-th and (i+1)th, with
+             WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+             the eigenvalues are stored in the same order as on the
+             diagonal of the Schur form returned in H, with
+             WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
+             block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
+             WI(i+1) = -WI(i).
+
+       ILOZ     (input) INTEGER
+       IHIZ     (input) INTEGER
+             Specify the rows of Z to which transformations must be
+             applied if WANTZ is .TRUE..
+             1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+
+       Z     (input/output) REAL array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) REAL array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then SLAQR4 does a workspace query.
+             In this case, SLAQR4 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, SLAQR4 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is an orthogonal matrix.  The final
+                  value of H is upper Hessenberg and quasi-triangular
+                  in rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the orthogonal matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    SLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constants WILK1 and WILK2 are used to form the
+       .    exceptional shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --wr;
+    --wi;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use SLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    slahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &wr[1], &
+		    wi[1], iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "SLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "SLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to SLAQR2 ====
+*/
+
+	i__1 = nwr + 1;
+	slaqr2_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1], &h__[
+		h_offset], ldh, n, &h__[h_offset], ldh, n, &h__[h_offset],
+		ldh, &work[1], &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(SLAQR5, SLAQR2) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1];
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    work[1] = (real) lwkopt;
+	    return 0;
+	}
+
+/*        ==== SLAHQR/SLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "SLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "SLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "SLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L90;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		if (h__[k + (k - 1) * h_dim1] == 0.f) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    if ((r__1 = h__[kwtop + (kwtop - 1) * h_dim1], dabs(r__1))
+			     > (r__2 = h__[kwtop - 1 + (kwtop - 2) * h_dim1],
+			    dabs(r__2))) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    slaqr2_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &wr[1], &wi[1],
+		     &h__[kv + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1],
+		    ldh, &nve, &h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if SLAQR2
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    SLAQR2 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+/* Computing MAX */
+		    i__3 = ks + 1, i__4 = ktop + 2;
+		    i__2 = max(i__3,i__4);
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			ss = (r__1 = h__[i__ + (i__ - 1) * h_dim1], dabs(r__1)
+				) + (r__2 = h__[i__ - 1 + (i__ - 2) * h_dim1],
+				 dabs(r__2));
+			aa = ss * .75f + h__[i__ + i__ * h_dim1];
+			bb = ss;
+			cc = ss * -.4375f;
+			dd = aa;
+			slanv2_(&aa, &bb, &cc, &dd, &wr[i__ - 1], &wi[i__ - 1]
+				, &wr[i__], &wi[i__], &cs, &sn);
+/* L30: */
+		    }
+		    if (ks == ktop) {
+			wr[ks + 1] = h__[ks + 1 + (ks + 1) * h_dim1];
+			wi[ks + 1] = 0.f;
+			wr[ks] = wr[ks + 1];
+			wi[ks] = wi[ks + 1];
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use SLAHQR
+                   .    on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			slacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			slahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[kt
+				+ h_dim1], ldh, &wr[ks], &wi[ks], &c__1, &
+				c__1, zdum, &c__1, &inf);
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  ====
+*/
+
+			if (ks >= kbot) {
+			    aa = h__[kbot - 1 + (kbot - 1) * h_dim1];
+			    cc = h__[kbot + (kbot - 1) * h_dim1];
+			    bb = h__[kbot - 1 + kbot * h_dim1];
+			    dd = h__[kbot + kbot * h_dim1];
+			    slanv2_(&aa, &bb, &cc, &dd, &wr[kbot - 1], &wi[
+				    kbot - 1], &wr[kbot], &wi[kbot], &cs, &sn)
+				    ;
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*
+                      ==== Sort the shifts (Helps a little)
+                      .    Bubble sort keeps complex conjugate
+                      .    pairs together. ====
+*/
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				if ((r__1 = wr[i__], dabs(r__1)) + (r__2 = wi[
+					i__], dabs(r__2)) < (r__3 = wr[i__ +
+					1], dabs(r__3)) + (r__4 = wi[i__ + 1],
+					 dabs(r__4))) {
+				    sorted = FALSE_;
+
+				    swap = wr[i__];
+				    wr[i__] = wr[i__ + 1];
+				    wr[i__ + 1] = swap;
+
+				    swap = wi[i__];
+				    wi[i__] = wi[i__ + 1];
+				    wi[i__ + 1] = swap;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+
+/*
+                   ==== Shuffle shifts into pairs of real shifts
+                   .    and pairs of complex conjugate shifts
+                   .    assuming complex conjugate shifts are
+                   .    already adjacent to one another. (Yes,
+                   .    they are.)  ====
+*/
+
+		    i__2 = ks + 2;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			if (wi[i__] != -wi[i__ - 1]) {
+
+			    swap = wr[i__];
+			    wr[i__] = wr[i__ - 1];
+			    wr[i__ - 1] = wr[i__ - 2];
+			    wr[i__ - 2] = swap;
+
+			    swap = wi[i__];
+			    wi[i__] = wi[i__ - 1];
+			    wi[i__ - 1] = wi[i__ - 2];
+			    wi[i__ - 2] = swap;
+			}
+/* L70: */
+		    }
+		}
+
+/*
+                ==== If there are only two shifts and both are
+                .    real, then use only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    if (wi[kbot] == 0.f) {
+			if ((r__1 = wr[kbot] - h__[kbot + kbot * h_dim1],
+				dabs(r__1)) < (r__2 = wr[kbot - 1] - h__[kbot
+				+ kbot * h_dim1], dabs(r__2))) {
+			    wr[kbot - 1] = wr[kbot];
+			} else {
+			    wr[kbot] = wr[kbot - 1];
+			}
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		slaqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &wr[ks],
+			&wi[ks], &h__[h_offset], ldh, iloz, ihiz, &z__[
+			z_offset], ldz, &work[1], &c__3, &h__[ku + h_dim1],
+			ldh, &nve, &h__[kwv + h_dim1], ldh, &nho, &h__[ku +
+			kwh * h_dim1], ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L80:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L90:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    work[1] = (real) lwkopt;
+
+/*     ==== End of SLAQR4 ==== */
+
+    return 0;
+} /* slaqr4_ */
+
+/* Subroutine */ int slaqr5_(logical *wantt, logical *wantz, integer *kacc22,
+	integer *n, integer *ktop, integer *kbot, integer *nshfts, real *sr,
+	real *si, real *h__, integer *ldh, integer *iloz, integer *ihiz, real
+	*z__, integer *ldz, real *v, integer *ldv, real *u, integer *ldu,
+	integer *nv, real *wv, integer *ldwv, integer *nh, real *wh, integer *
+	ldwh)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, u_dim1, u_offset, v_dim1, v_offset, wh_dim1,
+	    wh_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3,
+	     i__4, i__5, i__6, i__7;
+    real r__1, r__2, r__3, r__4, r__5;
+
+    /* Local variables */
+    static integer i__, j, k, m, i2, j2, i4, j4, k1;
+    static real h11, h12, h21, h22;
+    static integer m22, ns, nu;
+    static real vt[3], scl;
+    static integer kdu, kms;
+    static real ulp;
+    static integer knz, kzs;
+    static real tst1, tst2, beta;
+    static logical blk22, bmp22;
+    static integer mend, jcol, jlen, jbot, mbot;
+    static real swap;
+    static integer jtop, jrow, mtop;
+    static real alpha;
+    static logical accum;
+    static integer ndcol, incol;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer krcol, nbmps;
+    extern /* Subroutine */ int strmm_(char *, char *, char *, char *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), slaqr1_(integer *, real *,
+	    integer *, real *, real *, real *, real *, real *), slabad_(real *
+	    , real *);
+    extern doublereal slamch_(char *);
+    static real safmin;
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *,
+	    real *);
+    static real safmax;
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *,
+	    integer *, real *, integer *), slaset_(char *, integer *,
+	    integer *, real *, real *, real *, integer *);
+    static real refsum;
+    static integer mstart;
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+       This auxiliary subroutine called by SLAQR0 performs a
+       single small-bulge multi-shift QR sweep.
+
+        WANTT  (input) logical scalar
+               WANTT = .true. if the quasi-triangular Schur factor
+               is being computed.  WANTT is set to .false. otherwise.
+
+        WANTZ  (input) logical scalar
+               WANTZ = .true. if the orthogonal Schur factor is being
+               computed.  WANTZ is set to .false. otherwise.
+
+        KACC22 (input) integer with value 0, 1, or 2.
+               Specifies the computation mode of far-from-diagonal
+               orthogonal updates.
+          = 0: SLAQR5 does not accumulate reflections and does not
+               use matrix-matrix multiply to update far-from-diagonal
+               matrix entries.
+          = 1: SLAQR5 accumulates reflections and uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries.
+          = 2: SLAQR5 accumulates reflections, uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries,
+               and takes advantage of 2-by-2 block structure during
+               matrix multiplies.
+
+        N      (input) integer scalar
+               N is the order of the Hessenberg matrix H upon which this
+               subroutine operates.
+
+        KTOP   (input) integer scalar
+        KBOT   (input) integer scalar
+               These are the first and last rows and columns of an
+               isolated diagonal block upon which the QR sweep is to be
+               applied. It is assumed without a check that
+                         either KTOP = 1  or   H(KTOP,KTOP-1) = 0
+               and
+                         either KBOT = N  or   H(KBOT+1,KBOT) = 0.
+
+        NSHFTS (input) integer scalar
+               NSHFTS gives the number of simultaneous shifts.  NSHFTS
+               must be positive and even.
+
+        SR     (input/output) REAL array of size (NSHFTS)
+        SI     (input/output) REAL array of size (NSHFTS)
+               SR contains the real parts and SI contains the imaginary
+               parts of the NSHFTS shifts of origin that define the
+               multi-shift QR sweep.  On output SR and SI may be
+               reordered.
+
+        H      (input/output) REAL array of size (LDH,N)
+               On input H contains a Hessenberg matrix.  On output a
+               multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied
+               to the isolated diagonal block in rows and columns KTOP
+               through KBOT.
+
+        LDH    (input) integer scalar
+               LDH is the leading dimension of H just as declared in the
+               calling procedure.  LDH.GE.MAX(1,N).
+
+        ILOZ   (input) INTEGER
+        IHIZ   (input) INTEGER
+               Specify the rows of Z to which transformations must be
+               applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+
+        Z      (input/output) REAL array of size (LDZ,IHI)
+               If WANTZ = .TRUE., then the QR Sweep orthogonal
+               similarity transformation is accumulated into
+               Z(ILOZ:IHIZ,ILO:IHI) from the right.
+               If WANTZ = .FALSE., then Z is unreferenced.
+
+        LDZ    (input) integer scalar
+               LDA is the leading dimension of Z just as declared in
+               the calling procedure. LDZ.GE.N.
+
+        V      (workspace) REAL array of size (LDV,NSHFTS/2)
+
+        LDV    (input) integer scalar
+               LDV is the leading dimension of V as declared in the
+               calling procedure.  LDV.GE.3.
+
+        U      (workspace) REAL array of size
+               (LDU,3*NSHFTS-3)
+
+        LDU    (input) integer scalar
+               LDU is the leading dimension of U just as declared in the
+               in the calling subroutine.  LDU.GE.3*NSHFTS-3.
+
+        NH     (input) integer scalar
+               NH is the number of columns in array WH available for
+               workspace. NH.GE.1.
+
+        WH     (workspace) REAL array of size (LDWH,NH)
+
+        LDWH   (input) integer scalar
+               Leading dimension of WH just as declared in the
+               calling procedure.  LDWH.GE.3*NSHFTS-3.
+
+        NV     (input) integer scalar
+               NV is the number of rows in WV agailable for workspace.
+               NV.GE.1.
+
+        WV     (workspace) REAL array of size
+               (LDWV,3*NSHFTS-3)
+
+        LDWV   (input) integer scalar
+               LDWV is the leading dimension of WV as declared in the
+               in the calling subroutine.  LDWV.GE.NV.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       Reference:
+
+       K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+       Algorithm Part I: Maintaining Well Focused Shifts, and
+       Level 3 Performance, SIAM Journal of Matrix Analysis,
+       volume 23, pages 929--947, 2002.
+
+       ================================================================
+
+
+       ==== If there are no shifts, then there is nothing to do. ====
+*/
+
+    /* Parameter adjustments */
+    --sr;
+    --si;
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    wh_dim1 = *ldwh;
+    wh_offset = 1 + wh_dim1;
+    wh -= wh_offset;
+
+    /* Function Body */
+    if (*nshfts < 2) {
+	return 0;
+    }
+
+/*
+       ==== If the active block is empty or 1-by-1, then there
+       .    is nothing to do. ====
+*/
+
+    if (*ktop >= *kbot) {
+	return 0;
+    }
+
+/*
+       ==== Shuffle shifts into pairs of real shifts and pairs
+       .    of complex conjugate shifts assuming complex
+       .    conjugate shifts are already adjacent to one
+       .    another. ====
+*/
+
+    i__1 = *nshfts - 2;
+    for (i__ = 1; i__ <= i__1; i__ += 2) {
+	if (si[i__] != -si[i__ + 1]) {
+
+	    swap = sr[i__];
+	    sr[i__] = sr[i__ + 1];
+	    sr[i__ + 1] = sr[i__ + 2];
+	    sr[i__ + 2] = swap;
+
+	    swap = si[i__];
+	    si[i__] = si[i__ + 1];
+	    si[i__ + 1] = si[i__ + 2];
+	    si[i__ + 2] = swap;
+	}
+/* L10: */
+    }
+
+/*
+       ==== NSHFTS is supposed to be even, but if it is odd,
+       .    then simply reduce it by one.  The shuffle above
+       .    ensures that the dropped shift is real and that
+       .    the remaining shifts are paired. ====
+*/
+
+    ns = *nshfts - *nshfts % 2;
+
+/*     ==== Machine constants for deflation ==== */
+
+    safmin = slamch_("SAFE MINIMUM");
+    safmax = 1.f / safmin;
+    slabad_(&safmin, &safmax);
+    ulp = slamch_("PRECISION");
+    smlnum = safmin * ((real) (*n) / ulp);
+
+/*
+       ==== Use accumulated reflections to update far-from-diagonal
+       .    entries ? ====
+*/
+
+    accum = *kacc22 == 1 || *kacc22 == 2;
+
+/*     ==== If so, exploit the 2-by-2 block structure? ==== */
+
+    blk22 = ns > 2 && *kacc22 == 2;
+
+/*     ==== clear trash ==== */
+
+    if (*ktop + 2 <= *kbot) {
+	h__[*ktop + 2 + *ktop * h_dim1] = 0.f;
+    }
+
+/*     ==== NBMPS = number of 2-shift bulges in the chain ==== */
+
+    nbmps = ns / 2;
+
+/*     ==== KDU = width of slab ==== */
+
+    kdu = nbmps * 6 - 3;
+
+/*     ==== Create and chase chains of NBMPS bulges ==== */
+
+    i__1 = *kbot - 2;
+    i__2 = nbmps * 3 - 2;
+    for (incol = (1 - nbmps) * 3 + *ktop - 1; i__2 < 0 ? incol >= i__1 :
+	    incol <= i__1; incol += i__2) {
+	ndcol = incol + kdu;
+	if (accum) {
+	    slaset_("ALL", &kdu, &kdu, &c_b29, &c_b15, &u[u_offset], ldu);
+	}
+
+/*
+          ==== Near-the-diagonal bulge chase.  The following loop
+          .    performs the near-the-diagonal part of a small bulge
+          .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+          .    chunk extends from column INCOL to column NDCOL
+          .    (including both column INCOL and column NDCOL). The
+          .    following loop chases a 3*NBMPS column long chain of
+          .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+          .    may be less than KTOP and and NDCOL may be greater than
+          .    KBOT indicating phantom columns from which to chase
+          .    bulges before they are actually introduced or to which
+          .    to chase bulges beyond column KBOT.)  ====
+
+   Computing MIN
+*/
+	i__4 = incol + nbmps * 3 - 3, i__5 = *kbot - 2;
+	i__3 = min(i__4,i__5);
+	for (krcol = incol; krcol <= i__3; ++krcol) {
+
+/*
+             ==== Bulges number MTOP to MBOT are active double implicit
+             .    shift bulges.  There may or may not also be small
+             .    2-by-2 bulge, if there is room.  The inactive bulges
+             .    (if any) must wait until the active bulges have moved
+             .    down the diagonal to make room.  The phantom matrix
+             .    paradigm described above helps keep track.  ====
+
+   Computing MAX
+*/
+	    i__4 = 1, i__5 = (*ktop - 1 - krcol + 2) / 3 + 1;
+	    mtop = max(i__4,i__5);
+/* Computing MIN */
+	    i__4 = nbmps, i__5 = (*kbot - krcol) / 3;
+	    mbot = min(i__4,i__5);
+	    m22 = mbot + 1;
+	    bmp22 = mbot < nbmps && krcol + (m22 - 1) * 3 == *kbot - 2;
+
+/*
+             ==== Generate reflections to chase the chain right
+             .    one column.  (The minimum value of K is KTOP-1.) ====
+*/
+
+	    i__4 = mbot;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		if (k == *ktop - 1) {
+		    slaqr1_(&c__3, &h__[*ktop + *ktop * h_dim1], ldh, &sr[(m
+			    << 1) - 1], &si[(m << 1) - 1], &sr[m * 2], &si[m *
+			     2], &v[m * v_dim1 + 1]);
+		    alpha = v[m * v_dim1 + 1];
+		    slarfg_(&c__3, &alpha, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+		} else {
+		    beta = h__[k + 1 + k * h_dim1];
+		    v[m * v_dim1 + 2] = h__[k + 2 + k * h_dim1];
+		    v[m * v_dim1 + 3] = h__[k + 3 + k * h_dim1];
+		    slarfg_(&c__3, &beta, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+
+/*
+                   ==== A Bulge may collapse because of vigilant
+                   .    deflation or destructive underflow.  In the
+                   .    underflow case, try the two-small-subdiagonals
+                   .    trick to try to reinflate the bulge.  ====
+*/
+
+		    if (h__[k + 3 + k * h_dim1] != 0.f || h__[k + 3 + (k + 1)
+			    * h_dim1] != 0.f || h__[k + 3 + (k + 2) * h_dim1]
+			    == 0.f) {
+
+/*                    ==== Typical case: not collapsed (yet). ==== */
+
+			h__[k + 1 + k * h_dim1] = beta;
+			h__[k + 2 + k * h_dim1] = 0.f;
+			h__[k + 3 + k * h_dim1] = 0.f;
+		    } else {
+
+/*
+                      ==== Atypical case: collapsed.  Attempt to
+                      .    reintroduce ignoring H(K+1,K) and H(K+2,K).
+                      .    If the fill resulting from the new
+                      .    reflector is too large, then abandon it.
+                      .    Otherwise, use the new one. ====
+*/
+
+			slaqr1_(&c__3, &h__[k + 1 + (k + 1) * h_dim1], ldh, &
+				sr[(m << 1) - 1], &si[(m << 1) - 1], &sr[m *
+				2], &si[m * 2], vt);
+			alpha = vt[0];
+			slarfg_(&c__3, &alpha, &vt[1], &c__1, vt);
+			refsum = vt[0] * (h__[k + 1 + k * h_dim1] + vt[1] *
+				h__[k + 2 + k * h_dim1]);
+
+			if ((r__1 = h__[k + 2 + k * h_dim1] - refsum * vt[1],
+				dabs(r__1)) + (r__2 = refsum * vt[2], dabs(
+				r__2)) > ulp * ((r__3 = h__[k + k * h_dim1],
+				dabs(r__3)) + (r__4 = h__[k + 1 + (k + 1) *
+				h_dim1], dabs(r__4)) + (r__5 = h__[k + 2 + (k
+				+ 2) * h_dim1], dabs(r__5)))) {
+
+/*
+                         ==== Starting a new bulge here would
+                         .    create non-negligible fill.  Use
+                         .    the old one with trepidation. ====
+*/
+
+			    h__[k + 1 + k * h_dim1] = beta;
+			    h__[k + 2 + k * h_dim1] = 0.f;
+			    h__[k + 3 + k * h_dim1] = 0.f;
+			} else {
+
+/*
+                         ==== Stating a new bulge here would
+                         .    create only negligible fill.
+                         .    Replace the old reflector with
+                         .    the new one. ====
+*/
+
+			    h__[k + 1 + k * h_dim1] -= refsum;
+			    h__[k + 2 + k * h_dim1] = 0.f;
+			    h__[k + 3 + k * h_dim1] = 0.f;
+			    v[m * v_dim1 + 1] = vt[0];
+			    v[m * v_dim1 + 2] = vt[1];
+			    v[m * v_dim1 + 3] = vt[2];
+			}
+		    }
+		}
+/* L20: */
+	    }
+
+/*           ==== Generate a 2-by-2 reflection, if needed. ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    if (bmp22) {
+		if (k == *ktop - 1) {
+		    slaqr1_(&c__2, &h__[k + 1 + (k + 1) * h_dim1], ldh, &sr[(
+			    m22 << 1) - 1], &si[(m22 << 1) - 1], &sr[m22 * 2],
+			     &si[m22 * 2], &v[m22 * v_dim1 + 1]);
+		    beta = v[m22 * v_dim1 + 1];
+		    slarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		} else {
+		    beta = h__[k + 1 + k * h_dim1];
+		    v[m22 * v_dim1 + 2] = h__[k + 2 + k * h_dim1];
+		    slarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		    h__[k + 1 + k * h_dim1] = beta;
+		    h__[k + 2 + k * h_dim1] = 0.f;
+		}
+	    }
+
+/*           ==== Multiply H by reflections from the left ==== */
+
+	    if (accum) {
+		jbot = min(ndcol,*kbot);
+	    } else if (*wantt) {
+		jbot = *n;
+	    } else {
+		jbot = *kbot;
+	    }
+	    i__4 = jbot;
+	    for (j = max(*ktop,krcol); j <= i__4; ++j) {
+/* Computing MIN */
+		i__5 = mbot, i__6 = (j - krcol + 2) / 3;
+		mend = min(i__5,i__6);
+		i__5 = mend;
+		for (m = mtop; m <= i__5; ++m) {
+		    k = krcol + (m - 1) * 3;
+		    refsum = v[m * v_dim1 + 1] * (h__[k + 1 + j * h_dim1] + v[
+			    m * v_dim1 + 2] * h__[k + 2 + j * h_dim1] + v[m *
+			    v_dim1 + 3] * h__[k + 3 + j * h_dim1]);
+		    h__[k + 1 + j * h_dim1] -= refsum;
+		    h__[k + 2 + j * h_dim1] -= refsum * v[m * v_dim1 + 2];
+		    h__[k + 3 + j * h_dim1] -= refsum * v[m * v_dim1 + 3];
+/* L30: */
+		}
+/* L40: */
+	    }
+	    if (bmp22) {
+		k = krcol + (m22 - 1) * 3;
+/* Computing MAX */
+		i__4 = k + 1;
+		i__5 = jbot;
+		for (j = max(i__4,*ktop); j <= i__5; ++j) {
+		    refsum = v[m22 * v_dim1 + 1] * (h__[k + 1 + j * h_dim1] +
+			    v[m22 * v_dim1 + 2] * h__[k + 2 + j * h_dim1]);
+		    h__[k + 1 + j * h_dim1] -= refsum;
+		    h__[k + 2 + j * h_dim1] -= refsum * v[m22 * v_dim1 + 2];
+/* L50: */
+		}
+	    }
+
+/*
+             ==== Multiply H by reflections from the right.
+             .    Delay filling in the last row until the
+             .    vigilant deflation check is complete. ====
+*/
+
+	    if (accum) {
+		jtop = max(*ktop,incol);
+	    } else if (*wantt) {
+		jtop = 1;
+	    } else {
+		jtop = *ktop;
+	    }
+	    i__5 = mbot;
+	    for (m = mtop; m <= i__5; ++m) {
+		if (v[m * v_dim1 + 1] != 0.f) {
+		    k = krcol + (m - 1) * 3;
+/* Computing MIN */
+		    i__6 = *kbot, i__7 = k + 3;
+		    i__4 = min(i__6,i__7);
+		    for (j = jtop; j <= i__4; ++j) {
+			refsum = v[m * v_dim1 + 1] * (h__[j + (k + 1) *
+				h_dim1] + v[m * v_dim1 + 2] * h__[j + (k + 2)
+				* h_dim1] + v[m * v_dim1 + 3] * h__[j + (k +
+				3) * h_dim1]);
+			h__[j + (k + 1) * h_dim1] -= refsum;
+			h__[j + (k + 2) * h_dim1] -= refsum * v[m * v_dim1 +
+				2];
+			h__[j + (k + 3) * h_dim1] -= refsum * v[m * v_dim1 +
+				3];
+/* L60: */
+		    }
+
+		    if (accum) {
+
+/*
+                      ==== Accumulate U. (If necessary, update Z later
+                      .    with with an efficient matrix-matrix
+                      .    multiply.) ====
+*/
+
+			kms = k - incol;
+/* Computing MAX */
+			i__4 = 1, i__6 = *ktop - incol;
+			i__7 = kdu;
+			for (j = max(i__4,i__6); j <= i__7; ++j) {
+			    refsum = v[m * v_dim1 + 1] * (u[j + (kms + 1) *
+				    u_dim1] + v[m * v_dim1 + 2] * u[j + (kms
+				    + 2) * u_dim1] + v[m * v_dim1 + 3] * u[j
+				    + (kms + 3) * u_dim1]);
+			    u[j + (kms + 1) * u_dim1] -= refsum;
+			    u[j + (kms + 2) * u_dim1] -= refsum * v[m *
+				    v_dim1 + 2];
+			    u[j + (kms + 3) * u_dim1] -= refsum * v[m *
+				    v_dim1 + 3];
+/* L70: */
+			}
+		    } else if (*wantz) {
+
+/*
+                      ==== U is not accumulated, so update Z
+                      .    now by multiplying by reflections
+                      .    from the right. ====
+*/
+
+			i__7 = *ihiz;
+			for (j = *iloz; j <= i__7; ++j) {
+			    refsum = v[m * v_dim1 + 1] * (z__[j + (k + 1) *
+				    z_dim1] + v[m * v_dim1 + 2] * z__[j + (k
+				    + 2) * z_dim1] + v[m * v_dim1 + 3] * z__[
+				    j + (k + 3) * z_dim1]);
+			    z__[j + (k + 1) * z_dim1] -= refsum;
+			    z__[j + (k + 2) * z_dim1] -= refsum * v[m *
+				    v_dim1 + 2];
+			    z__[j + (k + 3) * z_dim1] -= refsum * v[m *
+				    v_dim1 + 3];
+/* L80: */
+			}
+		    }
+		}
+/* L90: */
+	    }
+
+/*           ==== Special case: 2-by-2 reflection (if needed) ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    if (bmp22 && v[m22 * v_dim1 + 1] != 0.f) {
+/* Computing MIN */
+		i__7 = *kbot, i__4 = k + 3;
+		i__5 = min(i__7,i__4);
+		for (j = jtop; j <= i__5; ++j) {
+		    refsum = v[m22 * v_dim1 + 1] * (h__[j + (k + 1) * h_dim1]
+			    + v[m22 * v_dim1 + 2] * h__[j + (k + 2) * h_dim1])
+			    ;
+		    h__[j + (k + 1) * h_dim1] -= refsum;
+		    h__[j + (k + 2) * h_dim1] -= refsum * v[m22 * v_dim1 + 2];
+/* L100: */
+		}
+
+		if (accum) {
+		    kms = k - incol;
+/* Computing MAX */
+		    i__5 = 1, i__7 = *ktop - incol;
+		    i__4 = kdu;
+		    for (j = max(i__5,i__7); j <= i__4; ++j) {
+			refsum = v[m22 * v_dim1 + 1] * (u[j + (kms + 1) *
+				u_dim1] + v[m22 * v_dim1 + 2] * u[j + (kms +
+				2) * u_dim1]);
+			u[j + (kms + 1) * u_dim1] -= refsum;
+			u[j + (kms + 2) * u_dim1] -= refsum * v[m22 * v_dim1
+				+ 2];
+/* L110: */
+		    }
+		} else if (*wantz) {
+		    i__4 = *ihiz;
+		    for (j = *iloz; j <= i__4; ++j) {
+			refsum = v[m22 * v_dim1 + 1] * (z__[j + (k + 1) *
+				z_dim1] + v[m22 * v_dim1 + 2] * z__[j + (k +
+				2) * z_dim1]);
+			z__[j + (k + 1) * z_dim1] -= refsum;
+			z__[j + (k + 2) * z_dim1] -= refsum * v[m22 * v_dim1
+				+ 2];
+/* L120: */
+		    }
+		}
+	    }
+
+/*           ==== Vigilant deflation check ==== */
+
+	    mstart = mtop;
+	    if (krcol + (mstart - 1) * 3 < *ktop) {
+		++mstart;
+	    }
+	    mend = mbot;
+	    if (bmp22) {
+		++mend;
+	    }
+	    if (krcol == *kbot - 2) {
+		++mend;
+	    }
+	    i__4 = mend;
+	    for (m = mstart; m <= i__4; ++m) {
+/* Computing MIN */
+		i__5 = *kbot - 1, i__7 = krcol + (m - 1) * 3;
+		k = min(i__5,i__7);
+
+/*
+                ==== The following convergence test requires that
+                .    the tradition small-compared-to-nearby-diagonals
+                .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+                .    criteria both be satisfied.  The latter improves
+                .    accuracy in some examples. Falling back on an
+                .    alternate convergence criterion when TST1 or TST2
+                .    is zero (as done here) is traditional but probably
+                .    unnecessary. ====
+*/
+
+		if (h__[k + 1 + k * h_dim1] != 0.f) {
+		    tst1 = (r__1 = h__[k + k * h_dim1], dabs(r__1)) + (r__2 =
+			    h__[k + 1 + (k + 1) * h_dim1], dabs(r__2));
+		    if (tst1 == 0.f) {
+			if (k >= *ktop + 1) {
+			    tst1 += (r__1 = h__[k + (k - 1) * h_dim1], dabs(
+				    r__1));
+			}
+			if (k >= *ktop + 2) {
+			    tst1 += (r__1 = h__[k + (k - 2) * h_dim1], dabs(
+				    r__1));
+			}
+			if (k >= *ktop + 3) {
+			    tst1 += (r__1 = h__[k + (k - 3) * h_dim1], dabs(
+				    r__1));
+			}
+			if (k <= *kbot - 2) {
+			    tst1 += (r__1 = h__[k + 2 + (k + 1) * h_dim1],
+				    dabs(r__1));
+			}
+			if (k <= *kbot - 3) {
+			    tst1 += (r__1 = h__[k + 3 + (k + 1) * h_dim1],
+				    dabs(r__1));
+			}
+			if (k <= *kbot - 4) {
+			    tst1 += (r__1 = h__[k + 4 + (k + 1) * h_dim1],
+				    dabs(r__1));
+			}
+		    }
+/* Computing MAX */
+		    r__2 = smlnum, r__3 = ulp * tst1;
+		    if ((r__1 = h__[k + 1 + k * h_dim1], dabs(r__1)) <= dmax(
+			    r__2,r__3)) {
+/* Computing MAX */
+			r__3 = (r__1 = h__[k + 1 + k * h_dim1], dabs(r__1)),
+				r__4 = (r__2 = h__[k + (k + 1) * h_dim1],
+				dabs(r__2));
+			h12 = dmax(r__3,r__4);
+/* Computing MIN */
+			r__3 = (r__1 = h__[k + 1 + k * h_dim1], dabs(r__1)),
+				r__4 = (r__2 = h__[k + (k + 1) * h_dim1],
+				dabs(r__2));
+			h21 = dmin(r__3,r__4);
+/* Computing MAX */
+			r__3 = (r__1 = h__[k + 1 + (k + 1) * h_dim1], dabs(
+				r__1)), r__4 = (r__2 = h__[k + k * h_dim1] -
+				h__[k + 1 + (k + 1) * h_dim1], dabs(r__2));
+			h11 = dmax(r__3,r__4);
+/* Computing MIN */
+			r__3 = (r__1 = h__[k + 1 + (k + 1) * h_dim1], dabs(
+				r__1)), r__4 = (r__2 = h__[k + k * h_dim1] -
+				h__[k + 1 + (k + 1) * h_dim1], dabs(r__2));
+			h22 = dmin(r__3,r__4);
+			scl = h11 + h12;
+			tst2 = h22 * (h11 / scl);
+
+/* Computing MAX */
+			r__1 = smlnum, r__2 = ulp * tst2;
+			if (tst2 == 0.f || h21 * (h12 / scl) <= dmax(r__1,
+				r__2)) {
+			    h__[k + 1 + k * h_dim1] = 0.f;
+			}
+		    }
+		}
+/* L130: */
+	    }
+
+/*
+             ==== Fill in the last row of each bulge. ====
+
+   Computing MIN
+*/
+	    i__4 = nbmps, i__5 = (*kbot - krcol - 1) / 3;
+	    mend = min(i__4,i__5);
+	    i__4 = mend;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		refsum = v[m * v_dim1 + 1] * v[m * v_dim1 + 3] * h__[k + 4 + (
+			k + 3) * h_dim1];
+		h__[k + 4 + (k + 1) * h_dim1] = -refsum;
+		h__[k + 4 + (k + 2) * h_dim1] = -refsum * v[m * v_dim1 + 2];
+		h__[k + 4 + (k + 3) * h_dim1] -= refsum * v[m * v_dim1 + 3];
+/* L140: */
+	    }
+
+/*
+             ==== End of near-the-diagonal bulge chase. ====
+
+   L150:
+*/
+	}
+
+/*
+          ==== Use U (if accumulated) to update far-from-diagonal
+          .    entries in H.  If required, use U to update Z as
+          .    well. ====
+*/
+
+	if (accum) {
+	    if (*wantt) {
+		jtop = 1;
+		jbot = *n;
+	    } else {
+		jtop = *ktop;
+		jbot = *kbot;
+	    }
+	    if (! blk22 || incol < *ktop || ndcol > *kbot || ns <= 2) {
+
+/*
+                ==== Updates not exploiting the 2-by-2 block
+                .    structure of U.  K1 and NU keep track of
+                .    the location and size of U in the special
+                .    cases of introducing bulges and chasing
+                .    bulges off the bottom.  In these special
+                .    cases and in case the number of shifts
+                .    is NS = 2, there is no 2-by-2 block
+                .    structure to exploit.  ====
+
+   Computing MAX
+*/
+		i__3 = 1, i__4 = *ktop - incol;
+		k1 = max(i__3,i__4);
+/* Computing MAX */
+		i__3 = 0, i__4 = ndcol - *kbot;
+		nu = kdu - max(i__3,i__4) - k1 + 1;
+
+/*              ==== Horizontal Multiply ==== */
+
+		i__3 = jbot;
+		i__4 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__4 < 0 ? jcol >= i__3 :
+			jcol <= i__3; jcol += i__4) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+		    sgemm_("C", "N", &nu, &jlen, &nu, &c_b15, &u[k1 + k1 *
+			    u_dim1], ldu, &h__[incol + k1 + jcol * h_dim1],
+			    ldh, &c_b29, &wh[wh_offset], ldwh);
+		    slacpy_("ALL", &nu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + k1 + jcol * h_dim1], ldh);
+/* L160: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__4 = max(*ktop,incol) - 1;
+		i__3 = *nv;
+		for (jrow = jtop; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			jrow += i__3) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(*ktop,incol) - jrow;
+		    jlen = min(i__5,i__7);
+		    sgemm_("N", "N", &jlen, &nu, &nu, &c_b15, &h__[jrow + (
+			    incol + k1) * h_dim1], ldh, &u[k1 + k1 * u_dim1],
+			    ldu, &c_b29, &wv[wv_offset], ldwv);
+		    slacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + k1) * h_dim1], ldh);
+/* L170: */
+		}
+
+/*              ==== Z multiply (also vertical) ==== */
+
+		if (*wantz) {
+		    i__3 = *ihiz;
+		    i__4 = *nv;
+		    for (jrow = *iloz; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			     jrow += i__4) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+			sgemm_("N", "N", &jlen, &nu, &nu, &c_b15, &z__[jrow +
+				(incol + k1) * z_dim1], ldz, &u[k1 + k1 *
+				u_dim1], ldu, &c_b29, &wv[wv_offset], ldwv);
+			slacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &z__[
+				jrow + (incol + k1) * z_dim1], ldz)
+				;
+/* L180: */
+		    }
+		}
+	    } else {
+
+/*
+                ==== Updates exploiting U's 2-by-2 block structure.
+                .    (I2, I4, J2, J4 are the last rows and columns
+                .    of the blocks.) ====
+*/
+
+		i2 = (kdu + 1) / 2;
+		i4 = kdu;
+		j2 = i4 - i2;
+		j4 = kdu;
+
+/*
+                ==== KZS and KNZ deal with the band of zeros
+                .    along the diagonal of one of the triangular
+                .    blocks. ====
+*/
+
+		kzs = j4 - j2 - (ns + 1);
+		knz = ns + 1;
+
+/*              ==== Horizontal multiply ==== */
+
+		i__4 = jbot;
+		i__3 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__3 < 0 ? jcol >= i__4 :
+			jcol <= i__4; jcol += i__3) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy bottom of H to top+KZS of scratch ====
+                    (The first KZS rows get multiplied by zero.) ====
+*/
+
+		    slacpy_("ALL", &knz, &jlen, &h__[incol + 1 + j2 + jcol *
+			    h_dim1], ldh, &wh[kzs + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    slaset_("ALL", &kzs, &jlen, &c_b29, &c_b29, &wh[wh_offset]
+			    , ldwh);
+		    strmm_("L", "U", "C", "N", &knz, &jlen, &c_b15, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wh[kzs + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Multiply top of H by U11' ==== */
+
+		    sgemm_("C", "N", &i2, &jlen, &j2, &c_b15, &u[u_offset],
+			    ldu, &h__[incol + 1 + jcol * h_dim1], ldh, &c_b15,
+			     &wh[wh_offset], ldwh);
+
+/*                 ==== Copy top of H to bottom of WH ==== */
+
+		    slacpy_("ALL", &j2, &jlen, &h__[incol + 1 + jcol * h_dim1]
+			    , ldh, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    strmm_("L", "L", "C", "N", &j2, &jlen, &c_b15, &u[(i2 + 1)
+			     * u_dim1 + 1], ldu, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    sgemm_("C", "N", &i__5, &jlen, &i__7, &c_b15, &u[j2 + 1 +
+			    (i2 + 1) * u_dim1], ldu, &h__[incol + 1 + j2 +
+			    jcol * h_dim1], ldh, &c_b15, &wh[i2 + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Copy it back ==== */
+
+		    slacpy_("ALL", &kdu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + 1 + jcol * h_dim1], ldh);
+/* L190: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__3 = max(incol,*ktop) - 1;
+		i__4 = *nv;
+		for (jrow = jtop; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			jrow += i__4) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(incol,*ktop) - jrow;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy right of H to scratch (the first KZS
+                   .    columns get multiplied by zero) ====
+*/
+
+		    slacpy_("ALL", &jlen, &knz, &h__[jrow + (incol + 1 + j2) *
+			     h_dim1], ldh, &wv[(kzs + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    slaset_("ALL", &jlen, &kzs, &c_b29, &c_b29, &wv[wv_offset]
+			    , ldwv);
+		    strmm_("R", "U", "N", "N", &jlen, &knz, &c_b15, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U11 ==== */
+
+		    sgemm_("N", "N", &jlen, &i2, &j2, &c_b15, &h__[jrow + (
+			    incol + 1) * h_dim1], ldh, &u[u_offset], ldu, &
+			    c_b15, &wv[wv_offset], ldwv)
+			    ;
+
+/*                 ==== Copy left of H to right of scratch ==== */
+
+		    slacpy_("ALL", &jlen, &j2, &h__[jrow + (incol + 1) *
+			    h_dim1], ldh, &wv[(i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    i__5 = i4 - i2;
+		    strmm_("R", "L", "N", "N", &jlen, &i__5, &c_b15, &u[(i2 +
+			    1) * u_dim1 + 1], ldu, &wv[(i2 + 1) * wv_dim1 + 1]
+			    , ldwv);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    sgemm_("N", "N", &jlen, &i__5, &i__7, &c_b15, &h__[jrow +
+			    (incol + 1 + j2) * h_dim1], ldh, &u[j2 + 1 + (i2
+			    + 1) * u_dim1], ldu, &c_b15, &wv[(i2 + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Copy it back ==== */
+
+		    slacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + 1) * h_dim1], ldh);
+/* L200: */
+		}
+
+/*              ==== Multiply Z (also vertical) ==== */
+
+		if (*wantz) {
+		    i__4 = *ihiz;
+		    i__3 = *nv;
+		    for (jrow = *iloz; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			     jrow += i__3) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+
+/*
+                      ==== Copy right of Z to left of scratch (first
+                      .     KZS columns get multiplied by zero) ====
+*/
+
+			slacpy_("ALL", &jlen, &knz, &z__[jrow + (incol + 1 +
+				j2) * z_dim1], ldz, &wv[(kzs + 1) * wv_dim1 +
+				1], ldwv);
+
+/*                    ==== Multiply by U12 ==== */
+
+			slaset_("ALL", &jlen, &kzs, &c_b29, &c_b29, &wv[
+				wv_offset], ldwv);
+			strmm_("R", "U", "N", "N", &jlen, &knz, &c_b15, &u[j2
+				+ 1 + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1)
+				* wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U11 ==== */
+
+			sgemm_("N", "N", &jlen, &i2, &j2, &c_b15, &z__[jrow +
+				(incol + 1) * z_dim1], ldz, &u[u_offset], ldu,
+				 &c_b15, &wv[wv_offset], ldwv);
+
+/*                    ==== Copy left of Z to right of scratch ==== */
+
+			slacpy_("ALL", &jlen, &j2, &z__[jrow + (incol + 1) *
+				z_dim1], ldz, &wv[(i2 + 1) * wv_dim1 + 1],
+				ldwv);
+
+/*                    ==== Multiply by U21 ==== */
+
+			i__5 = i4 - i2;
+			strmm_("R", "L", "N", "N", &jlen, &i__5, &c_b15, &u[(
+				i2 + 1) * u_dim1 + 1], ldu, &wv[(i2 + 1) *
+				wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U22 ==== */
+
+			i__5 = i4 - i2;
+			i__7 = j4 - j2;
+			sgemm_("N", "N", &jlen, &i__5, &i__7, &c_b15, &z__[
+				jrow + (incol + 1 + j2) * z_dim1], ldz, &u[j2
+				+ 1 + (i2 + 1) * u_dim1], ldu, &c_b15, &wv[(
+				i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                    ==== Copy the result back to Z ==== */
+
+			slacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &
+				z__[jrow + (incol + 1) * z_dim1], ldz);
+/* L210: */
+		    }
+		}
+	    }
+	}
+/* L220: */
+    }
+
+/*     ==== End of SLAQR5 ==== */
+
+    return 0;
+} /* slaqr5_ */
+
+/* Subroutine */ int slarf_(char *side, integer *m, integer *n, real *v,
+	integer *incv, real *tau, real *c__, integer *ldc, real *work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset;
+    real r__1;
+
+    /* Local variables */
+    static integer i__;
+    static logical applyleft;
+    extern /* Subroutine */ int sger_(integer *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, integer *);
+    extern logical lsame_(char *, char *);
+    static integer lastc;
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    static integer lastv;
+    extern integer ilaslc_(integer *, integer *, real *, integer *), ilaslr_(
+	    integer *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLARF applies a real elementary reflector H to a real m by n matrix
+    C, from either the left or the right. H is represented in the form
+
+          H = I - tau * v * v'
+
+    where tau is a real scalar and v is a real vector.
+
+    If tau = 0, then H is taken to be the unit matrix.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': form  H * C
+            = 'R': form  C * H
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    V       (input) REAL array, dimension
+                       (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+                    or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+            The vector v in the representation of H. V is not used if
+            TAU = 0.
+
+    INCV    (input) INTEGER
+            The increment between elements of v. INCV <> 0.
+
+    TAU     (input) REAL
+            The value tau in the representation of H.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+            or C * H if SIDE = 'R'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) REAL array, dimension
+                           (N) if SIDE = 'L'
+                        or (M) if SIDE = 'R'
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    applyleft = lsame_(side, "L");
+    lastv = 0;
+    lastc = 0;
+    if (*tau != 0.f) {
+/*
+       Set up variables for scanning V.  LASTV begins pointing to the end
+       of V.
+*/
+	if (applyleft) {
+	    lastv = *m;
+	} else {
+	    lastv = *n;
+	}
+	if (*incv > 0) {
+	    i__ = (lastv - 1) * *incv + 1;
+	} else {
+	    i__ = 1;
+	}
+/*     Look for the last non-zero row in V. */
+	while(lastv > 0 && v[i__] == 0.f) {
+	    --lastv;
+	    i__ -= *incv;
+	}
+	if (applyleft) {
+/*     Scan for the last non-zero column in C(1:lastv,:). */
+	    lastc = ilaslc_(&lastv, n, &c__[c_offset], ldc);
+	} else {
+/*     Scan for the last non-zero row in C(:,1:lastv). */
+	    lastc = ilaslr_(m, &lastv, &c__[c_offset], ldc);
+	}
+    }
+/*
+       Note that lastc.eq.0 renders the BLAS operations null; no special
+       case is needed at this level.
+*/
+    if (applyleft) {
+
+/*        Form  H * C */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastv,1:lastc)' * v(1:lastv,1) */
+
+	    sgemv_("Transpose", &lastv, &lastc, &c_b15, &c__[c_offset], ldc, &
+		    v[1], incv, &c_b29, &work[1], &c__1);
+
+/*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)' */
+
+	    r__1 = -(*tau);
+	    sger_(&lastv, &lastc, &r__1, &v[1], incv, &work[1], &c__1, &c__[
+		    c_offset], ldc);
+	}
+    } else {
+
+/*        Form  C * H */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1) */
+
+	    sgemv_("No transpose", &lastc, &lastv, &c_b15, &c__[c_offset],
+		    ldc, &v[1], incv, &c_b29, &work[1], &c__1);
+
+/*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)' */
+
+	    r__1 = -(*tau);
+	    sger_(&lastc, &lastv, &r__1, &work[1], &c__1, &v[1], incv, &c__[
+		    c_offset], ldc);
+	}
+    }
+    return 0;
+
+/*     End of SLARF */
+
+} /* slarf_ */
+
+/* Subroutine */ int slarfb_(char *side, char *trans, char *direct, char *
+	storev, integer *m, integer *n, integer *k, real *v, integer *ldv,
+	real *t, integer *ldt, real *c__, integer *ldc, real *work, integer *
+	ldwork)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1,
+	    work_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+    static integer lastc;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer lastv;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), strmm_(char *, char *, char *, char *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *);
+    extern integer ilaslc_(integer *, integer *, real *, integer *), ilaslr_(
+	    integer *, integer *, real *, integer *);
+    static char transt[1];
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLARFB applies a real block reflector H or its transpose H' to a
+    real m by n matrix C, from either the left or the right.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply H or H' from the Left
+            = 'R': apply H or H' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply H (No transpose)
+            = 'T': apply H' (Transpose)
+
+    DIRECT  (input) CHARACTER*1
+            Indicates how H is formed from a product of elementary
+            reflectors
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Indicates how the vectors which define the elementary
+            reflectors are stored:
+            = 'C': Columnwise
+            = 'R': Rowwise
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    K       (input) INTEGER
+            The order of the matrix T (= the number of elementary
+            reflectors whose product defines the block reflector).
+
+    V       (input) REAL array, dimension
+                                  (LDV,K) if STOREV = 'C'
+                                  (LDV,M) if STOREV = 'R' and SIDE = 'L'
+                                  (LDV,N) if STOREV = 'R' and SIDE = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+            if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+            if STOREV = 'R', LDV >= K.
+
+    T       (input) REAL array, dimension (LDT,K)
+            The triangular k by k matrix T in the representation of the
+            block reflector.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by H*C or H'*C or C*H or C*H'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDA >= max(1,M).
+
+    WORK    (workspace) REAL array, dimension (LDWORK,K)
+
+    LDWORK  (input) INTEGER
+            The leading dimension of the array WORK.
+            If SIDE = 'L', LDWORK >= max(1,N);
+            if SIDE = 'R', LDWORK >= max(1,M).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    work_dim1 = *ldwork;
+    work_offset = 1 + work_dim1;
+    work -= work_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(trans, "N")) {
+	*(unsigned char *)transt = 'T';
+    } else {
+	*(unsigned char *)transt = 'N';
+    }
+
+    if (lsame_(storev, "C")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1 )    (first K rows)
+                       ( V2 )
+             where  V1  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+/* L10: */
+		}
+
+/*              W := W * V1 */
+
+		strmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2 */
+
+		    i__1 = lastv - *k;
+		    sgemm_("Transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[*k + 1 + c_dim1], ldc, &v[*k + 1 +
+			    v_dim1], ldv, &c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		strmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2 * W' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[*k + 1 + v_dim1], ldv, &work[
+			    work_offset], ldwork, &c_b15, &c__[*k + 1 +
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		strmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];
+/* L20: */
+		    }
+/* L30: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L40: */
+		}
+
+/*              W := W * V1 */
+
+		strmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2 */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[*k +
+			    1 + v_dim1], ldv, &c_b15, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		strmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "Transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[*k + 1 +
+			    v_dim1], ldv, &c_b15, &c__[(*k + 1) * c_dim1 + 1],
+			     ldc);
+		}
+
+/*              W := W * V1' */
+
+		strmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] -= work[i__ + j * work_dim1];
+/* L50: */
+		    }
+/* L60: */
+		}
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1 )
+                       ( V2 )    (last K rows)
+             where  V2  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+/* L70: */
+		}
+
+/*              W := W * V2 */
+
+		strmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1 */
+
+		    i__1 = lastv - *k;
+		    sgemm_("Transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		strmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1 * W' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[v_offset], ldv, &work[work_offset],
+			    ldwork, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		strmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[lastv - *k + j + i__ * c_dim1] -= work[i__ + j *
+				work_dim1];
+/* L80: */
+		    }
+/* L90: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[(*n - *k + j) * c_dim1 + 1], &c__1, &
+			    work[j * work_dim1 + 1], &c__1);
+/* L100: */
+		}
+
+/*              W := W * V2 */
+
+		strmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1 */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		strmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "Transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[v_offset],
+			    ldv, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		strmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + (lastv - *k + j) * c_dim1] -= work[i__ + j *
+				 work_dim1];
+/* L110: */
+		    }
+/* L120: */
+		}
+	    }
+	}
+
+    } else if (lsame_(storev, "R")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1  V2 )    (V1: first K columns)
+             where  V1  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+/* L130: */
+		}
+
+/*              W := W * V1' */
+
+		strmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("Transpose", "Transpose", &lastc, k, &i__1, &c_b15,
+			     &c__[*k + 1 + c_dim1], ldc, &v[(*k + 1) * v_dim1
+			    + 1], ldv, &c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		strmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2' * W' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("Transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[(*k + 1) * v_dim1 + 1], ldv, &work[
+			    work_offset], ldwork, &c_b15, &c__[*k + 1 +
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		strmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];
+/* L140: */
+		    }
+/* L150: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L160: */
+		}
+
+/*              W := W * V1' */
+
+		strmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "Transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[(*k +
+			    1) * v_dim1 + 1], ldv, &c_b15, &work[work_offset],
+			     ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		strmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2 */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[(*k + 1) *
+			    v_dim1 + 1], ldv, &c_b15, &c__[(*k + 1) * c_dim1
+			    + 1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		strmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] -= work[i__ + j * work_dim1];
+/* L170: */
+		    }
+/* L180: */
+		}
+
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1  V2 )    (V2: last K columns)
+             where  V2  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+/* L190: */
+		}
+
+/*              W := W * V2' */
+
+		strmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("Transpose", "Transpose", &lastc, k, &i__1, &c_b15,
+			     &c__[c_offset], ldc, &v[v_offset], ldv, &c_b15, &
+			    work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		strmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b15, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1' * W' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("Transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b151, &v[v_offset], ldv, &work[work_offset],
+			    ldwork, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		strmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[lastv - *k + j + i__ * c_dim1] -= work[i__ + j *
+				work_dim1];
+/* L200: */
+		    }
+/* L210: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilaslc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilaslr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    scopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1,
+			     &work[j * work_dim1 + 1], &c__1);
+/* L220: */
+		}
+
+/*              W := W * V2' */
+
+		strmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1' */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "Transpose", &lastc, k, &i__1, &
+			    c_b15, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b15, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		strmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b15,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1 */
+
+		    i__1 = lastv - *k;
+		    sgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    c_b151, &work[work_offset], ldwork, &v[v_offset],
+			    ldv, &c_b15, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		strmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b15, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + (lastv - *k + j) * c_dim1] -= work[i__ + j *
+				 work_dim1];
+/* L230: */
+		    }
+/* L240: */
+		}
+
+	    }
+
+	}
+    }
+
+    return 0;
+
+/*     End of SLARFB */
+
+} /* slarfb_ */
+
+/* Subroutine */ int slarfg_(integer *n, real *alpha, real *x, integer *incx,
+	real *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1;
+
+    /* Local variables */
+    static integer j, knt;
+    static real beta;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static real xnorm;
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    static real safmin, rsafmn;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLARFG generates a real elementary reflector H of order n, such
+    that
+
+          H * ( alpha ) = ( beta ),   H' * H = I.
+              (   x   )   (   0  )
+
+    where alpha and beta are scalars, and x is an (n-1)-element real
+    vector. H is represented in the form
+
+          H = I - tau * ( 1 ) * ( 1 v' ) ,
+                        ( v )
+
+    where tau is a real scalar and v is a real (n-1)-element
+    vector.
+
+    If the elements of x are all zero, then tau = 0 and H is taken to be
+    the unit matrix.
+
+    Otherwise  1 <= tau <= 2.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the elementary reflector.
+
+    ALPHA   (input/output) REAL
+            On entry, the value alpha.
+            On exit, it is overwritten with the value beta.
+
+    X       (input/output) REAL array, dimension
+                           (1+(N-2)*abs(INCX))
+            On entry, the vector x.
+            On exit, it is overwritten with the vector v.
+
+    INCX    (input) INTEGER
+            The increment between elements of X. INCX > 0.
+
+    TAU     (output) REAL
+            The value tau.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 1) {
+	*tau = 0.f;
+	return 0;
+    }
+
+    i__1 = *n - 1;
+    xnorm = snrm2_(&i__1, &x[1], incx);
+
+    if (xnorm == 0.f) {
+
+/*        H  =  I */
+
+	*tau = 0.f;
+    } else {
+
+/*        general case */
+
+	r__1 = slapy2_(alpha, &xnorm);
+	beta = -r_sign(&r__1, alpha);
+	safmin = slamch_("S") / slamch_("E");
+	knt = 0;
+	if (dabs(beta) < safmin) {
+
+/*           XNORM, BETA may be inaccurate; scale X and recompute them */
+
+	    rsafmn = 1.f / safmin;
+L10:
+	    ++knt;
+	    i__1 = *n - 1;
+	    sscal_(&i__1, &rsafmn, &x[1], incx);
+	    beta *= rsafmn;
+	    *alpha *= rsafmn;
+	    if (dabs(beta) < safmin) {
+		goto L10;
+	    }
+
+/*           New BETA is at most 1, at least SAFMIN */
+
+	    i__1 = *n - 1;
+	    xnorm = snrm2_(&i__1, &x[1], incx);
+	    r__1 = slapy2_(alpha, &xnorm);
+	    beta = -r_sign(&r__1, alpha);
+	}
+	*tau = (beta - *alpha) / beta;
+	i__1 = *n - 1;
+	r__1 = 1.f / (*alpha - beta);
+	sscal_(&i__1, &r__1, &x[1], incx);
+
+/*        If ALPHA is subnormal, it may lose relative accuracy */
+
+	i__1 = knt;
+	for (j = 1; j <= i__1; ++j) {
+	    beta *= safmin;
+/* L20: */
+	}
+	*alpha = beta;
+    }
+
+    return 0;
+
+/*     End of SLARFG */
+
+} /* slarfg_ */
+
+/* Subroutine */ int slarft_(char *direct, char *storev, integer *n, integer *
+	k, real *v, integer *ldv, real *tau, real *t, integer *ldt)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2, i__3;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, prevlastv;
+    static real vii;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    static integer lastv;
+    extern /* Subroutine */ int strmv_(char *, char *, char *, integer *,
+	    real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLARFT forms the triangular factor T of a real block reflector H
+    of order n, which is defined as a product of k elementary reflectors.
+
+    If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+
+    If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+
+    If STOREV = 'C', the vector which defines the elementary reflector
+    H(i) is stored in the i-th column of the array V, and
+
+       H  =  I - V * T * V'
+
+    If STOREV = 'R', the vector which defines the elementary reflector
+    H(i) is stored in the i-th row of the array V, and
+
+       H  =  I - V' * T * V
+
+    Arguments
+    =========
+
+    DIRECT  (input) CHARACTER*1
+            Specifies the order in which the elementary reflectors are
+            multiplied to form the block reflector:
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Specifies how the vectors which define the elementary
+            reflectors are stored (see also Further Details):
+            = 'C': columnwise
+            = 'R': rowwise
+
+    N       (input) INTEGER
+            The order of the block reflector H. N >= 0.
+
+    K       (input) INTEGER
+            The order of the triangular factor T (= the number of
+            elementary reflectors). K >= 1.
+
+    V       (input/output) REAL array, dimension
+                                 (LDV,K) if STOREV = 'C'
+                                 (LDV,N) if STOREV = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i).
+
+    T       (output) REAL array, dimension (LDT,K)
+            The k by k triangular factor T of the block reflector.
+            If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+            lower triangular. The rest of the array is not used.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    Further Details
+    ===============
+
+    The shape of the matrix V and the storage of the vectors which define
+    the H(i) is best illustrated by the following example with n = 5 and
+    k = 3. The elements equal to 1 are not stored; the corresponding
+    array elements are modified but restored on exit. The rest of the
+    array is not used.
+
+    DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+
+                 V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+                     ( v1  1    )                     (     1 v2 v2 v2 )
+                     ( v1 v2  1 )                     (        1 v3 v3 )
+                     ( v1 v2 v3 )
+                     ( v1 v2 v3 )
+
+    DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+
+                 V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+                     ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+                     (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+                     (     1 v3 )
+                     (        1 )
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    --tau;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (lsame_(direct, "F")) {
+	prevlastv = *n;
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    prevlastv = max(i__,prevlastv);
+	    if (tau[i__] == 0.f) {
+
+/*              H(i)  =  I */
+
+		i__2 = i__;
+		for (j = 1; j <= i__2; ++j) {
+		    t[j + i__ * t_dim1] = 0.f;
+/* L10: */
+		}
+	    } else {
+
+/*              general case */
+
+		vii = v[i__ + i__ * v_dim1];
+		v[i__ + i__ * v_dim1] = 1.f;
+		if (lsame_(storev, "C")) {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			if (v[lastv + i__ * v_dim1] != 0.f) {
+			    goto L15;
+			}
+		    }
+L15:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)' * V(i:j,i) */
+
+		    i__2 = j - i__ + 1;
+		    i__3 = i__ - 1;
+		    r__1 = -tau[i__];
+		    sgemv_("Transpose", &i__2, &i__3, &r__1, &v[i__ + v_dim1],
+			     ldv, &v[i__ + i__ * v_dim1], &c__1, &c_b29, &t[
+			    i__ * t_dim1 + 1], &c__1);
+		} else {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			if (v[i__ + lastv * v_dim1] != 0.f) {
+			    goto L16;
+			}
+		    }
+L16:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)' */
+
+		    i__2 = i__ - 1;
+		    i__3 = j - i__ + 1;
+		    r__1 = -tau[i__];
+		    sgemv_("No transpose", &i__2, &i__3, &r__1, &v[i__ *
+			    v_dim1 + 1], ldv, &v[i__ + i__ * v_dim1], ldv, &
+			    c_b29, &t[i__ * t_dim1 + 1], &c__1);
+		}
+		v[i__ + i__ * v_dim1] = vii;
+
+/*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		strmv_("Upper", "No transpose", "Non-unit", &i__2, &t[
+			t_offset], ldt, &t[i__ * t_dim1 + 1], &c__1);
+		t[i__ + i__ * t_dim1] = tau[i__];
+		if (i__ > 1) {
+		    prevlastv = max(prevlastv,lastv);
+		} else {
+		    prevlastv = lastv;
+		}
+	    }
+/* L20: */
+	}
+    } else {
+	prevlastv = 1;
+	for (i__ = *k; i__ >= 1; --i__) {
+	    if (tau[i__] == 0.f) {
+
+/*              H(i)  =  I */
+
+		i__1 = *k;
+		for (j = i__; j <= i__1; ++j) {
+		    t[j + i__ * t_dim1] = 0.f;
+/* L30: */
+		}
+	    } else {
+
+/*              general case */
+
+		if (i__ < *k) {
+		    if (lsame_(storev, "C")) {
+			vii = v[*n - *k + i__ + i__ * v_dim1];
+			v[*n - *k + i__ + i__ * v_dim1] = 1.f;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    if (v[lastv + i__ * v_dim1] != 0.f) {
+				goto L35;
+			    }
+			}
+L35:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(j:n-k+i,i+1:k)' * V(j:n-k+i,i)
+*/
+
+			i__1 = *n - *k + i__ - j + 1;
+			i__2 = *k - i__;
+			r__1 = -tau[i__];
+			sgemv_("Transpose", &i__1, &i__2, &r__1, &v[j + (i__
+				+ 1) * v_dim1], ldv, &v[j + i__ * v_dim1], &
+				c__1, &c_b29, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			v[*n - *k + i__ + i__ * v_dim1] = vii;
+		    } else {
+			vii = v[i__ + (*n - *k + i__) * v_dim1];
+			v[i__ + (*n - *k + i__) * v_dim1] = 1.f;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    if (v[i__ + lastv * v_dim1] != 0.f) {
+				goto L36;
+			    }
+			}
+L36:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)'
+*/
+
+			i__1 = *k - i__;
+			i__2 = *n - *k + i__ - j + 1;
+			r__1 = -tau[i__];
+			sgemv_("No transpose", &i__1, &i__2, &r__1, &v[i__ +
+				1 + j * v_dim1], ldv, &v[i__ + j * v_dim1],
+				ldv, &c_b29, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			v[i__ + (*n - *k + i__) * v_dim1] = vii;
+		    }
+
+/*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) */
+
+		    i__1 = *k - i__;
+		    strmv_("Lower", "No transpose", "Non-unit", &i__1, &t[i__
+			    + 1 + (i__ + 1) * t_dim1], ldt, &t[i__ + 1 + i__ *
+			     t_dim1], &c__1)
+			    ;
+		    if (i__ > 1) {
+			prevlastv = min(prevlastv,lastv);
+		    } else {
+			prevlastv = lastv;
+		    }
+		}
+		t[i__ + i__ * t_dim1] = tau[i__];
+	    }
+/* L40: */
+	}
+    }
+    return 0;
+
+/*     End of SLARFT */
+
+} /* slarft_ */
+
+/* Subroutine */ int slarfx_(char *side, integer *m, integer *n, real *v,
+	real *tau, real *c__, integer *ldc, real *work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1;
+
+    /* Local variables */
+    static integer j;
+    static real t1, t2, t3, t4, t5, t6, t7, t8, t9, v1, v2, v3, v4, v5, v6,
+	    v7, v8, v9, t10, v10, sum;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLARFX applies a real elementary reflector H to a real m by n
+    matrix C, from either the left or the right. H is represented in the
+    form
+
+          H = I - tau * v * v'
+
+    where tau is a real scalar and v is a real vector.
+
+    If tau = 0, then H is taken to be the unit matrix
+
+    This version uses inline code if H has order < 11.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': form  H * C
+            = 'R': form  C * H
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    V       (input) REAL array, dimension (M) if SIDE = 'L'
+                                       or (N) if SIDE = 'R'
+            The vector v in the representation of H.
+
+    TAU     (input) REAL
+            The value tau in the representation of H.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+            or C * H if SIDE = 'R'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDA >= (1,M).
+
+    WORK    (workspace) REAL array, dimension
+                        (N) if SIDE = 'L'
+                        or (M) if SIDE = 'R'
+            WORK is not referenced if H has order < 11.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    if (*tau == 0.f) {
+	return 0;
+    }
+    if (lsame_(side, "L")) {
+
+/*        Form  H * C, where H has order m. */
+
+	switch (*m) {
+	    case 1:  goto L10;
+	    case 2:  goto L30;
+	    case 3:  goto L50;
+	    case 4:  goto L70;
+	    case 5:  goto L90;
+	    case 6:  goto L110;
+	    case 7:  goto L130;
+	    case 8:  goto L150;
+	    case 9:  goto L170;
+	    case 10:  goto L190;
+	}
+
+/*        Code for general M */
+
+	slarf_(side, m, n, &v[1], &c__1, tau, &c__[c_offset], ldc, &work[1]);
+	goto L410;
+L10:
+
+/*        Special code for 1 x 1 Householder */
+
+	t1 = 1.f - *tau * v[1] * v[1];
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    c__[j * c_dim1 + 1] = t1 * c__[j * c_dim1 + 1];
+/* L20: */
+	}
+	goto L410;
+L30:
+
+/*        Special code for 2 x 2 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+/* L40: */
+	}
+	goto L410;
+L50:
+
+/*        Special code for 3 x 3 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+/* L60: */
+	}
+	goto L410;
+L70:
+
+/*        Special code for 4 x 4 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+/* L80: */
+	}
+	goto L410;
+L90:
+
+/*        Special code for 5 x 5 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+/* L100: */
+	}
+	goto L410;
+L110:
+
+/*        Special code for 6 x 6 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+/* L120: */
+	}
+	goto L410;
+L130:
+
+/*        Special code for 7 x 7 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+/* L140: */
+	}
+	goto L410;
+L150:
+
+/*        Special code for 8 x 8 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7] + v8 * c__[j * c_dim1 + 8];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+	    c__[j * c_dim1 + 8] -= sum * t8;
+/* L160: */
+	}
+	goto L410;
+L170:
+
+/*        Special code for 9 x 9 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7] + v8 * c__[j * c_dim1 + 8] + v9 * c__[j *
+		    c_dim1 + 9];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+	    c__[j * c_dim1 + 8] -= sum * t8;
+	    c__[j * c_dim1 + 9] -= sum * t9;
+/* L180: */
+	}
+	goto L410;
+L190:
+
+/*        Special code for 10 x 10 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	v10 = v[10];
+	t10 = *tau * v10;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j * c_dim1 + 1] + v2 * c__[j * c_dim1 + 2] + v3 *
+		    c__[j * c_dim1 + 3] + v4 * c__[j * c_dim1 + 4] + v5 * c__[
+		    j * c_dim1 + 5] + v6 * c__[j * c_dim1 + 6] + v7 * c__[j *
+		    c_dim1 + 7] + v8 * c__[j * c_dim1 + 8] + v9 * c__[j *
+		    c_dim1 + 9] + v10 * c__[j * c_dim1 + 10];
+	    c__[j * c_dim1 + 1] -= sum * t1;
+	    c__[j * c_dim1 + 2] -= sum * t2;
+	    c__[j * c_dim1 + 3] -= sum * t3;
+	    c__[j * c_dim1 + 4] -= sum * t4;
+	    c__[j * c_dim1 + 5] -= sum * t5;
+	    c__[j * c_dim1 + 6] -= sum * t6;
+	    c__[j * c_dim1 + 7] -= sum * t7;
+	    c__[j * c_dim1 + 8] -= sum * t8;
+	    c__[j * c_dim1 + 9] -= sum * t9;
+	    c__[j * c_dim1 + 10] -= sum * t10;
+/* L200: */
+	}
+	goto L410;
+    } else {
+
+/*        Form  C * H, where H has order n. */
+
+	switch (*n) {
+	    case 1:  goto L210;
+	    case 2:  goto L230;
+	    case 3:  goto L250;
+	    case 4:  goto L270;
+	    case 5:  goto L290;
+	    case 6:  goto L310;
+	    case 7:  goto L330;
+	    case 8:  goto L350;
+	    case 9:  goto L370;
+	    case 10:  goto L390;
+	}
+
+/*        Code for general N */
+
+	slarf_(side, m, n, &v[1], &c__1, tau, &c__[c_offset], ldc, &work[1]);
+	goto L410;
+L210:
+
+/*        Special code for 1 x 1 Householder */
+
+	t1 = 1.f - *tau * v[1] * v[1];
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    c__[j + c_dim1] = t1 * c__[j + c_dim1];
+/* L220: */
+	}
+	goto L410;
+L230:
+
+/*        Special code for 2 x 2 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+/* L240: */
+	}
+	goto L410;
+L250:
+
+/*        Special code for 3 x 3 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+/* L260: */
+	}
+	goto L410;
+L270:
+
+/*        Special code for 4 x 4 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+/* L280: */
+	}
+	goto L410;
+L290:
+
+/*        Special code for 5 x 5 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+/* L300: */
+	}
+	goto L410;
+L310:
+
+/*        Special code for 6 x 6 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+/* L320: */
+	}
+	goto L410;
+L330:
+
+/*        Special code for 7 x 7 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+/* L340: */
+	}
+	goto L410;
+L350:
+
+/*        Special code for 8 x 8 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7] + v8 * c__[j + (c_dim1 << 3)];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+	    c__[j + (c_dim1 << 3)] -= sum * t8;
+/* L360: */
+	}
+	goto L410;
+L370:
+
+/*        Special code for 9 x 9 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7] + v8 * c__[j + (c_dim1 << 3)] + v9 * c__[
+		    j + c_dim1 * 9];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+	    c__[j + (c_dim1 << 3)] -= sum * t8;
+	    c__[j + c_dim1 * 9] -= sum * t9;
+/* L380: */
+	}
+	goto L410;
+L390:
+
+/*        Special code for 10 x 10 Householder */
+
+	v1 = v[1];
+	t1 = *tau * v1;
+	v2 = v[2];
+	t2 = *tau * v2;
+	v3 = v[3];
+	t3 = *tau * v3;
+	v4 = v[4];
+	t4 = *tau * v4;
+	v5 = v[5];
+	t5 = *tau * v5;
+	v6 = v[6];
+	t6 = *tau * v6;
+	v7 = v[7];
+	t7 = *tau * v7;
+	v8 = v[8];
+	t8 = *tau * v8;
+	v9 = v[9];
+	t9 = *tau * v9;
+	v10 = v[10];
+	t10 = *tau * v10;
+	i__1 = *m;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = v1 * c__[j + c_dim1] + v2 * c__[j + (c_dim1 << 1)] + v3 *
+		    c__[j + c_dim1 * 3] + v4 * c__[j + (c_dim1 << 2)] + v5 *
+		    c__[j + c_dim1 * 5] + v6 * c__[j + c_dim1 * 6] + v7 * c__[
+		    j + c_dim1 * 7] + v8 * c__[j + (c_dim1 << 3)] + v9 * c__[
+		    j + c_dim1 * 9] + v10 * c__[j + c_dim1 * 10];
+	    c__[j + c_dim1] -= sum * t1;
+	    c__[j + (c_dim1 << 1)] -= sum * t2;
+	    c__[j + c_dim1 * 3] -= sum * t3;
+	    c__[j + (c_dim1 << 2)] -= sum * t4;
+	    c__[j + c_dim1 * 5] -= sum * t5;
+	    c__[j + c_dim1 * 6] -= sum * t6;
+	    c__[j + c_dim1 * 7] -= sum * t7;
+	    c__[j + (c_dim1 << 3)] -= sum * t8;
+	    c__[j + c_dim1 * 9] -= sum * t9;
+	    c__[j + c_dim1 * 10] -= sum * t10;
+/* L400: */
+	}
+	goto L410;
+    }
+L410:
+    return 0;
+
+/*     End of SLARFX */
+
+} /* slarfx_ */
+
+/* Subroutine */ int slartg_(real *f, real *g, real *cs, real *sn, real *r__)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__;
+    static real f1, g1, eps, scale;
+    static integer count;
+    static real safmn2, safmx2;
+    extern doublereal slamch_(char *);
+    static real safmin;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLARTG generate a plane rotation so that
+
+       [  CS  SN  ]  .  [ F ]  =  [ R ]   where CS**2 + SN**2 = 1.
+       [ -SN  CS  ]     [ G ]     [ 0 ]
+
+    This is a slower, more accurate version of the BLAS1 routine SROTG,
+    with the following other differences:
+       F and G are unchanged on return.
+       If G=0, then CS=1 and SN=0.
+       If F=0 and (G .ne. 0), then CS=0 and SN=1 without doing any
+          floating point operations (saves work in SBDSQR when
+          there are zeros on the diagonal).
+
+    If F exceeds G in magnitude, CS will be positive.
+
+    Arguments
+    =========
+
+    F       (input) REAL
+            The first component of vector to be rotated.
+
+    G       (input) REAL
+            The second component of vector to be rotated.
+
+    CS      (output) REAL
+            The cosine of the rotation.
+
+    SN      (output) REAL
+            The sine of the rotation.
+
+    R       (output) REAL
+            The nonzero component of the rotated vector.
+
+    This version has a few statements commented out for thread safety
+    (machine parameters are computed on each entry). 10 feb 03, SJH.
+
+    =====================================================================
+
+       LOGICAL            FIRST
+       SAVE               FIRST, SAFMX2, SAFMIN, SAFMN2
+       DATA               FIRST / .TRUE. /
+
+       IF( FIRST ) THEN
+*/
+    safmin = slamch_("S");
+    eps = slamch_("E");
+    r__1 = slamch_("B");
+    i__1 = (integer) (log(safmin / eps) / log(slamch_("B")) / 2.f);
+    safmn2 = pow_ri(&r__1, &i__1);
+    safmx2 = 1.f / safmn2;
+/*
+          FIRST = .FALSE.
+       END IF
+*/
+    if (*g == 0.f) {
+	*cs = 1.f;
+	*sn = 0.f;
+	*r__ = *f;
+    } else if (*f == 0.f) {
+	*cs = 0.f;
+	*sn = 1.f;
+	*r__ = *g;
+    } else {
+	f1 = *f;
+	g1 = *g;
+/* Computing MAX */
+	r__1 = dabs(f1), r__2 = dabs(g1);
+	scale = dmax(r__1,r__2);
+	if (scale >= safmx2) {
+	    count = 0;
+L10:
+	    ++count;
+	    f1 *= safmn2;
+	    g1 *= safmn2;
+/* Computing MAX */
+	    r__1 = dabs(f1), r__2 = dabs(g1);
+	    scale = dmax(r__1,r__2);
+	    if (scale >= safmx2) {
+		goto L10;
+	    }
+/* Computing 2nd power */
+	    r__1 = f1;
+/* Computing 2nd power */
+	    r__2 = g1;
+	    *r__ = sqrt(r__1 * r__1 + r__2 * r__2);
+	    *cs = f1 / *r__;
+	    *sn = g1 / *r__;
+	    i__1 = count;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		*r__ *= safmx2;
+/* L20: */
+	    }
+	} else if (scale <= safmn2) {
+	    count = 0;
+L30:
+	    ++count;
+	    f1 *= safmx2;
+	    g1 *= safmx2;
+/* Computing MAX */
+	    r__1 = dabs(f1), r__2 = dabs(g1);
+	    scale = dmax(r__1,r__2);
+	    if (scale <= safmn2) {
+		goto L30;
+	    }
+/* Computing 2nd power */
+	    r__1 = f1;
+/* Computing 2nd power */
+	    r__2 = g1;
+	    *r__ = sqrt(r__1 * r__1 + r__2 * r__2);
+	    *cs = f1 / *r__;
+	    *sn = g1 / *r__;
+	    i__1 = count;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		*r__ *= safmn2;
+/* L40: */
+	    }
+	} else {
+/* Computing 2nd power */
+	    r__1 = f1;
+/* Computing 2nd power */
+	    r__2 = g1;
+	    *r__ = sqrt(r__1 * r__1 + r__2 * r__2);
+	    *cs = f1 / *r__;
+	    *sn = g1 / *r__;
+	}
+	if (dabs(*f) > dabs(*g) && *cs < 0.f) {
+	    *cs = -(*cs);
+	    *sn = -(*sn);
+	    *r__ = -(*r__);
+	}
+    }
+    return 0;
+
+/*     End of SLARTG */
+
+} /* slartg_ */
+
+/* Subroutine */ int slas2_(real *f, real *g, real *h__, real *ssmin, real *
+	ssmax)
+{
+    /* System generated locals */
+    real r__1, r__2;
+
+    /* Local variables */
+    static real c__, fa, ga, ha, as, at, au, fhmn, fhmx;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAS2  computes the singular values of the 2-by-2 matrix
+       [  F   G  ]
+       [  0   H  ].
+    On return, SSMIN is the smaller singular value and SSMAX is the
+    larger singular value.
+
+    Arguments
+    =========
+
+    F       (input) REAL
+            The (1,1) element of the 2-by-2 matrix.
+
+    G       (input) REAL
+            The (1,2) element of the 2-by-2 matrix.
+
+    H       (input) REAL
+            The (2,2) element of the 2-by-2 matrix.
+
+    SSMIN   (output) REAL
+            The smaller singular value.
+
+    SSMAX   (output) REAL
+            The larger singular value.
+
+    Further Details
+    ===============
+
+    Barring over/underflow, all output quantities are correct to within
+    a few units in the last place (ulps), even in the absence of a guard
+    digit in addition/subtraction.
+
+    In IEEE arithmetic, the code works correctly if one matrix element is
+    infinite.
+
+    Overflow will not occur unless the largest singular value itself
+    overflows, or is within a few ulps of overflow. (On machines with
+    partial overflow, like the Cray, overflow may occur if the largest
+    singular value is within a factor of 2 of overflow.)
+
+    Underflow is harmless if underflow is gradual. Otherwise, results
+    may correspond to a matrix modified by perturbations of size near
+    the underflow threshold.
+
+    ====================================================================
+*/
+
+
+    fa = dabs(*f);
+    ga = dabs(*g);
+    ha = dabs(*h__);
+    fhmn = dmin(fa,ha);
+    fhmx = dmax(fa,ha);
+    if (fhmn == 0.f) {
+	*ssmin = 0.f;
+	if (fhmx == 0.f) {
+	    *ssmax = ga;
+	} else {
+/* Computing 2nd power */
+	    r__1 = dmin(fhmx,ga) / dmax(fhmx,ga);
+	    *ssmax = dmax(fhmx,ga) * sqrt(r__1 * r__1 + 1.f);
+	}
+    } else {
+	if (ga < fhmx) {
+	    as = fhmn / fhmx + 1.f;
+	    at = (fhmx - fhmn) / fhmx;
+/* Computing 2nd power */
+	    r__1 = ga / fhmx;
+	    au = r__1 * r__1;
+	    c__ = 2.f / (sqrt(as * as + au) + sqrt(at * at + au));
+	    *ssmin = fhmn * c__;
+	    *ssmax = fhmx / c__;
+	} else {
+	    au = fhmx / ga;
+	    if (au == 0.f) {
+
+/*
+                Avoid possible harmful underflow if exponent range
+                asymmetric (true SSMIN may not underflow even if
+                AU underflows)
+*/
+
+		*ssmin = fhmn * fhmx / ga;
+		*ssmax = ga;
+	    } else {
+		as = fhmn / fhmx + 1.f;
+		at = (fhmx - fhmn) / fhmx;
+/* Computing 2nd power */
+		r__1 = as * au;
+/* Computing 2nd power */
+		r__2 = at * au;
+		c__ = 1.f / (sqrt(r__1 * r__1 + 1.f) + sqrt(r__2 * r__2 + 1.f)
+			);
+		*ssmin = fhmn * c__ * au;
+		*ssmin += *ssmin;
+		*ssmax = ga / (c__ + c__);
+	    }
+	}
+    }
+    return 0;
+
+/*     End of SLAS2 */
+
+} /* slas2_ */
+
+/* Subroutine */ int slascl_(char *type__, integer *kl, integer *ku, real *
+	cfrom, real *cto, integer *m, integer *n, real *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+
+    /* Local variables */
+    static integer i__, j, k1, k2, k3, k4;
+    static real mul, cto1;
+    static logical done;
+    static real ctoc;
+    extern logical lsame_(char *, char *);
+    static integer itype;
+    static real cfrom1;
+    extern doublereal slamch_(char *);
+    static real cfromc;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real bignum;
+    extern logical sisnan_(real *);
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASCL multiplies the M by N real matrix A by the real scalar
+    CTO/CFROM.  This is done without over/underflow as long as the final
+    result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+    A may be full, upper triangular, lower triangular, upper Hessenberg,
+    or banded.
+
+    Arguments
+    =========
+
+    TYPE    (input) CHARACTER*1
+            TYPE indices the storage type of the input matrix.
+            = 'G':  A is a full matrix.
+            = 'L':  A is a lower triangular matrix.
+            = 'U':  A is an upper triangular matrix.
+            = 'H':  A is an upper Hessenberg matrix.
+            = 'B':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the lower
+                    half stored.
+            = 'Q':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the upper
+                    half stored.
+            = 'Z':  A is a band matrix with lower bandwidth KL and upper
+                    bandwidth KU.
+
+    KL      (input) INTEGER
+            The lower bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    KU      (input) INTEGER
+            The upper bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    CFROM   (input) REAL
+    CTO     (input) REAL
+            The matrix A is multiplied by CTO/CFROM. A(I,J) is computed
+            without over/underflow if the final result CTO*A(I,J)/CFROM
+            can be represented without over/underflow.  CFROM must be
+            nonzero.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            The matrix to be multiplied by CTO/CFROM.  See TYPE for the
+            storage type.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    INFO    (output) INTEGER
+            0  - successful exit
+            <0 - if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(type__, "G")) {
+	itype = 0;
+    } else if (lsame_(type__, "L")) {
+	itype = 1;
+    } else if (lsame_(type__, "U")) {
+	itype = 2;
+    } else if (lsame_(type__, "H")) {
+	itype = 3;
+    } else if (lsame_(type__, "B")) {
+	itype = 4;
+    } else if (lsame_(type__, "Q")) {
+	itype = 5;
+    } else if (lsame_(type__, "Z")) {
+	itype = 6;
+    } else {
+	itype = -1;
+    }
+
+    if (itype == -1) {
+	*info = -1;
+    } else if (*cfrom == 0.f || sisnan_(cfrom)) {
+	*info = -4;
+    } else if (sisnan_(cto)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || itype == 4 && *n != *m || itype == 5 && *n != *m) {
+	*info = -7;
+    } else if (itype <= 3 && *lda < max(1,*m)) {
+	*info = -9;
+    } else if (itype >= 4) {
+/* Computing MAX */
+	i__1 = *m - 1;
+	if (*kl < 0 || *kl > max(i__1,0)) {
+	    *info = -2;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = *n - 1;
+	    if (*ku < 0 || *ku > max(i__1,0) || (itype == 4 || itype == 5) &&
+		    *kl != *ku) {
+		*info = -3;
+	    } else if (itype == 4 && *lda < *kl + 1 || itype == 5 && *lda < *
+		    ku + 1 || itype == 6 && *lda < (*kl << 1) + *ku + 1) {
+		*info = -9;
+	    }
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASCL", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = slamch_("S");
+    bignum = 1.f / smlnum;
+
+    cfromc = *cfrom;
+    ctoc = *cto;
+
+L10:
+    cfrom1 = cfromc * smlnum;
+    if (cfrom1 == cfromc) {
+/*
+          CFROMC is an inf.  Multiply by a correctly signed zero for
+          finite CTOC, or a NaN if CTOC is infinite.
+*/
+	mul = ctoc / cfromc;
+	done = TRUE_;
+	cto1 = ctoc;
+    } else {
+	cto1 = ctoc / bignum;
+	if (cto1 == ctoc) {
+/*
+             CTOC is either 0 or an inf.  In both cases, CTOC itself
+             serves as the correct multiplication factor.
+*/
+	    mul = ctoc;
+	    done = TRUE_;
+	    cfromc = 1.f;
+	} else if (dabs(cfrom1) > dabs(ctoc) && ctoc != 0.f) {
+	    mul = smlnum;
+	    done = FALSE_;
+	    cfromc = cfrom1;
+	} else if (dabs(cto1) > dabs(cfromc)) {
+	    mul = bignum;
+	    done = FALSE_;
+	    ctoc = cto1;
+	} else {
+	    mul = ctoc / cfromc;
+	    done = TRUE_;
+	}
+    }
+
+    if (itype == 0) {
+
+/*        Full matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L20: */
+	    }
+/* L30: */
+	}
+
+    } else if (itype == 1) {
+
+/*        Lower triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L40: */
+	    }
+/* L50: */
+	}
+
+    } else if (itype == 2) {
+
+/*        Upper triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L60: */
+	    }
+/* L70: */
+	}
+
+    } else if (itype == 3) {
+
+/*        Upper Hessenberg matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j + 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+    } else if (itype == 4) {
+
+/*        Lower half of a symmetric band matrix */
+
+	k3 = *kl + 1;
+	k4 = *n + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = k3, i__4 = k4 - j;
+	    i__2 = min(i__3,i__4);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L100: */
+	    }
+/* L110: */
+	}
+
+    } else if (itype == 5) {
+
+/*        Upper half of a symmetric band matrix */
+
+	k1 = *ku + 2;
+	k3 = *ku + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__2 = k1 - j;
+	    i__3 = k3;
+	    for (i__ = max(i__2,1); i__ <= i__3; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L120: */
+	    }
+/* L130: */
+	}
+
+    } else if (itype == 6) {
+
+/*        Band matrix */
+
+	k1 = *kl + *ku + 2;
+	k2 = *kl + 1;
+	k3 = (*kl << 1) + *ku + 1;
+	k4 = *kl + *ku + 1 + *m;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__3 = k1 - j;
+/* Computing MIN */
+	    i__4 = k3, i__5 = k4 - j;
+	    i__2 = min(i__4,i__5);
+	    for (i__ = max(i__3,k2); i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L140: */
+	    }
+/* L150: */
+	}
+
+    }
+
+    if (! done) {
+	goto L10;
+    }
+
+    return 0;
+
+/*     End of SLASCL */
+
+} /* slascl_ */
+
+/* Subroutine */ int slasd0_(integer *n, integer *sqre, real *d__, real *e,
+	real *u, integer *ldu, real *vt, integer *ldvt, integer *smlsiz,
+	integer *iwork, real *work, integer *info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, vt_dim1, vt_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, m, i1, ic, lf, nd, ll, nl, nr, im1, ncc, nlf, nrf,
+	    iwk, lvl, ndb1, nlp1, nrp1;
+    static real beta;
+    static integer idxq, nlvl;
+    static real alpha;
+    static integer inode, ndiml, idxqc, ndimr, itemp, sqrei;
+    extern /* Subroutine */ int slasd1_(integer *, integer *, integer *, real
+	    *, real *, real *, real *, integer *, real *, integer *, integer *
+	    , integer *, real *, integer *), xerbla_(char *, integer *), slasdq_(char *, integer *, integer *, integer *, integer
+	    *, integer *, real *, real *, real *, integer *, real *, integer *
+	    , real *, integer *, real *, integer *), slasdt_(integer *
+	    , integer *, integer *, integer *, integer *, integer *, integer *
+	    );
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    Using a divide and conquer approach, SLASD0 computes the singular
+    value decomposition (SVD) of a real upper bidiagonal N-by-M
+    matrix B with diagonal D and offdiagonal E, where M = N + SQRE.
+    The algorithm computes orthogonal matrices U and VT such that
+    B = U * S * VT. The singular values S are overwritten on D.
+
+    A related subroutine, SLASDA, computes only the singular values,
+    and optionally, the singular vectors in compact form.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           On entry, the row dimension of the upper bidiagonal matrix.
+           This is also the dimension of the main diagonal array D.
+
+    SQRE   (input) INTEGER
+           Specifies the column dimension of the bidiagonal matrix.
+           = 0: The bidiagonal matrix has column dimension M = N;
+           = 1: The bidiagonal matrix has column dimension M = N+1;
+
+    D      (input/output) REAL array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix.
+           On exit D, if INFO = 0, contains its singular values.
+
+    E      (input) REAL array, dimension (M-1)
+           Contains the subdiagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    U      (output) REAL array, dimension at least (LDQ, N)
+           On exit, U contains the left singular vectors.
+
+    LDU    (input) INTEGER
+           On entry, leading dimension of U.
+
+    VT     (output) REAL array, dimension at least (LDVT, M)
+           On exit, VT' contains the right singular vectors.
+
+    LDVT   (input) INTEGER
+           On entry, leading dimension of VT.
+
+    SMLSIZ (input) INTEGER
+           On entry, maximum size of the subproblems at the
+           bottom of the computation tree.
+
+    IWORK  (workspace) INTEGER array, dimension (8*N)
+
+    WORK   (workspace) REAL array, dimension (3*M**2+2*M)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --iwork;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -1;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -2;
+    }
+
+    m = *n + *sqre;
+
+    if (*ldu < *n) {
+	*info = -6;
+    } else if (*ldvt < m) {
+	*info = -8;
+    } else if (*smlsiz < 3) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD0", &i__1);
+	return 0;
+    }
+
+/*     If the input matrix is too small, call SLASDQ to find the SVD. */
+
+    if (*n <= *smlsiz) {
+	slasdq_("U", sqre, n, &m, n, &c__0, &d__[1], &e[1], &vt[vt_offset],
+		ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[1], info);
+	return 0;
+    }
+
+/*     Set up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+    idxq = ndimr + *n;
+    iwk = idxq + *n;
+    slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       For the nodes on bottom level of the tree, solve
+       their subproblems by SLASDQ.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    ncc = 0;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+       IC : center row of each node
+       NL : number of rows of left  subproblem
+       NR : number of rows of right subproblem
+       NLF: starting row of the left   subproblem
+       NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nlp1 = nl + 1;
+	nr = iwork[ndimr + i1];
+	nrp1 = nr + 1;
+	nlf = ic - nl;
+	nrf = ic + 1;
+	sqrei = 1;
+	slasdq_("U", &sqrei, &nl, &nlp1, &nl, &ncc, &d__[nlf], &e[nlf], &vt[
+		nlf + nlf * vt_dim1], ldvt, &u[nlf + nlf * u_dim1], ldu, &u[
+		nlf + nlf * u_dim1], ldu, &work[1], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	itemp = idxq + nlf - 2;
+	i__2 = nl;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[itemp + j] = j;
+/* L10: */
+	}
+	if (i__ == nd) {
+	    sqrei = *sqre;
+	} else {
+	    sqrei = 1;
+	}
+	nrp1 = nr + sqrei;
+	slasdq_("U", &sqrei, &nr, &nrp1, &nr, &ncc, &d__[nrf], &e[nrf], &vt[
+		nrf + nrf * vt_dim1], ldvt, &u[nrf + nrf * u_dim1], ldu, &u[
+		nrf + nrf * u_dim1], ldu, &work[1], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	itemp = idxq + ic;
+	i__2 = nr;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[itemp + j - 1] = j;
+/* L20: */
+	}
+/* L30: */
+    }
+
+/*     Now conquer each subproblem bottom-up. */
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+
+/*
+          Find the first node LF and last node LL on the
+          current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    if (*sqre == 0 && i__ == ll) {
+		sqrei = *sqre;
+	    } else {
+		sqrei = 1;
+	    }
+	    idxqc = idxq + nlf - 1;
+	    alpha = d__[ic];
+	    beta = e[ic];
+	    slasd1_(&nl, &nr, &sqrei, &d__[nlf], &alpha, &beta, &u[nlf + nlf *
+		     u_dim1], ldu, &vt[nlf + nlf * vt_dim1], ldvt, &iwork[
+		    idxqc], &iwork[iwk], &work[1], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+/* L40: */
+	}
+/* L50: */
+    }
+
+    return 0;
+
+/*     End of SLASD0 */
+
+} /* slasd0_ */
+
+/* Subroutine */ int slasd1_(integer *nl, integer *nr, integer *sqre, real *
+	d__, real *alpha, real *beta, real *u, integer *ldu, real *vt,
+	integer *ldvt, integer *idxq, integer *iwork, real *work, integer *
+	info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, vt_dim1, vt_offset, i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, k, m, n, n1, n2, iq, iz, iu2, ldq, idx, ldu2, ivt2,
+	    idxc, idxp, ldvt2;
+    extern /* Subroutine */ int slasd2_(integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *, real *, integer *,
+	    integer *, integer *, integer *, integer *, integer *, integer *),
+	     slasd3_(integer *, integer *, integer *, integer *, real *, real
+	    *, integer *, real *, real *, integer *, real *, integer *, real *
+	    , integer *, real *, integer *, integer *, integer *, real *,
+	    integer *);
+    static integer isigma;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slascl_(
+	    char *, integer *, integer *, real *, real *, integer *, integer *
+	    , real *, integer *, integer *), slamrg_(integer *,
+	    integer *, real *, integer *, integer *, integer *);
+    static real orgnrm;
+    static integer coltyp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLASD1 computes the SVD of an upper bidiagonal N-by-M matrix B,
+    where N = NL + NR + 1 and M = N + SQRE. SLASD1 is called from SLASD0.
+
+    A related subroutine SLASD7 handles the case in which the singular
+    values (and the singular vectors in factored form) are desired.
+
+    SLASD1 computes the SVD as follows:
+
+                  ( D1(in)  0    0     0 )
+      B = U(in) * (   Z1'   a   Z2'    b ) * VT(in)
+                  (   0     0   D2(in) 0 )
+
+        = U(out) * ( D(out) 0) * VT(out)
+
+    where Z' = (Z1' a Z2' b) = u' VT', and u is a vector of dimension M
+    with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+    elsewhere; and the entry b is empty if SQRE = 0.
+
+    The left singular vectors of the original matrix are stored in U, and
+    the transpose of the right singular vectors are stored in VT, and the
+    singular values are in D.  The algorithm consists of three stages:
+
+       The first stage consists of deflating the size of the problem
+       when there are multiple singular values or when there are zeros in
+       the Z vector.  For each such occurence the dimension of the
+       secular equation problem is reduced by one.  This stage is
+       performed by the routine SLASD2.
+
+       The second stage consists of calculating the updated
+       singular values. This is done by finding the square roots of the
+       roots of the secular equation via the routine SLASD4 (as called
+       by SLASD3). This routine also calculates the singular vectors of
+       the current problem.
+
+       The final stage consists of computing the updated singular vectors
+       directly using the updated singular values.  The singular vectors
+       for the current problem are multiplied with the singular vectors
+       from the overall problem.
+
+    Arguments
+    =========
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    D      (input/output) REAL array, dimension (NL+NR+1).
+           N = NL+NR+1
+           On entry D(1:NL,1:NL) contains the singular values of the
+           upper block; and D(NL+2:N) contains the singular values of
+           the lower block. On exit D(1:N) contains the singular values
+           of the modified matrix.
+
+    ALPHA  (input/output) REAL
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input/output) REAL
+           Contains the off-diagonal element associated with the added
+           row.
+
+    U      (input/output) REAL array, dimension (LDU,N)
+           On entry U(1:NL, 1:NL) contains the left singular vectors of
+           the upper block; U(NL+2:N, NL+2:N) contains the left singular
+           vectors of the lower block. On exit U contains the left
+           singular vectors of the bidiagonal matrix.
+
+    LDU    (input) INTEGER
+           The leading dimension of the array U.  LDU >= max( 1, N ).
+
+    VT     (input/output) REAL array, dimension (LDVT,M)
+           where M = N + SQRE.
+           On entry VT(1:NL+1, 1:NL+1)' contains the right singular
+           vectors of the upper block; VT(NL+2:M, NL+2:M)' contains
+           the right singular vectors of the lower block. On exit
+           VT' contains the right singular vectors of the
+           bidiagonal matrix.
+
+    LDVT   (input) INTEGER
+           The leading dimension of the array VT.  LDVT >= max( 1, M ).
+
+    IDXQ  (output) INTEGER array, dimension (N)
+           This contains the permutation which will reintegrate the
+           subproblem just solved back into sorted order, i.e.
+           D( IDXQ( I = 1, N ) ) will be in ascending order.
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    WORK   (workspace) REAL array, dimension (3*M**2+2*M)
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --idxq;
+    --iwork;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*nl < 1) {
+	*info = -1;
+    } else if (*nr < 1) {
+	*info = -2;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -3;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD1", &i__1);
+	return 0;
+    }
+
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in SLASD2 and SLASD3.
+*/
+
+    ldu2 = n;
+    ldvt2 = m;
+
+    iz = 1;
+    isigma = iz + m;
+    iu2 = isigma + n;
+    ivt2 = iu2 + ldu2 * n;
+    iq = ivt2 + ldvt2 * m;
+
+    idx = 1;
+    idxc = idx + n;
+    coltyp = idxc + n;
+    idxp = coltyp + n;
+
+/*
+       Scale.
+
+   Computing MAX
+*/
+    r__1 = dabs(*alpha), r__2 = dabs(*beta);
+    orgnrm = dmax(r__1,r__2);
+    d__[*nl + 1] = 0.f;
+    i__1 = n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) > orgnrm) {
+	    orgnrm = (r__1 = d__[i__], dabs(r__1));
+	}
+/* L10: */
+    }
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &n, &c__1, &d__[1], &n, info);
+    *alpha /= orgnrm;
+    *beta /= orgnrm;
+
+/*     Deflate singular values. */
+
+    slasd2_(nl, nr, sqre, &k, &d__[1], &work[iz], alpha, beta, &u[u_offset],
+	    ldu, &vt[vt_offset], ldvt, &work[isigma], &work[iu2], &ldu2, &
+	    work[ivt2], &ldvt2, &iwork[idxp], &iwork[idx], &iwork[idxc], &
+	    idxq[1], &iwork[coltyp], info);
+
+/*     Solve Secular Equation and update singular vectors. */
+
+    ldq = k;
+    slasd3_(nl, nr, sqre, &k, &d__[1], &work[iq], &ldq, &work[isigma], &u[
+	    u_offset], ldu, &work[iu2], &ldu2, &vt[vt_offset], ldvt, &work[
+	    ivt2], &ldvt2, &iwork[idxc], &iwork[coltyp], &work[iz], info);
+    if (*info != 0) {
+	return 0;
+    }
+
+/*     Unscale. */
+
+    slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, &n, &c__1, &d__[1], &n, info);
+
+/*     Prepare the IDXQ sorting permutation. */
+
+    n1 = k;
+    n2 = n - k;
+    slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]);
+
+    return 0;
+
+/*     End of SLASD1 */
+
+} /* slasd1_ */
+
+/* Subroutine */ int slasd2_(integer *nl, integer *nr, integer *sqre, integer
+	*k, real *d__, real *z__, real *alpha, real *beta, real *u, integer *
+	ldu, real *vt, integer *ldvt, real *dsigma, real *u2, integer *ldu2,
+	real *vt2, integer *ldvt2, integer *idxp, integer *idx, integer *idxc,
+	 integer *idxq, integer *coltyp, integer *info)
+{
+    /* System generated locals */
+    integer u_dim1, u_offset, u2_dim1, u2_offset, vt_dim1, vt_offset,
+	    vt2_dim1, vt2_offset, i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real c__;
+    static integer i__, j, m, n;
+    static real s;
+    static integer k2;
+    static real z1;
+    static integer ct, jp;
+    static real eps, tau, tol;
+    static integer psm[4], nlp1, nlp2, idxi, idxj, ctot[4];
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    static integer idxjp, jprev;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), slamrg_(
+	    integer *, integer *, real *, integer *, integer *, integer *);
+    static real hlftol;
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *,
+	    integer *, real *, integer *), slaset_(char *, integer *,
+	    integer *, real *, real *, real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASD2 merges the two sets of singular values together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    singular values are close together or if there is a tiny entry in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    SLASD2 is called from SLASD1.
+
+    Arguments
+    =========
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has N = NL + NR + 1 rows and
+           M = N + SQRE >= N columns.
+
+    K      (output) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry D contains the singular values of the two submatrices
+           to be combined.  On exit D contains the trailing (N-K) updated
+           singular values (those which were deflated) sorted into
+           increasing order.
+
+    Z      (output) REAL array, dimension (N)
+           On exit Z contains the updating row vector in the secular
+           equation.
+
+    ALPHA  (input) REAL
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input) REAL
+           Contains the off-diagonal element associated with the added
+           row.
+
+    U      (input/output) REAL array, dimension (LDU,N)
+           On entry U contains the left singular vectors of two
+           submatrices in the two square blocks with corners at (1,1),
+           (NL, NL), and (NL+2, NL+2), (N,N).
+           On exit U contains the trailing (N-K) updated left singular
+           vectors (those which were deflated) in its last N-K columns.
+
+    LDU    (input) INTEGER
+           The leading dimension of the array U.  LDU >= N.
+
+    VT     (input/output) REAL array, dimension (LDVT,M)
+           On entry VT' contains the right singular vectors of two
+           submatrices in the two square blocks with corners at (1,1),
+           (NL+1, NL+1), and (NL+2, NL+2), (M,M).
+           On exit VT' contains the trailing (N-K) updated right singular
+           vectors (those which were deflated) in its last N-K columns.
+           In case SQRE =1, the last row of VT spans the right null
+           space.
+
+    LDVT   (input) INTEGER
+           The leading dimension of the array VT.  LDVT >= M.
+
+    DSIGMA (output) REAL array, dimension (N)
+           Contains a copy of the diagonal elements (K-1 singular values
+           and one zero) in the secular equation.
+
+    U2     (output) REAL array, dimension (LDU2,N)
+           Contains a copy of the first K-1 left singular vectors which
+           will be used by SLASD3 in a matrix multiply (SGEMM) to solve
+           for the new left singular vectors. U2 is arranged into four
+           blocks. The first block contains a column with 1 at NL+1 and
+           zero everywhere else; the second block contains non-zero
+           entries only at and above NL; the third contains non-zero
+           entries only below NL+1; and the fourth is dense.
+
+    LDU2   (input) INTEGER
+           The leading dimension of the array U2.  LDU2 >= N.
+
+    VT2    (output) REAL array, dimension (LDVT2,N)
+           VT2' contains a copy of the first K right singular vectors
+           which will be used by SLASD3 in a matrix multiply (SGEMM) to
+           solve for the new right singular vectors. VT2 is arranged into
+           three blocks. The first block contains a row that corresponds
+           to the special 0 diagonal element in SIGMA; the second block
+           contains non-zeros only at and before NL +1; the third block
+           contains non-zeros only at and after  NL +2.
+
+    LDVT2  (input) INTEGER
+           The leading dimension of the array VT2.  LDVT2 >= M.
+
+    IDXP   (workspace) INTEGER array, dimension (N)
+           This will contain the permutation used to place deflated
+           values of D at the end of the array. On output IDXP(2:K)
+           points to the nondeflated D-values and IDXP(K+1:N)
+           points to the deflated singular values.
+
+    IDX    (workspace) INTEGER array, dimension (N)
+           This will contain the permutation used to sort the contents of
+           D into ascending order.
+
+    IDXC   (output) INTEGER array, dimension (N)
+           This will contain the permutation used to arrange the columns
+           of the deflated U matrix into three groups:  the first group
+           contains non-zero entries only at and above NL, the second
+           contains non-zero entries only below NL+2, and the third is
+           dense.
+
+    IDXQ   (input/output) INTEGER array, dimension (N)
+           This contains the permutation which separately sorts the two
+           sub-problems in D into ascending order.  Note that entries in
+           the first hlaf of this permutation must first be moved one
+           position backward; and entries in the second half
+           must first have NL+1 added to their values.
+
+    COLTYP (workspace/output) INTEGER array, dimension (N)
+           As workspace, this will contain a label which will indicate
+           which of the following types a column in the U2 matrix or a
+           row in the VT2 matrix is:
+           1 : non-zero in the upper half only
+           2 : non-zero in the lower half only
+           3 : dense
+           4 : deflated
+
+           On exit, it is an array of dimension 4, with COLTYP(I) being
+           the dimension of the I-th type columns.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --z__;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --dsigma;
+    u2_dim1 = *ldu2;
+    u2_offset = 1 + u2_dim1;
+    u2 -= u2_offset;
+    vt2_dim1 = *ldvt2;
+    vt2_offset = 1 + vt2_dim1;
+    vt2 -= vt2_offset;
+    --idxp;
+    --idx;
+    --idxc;
+    --idxq;
+    --coltyp;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*nl < 1) {
+	*info = -1;
+    } else if (*nr < 1) {
+	*info = -2;
+    } else if (*sqre != 1 && *sqre != 0) {
+	*info = -3;
+    }
+
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+    if (*ldu < n) {
+	*info = -10;
+    } else if (*ldvt < m) {
+	*info = -12;
+    } else if (*ldu2 < n) {
+	*info = -15;
+    } else if (*ldvt2 < m) {
+	*info = -17;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD2", &i__1);
+	return 0;
+    }
+
+    nlp1 = *nl + 1;
+    nlp2 = *nl + 2;
+
+/*
+       Generate the first part of the vector Z; and move the singular
+       values in the first part of D one position backward.
+*/
+
+    z1 = *alpha * vt[nlp1 + nlp1 * vt_dim1];
+    z__[1] = z1;
+    for (i__ = *nl; i__ >= 1; --i__) {
+	z__[i__ + 1] = *alpha * vt[i__ + nlp1 * vt_dim1];
+	d__[i__ + 1] = d__[i__];
+	idxq[i__ + 1] = idxq[i__] + 1;
+/* L10: */
+    }
+
+/*     Generate the second part of the vector Z. */
+
+    i__1 = m;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	z__[i__] = *beta * vt[i__ + nlp2 * vt_dim1];
+/* L20: */
+    }
+
+/*     Initialize some reference arrays. */
+
+    i__1 = nlp1;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	coltyp[i__] = 1;
+/* L30: */
+    }
+    i__1 = n;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	coltyp[i__] = 2;
+/* L40: */
+    }
+
+/*     Sort the singular values into increasing order */
+
+    i__1 = n;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	idxq[i__] += nlp1;
+/* L50: */
+    }
+
+/*
+       DSIGMA, IDXC, IDXC, and the first column of U2
+       are used as storage space.
+*/
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	dsigma[i__] = d__[idxq[i__]];
+	u2[i__ + u2_dim1] = z__[idxq[i__]];
+	idxc[i__] = coltyp[idxq[i__]];
+/* L60: */
+    }
+
+    slamrg_(nl, nr, &dsigma[2], &c__1, &c__1, &idx[2]);
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	idxi = idx[i__] + 1;
+	d__[i__] = dsigma[idxi];
+	z__[i__] = u2[idxi + u2_dim1];
+	coltyp[i__] = idxc[idxi];
+/* L70: */
+    }
+
+/*     Calculate the allowable deflation tolerance */
+
+    eps = slamch_("Epsilon");
+/* Computing MAX */
+    r__1 = dabs(*alpha), r__2 = dabs(*beta);
+    tol = dmax(r__1,r__2);
+/* Computing MAX */
+    r__2 = (r__1 = d__[n], dabs(r__1));
+    tol = eps * 8.f * dmax(r__2,tol);
+
+/*
+       There are 2 kinds of deflation -- first a value in the z-vector
+       is small, second two (or more) singular values are very close
+       together (their difference is small).
+
+       If the value in the z-vector is small, we simply permute the
+       array so that the corresponding singular value is moved to the
+       end.
+
+       If two values in the D-vector are close, we perform a two-sided
+       rotation designed to make one of the corresponding z-vector
+       entries zero, and then permute the array so that the deflated
+       singular value is moved to the end.
+
+       If there are multiple singular values then the problem deflates.
+       Here the number of equal singular values are found.  As each equal
+       singular value is found, an elementary reflector is computed to
+       rotate the corresponding singular subspace so that the
+       corresponding components of Z are zero in this new basis.
+*/
+
+    *k = 1;
+    k2 = n + 1;
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	if ((r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    idxp[k2] = j;
+	    coltyp[j] = 4;
+	    if (j == n) {
+		goto L120;
+	    }
+	} else {
+	    jprev = j;
+	    goto L90;
+	}
+/* L80: */
+    }
+L90:
+    j = jprev;
+L100:
+    ++j;
+    if (j > n) {
+	goto L110;
+    }
+    if ((r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	idxp[k2] = j;
+	coltyp[j] = 4;
+    } else {
+
+/*        Check if singular values are close enough to allow deflation. */
+
+	if ((r__1 = d__[j] - d__[jprev], dabs(r__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    s = z__[jprev];
+	    c__ = z__[j];
+
+/*
+             Find sqrt(a**2+b**2) without overflow or
+             destructive underflow.
+*/
+
+	    tau = slapy2_(&c__, &s);
+	    c__ /= tau;
+	    s = -s / tau;
+	    z__[j] = tau;
+	    z__[jprev] = 0.f;
+
+/*
+             Apply back the Givens rotation to the left and right
+             singular vector matrices.
+*/
+
+	    idxjp = idxq[idx[jprev] + 1];
+	    idxj = idxq[idx[j] + 1];
+	    if (idxjp <= nlp1) {
+		--idxjp;
+	    }
+	    if (idxj <= nlp1) {
+		--idxj;
+	    }
+	    srot_(&n, &u[idxjp * u_dim1 + 1], &c__1, &u[idxj * u_dim1 + 1], &
+		    c__1, &c__, &s);
+	    srot_(&m, &vt[idxjp + vt_dim1], ldvt, &vt[idxj + vt_dim1], ldvt, &
+		    c__, &s);
+	    if (coltyp[j] != coltyp[jprev]) {
+		coltyp[j] = 3;
+	    }
+	    coltyp[jprev] = 4;
+	    --k2;
+	    idxp[k2] = jprev;
+	    jprev = j;
+	} else {
+	    ++(*k);
+	    u2[*k + u2_dim1] = z__[jprev];
+	    dsigma[*k] = d__[jprev];
+	    idxp[*k] = jprev;
+	    jprev = j;
+	}
+    }
+    goto L100;
+L110:
+
+/*     Record the last singular value. */
+
+    ++(*k);
+    u2[*k + u2_dim1] = z__[jprev];
+    dsigma[*k] = d__[jprev];
+    idxp[*k] = jprev;
+
+L120:
+
+/*
+       Count up the total number of the various types of columns, then
+       form a permutation which positions the four column types into
+       four groups of uniform structure (although one or more of these
+       groups may be empty).
+*/
+
+    for (j = 1; j <= 4; ++j) {
+	ctot[j - 1] = 0;
+/* L130: */
+    }
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	ct = coltyp[j];
+	++ctot[ct - 1];
+/* L140: */
+    }
+
+/*     PSM(*) = Position in SubMatrix (of types 1 through 4) */
+
+    psm[0] = 2;
+    psm[1] = ctot[0] + 2;
+    psm[2] = psm[1] + ctot[1];
+    psm[3] = psm[2] + ctot[2];
+
+/*
+       Fill out the IDXC array so that the permutation which it induces
+       will place all type-1 columns first, all type-2 columns next,
+       then all type-3's, and finally all type-4's, starting from the
+       second column. This applies similarly to the rows of VT.
+*/
+
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	jp = idxp[j];
+	ct = coltyp[jp];
+	idxc[psm[ct - 1]] = j;
+	++psm[ct - 1];
+/* L150: */
+    }
+
+/*
+       Sort the singular values and corresponding singular vectors into
+       DSIGMA, U2, and VT2 respectively.  The singular values/vectors
+       which were not deflated go into the first K slots of DSIGMA, U2,
+       and VT2 respectively, while those which were deflated go into the
+       last N - K slots, except that the first column/row will be treated
+       separately.
+*/
+
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	jp = idxp[j];
+	dsigma[j] = d__[jp];
+	idxj = idxq[idx[idxp[idxc[j]]] + 1];
+	if (idxj <= nlp1) {
+	    --idxj;
+	}
+	scopy_(&n, &u[idxj * u_dim1 + 1], &c__1, &u2[j * u2_dim1 + 1], &c__1);
+	scopy_(&m, &vt[idxj + vt_dim1], ldvt, &vt2[j + vt2_dim1], ldvt2);
+/* L160: */
+    }
+
+/*     Determine DSIGMA(1), DSIGMA(2) and Z(1) */
+
+    dsigma[1] = 0.f;
+    hlftol = tol / 2.f;
+    if (dabs(dsigma[2]) <= hlftol) {
+	dsigma[2] = hlftol;
+    }
+    if (m > n) {
+	z__[1] = slapy2_(&z1, &z__[m]);
+	if (z__[1] <= tol) {
+	    c__ = 1.f;
+	    s = 0.f;
+	    z__[1] = tol;
+	} else {
+	    c__ = z1 / z__[1];
+	    s = z__[m] / z__[1];
+	}
+    } else {
+	if (dabs(z1) <= tol) {
+	    z__[1] = tol;
+	} else {
+	    z__[1] = z1;
+	}
+    }
+
+/*     Move the rest of the updating row to Z. */
+
+    i__1 = *k - 1;
+    scopy_(&i__1, &u2[u2_dim1 + 2], &c__1, &z__[2], &c__1);
+
+/*
+       Determine the first column of U2, the first row of VT2 and the
+       last row of VT.
+*/
+
+    slaset_("A", &n, &c__1, &c_b29, &c_b29, &u2[u2_offset], ldu2);
+    u2[nlp1 + u2_dim1] = 1.f;
+    if (m > n) {
+	i__1 = nlp1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    vt[m + i__ * vt_dim1] = -s * vt[nlp1 + i__ * vt_dim1];
+	    vt2[i__ * vt2_dim1 + 1] = c__ * vt[nlp1 + i__ * vt_dim1];
+/* L170: */
+	}
+	i__1 = m;
+	for (i__ = nlp2; i__ <= i__1; ++i__) {
+	    vt2[i__ * vt2_dim1 + 1] = s * vt[m + i__ * vt_dim1];
+	    vt[m + i__ * vt_dim1] = c__ * vt[m + i__ * vt_dim1];
+/* L180: */
+	}
+    } else {
+	scopy_(&m, &vt[nlp1 + vt_dim1], ldvt, &vt2[vt2_dim1 + 1], ldvt2);
+    }
+    if (m > n) {
+	scopy_(&m, &vt[m + vt_dim1], ldvt, &vt2[m + vt2_dim1], ldvt2);
+    }
+
+/*
+       The deflated singular values and their corresponding vectors go
+       into the back of D, U, and V respectively.
+*/
+
+    if (n > *k) {
+	i__1 = n - *k;
+	scopy_(&i__1, &dsigma[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	i__1 = n - *k;
+	slacpy_("A", &n, &i__1, &u2[(*k + 1) * u2_dim1 + 1], ldu2, &u[(*k + 1)
+		 * u_dim1 + 1], ldu);
+	i__1 = n - *k;
+	slacpy_("A", &i__1, &m, &vt2[*k + 1 + vt2_dim1], ldvt2, &vt[*k + 1 +
+		vt_dim1], ldvt);
+    }
+
+/*     Copy CTOT into COLTYP for referencing in SLASD3. */
+
+    for (j = 1; j <= 4; ++j) {
+	coltyp[j] = ctot[j - 1];
+/* L190: */
+    }
+
+    return 0;
+
+/*     End of SLASD2 */
+
+} /* slasd2_ */
+
+/* Subroutine */ int slasd3_(integer *nl, integer *nr, integer *sqre, integer
+	*k, real *d__, real *q, integer *ldq, real *dsigma, real *u, integer *
+	ldu, real *u2, integer *ldu2, real *vt, integer *ldvt, real *vt2,
+	integer *ldvt2, integer *idxc, integer *ctot, real *z__, integer *
+	info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, u_dim1, u_offset, u2_dim1, u2_offset, vt_dim1,
+	    vt_offset, vt2_dim1, vt2_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, j, m, n, jc;
+    static real rho;
+    static integer nlp1, nlp2, nrp1;
+    static real temp;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    static integer ctemp;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer ktemp;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int slasd4_(integer *, integer *, real *, real *,
+	    real *, real *, real *, real *, integer *), xerbla_(char *,
+	    integer *), slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLASD3 finds all the square roots of the roots of the secular
+    equation, as defined by the values in D and Z.  It makes the
+    appropriate calls to SLASD4 and then updates the singular
+    vectors by matrix multiplication.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    SLASD3 is called from SLASD1.
+
+    Arguments
+    =========
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has N = NL + NR + 1 rows and
+           M = N + SQRE >= N columns.
+
+    K      (input) INTEGER
+           The size of the secular equation, 1 =< K = < N.
+
+    D      (output) REAL array, dimension(K)
+           On exit the square roots of the roots of the secular equation,
+           in ascending order.
+
+    Q      (workspace) REAL array,
+                       dimension at least (LDQ,K).
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= K.
+
+    DSIGMA (input/output) REAL array, dimension(K)
+           The first K elements of this array contain the old roots
+           of the deflated updating problem.  These are the poles
+           of the secular equation.
+
+    U      (output) REAL array, dimension (LDU, N)
+           The last N - K columns of this matrix contain the deflated
+           left singular vectors.
+
+    LDU    (input) INTEGER
+           The leading dimension of the array U.  LDU >= N.
+
+    U2     (input) REAL array, dimension (LDU2, N)
+           The first K columns of this matrix contain the non-deflated
+           left singular vectors for the split problem.
+
+    LDU2   (input) INTEGER
+           The leading dimension of the array U2.  LDU2 >= N.
+
+    VT     (output) REAL array, dimension (LDVT, M)
+           The last M - K columns of VT' contain the deflated
+           right singular vectors.
+
+    LDVT   (input) INTEGER
+           The leading dimension of the array VT.  LDVT >= N.
+
+    VT2    (input/output) REAL array, dimension (LDVT2, N)
+           The first K columns of VT2' contain the non-deflated
+           right singular vectors for the split problem.
+
+    LDVT2  (input) INTEGER
+           The leading dimension of the array VT2.  LDVT2 >= N.
+
+    IDXC   (input) INTEGER array, dimension (N)
+           The permutation used to arrange the columns of U (and rows of
+           VT) into three groups:  the first group contains non-zero
+           entries only at and above (or before) NL +1; the second
+           contains non-zero entries only at and below (or after) NL+2;
+           and the third is dense. The first column of U and the row of
+           VT are treated separately, however.
+
+           The rows of the singular vectors found by SLASD4
+           must be likewise permuted before the matrix multiplies can
+           take place.
+
+    CTOT   (input) INTEGER array, dimension (4)
+           A count of the total number of the various types of columns
+           in U (or rows in VT), as described in IDXC. The fourth column
+           type is any column which has been deflated.
+
+    Z      (input/output) REAL array, dimension (K)
+           The first K elements of this array contain the components
+           of the deflation-adjusted updating row vector.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --dsigma;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    u2_dim1 = *ldu2;
+    u2_offset = 1 + u2_dim1;
+    u2 -= u2_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    vt2_dim1 = *ldvt2;
+    vt2_offset = 1 + vt2_dim1;
+    vt2 -= vt2_offset;
+    --idxc;
+    --ctot;
+    --z__;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*nl < 1) {
+	*info = -1;
+    } else if (*nr < 1) {
+	*info = -2;
+    } else if (*sqre != 1 && *sqre != 0) {
+	*info = -3;
+    }
+
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+    nlp2 = *nl + 2;
+
+    if (*k < 1 || *k > n) {
+	*info = -4;
+    } else if (*ldq < *k) {
+	*info = -7;
+    } else if (*ldu < n) {
+	*info = -10;
+    } else if (*ldu2 < n) {
+	*info = -12;
+    } else if (*ldvt < m) {
+	*info = -14;
+    } else if (*ldvt2 < m) {
+	*info = -16;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD3", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 1) {
+	d__[1] = dabs(z__[1]);
+	scopy_(&m, &vt2[vt2_dim1 + 1], ldvt2, &vt[vt_dim1 + 1], ldvt);
+	if (z__[1] > 0.f) {
+	    scopy_(&n, &u2[u2_dim1 + 1], &c__1, &u[u_dim1 + 1], &c__1);
+	} else {
+	    i__1 = n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		u[i__ + u_dim1] = -u2[i__ + u2_dim1];
+/* L10: */
+	    }
+	}
+	return 0;
+    }
+
+/*
+       Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DSIGMA(I) if it is 1; this makes the subsequent
+       subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DSIGMA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DSIGMA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DSIGMA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dsigma[i__] = slamc3_(&dsigma[i__], &dsigma[i__]) - dsigma[i__];
+/* L20: */
+    }
+
+/*     Keep a copy of Z. */
+
+    scopy_(k, &z__[1], &c__1, &q[q_offset], &c__1);
+
+/*     Normalize Z. */
+
+    rho = snrm2_(k, &z__[1], &c__1);
+    slascl_("G", &c__0, &c__0, &rho, &c_b15, k, &c__1, &z__[1], k, info);
+    rho *= rho;
+
+/*     Find the new singular values. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	slasd4_(k, &j, &dsigma[1], &z__[1], &u[j * u_dim1 + 1], &rho, &d__[j],
+		 &vt[j * vt_dim1 + 1], info);
+
+/*        If the zero finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    return 0;
+	}
+/* L30: */
+    }
+
+/*     Compute updated Z. */
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	z__[i__] = u[i__ + *k * u_dim1] * vt[i__ + *k * vt_dim1];
+	i__2 = i__ - 1;
+	for (j = 1; j <= i__2; ++j) {
+	    z__[i__] *= u[i__ + j * u_dim1] * vt[i__ + j * vt_dim1] / (dsigma[
+		    i__] - dsigma[j]) / (dsigma[i__] + dsigma[j]);
+/* L40: */
+	}
+	i__2 = *k - 1;
+	for (j = i__; j <= i__2; ++j) {
+	    z__[i__] *= u[i__ + j * u_dim1] * vt[i__ + j * vt_dim1] / (dsigma[
+		    i__] - dsigma[j + 1]) / (dsigma[i__] + dsigma[j + 1]);
+/* L50: */
+	}
+	r__2 = sqrt((r__1 = z__[i__], dabs(r__1)));
+	z__[i__] = r_sign(&r__2, &q[i__ + q_dim1]);
+/* L60: */
+    }
+
+/*
+       Compute left singular vectors of the modified diagonal matrix,
+       and store related information for the right singular vectors.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	vt[i__ * vt_dim1 + 1] = z__[1] / u[i__ * u_dim1 + 1] / vt[i__ *
+		vt_dim1 + 1];
+	u[i__ * u_dim1 + 1] = -1.f;
+	i__2 = *k;
+	for (j = 2; j <= i__2; ++j) {
+	    vt[j + i__ * vt_dim1] = z__[j] / u[j + i__ * u_dim1] / vt[j + i__
+		    * vt_dim1];
+	    u[j + i__ * u_dim1] = dsigma[j] * vt[j + i__ * vt_dim1];
+/* L70: */
+	}
+	temp = snrm2_(k, &u[i__ * u_dim1 + 1], &c__1);
+	q[i__ * q_dim1 + 1] = u[i__ * u_dim1 + 1] / temp;
+	i__2 = *k;
+	for (j = 2; j <= i__2; ++j) {
+	    jc = idxc[j];
+	    q[j + i__ * q_dim1] = u[jc + i__ * u_dim1] / temp;
+/* L80: */
+	}
+/* L90: */
+    }
+
+/*     Update the left singular vector matrix. */
+
+    if (*k == 2) {
+	sgemm_("N", "N", &n, k, k, &c_b15, &u2[u2_offset], ldu2, &q[q_offset],
+		 ldq, &c_b29, &u[u_offset], ldu);
+	goto L100;
+    }
+    if (ctot[1] > 0) {
+	sgemm_("N", "N", nl, k, &ctot[1], &c_b15, &u2[(u2_dim1 << 1) + 1],
+		ldu2, &q[q_dim1 + 2], ldq, &c_b29, &u[u_dim1 + 1], ldu);
+	if (ctot[3] > 0) {
+	    ktemp = ctot[1] + 2 + ctot[2];
+	    sgemm_("N", "N", nl, k, &ctot[3], &c_b15, &u2[ktemp * u2_dim1 + 1]
+		    , ldu2, &q[ktemp + q_dim1], ldq, &c_b15, &u[u_dim1 + 1],
+		    ldu);
+	}
+    } else if (ctot[3] > 0) {
+	ktemp = ctot[1] + 2 + ctot[2];
+	sgemm_("N", "N", nl, k, &ctot[3], &c_b15, &u2[ktemp * u2_dim1 + 1],
+		ldu2, &q[ktemp + q_dim1], ldq, &c_b29, &u[u_dim1 + 1], ldu);
+    } else {
+	slacpy_("F", nl, k, &u2[u2_offset], ldu2, &u[u_offset], ldu);
+    }
+    scopy_(k, &q[q_dim1 + 1], ldq, &u[nlp1 + u_dim1], ldu);
+    ktemp = ctot[1] + 2;
+    ctemp = ctot[2] + ctot[3];
+    sgemm_("N", "N", nr, k, &ctemp, &c_b15, &u2[nlp2 + ktemp * u2_dim1], ldu2,
+	     &q[ktemp + q_dim1], ldq, &c_b29, &u[nlp2 + u_dim1], ldu);
+
+/*     Generate the right singular vectors. */
+
+L100:
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	temp = snrm2_(k, &vt[i__ * vt_dim1 + 1], &c__1);
+	q[i__ + q_dim1] = vt[i__ * vt_dim1 + 1] / temp;
+	i__2 = *k;
+	for (j = 2; j <= i__2; ++j) {
+	    jc = idxc[j];
+	    q[i__ + j * q_dim1] = vt[jc + i__ * vt_dim1] / temp;
+/* L110: */
+	}
+/* L120: */
+    }
+
+/*     Update the right singular vector matrix. */
+
+    if (*k == 2) {
+	sgemm_("N", "N", k, &m, k, &c_b15, &q[q_offset], ldq, &vt2[vt2_offset]
+		, ldvt2, &c_b29, &vt[vt_offset], ldvt);
+	return 0;
+    }
+    ktemp = ctot[1] + 1;
+    sgemm_("N", "N", k, &nlp1, &ktemp, &c_b15, &q[q_dim1 + 1], ldq, &vt2[
+	    vt2_dim1 + 1], ldvt2, &c_b29, &vt[vt_dim1 + 1], ldvt);
+    ktemp = ctot[1] + 2 + ctot[2];
+    if (ktemp <= *ldvt2) {
+	sgemm_("N", "N", k, &nlp1, &ctot[3], &c_b15, &q[ktemp * q_dim1 + 1],
+		ldq, &vt2[ktemp + vt2_dim1], ldvt2, &c_b15, &vt[vt_dim1 + 1],
+		ldvt);
+    }
+
+    ktemp = ctot[1] + 1;
+    nrp1 = *nr + *sqre;
+    if (ktemp > 1) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    q[i__ + ktemp * q_dim1] = q[i__ + q_dim1];
+/* L130: */
+	}
+	i__1 = m;
+	for (i__ = nlp2; i__ <= i__1; ++i__) {
+	    vt2[ktemp + i__ * vt2_dim1] = vt2[i__ * vt2_dim1 + 1];
+/* L140: */
+	}
+    }
+    ctemp = ctot[2] + 1 + ctot[3];
+    sgemm_("N", "N", k, &nrp1, &ctemp, &c_b15, &q[ktemp * q_dim1 + 1], ldq, &
+	    vt2[ktemp + nlp2 * vt2_dim1], ldvt2, &c_b29, &vt[nlp2 * vt_dim1 +
+	    1], ldvt);
+
+    return 0;
+
+/*     End of SLASD3 */
+
+} /* slasd3_ */
+
+/* Subroutine */ int slasd4_(integer *n, integer *i__, real *d__, real *z__,
+	real *delta, real *rho, real *sigma, real *work, integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1;
+
+    /* Local variables */
+    static real a, b, c__;
+    static integer j;
+    static real w, dd[3];
+    static integer ii;
+    static real dw, zz[3];
+    static integer ip1;
+    static real eta, phi, eps, tau, psi;
+    static integer iim1, iip1;
+    static real dphi, dpsi;
+    static integer iter;
+    static real temp, prew, sg2lb, sg2ub, temp1, temp2, dtiim, delsq, dtiip;
+    static integer niter;
+    static real dtisq;
+    static logical swtch;
+    static real dtnsq;
+    extern /* Subroutine */ int slaed6_(integer *, logical *, real *, real *,
+	    real *, real *, real *, integer *);
+    static real delsq2;
+    extern /* Subroutine */ int slasd5_(integer *, real *, real *, real *,
+	    real *, real *, real *);
+    static real dtnsq1;
+    static logical swtch3;
+    extern doublereal slamch_(char *);
+    static logical orgati;
+    static real erretm, dtipsq, rhoinv;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the square root of the I-th updated
+    eigenvalue of a positive symmetric rank-one modification to
+    a positive diagonal matrix whose entries are given as the squares
+    of the corresponding entries in the array d, and that
+
+           0 <= D(i) < D(j)  for  i < j
+
+    and that RHO > 0. This is arranged by the calling routine, and is
+    no loss in generality.  The rank-one modified system is thus
+
+           diag( D ) * diag( D ) +  RHO *  Z * Z_transpose.
+
+    where we assume the Euclidean norm of Z is 1.
+
+    The method consists of approximating the rational functions in the
+    secular equation by simpler interpolating rational functions.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The length of all arrays.
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  1 <= I <= N.
+
+    D      (input) REAL array, dimension ( N )
+           The original eigenvalues.  It is assumed that they are in
+           order, 0 <= D(I) < D(J)  for I < J.
+
+    Z      (input) REAL array, dimension (N)
+           The components of the updating vector.
+
+    DELTA  (output) REAL array, dimension (N)
+           If N .ne. 1, DELTA contains (D(j) - sigma_I) in its  j-th
+           component.  If N = 1, then DELTA(1) = 1.  The vector DELTA
+           contains the information necessary to construct the
+           (singular) eigenvectors.
+
+    RHO    (input) REAL
+           The scalar in the symmetric updating formula.
+
+    SIGMA  (output) REAL
+           The computed sigma_I, the I-th updated eigenvalue.
+
+    WORK   (workspace) REAL array, dimension (N)
+           If N .ne. 1, WORK contains (D(j) + sigma_I) in its  j-th
+           component.  If N = 1, then WORK( 1 ) = 1.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit
+           > 0:  if INFO = 1, the updating process failed.
+
+    Internal Parameters
+    ===================
+
+    Logical variable ORGATI (origin-at-i?) is used for distinguishing
+    whether D(i) or D(i+1) is treated as the origin.
+
+              ORGATI = .true.    origin at i
+              ORGATI = .false.   origin at i+1
+
+    Logical variable SWTCH3 (switch-for-3-poles?) is for noting
+    if we are working with THREE poles!
+
+    MAXIT is the maximum number of iterations allowed for each
+    eigenvalue.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Since this routine is called in an inner loop, we do no argument
+       checking.
+
+       Quick return for N=1 and 2.
+*/
+
+    /* Parameter adjustments */
+    --work;
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    if (*n == 1) {
+
+/*        Presumably, I=1 upon entry */
+
+	*sigma = sqrt(d__[1] * d__[1] + *rho * z__[1] * z__[1]);
+	delta[1] = 1.f;
+	work[1] = 1.f;
+	return 0;
+    }
+    if (*n == 2) {
+	slasd5_(i__, &d__[1], &z__[1], &delta[1], rho, sigma, &work[1]);
+	return 0;
+    }
+
+/*     Compute machine epsilon */
+
+    eps = slamch_("Epsilon");
+    rhoinv = 1.f / *rho;
+
+/*     The case I = N */
+
+    if (*i__ == *n) {
+
+/*        Initialize some basic variables */
+
+	ii = *n - 1;
+	niter = 1;
+
+/*        Calculate initial guess */
+
+	temp = *rho / 2.f;
+
+/*
+          If ||Z||_2 is not one, then TEMP should be set to
+          RHO * ||Z||_2^2 / TWO
+*/
+
+	temp1 = temp / (d__[*n] + sqrt(d__[*n] * d__[*n] + temp));
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    work[j] = d__[j] + d__[*n] + temp1;
+	    delta[j] = d__[j] - d__[*n] - temp1;
+/* L10: */
+	}
+
+	psi = 0.f;
+	i__1 = *n - 2;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / (delta[j] * work[j]);
+/* L20: */
+	}
+
+	c__ = rhoinv + psi;
+	w = c__ + z__[ii] * z__[ii] / (delta[ii] * work[ii]) + z__[*n] * z__[*
+		n] / (delta[*n] * work[*n]);
+
+	if (w <= 0.f) {
+	    temp1 = sqrt(d__[*n] * d__[*n] + *rho);
+	    temp = z__[*n - 1] * z__[*n - 1] / ((d__[*n - 1] + temp1) * (d__[*
+		    n] - d__[*n - 1] + *rho / (d__[*n] + temp1))) + z__[*n] *
+		    z__[*n] / *rho;
+
+/*
+             The following TAU is to approximate
+             SIGMA_n^2 - D( N )*D( N )
+*/
+
+	    if (c__ <= temp) {
+		tau = *rho;
+	    } else {
+		delsq = (d__[*n] - d__[*n - 1]) * (d__[*n] + d__[*n - 1]);
+		a = -c__ * delsq + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*
+			n];
+		b = z__[*n] * z__[*n] * delsq;
+		if (a < 0.f) {
+		    tau = b * 2.f / (sqrt(a * a + b * 4.f * c__) - a);
+		} else {
+		    tau = (a + sqrt(a * a + b * 4.f * c__)) / (c__ * 2.f);
+		}
+	    }
+
+/*
+             It can be proved that
+                 D(N)^2+RHO/2 <= SIGMA_n^2 < D(N)^2+TAU <= D(N)^2+RHO
+*/
+
+	} else {
+	    delsq = (d__[*n] - d__[*n - 1]) * (d__[*n] + d__[*n - 1]);
+	    a = -c__ * delsq + z__[*n - 1] * z__[*n - 1] + z__[*n] * z__[*n];
+	    b = z__[*n] * z__[*n] * delsq;
+
+/*
+             The following TAU is to approximate
+             SIGMA_n^2 - D( N )*D( N )
+*/
+
+	    if (a < 0.f) {
+		tau = b * 2.f / (sqrt(a * a + b * 4.f * c__) - a);
+	    } else {
+		tau = (a + sqrt(a * a + b * 4.f * c__)) / (c__ * 2.f);
+	    }
+
+/*
+             It can be proved that
+             D(N)^2 < D(N)^2+TAU < SIGMA(N)^2 < D(N)^2+RHO/2
+*/
+
+	}
+
+/*        The following ETA is to approximate SIGMA_n - D( N ) */
+
+	eta = tau / (d__[*n] + sqrt(d__[*n] * d__[*n] + tau));
+
+	*sigma = d__[*n] + eta;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] = d__[j] - d__[*i__] - eta;
+	    work[j] = d__[j] + d__[*i__] + eta;
+/* L30: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (delta[j] * work[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L40: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / (delta[*n] * work[*n]);
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8.f + erretm - phi + rhoinv + dabs(tau) * (
+		dpsi + dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Test for convergence */
+
+	if (dabs(w) <= eps * erretm) {
+	    goto L240;
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	dtnsq1 = work[*n - 1] * delta[*n - 1];
+	dtnsq = work[*n] * delta[*n];
+	c__ = w - dtnsq1 * dpsi - dtnsq * dphi;
+	a = (dtnsq + dtnsq1) * w - dtnsq * dtnsq1 * (dpsi + dphi);
+	b = dtnsq * dtnsq1 * w;
+	if (c__ < 0.f) {
+	    c__ = dabs(c__);
+	}
+	if (c__ == 0.f) {
+	    eta = *rho - *sigma * *sigma;
+	} else if (a >= 0.f) {
+	    eta = (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) / (
+		    c__ * 2.f);
+	} else {
+	    eta = b * 2.f / (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+		    r__1))));
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta > 0.f) {
+	    eta = -w / (dpsi + dphi);
+	}
+	temp = eta - dtnsq;
+	if (temp > *rho) {
+	    eta = *rho + dtnsq;
+	}
+
+	tau += eta;
+	eta /= *sigma + sqrt(eta + *sigma * *sigma);
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    delta[j] -= eta;
+	    work[j] += eta;
+/* L50: */
+	}
+
+	*sigma += eta;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = ii;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L60: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	temp = z__[*n] / (work[*n] * delta[*n]);
+	phi = z__[*n] * temp;
+	dphi = temp * temp;
+	erretm = (-phi - psi) * 8.f + erretm - phi + rhoinv + dabs(tau) * (
+		dpsi + dphi);
+
+	w = rhoinv + phi + psi;
+
+/*        Main loop to update the values of the array   DELTA */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 20; ++niter) {
+
+/*           Test for convergence */
+
+	    if (dabs(w) <= eps * erretm) {
+		goto L240;
+	    }
+
+/*           Calculate the new step */
+
+	    dtnsq1 = work[*n - 1] * delta[*n - 1];
+	    dtnsq = work[*n] * delta[*n];
+	    c__ = w - dtnsq1 * dpsi - dtnsq * dphi;
+	    a = (dtnsq + dtnsq1) * w - dtnsq1 * dtnsq * (dpsi + dphi);
+	    b = dtnsq1 * dtnsq * w;
+	    if (a >= 0.f) {
+		eta = (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) /
+			 (c__ * 2.f);
+	    } else {
+		eta = b * 2.f / (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+			r__1))));
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta > 0.f) {
+		eta = -w / (dpsi + dphi);
+	    }
+	    temp = eta - dtnsq;
+	    if (temp <= 0.f) {
+		eta /= 2.f;
+	    }
+
+	    tau += eta;
+	    eta /= *sigma + sqrt(eta + *sigma * *sigma);
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		delta[j] -= eta;
+		work[j] += eta;
+/* L70: */
+	    }
+
+	    *sigma += eta;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.f;
+	    psi = 0.f;
+	    erretm = 0.f;
+	    i__1 = ii;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / (work[j] * delta[j]);
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L80: */
+	    }
+	    erretm = dabs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    temp = z__[*n] / (work[*n] * delta[*n]);
+	    phi = z__[*n] * temp;
+	    dphi = temp * temp;
+	    erretm = (-phi - psi) * 8.f + erretm - phi + rhoinv + dabs(tau) *
+		    (dpsi + dphi);
+
+	    w = rhoinv + phi + psi;
+/* L90: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+	goto L240;
+
+/*        End for the case I = N */
+
+    } else {
+
+/*        The case for I < N */
+
+	niter = 1;
+	ip1 = *i__ + 1;
+
+/*        Calculate initial guess */
+
+	delsq = (d__[ip1] - d__[*i__]) * (d__[ip1] + d__[*i__]);
+	delsq2 = delsq / 2.f;
+	temp = delsq2 / (d__[*i__] + sqrt(d__[*i__] * d__[*i__] + delsq2));
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    work[j] = d__[j] + d__[*i__] + temp;
+	    delta[j] = d__[j] - d__[*i__] - temp;
+/* L100: */
+	}
+
+	psi = 0.f;
+	i__1 = *i__ - 1;
+	for (j = 1; j <= i__1; ++j) {
+	    psi += z__[j] * z__[j] / (work[j] * delta[j]);
+/* L110: */
+	}
+
+	phi = 0.f;
+	i__1 = *i__ + 2;
+	for (j = *n; j >= i__1; --j) {
+	    phi += z__[j] * z__[j] / (work[j] * delta[j]);
+/* L120: */
+	}
+	c__ = rhoinv + psi + phi;
+	w = c__ + z__[*i__] * z__[*i__] / (work[*i__] * delta[*i__]) + z__[
+		ip1] * z__[ip1] / (work[ip1] * delta[ip1]);
+
+	if (w > 0.f) {
+
+/*
+             d(i)^2 < the ith sigma^2 < (d(i)^2+d(i+1)^2)/2
+
+             We choose d(i) as origin.
+*/
+
+	    orgati = TRUE_;
+	    sg2lb = 0.f;
+	    sg2ub = delsq2;
+	    a = c__ * delsq + z__[*i__] * z__[*i__] + z__[ip1] * z__[ip1];
+	    b = z__[*i__] * z__[*i__] * delsq;
+	    if (a > 0.f) {
+		tau = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+			r__1))));
+	    } else {
+		tau = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) /
+			 (c__ * 2.f);
+	    }
+
+/*
+             TAU now is an estimation of SIGMA^2 - D( I )^2. The
+             following, however, is the corresponding estimation of
+             SIGMA - D( I ).
+*/
+
+	    eta = tau / (d__[*i__] + sqrt(d__[*i__] * d__[*i__] + tau));
+	} else {
+
+/*
+             (d(i)^2+d(i+1)^2)/2 <= the ith sigma^2 < d(i+1)^2/2
+
+             We choose d(i+1) as origin.
+*/
+
+	    orgati = FALSE_;
+	    sg2lb = -delsq2;
+	    sg2ub = 0.f;
+	    a = c__ * delsq - z__[*i__] * z__[*i__] - z__[ip1] * z__[ip1];
+	    b = z__[ip1] * z__[ip1] * delsq;
+	    if (a < 0.f) {
+		tau = b * 2.f / (a - sqrt((r__1 = a * a + b * 4.f * c__, dabs(
+			r__1))));
+	    } else {
+		tau = -(a + sqrt((r__1 = a * a + b * 4.f * c__, dabs(r__1))))
+			/ (c__ * 2.f);
+	    }
+
+/*
+             TAU now is an estimation of SIGMA^2 - D( IP1 )^2. The
+             following, however, is the corresponding estimation of
+             SIGMA - D( IP1 ).
+*/
+
+	    eta = tau / (d__[ip1] + sqrt((r__1 = d__[ip1] * d__[ip1] + tau,
+		    dabs(r__1))));
+	}
+
+	if (orgati) {
+	    ii = *i__;
+	    *sigma = d__[*i__] + eta;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		work[j] = d__[j] + d__[*i__] + eta;
+		delta[j] = d__[j] - d__[*i__] - eta;
+/* L130: */
+	    }
+	} else {
+	    ii = *i__ + 1;
+	    *sigma = d__[ip1] + eta;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		work[j] = d__[j] + d__[ip1] + eta;
+		delta[j] = d__[j] - d__[ip1] - eta;
+/* L140: */
+	    }
+	}
+	iim1 = ii - 1;
+	iip1 = ii + 1;
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L150: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.f;
+	phi = 0.f;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L160: */
+	}
+
+	w = rhoinv + phi + psi;
+
+/*
+          W is the value of the secular function with
+          its ii-th element removed.
+*/
+
+	swtch3 = FALSE_;
+	if (orgati) {
+	    if (w < 0.f) {
+		swtch3 = TRUE_;
+	    }
+	} else {
+	    if (w > 0.f) {
+		swtch3 = TRUE_;
+	    }
+	}
+	if (ii == 1 || ii == *n) {
+	    swtch3 = FALSE_;
+	}
+
+	temp = z__[ii] / (work[ii] * delta[ii]);
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w += temp;
+	erretm = (phi - psi) * 8.f + erretm + rhoinv * 2.f + dabs(temp) * 3.f
+		+ dabs(tau) * dw;
+
+/*        Test for convergence */
+
+	if (dabs(w) <= eps * erretm) {
+	    goto L240;
+	}
+
+	if (w <= 0.f) {
+	    sg2lb = dmax(sg2lb,tau);
+	} else {
+	    sg2ub = dmin(sg2ub,tau);
+	}
+
+/*        Calculate the new step */
+
+	++niter;
+	if (! swtch3) {
+	    dtipsq = work[ip1] * delta[ip1];
+	    dtisq = work[*i__] * delta[*i__];
+	    if (orgati) {
+/* Computing 2nd power */
+		r__1 = z__[*i__] / dtisq;
+		c__ = w - dtipsq * dw + delsq * (r__1 * r__1);
+	    } else {
+/* Computing 2nd power */
+		r__1 = z__[ip1] / dtipsq;
+		c__ = w - dtisq * dw - delsq * (r__1 * r__1);
+	    }
+	    a = (dtipsq + dtisq) * w - dtipsq * dtisq * dw;
+	    b = dtipsq * dtisq * w;
+	    if (c__ == 0.f) {
+		if (a == 0.f) {
+		    if (orgati) {
+			a = z__[*i__] * z__[*i__] + dtipsq * dtipsq * (dpsi +
+				dphi);
+		    } else {
+			a = z__[ip1] * z__[ip1] + dtisq * dtisq * (dpsi +
+				dphi);
+		    }
+		}
+		eta = b / a;
+	    } else if (a <= 0.f) {
+		eta = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1)))) /
+			 (c__ * 2.f);
+	    } else {
+		eta = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__, dabs(
+			r__1))));
+	    }
+	} else {
+
+/*           Interpolation using THREE most relevant poles */
+
+	    dtiim = work[iim1] * delta[iim1];
+	    dtiip = work[iip1] * delta[iip1];
+	    temp = rhoinv + psi + phi;
+	    if (orgati) {
+		temp1 = z__[iim1] / dtiim;
+		temp1 *= temp1;
+		c__ = temp - dtiip * (dpsi + dphi) - (d__[iim1] - d__[iip1]) *
+			 (d__[iim1] + d__[iip1]) * temp1;
+		zz[0] = z__[iim1] * z__[iim1];
+		if (dpsi < temp1) {
+		    zz[2] = dtiip * dtiip * dphi;
+		} else {
+		    zz[2] = dtiip * dtiip * (dpsi - temp1 + dphi);
+		}
+	    } else {
+		temp1 = z__[iip1] / dtiip;
+		temp1 *= temp1;
+		c__ = temp - dtiim * (dpsi + dphi) - (d__[iip1] - d__[iim1]) *
+			 (d__[iim1] + d__[iip1]) * temp1;
+		if (dphi < temp1) {
+		    zz[0] = dtiim * dtiim * dpsi;
+		} else {
+		    zz[0] = dtiim * dtiim * (dpsi + (dphi - temp1));
+		}
+		zz[2] = z__[iip1] * z__[iip1];
+	    }
+	    zz[1] = z__[ii] * z__[ii];
+	    dd[0] = dtiim;
+	    dd[1] = delta[ii] * work[ii];
+	    dd[2] = dtiip;
+	    slaed6_(&niter, &orgati, &c__, dd, zz, &w, &eta, info);
+	    if (*info != 0) {
+		goto L240;
+	    }
+	}
+
+/*
+          Note, eta should be positive if w is negative, and
+          eta should be negative otherwise. However,
+          if for some reason caused by roundoff, eta*w > 0,
+          we simply use one Newton step instead. This way
+          will guarantee eta*w < 0.
+*/
+
+	if (w * eta >= 0.f) {
+	    eta = -w / dw;
+	}
+	if (orgati) {
+	    temp1 = work[*i__] * delta[*i__];
+	    temp = eta - temp1;
+	} else {
+	    temp1 = work[ip1] * delta[ip1];
+	    temp = eta - temp1;
+	}
+	if (temp > sg2ub || temp < sg2lb) {
+	    if (w < 0.f) {
+		eta = (sg2ub - tau) / 2.f;
+	    } else {
+		eta = (sg2lb - tau) / 2.f;
+	    }
+	}
+
+	tau += eta;
+	eta /= *sigma + sqrt(*sigma * *sigma + eta);
+
+	prew = w;
+
+	*sigma += eta;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    work[j] += eta;
+	    delta[j] -= eta;
+/* L170: */
+	}
+
+/*        Evaluate PSI and the derivative DPSI */
+
+	dpsi = 0.f;
+	psi = 0.f;
+	erretm = 0.f;
+	i__1 = iim1;
+	for (j = 1; j <= i__1; ++j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    psi += z__[j] * temp;
+	    dpsi += temp * temp;
+	    erretm += psi;
+/* L180: */
+	}
+	erretm = dabs(erretm);
+
+/*        Evaluate PHI and the derivative DPHI */
+
+	dphi = 0.f;
+	phi = 0.f;
+	i__1 = iip1;
+	for (j = *n; j >= i__1; --j) {
+	    temp = z__[j] / (work[j] * delta[j]);
+	    phi += z__[j] * temp;
+	    dphi += temp * temp;
+	    erretm += phi;
+/* L190: */
+	}
+
+	temp = z__[ii] / (work[ii] * delta[ii]);
+	dw = dpsi + dphi + temp * temp;
+	temp = z__[ii] * temp;
+	w = rhoinv + phi + psi + temp;
+	erretm = (phi - psi) * 8.f + erretm + rhoinv * 2.f + dabs(temp) * 3.f
+		+ dabs(tau) * dw;
+
+	if (w <= 0.f) {
+	    sg2lb = dmax(sg2lb,tau);
+	} else {
+	    sg2ub = dmin(sg2ub,tau);
+	}
+
+	swtch = FALSE_;
+	if (orgati) {
+	    if (-w > dabs(prew) / 10.f) {
+		swtch = TRUE_;
+	    }
+	} else {
+	    if (w > dabs(prew) / 10.f) {
+		swtch = TRUE_;
+	    }
+	}
+
+/*        Main loop to update the values of the array   DELTA and WORK */
+
+	iter = niter + 1;
+
+	for (niter = iter; niter <= 20; ++niter) {
+
+/*           Test for convergence */
+
+	    if (dabs(w) <= eps * erretm) {
+		goto L240;
+	    }
+
+/*           Calculate the new step */
+
+	    if (! swtch3) {
+		dtipsq = work[ip1] * delta[ip1];
+		dtisq = work[*i__] * delta[*i__];
+		if (! swtch) {
+		    if (orgati) {
+/* Computing 2nd power */
+			r__1 = z__[*i__] / dtisq;
+			c__ = w - dtipsq * dw + delsq * (r__1 * r__1);
+		    } else {
+/* Computing 2nd power */
+			r__1 = z__[ip1] / dtipsq;
+			c__ = w - dtisq * dw - delsq * (r__1 * r__1);
+		    }
+		} else {
+		    temp = z__[ii] / (work[ii] * delta[ii]);
+		    if (orgati) {
+			dpsi += temp * temp;
+		    } else {
+			dphi += temp * temp;
+		    }
+		    c__ = w - dtisq * dpsi - dtipsq * dphi;
+		}
+		a = (dtipsq + dtisq) * w - dtipsq * dtisq * dw;
+		b = dtipsq * dtisq * w;
+		if (c__ == 0.f) {
+		    if (a == 0.f) {
+			if (! swtch) {
+			    if (orgati) {
+				a = z__[*i__] * z__[*i__] + dtipsq * dtipsq *
+					(dpsi + dphi);
+			    } else {
+				a = z__[ip1] * z__[ip1] + dtisq * dtisq * (
+					dpsi + dphi);
+			    }
+			} else {
+			    a = dtisq * dtisq * dpsi + dtipsq * dtipsq * dphi;
+			}
+		    }
+		    eta = b / a;
+		} else if (a <= 0.f) {
+		    eta = (a - sqrt((r__1 = a * a - b * 4.f * c__, dabs(r__1))
+			    )) / (c__ * 2.f);
+		} else {
+		    eta = b * 2.f / (a + sqrt((r__1 = a * a - b * 4.f * c__,
+			    dabs(r__1))));
+		}
+	    } else {
+
+/*              Interpolation using THREE most relevant poles */
+
+		dtiim = work[iim1] * delta[iim1];
+		dtiip = work[iip1] * delta[iip1];
+		temp = rhoinv + psi + phi;
+		if (swtch) {
+		    c__ = temp - dtiim * dpsi - dtiip * dphi;
+		    zz[0] = dtiim * dtiim * dpsi;
+		    zz[2] = dtiip * dtiip * dphi;
+		} else {
+		    if (orgati) {
+			temp1 = z__[iim1] / dtiim;
+			temp1 *= temp1;
+			temp2 = (d__[iim1] - d__[iip1]) * (d__[iim1] + d__[
+				iip1]) * temp1;
+			c__ = temp - dtiip * (dpsi + dphi) - temp2;
+			zz[0] = z__[iim1] * z__[iim1];
+			if (dpsi < temp1) {
+			    zz[2] = dtiip * dtiip * dphi;
+			} else {
+			    zz[2] = dtiip * dtiip * (dpsi - temp1 + dphi);
+			}
+		    } else {
+			temp1 = z__[iip1] / dtiip;
+			temp1 *= temp1;
+			temp2 = (d__[iip1] - d__[iim1]) * (d__[iim1] + d__[
+				iip1]) * temp1;
+			c__ = temp - dtiim * (dpsi + dphi) - temp2;
+			if (dphi < temp1) {
+			    zz[0] = dtiim * dtiim * dpsi;
+			} else {
+			    zz[0] = dtiim * dtiim * (dpsi + (dphi - temp1));
+			}
+			zz[2] = z__[iip1] * z__[iip1];
+		    }
+		}
+		dd[0] = dtiim;
+		dd[1] = delta[ii] * work[ii];
+		dd[2] = dtiip;
+		slaed6_(&niter, &orgati, &c__, dd, zz, &w, &eta, info);
+		if (*info != 0) {
+		    goto L240;
+		}
+	    }
+
+/*
+             Note, eta should be positive if w is negative, and
+             eta should be negative otherwise. However,
+             if for some reason caused by roundoff, eta*w > 0,
+             we simply use one Newton step instead. This way
+             will guarantee eta*w < 0.
+*/
+
+	    if (w * eta >= 0.f) {
+		eta = -w / dw;
+	    }
+	    if (orgati) {
+		temp1 = work[*i__] * delta[*i__];
+		temp = eta - temp1;
+	    } else {
+		temp1 = work[ip1] * delta[ip1];
+		temp = eta - temp1;
+	    }
+	    if (temp > sg2ub || temp < sg2lb) {
+		if (w < 0.f) {
+		    eta = (sg2ub - tau) / 2.f;
+		} else {
+		    eta = (sg2lb - tau) / 2.f;
+		}
+	    }
+
+	    tau += eta;
+	    eta /= *sigma + sqrt(*sigma * *sigma + eta);
+
+	    *sigma += eta;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		work[j] += eta;
+		delta[j] -= eta;
+/* L200: */
+	    }
+
+	    prew = w;
+
+/*           Evaluate PSI and the derivative DPSI */
+
+	    dpsi = 0.f;
+	    psi = 0.f;
+	    erretm = 0.f;
+	    i__1 = iim1;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = z__[j] / (work[j] * delta[j]);
+		psi += z__[j] * temp;
+		dpsi += temp * temp;
+		erretm += psi;
+/* L210: */
+	    }
+	    erretm = dabs(erretm);
+
+/*           Evaluate PHI and the derivative DPHI */
+
+	    dphi = 0.f;
+	    phi = 0.f;
+	    i__1 = iip1;
+	    for (j = *n; j >= i__1; --j) {
+		temp = z__[j] / (work[j] * delta[j]);
+		phi += z__[j] * temp;
+		dphi += temp * temp;
+		erretm += phi;
+/* L220: */
+	    }
+
+	    temp = z__[ii] / (work[ii] * delta[ii]);
+	    dw = dpsi + dphi + temp * temp;
+	    temp = z__[ii] * temp;
+	    w = rhoinv + phi + psi + temp;
+	    erretm = (phi - psi) * 8.f + erretm + rhoinv * 2.f + dabs(temp) *
+		    3.f + dabs(tau) * dw;
+	    if (w * prew > 0.f && dabs(w) > dabs(prew) / 10.f) {
+		swtch = ! swtch;
+	    }
+
+	    if (w <= 0.f) {
+		sg2lb = dmax(sg2lb,tau);
+	    } else {
+		sg2ub = dmin(sg2ub,tau);
+	    }
+
+/* L230: */
+	}
+
+/*        Return with INFO = 1, NITER = MAXIT and not converged */
+
+	*info = 1;
+
+    }
+
+L240:
+    return 0;
+
+/*     End of SLASD4 */
+
+} /* slasd4_ */
+
+/* Subroutine */ int slasd5_(integer *i__, real *d__, real *z__, real *delta,
+	real *rho, real *dsigma, real *work)
+{
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    static real b, c__, w, del, tau, delsq;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    This subroutine computes the square root of the I-th eigenvalue
+    of a positive symmetric rank-one modification of a 2-by-2 diagonal
+    matrix
+
+               diag( D ) * diag( D ) +  RHO *  Z * transpose(Z) .
+
+    The diagonal entries in the array D are assumed to satisfy
+
+               0 <= D(i) < D(j)  for  i < j .
+
+    We also assume RHO > 0 and that the Euclidean norm of the vector
+    Z is one.
+
+    Arguments
+    =========
+
+    I      (input) INTEGER
+           The index of the eigenvalue to be computed.  I = 1 or I = 2.
+
+    D      (input) REAL array, dimension (2)
+           The original eigenvalues.  We assume 0 <= D(1) < D(2).
+
+    Z      (input) REAL array, dimension (2)
+           The components of the updating vector.
+
+    DELTA  (output) REAL array, dimension (2)
+           Contains (D(j) - sigma_I) in its  j-th component.
+           The vector DELTA contains the information necessary
+           to construct the eigenvectors.
+
+    RHO    (input) REAL
+           The scalar in the symmetric updating formula.
+
+    DSIGMA (output) REAL
+           The computed sigma_I, the I-th updated eigenvalue.
+
+    WORK   (workspace) REAL array, dimension (2)
+           WORK contains (D(j) + sigma_I) in its  j-th component.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ren-Cang Li, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --work;
+    --delta;
+    --z__;
+    --d__;
+
+    /* Function Body */
+    del = d__[2] - d__[1];
+    delsq = del * (d__[2] + d__[1]);
+    if (*i__ == 1) {
+	w = *rho * 4.f * (z__[2] * z__[2] / (d__[1] + d__[2] * 3.f) - z__[1] *
+		 z__[1] / (d__[1] * 3.f + d__[2])) / del + 1.f;
+	if (w > 0.f) {
+	    b = delsq + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[1] * z__[1] * delsq;
+
+/*
+             B > ZERO, always
+
+             The following TAU is DSIGMA * DSIGMA - D( 1 ) * D( 1 )
+*/
+
+	    tau = c__ * 2.f / (b + sqrt((r__1 = b * b - c__ * 4.f, dabs(r__1))
+		    ));
+
+/*           The following TAU is DSIGMA - D( 1 ) */
+
+	    tau /= d__[1] + sqrt(d__[1] * d__[1] + tau);
+	    *dsigma = d__[1] + tau;
+	    delta[1] = -tau;
+	    delta[2] = del - tau;
+	    work[1] = d__[1] * 2.f + tau;
+	    work[2] = d__[1] + tau + d__[2];
+/*
+             DELTA( 1 ) = -Z( 1 ) / TAU
+             DELTA( 2 ) = Z( 2 ) / ( DEL-TAU )
+*/
+	} else {
+	    b = -delsq + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	    c__ = *rho * z__[2] * z__[2] * delsq;
+
+/*           The following TAU is DSIGMA * DSIGMA - D( 2 ) * D( 2 ) */
+
+	    if (b > 0.f) {
+		tau = c__ * -2.f / (b + sqrt(b * b + c__ * 4.f));
+	    } else {
+		tau = (b - sqrt(b * b + c__ * 4.f)) / 2.f;
+	    }
+
+/*           The following TAU is DSIGMA - D( 2 ) */
+
+	    tau /= d__[2] + sqrt((r__1 = d__[2] * d__[2] + tau, dabs(r__1)));
+	    *dsigma = d__[2] + tau;
+	    delta[1] = -(del + tau);
+	    delta[2] = -tau;
+	    work[1] = d__[1] + tau + d__[2];
+	    work[2] = d__[2] * 2.f + tau;
+/*
+             DELTA( 1 ) = -Z( 1 ) / ( DEL+TAU )
+             DELTA( 2 ) = -Z( 2 ) / TAU
+*/
+	}
+/*
+          TEMP = SQRT( DELTA( 1 )*DELTA( 1 )+DELTA( 2 )*DELTA( 2 ) )
+          DELTA( 1 ) = DELTA( 1 ) / TEMP
+          DELTA( 2 ) = DELTA( 2 ) / TEMP
+*/
+    } else {
+
+/*        Now I=2 */
+
+	b = -delsq + *rho * (z__[1] * z__[1] + z__[2] * z__[2]);
+	c__ = *rho * z__[2] * z__[2] * delsq;
+
+/*        The following TAU is DSIGMA * DSIGMA - D( 2 ) * D( 2 ) */
+
+	if (b > 0.f) {
+	    tau = (b + sqrt(b * b + c__ * 4.f)) / 2.f;
+	} else {
+	    tau = c__ * 2.f / (-b + sqrt(b * b + c__ * 4.f));
+	}
+
+/*        The following TAU is DSIGMA - D( 2 ) */
+
+	tau /= d__[2] + sqrt(d__[2] * d__[2] + tau);
+	*dsigma = d__[2] + tau;
+	delta[1] = -(del + tau);
+	delta[2] = -tau;
+	work[1] = d__[1] + tau + d__[2];
+	work[2] = d__[2] * 2.f + tau;
+/*
+          DELTA( 1 ) = -Z( 1 ) / ( DEL+TAU )
+          DELTA( 2 ) = -Z( 2 ) / TAU
+          TEMP = SQRT( DELTA( 1 )*DELTA( 1 )+DELTA( 2 )*DELTA( 2 ) )
+          DELTA( 1 ) = DELTA( 1 ) / TEMP
+          DELTA( 2 ) = DELTA( 2 ) / TEMP
+*/
+    }
+    return 0;
+
+/*     End of SLASD5 */
+
+} /* slasd5_ */
+
+/* Subroutine */ int slasd6_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, real *d__, real *vf, real *vl, real *alpha, real *beta,
+	 integer *idxq, integer *perm, integer *givptr, integer *givcol,
+	integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real *
+	difl, real *difr, real *z__, integer *k, real *c__, real *s, real *
+	work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, m, n, n1, n2, iw, idx, idxc, idxp, ivfw, ivlw;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slasd7_(integer *, integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, real *, real *, real *,
+	     real *, real *, real *, integer *, integer *, integer *, integer
+	    *, integer *, integer *, integer *, real *, integer *, real *,
+	    real *, integer *), slasd8_(integer *, integer *, real *, real *,
+	    real *, real *, real *, real *, integer *, real *, real *,
+	    integer *);
+    static integer isigma;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slascl_(
+	    char *, integer *, integer *, real *, real *, integer *, integer *
+	    , real *, integer *, integer *), slamrg_(integer *,
+	    integer *, real *, integer *, integer *, integer *);
+    static real orgnrm;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLASD6 computes the SVD of an updated upper bidiagonal matrix B
+    obtained by merging two smaller ones by appending a row. This
+    routine is used only for the problem which requires all singular
+    values and optionally singular vector matrices in factored form.
+    B is an N-by-M matrix with N = NL + NR + 1 and M = N + SQRE.
+    A related subroutine, SLASD1, handles the case in which all singular
+    values and singular vectors of the bidiagonal matrix are desired.
+
+    SLASD6 computes the SVD as follows:
+
+                  ( D1(in)  0    0     0 )
+      B = U(in) * (   Z1'   a   Z2'    b ) * VT(in)
+                  (   0     0   D2(in) 0 )
+
+        = U(out) * ( D(out) 0) * VT(out)
+
+    where Z' = (Z1' a Z2' b) = u' VT', and u is a vector of dimension M
+    with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+    elsewhere; and the entry b is empty if SQRE = 0.
+
+    The singular values of B can be computed using D1, D2, the first
+    components of all the right singular vectors of the lower block, and
+    the last components of all the right singular vectors of the upper
+    block. These components are stored and updated in VF and VL,
+    respectively, in SLASD6. Hence U and VT are not explicitly
+    referenced.
+
+    The singular values are stored in D. The algorithm consists of two
+    stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple singular values or if there is a zero
+          in the Z vector. For each such occurence the dimension of the
+          secular equation problem is reduced by one. This stage is
+          performed by the routine SLASD7.
+
+          The second stage consists of calculating the updated
+          singular values. This is done by finding the roots of the
+          secular equation via the routine SLASD4 (as called by SLASD8).
+          This routine also updates VF and VL and computes the distances
+          between the updated singular values and the old singular
+          values.
+
+    SLASD6 is called from SLASDA.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Compute singular values only.
+           = 1: Compute singular vectors in factored form as well.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block.  NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block.  NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    D      (input/output) REAL array, dimension (NL+NR+1).
+           On entry D(1:NL,1:NL) contains the singular values of the
+           upper block, and D(NL+2:N) contains the singular values
+           of the lower block. On exit D(1:N) contains the singular
+           values of the modified matrix.
+
+    VF     (input/output) REAL array, dimension (M)
+           On entry, VF(1:NL+1) contains the first components of all
+           right singular vectors of the upper block; and VF(NL+2:M)
+           contains the first components of all right singular vectors
+           of the lower block. On exit, VF contains the first components
+           of all right singular vectors of the bidiagonal matrix.
+
+    VL     (input/output) REAL array, dimension (M)
+           On entry, VL(1:NL+1) contains the  last components of all
+           right singular vectors of the upper block; and VL(NL+2:M)
+           contains the last components of all right singular vectors of
+           the lower block. On exit, VL contains the last components of
+           all right singular vectors of the bidiagonal matrix.
+
+    ALPHA  (input/output) REAL
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input/output) REAL
+           Contains the off-diagonal element associated with the added
+           row.
+
+    IDXQ   (output) INTEGER array, dimension (N)
+           This contains the permutation which will reintegrate the
+           subproblem just solved back into sorted order, i.e.
+           D( IDXQ( I = 1, N ) ) will be in ascending order.
+
+    PERM   (output) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) to be applied
+           to each block. Not referenced if ICOMPQ = 0.
+
+    GIVPTR (output) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem. Not referenced if ICOMPQ = 0.
+
+    GIVCOL (output) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGCOL (input) INTEGER
+           leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (output) REAL array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value to be used in the
+           corresponding Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of GIVNUM and POLES, must be at least N.
+
+    POLES  (output) REAL array, dimension ( LDGNUM, 2 )
+           On exit, POLES(1,*) is an array containing the new singular
+           values obtained from solving the secular equation, and
+           POLES(2,*) is an array containing the poles in the secular
+           equation. Not referenced if ICOMPQ = 0.
+
+    DIFL   (output) REAL array, dimension ( N )
+           On exit, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (output) REAL array,
+                    dimension ( LDGNUM, 2 ) if ICOMPQ = 1 and
+                    dimension ( N ) if ICOMPQ = 0.
+           On exit, DIFR(I, 1) is the distance between I-th updated
+           (undeflated) singular value and the I+1-th (undeflated) old
+           singular value.
+
+           If ICOMPQ = 1, DIFR(1:K,2) is an array containing the
+           normalizing factors for the right singular vector matrix.
+
+           See SLASD8 for details on DIFL and DIFR.
+
+    Z      (output) REAL array, dimension ( M )
+           The first elements of this array contain the components
+           of the deflation-adjusted updating row vector.
+
+    K      (output) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (output) REAL
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (output) REAL
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    WORK   (workspace) REAL array, dimension ( 4 * M )
+
+    IWORK  (workspace) INTEGER array, dimension ( 3 * N )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --vf;
+    --vl;
+    --idxq;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --difr;
+    --z__;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    } else if (*ldgcol < n) {
+	*info = -14;
+    } else if (*ldgnum < n) {
+	*info = -16;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD6", &i__1);
+	return 0;
+    }
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in SLASD7 and SLASD8.
+*/
+
+    isigma = 1;
+    iw = isigma + n;
+    ivfw = iw + m;
+    ivlw = ivfw + m;
+
+    idx = 1;
+    idxc = idx + n;
+    idxp = idxc + n;
+
+/*
+       Scale.
+
+   Computing MAX
+*/
+    r__1 = dabs(*alpha), r__2 = dabs(*beta);
+    orgnrm = dmax(r__1,r__2);
+    d__[*nl + 1] = 0.f;
+    i__1 = n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) > orgnrm) {
+	    orgnrm = (r__1 = d__[i__], dabs(r__1));
+	}
+/* L10: */
+    }
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &n, &c__1, &d__[1], &n, info);
+    *alpha /= orgnrm;
+    *beta /= orgnrm;
+
+/*     Sort and Deflate singular values. */
+
+    slasd7_(icompq, nl, nr, sqre, k, &d__[1], &z__[1], &work[iw], &vf[1], &
+	    work[ivfw], &vl[1], &work[ivlw], alpha, beta, &work[isigma], &
+	    iwork[idx], &iwork[idxp], &idxq[1], &perm[1], givptr, &givcol[
+	    givcol_offset], ldgcol, &givnum[givnum_offset], ldgnum, c__, s,
+	    info);
+
+/*     Solve Secular Equation, compute DIFL, DIFR, and update VF, VL. */
+
+    slasd8_(icompq, k, &d__[1], &z__[1], &vf[1], &vl[1], &difl[1], &difr[1],
+	    ldgnum, &work[isigma], &work[iw], info);
+
+/*     Save the poles if ICOMPQ = 1. */
+
+    if (*icompq == 1) {
+	scopy_(k, &d__[1], &c__1, &poles[poles_dim1 + 1], &c__1);
+	scopy_(k, &work[isigma], &c__1, &poles[(poles_dim1 << 1) + 1], &c__1);
+    }
+
+/*     Unscale. */
+
+    slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, &n, &c__1, &d__[1], &n, info);
+
+/*     Prepare the IDXQ sorting permutation. */
+
+    n1 = *k;
+    n2 = n - *k;
+    slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]);
+
+    return 0;
+
+/*     End of SLASD6 */
+
+} /* slasd6_ */
+
+/* Subroutine */ int slasd7_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *k, real *d__, real *z__, real *zw, real *vf,
+	real *vfw, real *vl, real *vlw, real *alpha, real *beta, real *dsigma,
+	 integer *idx, integer *idxp, integer *idxq, integer *perm, integer *
+	givptr, integer *givcol, integer *ldgcol, real *givnum, integer *
+	ldgnum, real *c__, real *s, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, givnum_dim1, givnum_offset, i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, j, m, n, k2;
+    static real z1;
+    static integer jp;
+    static real eps, tau, tol;
+    static integer nlp1, nlp2, idxi, idxj;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    static integer idxjp, jprev;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    extern doublereal slapy2_(real *, real *), slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), slamrg_(
+	    integer *, integer *, real *, integer *, integer *, integer *);
+    static real hlftol;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASD7 merges the two sets of singular values together into a single
+    sorted set. Then it tries to deflate the size of the problem. There
+    are two ways in which deflation can occur:  when two or more singular
+    values are close together or if there is a tiny entry in the Z
+    vector. For each such occurrence the order of the related
+    secular equation problem is reduced by one.
+
+    SLASD7 is called from SLASD6.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            Specifies whether singular vectors are to be computed
+            in compact form, as follows:
+            = 0: Compute singular values only.
+            = 1: Compute singular vectors of upper
+                 bidiagonal matrix in compact form.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has
+           N = NL + NR + 1 rows and
+           M = N + SQRE >= N columns.
+
+    K      (output) INTEGER
+           Contains the dimension of the non-deflated matrix, this is
+           the order of the related secular equation. 1 <= K <=N.
+
+    D      (input/output) REAL array, dimension ( N )
+           On entry D contains the singular values of the two submatrices
+           to be combined. On exit D contains the trailing (N-K) updated
+           singular values (those which were deflated) sorted into
+           increasing order.
+
+    Z      (output) REAL array, dimension ( M )
+           On exit Z contains the updating row vector in the secular
+           equation.
+
+    ZW     (workspace) REAL array, dimension ( M )
+           Workspace for Z.
+
+    VF     (input/output) REAL array, dimension ( M )
+           On entry, VF(1:NL+1) contains the first components of all
+           right singular vectors of the upper block; and VF(NL+2:M)
+           contains the first components of all right singular vectors
+           of the lower block. On exit, VF contains the first components
+           of all right singular vectors of the bidiagonal matrix.
+
+    VFW    (workspace) REAL array, dimension ( M )
+           Workspace for VF.
+
+    VL     (input/output) REAL array, dimension ( M )
+           On entry, VL(1:NL+1) contains the  last components of all
+           right singular vectors of the upper block; and VL(NL+2:M)
+           contains the last components of all right singular vectors
+           of the lower block. On exit, VL contains the last components
+           of all right singular vectors of the bidiagonal matrix.
+
+    VLW    (workspace) REAL array, dimension ( M )
+           Workspace for VL.
+
+    ALPHA  (input) REAL
+           Contains the diagonal element associated with the added row.
+
+    BETA   (input) REAL
+           Contains the off-diagonal element associated with the added
+           row.
+
+    DSIGMA (output) REAL array, dimension ( N )
+           Contains a copy of the diagonal elements (K-1 singular values
+           and one zero) in the secular equation.
+
+    IDX    (workspace) INTEGER array, dimension ( N )
+           This will contain the permutation used to sort the contents of
+           D into ascending order.
+
+    IDXP   (workspace) INTEGER array, dimension ( N )
+           This will contain the permutation used to place deflated
+           values of D at the end of the array. On output IDXP(2:K)
+           points to the nondeflated D-values and IDXP(K+1:N)
+           points to the deflated singular values.
+
+    IDXQ   (input) INTEGER array, dimension ( N )
+           This contains the permutation which separately sorts the two
+           sub-problems in D into ascending order.  Note that entries in
+           the first half of this permutation must first be moved one
+           position backward; and entries in the second half
+           must first have NL+1 added to their values.
+
+    PERM   (output) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) to be applied
+           to each singular block. Not referenced if ICOMPQ = 0.
+
+    GIVPTR (output) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem. Not referenced if ICOMPQ = 0.
+
+    GIVCOL (output) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (output) REAL array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value to be used in the
+           corresponding Givens rotation. Not referenced if ICOMPQ = 0.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of GIVNUM, must be at least N.
+
+    C      (output) REAL
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (output) REAL
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --z__;
+    --zw;
+    --vf;
+    --vfw;
+    --vl;
+    --vlw;
+    --dsigma;
+    --idx;
+    --idxp;
+    --idxq;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+
+    /* Function Body */
+    *info = 0;
+    n = *nl + *nr + 1;
+    m = n + *sqre;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    } else if (*ldgcol < n) {
+	*info = -22;
+    } else if (*ldgnum < n) {
+	*info = -24;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD7", &i__1);
+	return 0;
+    }
+
+    nlp1 = *nl + 1;
+    nlp2 = *nl + 2;
+    if (*icompq == 1) {
+	*givptr = 0;
+    }
+
+/*
+       Generate the first part of the vector Z and move the singular
+       values in the first part of D one position backward.
+*/
+
+    z1 = *alpha * vl[nlp1];
+    vl[nlp1] = 0.f;
+    tau = vf[nlp1];
+    for (i__ = *nl; i__ >= 1; --i__) {
+	z__[i__ + 1] = *alpha * vl[i__];
+	vl[i__] = 0.f;
+	vf[i__ + 1] = vf[i__];
+	d__[i__ + 1] = d__[i__];
+	idxq[i__ + 1] = idxq[i__] + 1;
+/* L10: */
+    }
+    vf[1] = tau;
+
+/*     Generate the second part of the vector Z. */
+
+    i__1 = m;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	z__[i__] = *beta * vf[i__];
+	vf[i__] = 0.f;
+/* L20: */
+    }
+
+/*     Sort the singular values into increasing order */
+
+    i__1 = n;
+    for (i__ = nlp2; i__ <= i__1; ++i__) {
+	idxq[i__] += nlp1;
+/* L30: */
+    }
+
+/*     DSIGMA, IDXC, IDXC, and ZW are used as storage space. */
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	dsigma[i__] = d__[idxq[i__]];
+	zw[i__] = z__[idxq[i__]];
+	vfw[i__] = vf[idxq[i__]];
+	vlw[i__] = vl[idxq[i__]];
+/* L40: */
+    }
+
+    slamrg_(nl, nr, &dsigma[2], &c__1, &c__1, &idx[2]);
+
+    i__1 = n;
+    for (i__ = 2; i__ <= i__1; ++i__) {
+	idxi = idx[i__] + 1;
+	d__[i__] = dsigma[idxi];
+	z__[i__] = zw[idxi];
+	vf[i__] = vfw[idxi];
+	vl[i__] = vlw[idxi];
+/* L50: */
+    }
+
+/*     Calculate the allowable deflation tolerence */
+
+    eps = slamch_("Epsilon");
+/* Computing MAX */
+    r__1 = dabs(*alpha), r__2 = dabs(*beta);
+    tol = dmax(r__1,r__2);
+/* Computing MAX */
+    r__2 = (r__1 = d__[n], dabs(r__1));
+    tol = eps * 64.f * dmax(r__2,tol);
+
+/*
+       There are 2 kinds of deflation -- first a value in the z-vector
+       is small, second two (or more) singular values are very close
+       together (their difference is small).
+
+       If the value in the z-vector is small, we simply permute the
+       array so that the corresponding singular value is moved to the
+       end.
+
+       If two values in the D-vector are close, we perform a two-sided
+       rotation designed to make one of the corresponding z-vector
+       entries zero, and then permute the array so that the deflated
+       singular value is moved to the end.
+
+       If there are multiple singular values then the problem deflates.
+       Here the number of equal singular values are found.  As each equal
+       singular value is found, an elementary reflector is computed to
+       rotate the corresponding singular subspace so that the
+       corresponding components of Z are zero in this new basis.
+*/
+
+    *k = 1;
+    k2 = n + 1;
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	if ((r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    idxp[k2] = j;
+	    if (j == n) {
+		goto L100;
+	    }
+	} else {
+	    jprev = j;
+	    goto L70;
+	}
+/* L60: */
+    }
+L70:
+    j = jprev;
+L80:
+    ++j;
+    if (j > n) {
+	goto L90;
+    }
+    if ((r__1 = z__[j], dabs(r__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	idxp[k2] = j;
+    } else {
+
+/*        Check if singular values are close enough to allow deflation. */
+
+	if ((r__1 = d__[j] - d__[jprev], dabs(r__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    *s = z__[jprev];
+	    *c__ = z__[j];
+
+/*
+             Find sqrt(a**2+b**2) without overflow or
+             destructive underflow.
+*/
+
+	    tau = slapy2_(c__, s);
+	    z__[j] = tau;
+	    z__[jprev] = 0.f;
+	    *c__ /= tau;
+	    *s = -(*s) / tau;
+
+/*           Record the appropriate Givens rotation */
+
+	    if (*icompq == 1) {
+		++(*givptr);
+		idxjp = idxq[idx[jprev] + 1];
+		idxj = idxq[idx[j] + 1];
+		if (idxjp <= nlp1) {
+		    --idxjp;
+		}
+		if (idxj <= nlp1) {
+		    --idxj;
+		}
+		givcol[*givptr + (givcol_dim1 << 1)] = idxjp;
+		givcol[*givptr + givcol_dim1] = idxj;
+		givnum[*givptr + (givnum_dim1 << 1)] = *c__;
+		givnum[*givptr + givnum_dim1] = *s;
+	    }
+	    srot_(&c__1, &vf[jprev], &c__1, &vf[j], &c__1, c__, s);
+	    srot_(&c__1, &vl[jprev], &c__1, &vl[j], &c__1, c__, s);
+	    --k2;
+	    idxp[k2] = jprev;
+	    jprev = j;
+	} else {
+	    ++(*k);
+	    zw[*k] = z__[jprev];
+	    dsigma[*k] = d__[jprev];
+	    idxp[*k] = jprev;
+	    jprev = j;
+	}
+    }
+    goto L80;
+L90:
+
+/*     Record the last singular value. */
+
+    ++(*k);
+    zw[*k] = z__[jprev];
+    dsigma[*k] = d__[jprev];
+    idxp[*k] = jprev;
+
+L100:
+
+/*
+       Sort the singular values into DSIGMA. The singular values which
+       were not deflated go into the first K slots of DSIGMA, except
+       that DSIGMA(1) is treated separately.
+*/
+
+    i__1 = n;
+    for (j = 2; j <= i__1; ++j) {
+	jp = idxp[j];
+	dsigma[j] = d__[jp];
+	vfw[j] = vf[jp];
+	vlw[j] = vl[jp];
+/* L110: */
+    }
+    if (*icompq == 1) {
+	i__1 = n;
+	for (j = 2; j <= i__1; ++j) {
+	    jp = idxp[j];
+	    perm[j] = idxq[idx[jp] + 1];
+	    if (perm[j] <= nlp1) {
+		--perm[j];
+	    }
+/* L120: */
+	}
+    }
+
+/*
+       The deflated singular values go back into the last N - K slots of
+       D.
+*/
+
+    i__1 = n - *k;
+    scopy_(&i__1, &dsigma[*k + 1], &c__1, &d__[*k + 1], &c__1);
+
+/*
+       Determine DSIGMA(1), DSIGMA(2), Z(1), VF(1), VL(1), VF(M), and
+       VL(M).
+*/
+
+    dsigma[1] = 0.f;
+    hlftol = tol / 2.f;
+    if (dabs(dsigma[2]) <= hlftol) {
+	dsigma[2] = hlftol;
+    }
+    if (m > n) {
+	z__[1] = slapy2_(&z1, &z__[m]);
+	if (z__[1] <= tol) {
+	    *c__ = 1.f;
+	    *s = 0.f;
+	    z__[1] = tol;
+	} else {
+	    *c__ = z1 / z__[1];
+	    *s = -z__[m] / z__[1];
+	}
+	srot_(&c__1, &vf[m], &c__1, &vf[1], &c__1, c__, s);
+	srot_(&c__1, &vl[m], &c__1, &vl[1], &c__1, c__, s);
+    } else {
+	if (dabs(z1) <= tol) {
+	    z__[1] = tol;
+	} else {
+	    z__[1] = z1;
+	}
+    }
+
+/*     Restore Z, VF, and VL. */
+
+    i__1 = *k - 1;
+    scopy_(&i__1, &zw[2], &c__1, &z__[2], &c__1);
+    i__1 = n - 1;
+    scopy_(&i__1, &vfw[2], &c__1, &vf[2], &c__1);
+    i__1 = n - 1;
+    scopy_(&i__1, &vlw[2], &c__1, &vl[2], &c__1);
+
+    return 0;
+
+/*     End of SLASD7 */
+
+} /* slasd7_ */
+
+/* Subroutine */ int slasd8_(integer *icompq, integer *k, real *d__, real *
+	z__, real *vf, real *vl, real *difl, real *difr, integer *lddifr,
+	real *dsigma, real *work, integer *info)
+{
+    /* System generated locals */
+    integer difr_dim1, difr_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static real dj, rho;
+    static integer iwk1, iwk2, iwk3;
+    static real temp;
+    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
+    static integer iwk2i, iwk3i;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    static real diflj, difrj, dsigj;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int slasd4_(integer *, integer *, real *, real *,
+	    real *, real *, real *, real *, integer *), xerbla_(char *,
+	    integer *);
+    static real dsigjp;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLASD8 finds the square roots of the roots of the secular equation,
+    as defined by the values in DSIGMA and Z. It makes the appropriate
+    calls to SLASD4, and stores, for each  element in D, the distance
+    to its two nearest poles (elements in DSIGMA). It also updates
+    the arrays VF and VL, the first and last components of all the
+    right singular vectors of the original bidiagonal matrix.
+
+    SLASD8 is called from SLASD6.
+
+    Arguments
+    =========
+
+    ICOMPQ  (input) INTEGER
+            Specifies whether singular vectors are to be computed in
+            factored form in the calling routine:
+            = 0: Compute singular values only.
+            = 1: Compute singular vectors in factored form as well.
+
+    K       (input) INTEGER
+            The number of terms in the rational function to be solved
+            by SLASD4.  K >= 1.
+
+    D       (output) REAL array, dimension ( K )
+            On output, D contains the updated singular values.
+
+    Z       (input/output) REAL array, dimension ( K )
+            On entry, the first K elements of this array contain the
+            components of the deflation-adjusted updating row vector.
+            On exit, Z is updated.
+
+    VF      (input/output) REAL array, dimension ( K )
+            On entry, VF contains  information passed through DBEDE8.
+            On exit, VF contains the first K components of the first
+            components of all right singular vectors of the bidiagonal
+            matrix.
+
+    VL      (input/output) REAL array, dimension ( K )
+            On entry, VL contains  information passed through DBEDE8.
+            On exit, VL contains the first K components of the last
+            components of all right singular vectors of the bidiagonal
+            matrix.
+
+    DIFL    (output) REAL array, dimension ( K )
+            On exit, DIFL(I) = D(I) - DSIGMA(I).
+
+    DIFR    (output) REAL array,
+                     dimension ( LDDIFR, 2 ) if ICOMPQ = 1 and
+                     dimension ( K ) if ICOMPQ = 0.
+            On exit, DIFR(I,1) = D(I) - DSIGMA(I+1), DIFR(K,1) is not
+            defined and will not be referenced.
+
+            If ICOMPQ = 1, DIFR(1:K,2) is an array containing the
+            normalizing factors for the right singular vector matrix.
+
+    LDDIFR  (input) INTEGER
+            The leading dimension of DIFR, must be at least K.
+
+    DSIGMA  (input/output) REAL array, dimension ( K )
+            On entry, the first K elements of this array contain the old
+            roots of the deflated updating problem.  These are the poles
+            of the secular equation.
+            On exit, the elements of DSIGMA may be very slightly altered
+            in value.
+
+    WORK    (workspace) REAL array, dimension at least 3 * K
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --z__;
+    --vf;
+    --vl;
+    --difl;
+    difr_dim1 = *lddifr;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    --dsigma;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*k < 1) {
+	*info = -2;
+    } else if (*lddifr < *k) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASD8", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*k == 1) {
+	d__[1] = dabs(z__[1]);
+	difl[1] = d__[1];
+	if (*icompq == 1) {
+	    difl[2] = 1.f;
+	    difr[(difr_dim1 << 1) + 1] = 1.f;
+	}
+	return 0;
+    }
+
+/*
+       Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
+       be computed with high relative accuracy (barring over/underflow).
+       This is a problem on machines without a guard digit in
+       add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
+       The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
+       which on any of these machines zeros out the bottommost
+       bit of DSIGMA(I) if it is 1; this makes the subsequent
+       subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
+       occurs. On binary machines with a guard digit (almost all
+       machines) it does not change DSIGMA(I) at all. On hexadecimal
+       and decimal machines with a guard digit, it slightly
+       changes the bottommost bits of DSIGMA(I). It does not account
+       for hexadecimal or decimal machines without guard digits
+       (we know of none). We use a subroutine call to compute
+       2*DLAMBDA(I) to prevent optimizing compilers from eliminating
+       this code.
+*/
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dsigma[i__] = slamc3_(&dsigma[i__], &dsigma[i__]) - dsigma[i__];
+/* L10: */
+    }
+
+/*     Book keeping. */
+
+    iwk1 = 1;
+    iwk2 = iwk1 + *k;
+    iwk3 = iwk2 + *k;
+    iwk2i = iwk2 - 1;
+    iwk3i = iwk3 - 1;
+
+/*     Normalize Z. */
+
+    rho = snrm2_(k, &z__[1], &c__1);
+    slascl_("G", &c__0, &c__0, &rho, &c_b15, k, &c__1, &z__[1], k, info);
+    rho *= rho;
+
+/*     Initialize WORK(IWK3). */
+
+    slaset_("A", k, &c__1, &c_b15, &c_b15, &work[iwk3], k);
+
+/*
+       Compute the updated singular values, the arrays DIFL, DIFR,
+       and the updated Z.
+*/
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	slasd4_(k, &j, &dsigma[1], &z__[1], &work[iwk1], &rho, &d__[j], &work[
+		iwk2], info);
+
+/*        If the root finder fails, the computation is terminated. */
+
+	if (*info != 0) {
+	    return 0;
+	}
+	work[iwk3i + j] = work[iwk3i + j] * work[j] * work[iwk2i + j];
+	difl[j] = -work[j];
+	difr[j + difr_dim1] = -work[j + 1];
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    work[iwk3i + i__] = work[iwk3i + i__] * work[i__] * work[iwk2i +
+		    i__] / (dsigma[i__] - dsigma[j]) / (dsigma[i__] + dsigma[
+		    j]);
+/* L20: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    work[iwk3i + i__] = work[iwk3i + i__] * work[i__] * work[iwk2i +
+		    i__] / (dsigma[i__] - dsigma[j]) / (dsigma[i__] + dsigma[
+		    j]);
+/* L30: */
+	}
+/* L40: */
+    }
+
+/*     Compute updated Z. */
+
+    i__1 = *k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	r__2 = sqrt((r__1 = work[iwk3i + i__], dabs(r__1)));
+	z__[i__] = r_sign(&r__2, &z__[i__]);
+/* L50: */
+    }
+
+/*     Update VF and VL. */
+
+    i__1 = *k;
+    for (j = 1; j <= i__1; ++j) {
+	diflj = difl[j];
+	dj = d__[j];
+	dsigj = -dsigma[j];
+	if (j < *k) {
+	    difrj = -difr[j + difr_dim1];
+	    dsigjp = -dsigma[j + 1];
+	}
+	work[j] = -z__[j] / diflj / (dsigma[j] + dj);
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    work[i__] = z__[i__] / (slamc3_(&dsigma[i__], &dsigj) - diflj) / (
+		    dsigma[i__] + dj);
+/* L60: */
+	}
+	i__2 = *k;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    work[i__] = z__[i__] / (slamc3_(&dsigma[i__], &dsigjp) + difrj) /
+		    (dsigma[i__] + dj);
+/* L70: */
+	}
+	temp = snrm2_(k, &work[1], &c__1);
+	work[iwk2i + j] = sdot_(k, &work[1], &c__1, &vf[1], &c__1) / temp;
+	work[iwk3i + j] = sdot_(k, &work[1], &c__1, &vl[1], &c__1) / temp;
+	if (*icompq == 1) {
+	    difr[j + (difr_dim1 << 1)] = temp;
+	}
+/* L80: */
+    }
+
+    scopy_(k, &work[iwk2], &c__1, &vf[1], &c__1);
+    scopy_(k, &work[iwk3], &c__1, &vl[1], &c__1);
+
+    return 0;
+
+/*     End of SLASD8 */
+
+} /* slasd8_ */
+
+/* Subroutine */ int slasda_(integer *icompq, integer *smlsiz, integer *n,
+	integer *sqre, real *d__, real *e, real *u, integer *ldu, real *vt,
+	integer *k, real *difl, real *difr, real *z__, real *poles, integer *
+	givptr, integer *givcol, integer *ldgcol, integer *perm, real *givnum,
+	 real *c__, real *s, real *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, difl_dim1,
+	    difl_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset,
+	    z_dim1, z_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, m, i1, ic, lf, nd, ll, nl, vf, nr, vl, im1, ncc,
+	    nlf, nrf, vfi, iwk, vli, lvl, nru, ndb1, nlp1, lvl2, nrp1;
+    static real beta;
+    static integer idxq, nlvl;
+    static real alpha;
+    static integer inode, ndiml, ndimr, idxqi, itemp, sqrei;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slasd6_(integer *, integer *, integer *, integer *,
+	    real *, real *, real *, real *, real *, integer *, integer *,
+	    integer *, integer *, integer *, real *, integer *, real *, real *
+	    , real *, real *, integer *, real *, real *, real *, integer *,
+	    integer *);
+    static integer nwork1, nwork2;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slasdq_(
+	    char *, integer *, integer *, integer *, integer *, integer *,
+	    real *, real *, real *, integer *, real *, integer *, real *,
+	    integer *, real *, integer *), slasdt_(integer *, integer
+	    *, integer *, integer *, integer *, integer *, integer *),
+	    slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *);
+    static integer smlszp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    Using a divide and conquer approach, SLASDA computes the singular
+    value decomposition (SVD) of a real upper bidiagonal N-by-M matrix
+    B with diagonal D and offdiagonal E, where M = N + SQRE. The
+    algorithm computes the singular values in the SVD B = U * S * VT.
+    The orthogonal matrices U and VT are optionally computed in
+    compact form.
+
+    A related subroutine, SLASD0, computes the singular values and
+    the singular vectors in explicit form.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed
+           in compact form, as follows
+           = 0: Compute singular values only.
+           = 1: Compute singular vectors of upper bidiagonal
+                matrix in compact form.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row dimension of the upper bidiagonal matrix. This is
+           also the dimension of the main diagonal array D.
+
+    SQRE   (input) INTEGER
+           Specifies the column dimension of the bidiagonal matrix.
+           = 0: The bidiagonal matrix has column dimension M = N;
+           = 1: The bidiagonal matrix has column dimension M = N + 1.
+
+    D      (input/output) REAL array, dimension ( N )
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit D, if INFO = 0, contains its singular values.
+
+    E      (input) REAL array, dimension ( M-1 )
+           Contains the subdiagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    U      (output) REAL array,
+           dimension ( LDU, SMLSIZ ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, U contains the left
+           singular vector matrices of all subproblems at the bottom
+           level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR, POLES,
+           GIVNUM, and Z.
+
+    VT     (output) REAL array,
+           dimension ( LDU, SMLSIZ+1 ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, VT' contains the right
+           singular vector matrices of all subproblems at the bottom
+           level.
+
+    K      (output) INTEGER array, dimension ( N )
+           if ICOMPQ = 1 and dimension 1 if ICOMPQ = 0.
+           If ICOMPQ = 1, on exit, K(I) is the dimension of the I-th
+           secular equation on the computation tree.
+
+    DIFL   (output) REAL array, dimension ( LDU, NLVL ),
+           where NLVL = floor(log_2 (N/SMLSIZ))).
+
+    DIFR   (output) REAL array,
+                    dimension ( LDU, 2 * NLVL ) if ICOMPQ = 1 and
+                    dimension ( N ) if ICOMPQ = 0.
+           If ICOMPQ = 1, on exit, DIFL(1:N, I) and DIFR(1:N, 2 * I - 1)
+           record distances between singular values on the I-th
+           level and singular values on the (I -1)-th level, and
+           DIFR(1:N, 2 * I ) contains the normalizing factors for
+           the right singular vector matrix. See SLASD8 for details.
+
+    Z      (output) REAL array,
+                    dimension ( LDU, NLVL ) if ICOMPQ = 1 and
+                    dimension ( N ) if ICOMPQ = 0.
+           The first K elements of Z(1, I) contain the components of
+           the deflation-adjusted updating row vector for subproblems
+           on the I-th level.
+
+    POLES  (output) REAL array,
+           dimension ( LDU, 2 * NLVL ) if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, POLES(1, 2*I - 1) and
+           POLES(1, 2*I) contain  the new and old singular values
+           involved in the secular equations on the I-th level.
+
+    GIVPTR (output) INTEGER array,
+           dimension ( N ) if ICOMPQ = 1, and not referenced if
+           ICOMPQ = 0. If ICOMPQ = 1, on exit, GIVPTR( I ) records
+           the number of Givens rotations performed on the I-th
+           problem on the computation tree.
+
+    GIVCOL (output) INTEGER array,
+           dimension ( LDGCOL, 2 * NLVL ) if ICOMPQ = 1, and not
+           referenced if ICOMPQ = 0. If ICOMPQ = 1, on exit, for each I,
+           GIVCOL(1, 2 *I - 1) and GIVCOL(1, 2 *I) record the locations
+           of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (output) INTEGER array, dimension ( LDGCOL, NLVL )
+           if ICOMPQ = 1, and not referenced
+           if ICOMPQ = 0. If ICOMPQ = 1, on exit, PERM(1, I) records
+           permutations done on the I-th level of the computation tree.
+
+    GIVNUM (output) REAL array,
+           dimension ( LDU,  2 * NLVL ) if ICOMPQ = 1, and not
+           referenced if ICOMPQ = 0. If ICOMPQ = 1, on exit, for each I,
+           GIVNUM(1, 2 *I - 1) and GIVNUM(1, 2 *I) record the C- and S-
+           values of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    C      (output) REAL array,
+           dimension ( N ) if ICOMPQ = 1, and dimension 1 if ICOMPQ = 0.
+           If ICOMPQ = 1 and the I-th subproblem is not square, on exit,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (output) REAL array, dimension ( N ) if
+           ICOMPQ = 1, and dimension 1 if ICOMPQ = 0. If ICOMPQ = 1
+           and the I-th subproblem is not square, on exit, S( I )
+           contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    WORK   (workspace) REAL array, dimension
+           (6 * N + (SMLSIZ + 1)*(SMLSIZ + 1)).
+
+    IWORK  (workspace) INTEGER array, dimension (7*N).
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, a singular value did not converge
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    } else if (*ldu < *n + *sqre) {
+	*info = -8;
+    } else if (*ldgcol < *n) {
+	*info = -17;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASDA", &i__1);
+	return 0;
+    }
+
+    m = *n + *sqre;
+
+/*     If the input matrix is too small, call SLASDQ to find the SVD. */
+
+    if (*n <= *smlsiz) {
+	if (*icompq == 0) {
+	    slasdq_("U", sqre, n, &c__0, &c__0, &c__0, &d__[1], &e[1], &vt[
+		    vt_offset], ldu, &u[u_offset], ldu, &u[u_offset], ldu, &
+		    work[1], info);
+	} else {
+	    slasdq_("U", sqre, n, &m, n, &c__0, &d__[1], &e[1], &vt[vt_offset]
+		    , ldu, &u[u_offset], ldu, &u[u_offset], ldu, &work[1],
+		    info);
+	}
+	return 0;
+    }
+
+/*     Book-keeping and  set up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+    idxq = ndimr + *n;
+    iwk = idxq + *n;
+
+    ncc = 0;
+    nru = 0;
+
+    smlszp = *smlsiz + 1;
+    vf = 1;
+    vl = vf + m;
+    nwork1 = vl + m;
+    nwork2 = nwork1 + smlszp * smlszp;
+
+    slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       for the nodes on bottom level of the tree, solve
+       their subproblems by SLASDQ.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nlp1 = nl + 1;
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+	idxqi = idxq + nlf - 2;
+	vfi = vf + nlf - 1;
+	vli = vl + nlf - 1;
+	sqrei = 1;
+	if (*icompq == 0) {
+	    slaset_("A", &nlp1, &nlp1, &c_b29, &c_b15, &work[nwork1], &smlszp);
+	    slasdq_("U", &sqrei, &nl, &nlp1, &nru, &ncc, &d__[nlf], &e[nlf], &
+		    work[nwork1], &smlszp, &work[nwork2], &nl, &work[nwork2],
+		    &nl, &work[nwork2], info);
+	    itemp = nwork1 + nl * smlszp;
+	    scopy_(&nlp1, &work[nwork1], &c__1, &work[vfi], &c__1);
+	    scopy_(&nlp1, &work[itemp], &c__1, &work[vli], &c__1);
+	} else {
+	    slaset_("A", &nl, &nl, &c_b29, &c_b15, &u[nlf + u_dim1], ldu);
+	    slaset_("A", &nlp1, &nlp1, &c_b29, &c_b15, &vt[nlf + vt_dim1],
+		    ldu);
+	    slasdq_("U", &sqrei, &nl, &nlp1, &nl, &ncc, &d__[nlf], &e[nlf], &
+		    vt[nlf + vt_dim1], ldu, &u[nlf + u_dim1], ldu, &u[nlf +
+		    u_dim1], ldu, &work[nwork1], info);
+	    scopy_(&nlp1, &vt[nlf + vt_dim1], &c__1, &work[vfi], &c__1);
+	    scopy_(&nlp1, &vt[nlf + nlp1 * vt_dim1], &c__1, &work[vli], &c__1)
+		    ;
+	}
+	if (*info != 0) {
+	    return 0;
+	}
+	i__2 = nl;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[idxqi + j] = j;
+/* L10: */
+	}
+	if (i__ == nd && *sqre == 0) {
+	    sqrei = 0;
+	} else {
+	    sqrei = 1;
+	}
+	idxqi += nlp1;
+	vfi += nlp1;
+	vli += nlp1;
+	nrp1 = nr + sqrei;
+	if (*icompq == 0) {
+	    slaset_("A", &nrp1, &nrp1, &c_b29, &c_b15, &work[nwork1], &smlszp);
+	    slasdq_("U", &sqrei, &nr, &nrp1, &nru, &ncc, &d__[nrf], &e[nrf], &
+		    work[nwork1], &smlszp, &work[nwork2], &nr, &work[nwork2],
+		    &nr, &work[nwork2], info);
+	    itemp = nwork1 + (nrp1 - 1) * smlszp;
+	    scopy_(&nrp1, &work[nwork1], &c__1, &work[vfi], &c__1);
+	    scopy_(&nrp1, &work[itemp], &c__1, &work[vli], &c__1);
+	} else {
+	    slaset_("A", &nr, &nr, &c_b29, &c_b15, &u[nrf + u_dim1], ldu);
+	    slaset_("A", &nrp1, &nrp1, &c_b29, &c_b15, &vt[nrf + vt_dim1],
+		    ldu);
+	    slasdq_("U", &sqrei, &nr, &nrp1, &nr, &ncc, &d__[nrf], &e[nrf], &
+		    vt[nrf + vt_dim1], ldu, &u[nrf + u_dim1], ldu, &u[nrf +
+		    u_dim1], ldu, &work[nwork1], info);
+	    scopy_(&nrp1, &vt[nrf + vt_dim1], &c__1, &work[vfi], &c__1);
+	    scopy_(&nrp1, &vt[nrf + nrp1 * vt_dim1], &c__1, &work[vli], &c__1)
+		    ;
+	}
+	if (*info != 0) {
+	    return 0;
+	}
+	i__2 = nr;
+	for (j = 1; j <= i__2; ++j) {
+	    iwork[idxqi + j] = j;
+/* L20: */
+	}
+/* L30: */
+    }
+
+/*     Now conquer each subproblem bottom-up. */
+
+    j = pow_ii(&c__2, &nlvl);
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqrei = *sqre;
+	    } else {
+		sqrei = 1;
+	    }
+	    vfi = vf + nlf - 1;
+	    vli = vl + nlf - 1;
+	    idxqi = idxq + nlf - 1;
+	    alpha = d__[ic];
+	    beta = e[ic];
+	    if (*icompq == 0) {
+		slasd6_(icompq, &nl, &nr, &sqrei, &d__[nlf], &work[vfi], &
+			work[vli], &alpha, &beta, &iwork[idxqi], &perm[
+			perm_offset], &givptr[1], &givcol[givcol_offset],
+			ldgcol, &givnum[givnum_offset], ldu, &poles[
+			poles_offset], &difl[difl_offset], &difr[difr_offset],
+			 &z__[z_offset], &k[1], &c__[1], &s[1], &work[nwork1],
+			 &iwork[iwk], info);
+	    } else {
+		--j;
+		slasd6_(icompq, &nl, &nr, &sqrei, &d__[nlf], &work[vfi], &
+			work[vli], &alpha, &beta, &iwork[idxqi], &perm[nlf +
+			lvl * perm_dim1], &givptr[j], &givcol[nlf + lvl2 *
+			givcol_dim1], ldgcol, &givnum[nlf + lvl2 *
+			givnum_dim1], ldu, &poles[nlf + lvl2 * poles_dim1], &
+			difl[nlf + lvl * difl_dim1], &difr[nlf + lvl2 *
+			difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[j],
+			&s[j], &work[nwork1], &iwork[iwk], info);
+	    }
+	    if (*info != 0) {
+		return 0;
+	    }
+/* L40: */
+	}
+/* L50: */
+    }
+
+    return 0;
+
+/*     End of SLASDA */
+
+} /* slasda_ */
+
+/* Subroutine */ int slasdq_(char *uplo, integer *sqre, integer *n, integer *
+	ncvt, integer *nru, integer *ncc, real *d__, real *e, real *vt,
+	integer *ldvt, real *u, integer *ldu, real *c__, integer *ldc, real *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static real r__, cs, sn;
+    static integer np1, isub;
+    static real smin;
+    static integer sqre1;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int slasr_(char *, char *, char *, integer *,
+	    integer *, real *, real *, real *, integer *);
+    static integer iuplo;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *,
+	    integer *), xerbla_(char *, integer *), slartg_(real *,
+	    real *, real *, real *, real *);
+    static logical rotate;
+    extern /* Subroutine */ int sbdsqr_(char *, integer *, integer *, integer
+	    *, integer *, real *, real *, real *, integer *, real *, integer *
+	    , real *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASDQ computes the singular value decomposition (SVD) of a real
+    (upper or lower) bidiagonal matrix with diagonal D and offdiagonal
+    E, accumulating the transformations if desired. Letting B denote
+    the input bidiagonal matrix, the algorithm computes orthogonal
+    matrices Q and P such that B = Q * S * P' (P' denotes the transpose
+    of P). The singular values S are overwritten on D.
+
+    The input matrix U  is changed to U  * Q  if desired.
+    The input matrix VT is changed to P' * VT if desired.
+    The input matrix C  is changed to Q' * C  if desired.
+
+    See "Computing  Small Singular Values of Bidiagonal Matrices With
+    Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+    LAPACK Working Note #3, for a detailed description of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO  (input) CHARACTER*1
+          On entry, UPLO specifies whether the input bidiagonal matrix
+          is upper or lower bidiagonal, and wether it is square are
+          not.
+             UPLO = 'U' or 'u'   B is upper bidiagonal.
+             UPLO = 'L' or 'l'   B is lower bidiagonal.
+
+    SQRE  (input) INTEGER
+          = 0: then the input matrix is N-by-N.
+          = 1: then the input matrix is N-by-(N+1) if UPLU = 'U' and
+               (N+1)-by-N if UPLU = 'L'.
+
+          The bidiagonal matrix has
+          N = NL + NR + 1 rows and
+          M = N + SQRE >= N columns.
+
+    N     (input) INTEGER
+          On entry, N specifies the number of rows and columns
+          in the matrix. N must be at least 0.
+
+    NCVT  (input) INTEGER
+          On entry, NCVT specifies the number of columns of
+          the matrix VT. NCVT must be at least 0.
+
+    NRU   (input) INTEGER
+          On entry, NRU specifies the number of rows of
+          the matrix U. NRU must be at least 0.
+
+    NCC   (input) INTEGER
+          On entry, NCC specifies the number of columns of
+          the matrix C. NCC must be at least 0.
+
+    D     (input/output) REAL array, dimension (N)
+          On entry, D contains the diagonal entries of the
+          bidiagonal matrix whose SVD is desired. On normal exit,
+          D contains the singular values in ascending order.
+
+    E     (input/output) REAL array.
+          dimension is (N-1) if SQRE = 0 and N if SQRE = 1.
+          On entry, the entries of E contain the offdiagonal entries
+          of the bidiagonal matrix whose SVD is desired. On normal
+          exit, E will contain 0. If the algorithm does not converge,
+          D and E will contain the diagonal and superdiagonal entries
+          of a bidiagonal matrix orthogonally equivalent to the one
+          given as input.
+
+    VT    (input/output) REAL array, dimension (LDVT, NCVT)
+          On entry, contains a matrix which on exit has been
+          premultiplied by P', dimension N-by-NCVT if SQRE = 0
+          and (N+1)-by-NCVT if SQRE = 1 (not referenced if NCVT=0).
+
+    LDVT  (input) INTEGER
+          On entry, LDVT specifies the leading dimension of VT as
+          declared in the calling (sub) program. LDVT must be at
+          least 1. If NCVT is nonzero LDVT must also be at least N.
+
+    U     (input/output) REAL array, dimension (LDU, N)
+          On entry, contains a  matrix which on exit has been
+          postmultiplied by Q, dimension NRU-by-N if SQRE = 0
+          and NRU-by-(N+1) if SQRE = 1 (not referenced if NRU=0).
+
+    LDU   (input) INTEGER
+          On entry, LDU  specifies the leading dimension of U as
+          declared in the calling (sub) program. LDU must be at
+          least max( 1, NRU ) .
+
+    C     (input/output) REAL array, dimension (LDC, NCC)
+          On entry, contains an N-by-NCC matrix which on exit
+          has been premultiplied by Q'  dimension N-by-NCC if SQRE = 0
+          and (N+1)-by-NCC if SQRE = 1 (not referenced if NCC=0).
+
+    LDC   (input) INTEGER
+          On entry, LDC  specifies the leading dimension of C as
+          declared in the calling (sub) program. LDC must be at
+          least 1. If NCC is nonzero, LDC must also be at least N.
+
+    WORK  (workspace) REAL array, dimension (4*N)
+          Workspace. Only referenced if one of NCVT, NRU, or NCC is
+          nonzero, and if N is at least 2.
+
+    INFO  (output) INTEGER
+          On exit, a value of 0 indicates a successful exit.
+          If INFO < 0, argument number -INFO is illegal.
+          If INFO > 0, the algorithm did not converge, and INFO
+          specifies how many superdiagonals did not converge.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    iuplo = 0;
+    if (lsame_(uplo, "U")) {
+	iuplo = 1;
+    }
+    if (lsame_(uplo, "L")) {
+	iuplo = 2;
+    }
+    if (iuplo == 0) {
+	*info = -1;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ncvt < 0) {
+	*info = -4;
+    } else if (*nru < 0) {
+	*info = -5;
+    } else if (*ncc < 0) {
+	*info = -6;
+    } else if (*ncvt == 0 && *ldvt < 1 || *ncvt > 0 && *ldvt < max(1,*n)) {
+	*info = -10;
+    } else if (*ldu < max(1,*nru)) {
+	*info = -12;
+    } else if (*ncc == 0 && *ldc < 1 || *ncc > 0 && *ldc < max(1,*n)) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASDQ", &i__1);
+	return 0;
+    }
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     ROTATE is true if any singular vectors desired, false otherwise */
+
+    rotate = *ncvt > 0 || *nru > 0 || *ncc > 0;
+    np1 = *n + 1;
+    sqre1 = *sqre;
+
+/*
+       If matrix non-square upper bidiagonal, rotate to be lower
+       bidiagonal.  The rotations are on the right.
+*/
+
+    if (iuplo == 1 && sqre1 == 1) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (rotate) {
+		work[i__] = cs;
+		work[*n + i__] = sn;
+	    }
+/* L10: */
+	}
+	slartg_(&d__[*n], &e[*n], &cs, &sn, &r__);
+	d__[*n] = r__;
+	e[*n] = 0.f;
+	if (rotate) {
+	    work[*n] = cs;
+	    work[*n + *n] = sn;
+	}
+	iuplo = 2;
+	sqre1 = 0;
+
+/*        Update singular vectors if desired. */
+
+	if (*ncvt > 0) {
+	    slasr_("L", "V", "F", &np1, ncvt, &work[1], &work[np1], &vt[
+		    vt_offset], ldvt);
+	}
+    }
+
+/*
+       If matrix lower bidiagonal, rotate to be upper bidiagonal
+       by applying Givens rotations on the left.
+*/
+
+    if (iuplo == 2) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (rotate) {
+		work[i__] = cs;
+		work[*n + i__] = sn;
+	    }
+/* L20: */
+	}
+
+/*
+          If matrix (N+1)-by-N lower bidiagonal, one additional
+          rotation is needed.
+*/
+
+	if (sqre1 == 1) {
+	    slartg_(&d__[*n], &e[*n], &cs, &sn, &r__);
+	    d__[*n] = r__;
+	    if (rotate) {
+		work[*n] = cs;
+		work[*n + *n] = sn;
+	    }
+	}
+
+/*        Update singular vectors if desired. */
+
+	if (*nru > 0) {
+	    if (sqre1 == 0) {
+		slasr_("R", "V", "F", nru, n, &work[1], &work[np1], &u[
+			u_offset], ldu);
+	    } else {
+		slasr_("R", "V", "F", nru, &np1, &work[1], &work[np1], &u[
+			u_offset], ldu);
+	    }
+	}
+	if (*ncc > 0) {
+	    if (sqre1 == 0) {
+		slasr_("L", "V", "F", n, ncc, &work[1], &work[np1], &c__[
+			c_offset], ldc);
+	    } else {
+		slasr_("L", "V", "F", &np1, ncc, &work[1], &work[np1], &c__[
+			c_offset], ldc);
+	    }
+	}
+    }
+
+/*
+       Call SBDSQR to compute the SVD of the reduced real
+       N-by-N upper bidiagonal matrix.
+*/
+
+    sbdsqr_("U", n, ncvt, nru, ncc, &d__[1], &e[1], &vt[vt_offset], ldvt, &u[
+	    u_offset], ldu, &c__[c_offset], ldc, &work[1], info);
+
+/*
+       Sort the singular values into ascending order (insertion sort on
+       singular values, but only one transposition per singular vector)
+*/
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Scan for smallest D(I). */
+
+	isub = i__;
+	smin = d__[i__];
+	i__2 = *n;
+	for (j = i__ + 1; j <= i__2; ++j) {
+	    if (d__[j] < smin) {
+		isub = j;
+		smin = d__[j];
+	    }
+/* L30: */
+	}
+	if (isub != i__) {
+
+/*           Swap singular values and vectors. */
+
+	    d__[isub] = d__[i__];
+	    d__[i__] = smin;
+	    if (*ncvt > 0) {
+		sswap_(ncvt, &vt[isub + vt_dim1], ldvt, &vt[i__ + vt_dim1],
+			ldvt);
+	    }
+	    if (*nru > 0) {
+		sswap_(nru, &u[isub * u_dim1 + 1], &c__1, &u[i__ * u_dim1 + 1]
+			, &c__1);
+	    }
+	    if (*ncc > 0) {
+		sswap_(ncc, &c__[isub + c_dim1], ldc, &c__[i__ + c_dim1], ldc)
+			;
+	    }
+	}
+/* L40: */
+    }
+
+    return 0;
+
+/*     End of SLASDQ */
+
+} /* slasdq_ */
+
+/* Subroutine */ int slasdt_(integer *n, integer *lvl, integer *nd, integer *
+	inode, integer *ndiml, integer *ndimr, integer *msub)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i__, il, ir, maxn;
+    static real temp;
+    static integer nlvl, llst, ncrnt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLASDT creates a tree of subproblems for bidiagonal divide and
+    conquer.
+
+    Arguments
+    =========
+
+     N      (input) INTEGER
+            On entry, the number of diagonal elements of the
+            bidiagonal matrix.
+
+     LVL    (output) INTEGER
+            On exit, the number of levels on the computation tree.
+
+     ND     (output) INTEGER
+            On exit, the number of nodes on the tree.
+
+     INODE  (output) INTEGER array, dimension ( N )
+            On exit, centers of subproblems.
+
+     NDIML  (output) INTEGER array, dimension ( N )
+            On exit, row dimensions of left children.
+
+     NDIMR  (output) INTEGER array, dimension ( N )
+            On exit, row dimensions of right children.
+
+     MSUB   (input) INTEGER
+            On entry, the maximum row dimension each subproblem at the
+            bottom of the tree can be of.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Find the number of levels on the tree.
+*/
+
+    /* Parameter adjustments */
+    --ndimr;
+    --ndiml;
+    --inode;
+
+    /* Function Body */
+    maxn = max(1,*n);
+    temp = log((real) maxn / (real) (*msub + 1)) / log(2.f);
+    *lvl = (integer) temp + 1;
+
+    i__ = *n / 2;
+    inode[1] = i__ + 1;
+    ndiml[1] = i__;
+    ndimr[1] = *n - i__ - 1;
+    il = 0;
+    ir = 1;
+    llst = 1;
+    i__1 = *lvl - 1;
+    for (nlvl = 1; nlvl <= i__1; ++nlvl) {
+
+/*
+          Constructing the tree at (NLVL+1)-st level. The number of
+          nodes created on this level is LLST * 2.
+*/
+
+	i__2 = llst - 1;
+	for (i__ = 0; i__ <= i__2; ++i__) {
+	    il += 2;
+	    ir += 2;
+	    ncrnt = llst + i__;
+	    ndiml[il] = ndiml[ncrnt] / 2;
+	    ndimr[il] = ndiml[ncrnt] - ndiml[il] - 1;
+	    inode[il] = inode[ncrnt] - ndimr[il] - 1;
+	    ndiml[ir] = ndimr[ncrnt] / 2;
+	    ndimr[ir] = ndimr[ncrnt] - ndiml[ir] - 1;
+	    inode[ir] = inode[ncrnt] + ndiml[ir] + 1;
+/* L10: */
+	}
+	llst <<= 1;
+/* L20: */
+    }
+    *nd = (llst << 1) - 1;
+
+    return 0;
+
+/*     End of SLASDT */
+
+} /* slasdt_ */
+
+/* Subroutine */ int slaset_(char *uplo, integer *m, integer *n, real *alpha,
+	real *beta, real *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASET initializes an m-by-n matrix A to BETA on the diagonal and
+    ALPHA on the offdiagonals.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be set.
+            = 'U':      Upper triangular part is set; the strictly lower
+                        triangular part of A is not changed.
+            = 'L':      Lower triangular part is set; the strictly upper
+                        triangular part of A is not changed.
+            Otherwise:  All of the matrix A is set.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    ALPHA   (input) REAL
+            The constant to which the offdiagonal elements are to be set.
+
+    BETA    (input) REAL
+            The constant to which the diagonal elements are to be set.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On exit, the leading m-by-n submatrix of A is set as follows:
+
+            if UPLO = 'U', A(i,j) = ALPHA, 1<=i<=j-1, 1<=j<=n,
+            if UPLO = 'L', A(i,j) = ALPHA, j+1<=i<=m, 1<=j<=n,
+            otherwise,     A(i,j) = ALPHA, 1<=i<=m, 1<=j<=n, i.ne.j,
+
+            and, for all UPLO, A(i,i) = BETA, 1<=i<=min(m,n).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+
+/*
+          Set the strictly upper triangular or trapezoidal part of the
+          array to ALPHA.
+*/
+
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j - 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+
+/*
+          Set the strictly lower triangular or trapezoidal part of the
+          array to ALPHA.
+*/
+
+	i__1 = min(*m,*n);
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j + 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+
+/*        Set the leading m-by-n submatrix to ALPHA. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+/*     Set the first min(M,N) diagonal elements to BETA. */
+
+    i__1 = min(*m,*n);
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	a[i__ + i__ * a_dim1] = *beta;
+/* L70: */
+    }
+
+    return 0;
+
+/*     End of SLASET */
+
+} /* slaset_ */
+
+/* Subroutine */ int slasq1_(integer *n, real *d__, real *e, real *work,
+	integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    real r__1, r__2, r__3;
+
+    /* Local variables */
+    static integer i__;
+    static real eps;
+    extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *)
+	    ;
+    static real scale;
+    static integer iinfo;
+    static real sigmn, sigmx;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slasq2_(integer *, real *, integer *);
+    extern doublereal slamch_(char *);
+    static real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slascl_(
+	    char *, integer *, integer *, real *, real *, integer *, integer *
+	    , real *, integer *, integer *), slasrt_(char *, integer *
+	    , real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    SLASQ1 computes the singular values of a real N-by-N bidiagonal
+    matrix with diagonal D and off-diagonal E. The singular values
+    are computed to high relative accuracy, in the absence of
+    denormalization, underflow and overflow. The algorithm was first
+    presented in
+
+    "Accurate singular values and differential qd algorithms" by K. V.
+    Fernando and B. N. Parlett, Numer. Math., Vol-67, No. 2, pp. 191-230,
+    1994,
+
+    and the present implementation is described in "An implementation of
+    the dqds Algorithm (Positive Case)", LAPACK Working Note.
+
+    Arguments
+    =========
+
+    N     (input) INTEGER
+          The number of rows and columns in the matrix. N >= 0.
+
+    D     (input/output) REAL array, dimension (N)
+          On entry, D contains the diagonal elements of the
+          bidiagonal matrix whose SVD is desired. On normal exit,
+          D contains the singular values in decreasing order.
+
+    E     (input/output) REAL array, dimension (N)
+          On entry, elements E(1:N-1) contain the off-diagonal elements
+          of the bidiagonal matrix whose SVD is desired.
+          On exit, E is overwritten.
+
+    WORK  (workspace) REAL array, dimension (4*N)
+
+    INFO  (output) INTEGER
+          = 0: successful exit
+          < 0: if INFO = -i, the i-th argument had an illegal value
+          > 0: the algorithm failed
+               = 1, a split was marked by a positive value in E
+               = 2, current block of Z not diagonalized after 30*N
+                    iterations (in inner while loop)
+               = 3, termination criterion of outer while loop not met
+                    (program created more than N unreduced blocks)
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --work;
+    --e;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -2;
+	i__1 = -(*info);
+	xerbla_("SLASQ1", &i__1);
+	return 0;
+    } else if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	d__[1] = dabs(d__[1]);
+	return 0;
+    } else if (*n == 2) {
+	slas2_(&d__[1], &e[1], &d__[2], &sigmn, &sigmx);
+	d__[1] = sigmx;
+	d__[2] = sigmn;
+	return 0;
+    }
+
+/*     Estimate the largest singular value. */
+
+    sigmx = 0.f;
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = (r__1 = d__[i__], dabs(r__1));
+/* Computing MAX */
+	r__2 = sigmx, r__3 = (r__1 = e[i__], dabs(r__1));
+	sigmx = dmax(r__2,r__3);
+/* L10: */
+    }
+    d__[*n] = (r__1 = d__[*n], dabs(r__1));
+
+/*     Early return if SIGMX is zero (matrix is already diagonal). */
+
+    if (sigmx == 0.f) {
+	slasrt_("D", n, &d__[1], &iinfo);
+	return 0;
+    }
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	r__1 = sigmx, r__2 = d__[i__];
+	sigmx = dmax(r__1,r__2);
+/* L20: */
+    }
+
+/*
+       Copy D and E into WORK (in the Z format) and scale (squaring the
+       input data makes scaling by a power of the radix pointless).
+*/
+
+    eps = slamch_("Precision");
+    safmin = slamch_("Safe minimum");
+    scale = sqrt(eps / safmin);
+    scopy_(n, &d__[1], &c__1, &work[1], &c__2);
+    i__1 = *n - 1;
+    scopy_(&i__1, &e[1], &c__1, &work[2], &c__2);
+    i__1 = (*n << 1) - 1;
+    i__2 = (*n << 1) - 1;
+    slascl_("G", &c__0, &c__0, &sigmx, &scale, &i__1, &c__1, &work[1], &i__2,
+	    &iinfo);
+
+/*     Compute the q's and e's. */
+
+    i__1 = (*n << 1) - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing 2nd power */
+	r__1 = work[i__];
+	work[i__] = r__1 * r__1;
+/* L30: */
+    }
+    work[*n * 2] = 0.f;
+
+    slasq2_(n, &work[1], info);
+
+    if (*info == 0) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__[i__] = sqrt(work[i__]);
+/* L40: */
+	}
+	slascl_("G", &c__0, &c__0, &scale, &sigmx, n, &c__1, &d__[1], n, &
+		iinfo);
+    }
+
+    return 0;
+
+/*     End of SLASQ1 */
+
+} /* slasq1_ */
+
+/* Subroutine */ int slasq2_(integer *n, real *z__, integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real d__, e, g;
+    static integer k;
+    static real s, t;
+    static integer i0, i4, n0;
+    static real dn;
+    static integer pp;
+    static real dn1, dn2, dee, eps, tau, tol;
+    static integer ipn4;
+    static real tol2;
+    static logical ieee;
+    static integer nbig;
+    static real dmin__, emin, emax;
+    static integer kmin, ndiv, iter;
+    static real qmin, temp, qmax, zmax;
+    static integer splt;
+    static real dmin1, dmin2;
+    static integer nfail;
+    static real desig, trace, sigma;
+    static integer iinfo, ttype;
+    extern /* Subroutine */ int slasq3_(integer *, integer *, real *, integer
+	    *, real *, real *, real *, real *, integer *, integer *, integer *
+	    , logical *, integer *, real *, real *, real *, real *, real *,
+	    real *, real *);
+    static real deemin;
+    extern doublereal slamch_(char *);
+    static integer iwhila, iwhilb;
+    static real oldemn, safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slasrt_(
+	    char *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    SLASQ2 computes all the eigenvalues of the symmetric positive
+    definite tridiagonal matrix associated with the qd array Z to high
+    relative accuracy are computed to high relative accuracy, in the
+    absence of denormalization, underflow and overflow.
+
+    To see the relation of Z to the tridiagonal matrix, let L be a
+    unit lower bidiagonal matrix with subdiagonals Z(2,4,6,,..) and
+    let U be an upper bidiagonal matrix with 1's above and diagonal
+    Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+    symmetric tridiagonal to which it is similar.
+
+    Note : SLASQ2 defines a logical variable, IEEE, which is true
+    on machines which follow ieee-754 floating-point standard in their
+    handling of infinities and NaNs, and false otherwise. This variable
+    is passed to SLASQ3.
+
+    Arguments
+    =========
+
+    N     (input) INTEGER
+          The number of rows and columns in the matrix. N >= 0.
+
+    Z     (input/output) REAL array, dimension ( 4*N )
+          On entry Z holds the qd array. On exit, entries 1 to N hold
+          the eigenvalues in decreasing order, Z( 2*N+1 ) holds the
+          trace, and Z( 2*N+2 ) holds the sum of the eigenvalues. If
+          N > 2, then Z( 2*N+3 ) holds the iteration count, Z( 2*N+4 )
+          holds NDIVS/NIN^2, and Z( 2*N+5 ) holds the percentage of
+          shifts that failed.
+
+    INFO  (output) INTEGER
+          = 0: successful exit
+          < 0: if the i-th argument is a scalar and had an illegal
+               value, then INFO = -i, if the i-th argument is an
+               array and the j-entry had an illegal value, then
+               INFO = -(i*100+j)
+          > 0: the algorithm failed
+                = 1, a split was marked by a positive value in E
+                = 2, current block of Z not diagonalized after 30*N
+                     iterations (in inner while loop)
+                = 3, termination criterion of outer while loop not met
+                     (program created more than N unreduced blocks)
+
+    Further Details
+    ===============
+    Local Variables: I0:N0 defines a current unreduced segment of Z.
+    The shifts are accumulated in SIGMA. Iteration count is in ITER.
+    Ping-pong is controlled by PP (alternates between 0 and 1).
+
+    =====================================================================
+
+
+       Test the input arguments.
+       (in case SLASQ2 is not called by SLASQ1)
+*/
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    *info = 0;
+    eps = slamch_("Precision");
+    safmin = slamch_("Safe minimum");
+    tol = eps * 100.f;
+/* Computing 2nd power */
+    r__1 = tol;
+    tol2 = r__1 * r__1;
+
+    if (*n < 0) {
+	*info = -1;
+	xerbla_("SLASQ2", &c__1);
+	return 0;
+    } else if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+
+/*        1-by-1 case. */
+
+	if (z__[1] < 0.f) {
+	    *info = -201;
+	    xerbla_("SLASQ2", &c__2);
+	}
+	return 0;
+    } else if (*n == 2) {
+
+/*        2-by-2 case. */
+
+	if (z__[2] < 0.f || z__[3] < 0.f) {
+	    *info = -2;
+	    xerbla_("SLASQ2", &c__2);
+	    return 0;
+	} else if (z__[3] > z__[1]) {
+	    d__ = z__[3];
+	    z__[3] = z__[1];
+	    z__[1] = d__;
+	}
+	z__[5] = z__[1] + z__[2] + z__[3];
+	if (z__[2] > z__[3] * tol2) {
+	    t = (z__[1] - z__[3] + z__[2]) * .5f;
+	    s = z__[3] * (z__[2] / t);
+	    if (s <= t) {
+		s = z__[3] * (z__[2] / (t * (sqrt(s / t + 1.f) + 1.f)));
+	    } else {
+		s = z__[3] * (z__[2] / (t + sqrt(t) * sqrt(t + s)));
+	    }
+	    t = z__[1] + (s + z__[2]);
+	    z__[3] *= z__[1] / t;
+	    z__[1] = t;
+	}
+	z__[2] = z__[3];
+	z__[6] = z__[2] + z__[1];
+	return 0;
+    }
+
+/*     Check for negative data and compute sums of q's and e's. */
+
+    z__[*n * 2] = 0.f;
+    emin = z__[2];
+    qmax = 0.f;
+    zmax = 0.f;
+    d__ = 0.f;
+    e = 0.f;
+
+    i__1 = *n - 1 << 1;
+    for (k = 1; k <= i__1; k += 2) {
+	if (z__[k] < 0.f) {
+	    *info = -(k + 200);
+	    xerbla_("SLASQ2", &c__2);
+	    return 0;
+	} else if (z__[k + 1] < 0.f) {
+	    *info = -(k + 201);
+	    xerbla_("SLASQ2", &c__2);
+	    return 0;
+	}
+	d__ += z__[k];
+	e += z__[k + 1];
+/* Computing MAX */
+	r__1 = qmax, r__2 = z__[k];
+	qmax = dmax(r__1,r__2);
+/* Computing MIN */
+	r__1 = emin, r__2 = z__[k + 1];
+	emin = dmin(r__1,r__2);
+/* Computing MAX */
+	r__1 = max(qmax,zmax), r__2 = z__[k + 1];
+	zmax = dmax(r__1,r__2);
+/* L10: */
+    }
+    if (z__[(*n << 1) - 1] < 0.f) {
+	*info = -((*n << 1) + 199);
+	xerbla_("SLASQ2", &c__2);
+	return 0;
+    }
+    d__ += z__[(*n << 1) - 1];
+/* Computing MAX */
+    r__1 = qmax, r__2 = z__[(*n << 1) - 1];
+    qmax = dmax(r__1,r__2);
+    zmax = dmax(qmax,zmax);
+
+/*     Check for diagonality. */
+
+    if (e == 0.f) {
+	i__1 = *n;
+	for (k = 2; k <= i__1; ++k) {
+	    z__[k] = z__[(k << 1) - 1];
+/* L20: */
+	}
+	slasrt_("D", n, &z__[1], &iinfo);
+	z__[(*n << 1) - 1] = d__;
+	return 0;
+    }
+
+    trace = d__ + e;
+
+/*     Check for zero data. */
+
+    if (trace == 0.f) {
+	z__[(*n << 1) - 1] = 0.f;
+	return 0;
+    }
+
+/*
+       Check whether the machine is IEEE conformable.
+
+       IEEE = ILAENV( 10, 'SLASQ2', 'N', 1, 2, 3, 4 ).EQ.1 .AND.
+      $       ILAENV( 11, 'SLASQ2', 'N', 1, 2, 3, 4 ).EQ.1
+
+       [11/15/2008] The case IEEE=.TRUE. has a problem in single precision with
+       some the test matrices of type 16. The double precision code is fine.
+*/
+
+    ieee = FALSE_;
+
+/*     Rearrange data for locality: Z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...). */
+
+    for (k = *n << 1; k >= 2; k += -2) {
+	z__[k * 2] = 0.f;
+	z__[(k << 1) - 1] = z__[k];
+	z__[(k << 1) - 2] = 0.f;
+	z__[(k << 1) - 3] = z__[k - 1];
+/* L30: */
+    }
+
+    i0 = 1;
+    n0 = *n;
+
+/*     Reverse the qd-array, if warranted. */
+
+    if (z__[(i0 << 2) - 3] * 1.5f < z__[(n0 << 2) - 3]) {
+	ipn4 = i0 + n0 << 2;
+	i__1 = i0 + n0 - 1 << 1;
+	for (i4 = i0 << 2; i4 <= i__1; i4 += 4) {
+	    temp = z__[i4 - 3];
+	    z__[i4 - 3] = z__[ipn4 - i4 - 3];
+	    z__[ipn4 - i4 - 3] = temp;
+	    temp = z__[i4 - 1];
+	    z__[i4 - 1] = z__[ipn4 - i4 - 5];
+	    z__[ipn4 - i4 - 5] = temp;
+/* L40: */
+	}
+    }
+
+/*     Initial split checking via dqd and Li's test. */
+
+    pp = 0;
+
+    for (k = 1; k <= 2; ++k) {
+
+	d__ = z__[(n0 << 2) + pp - 3];
+	i__1 = (i0 << 2) + pp;
+	for (i4 = (n0 - 1 << 2) + pp; i4 >= i__1; i4 += -4) {
+	    if (z__[i4 - 1] <= tol2 * d__) {
+		z__[i4 - 1] = -0.f;
+		d__ = z__[i4 - 3];
+	    } else {
+		d__ = z__[i4 - 3] * (d__ / (d__ + z__[i4 - 1]));
+	    }
+/* L50: */
+	}
+
+/*        dqd maps Z to ZZ plus Li's test. */
+
+	emin = z__[(i0 << 2) + pp + 1];
+	d__ = z__[(i0 << 2) + pp - 3];
+	i__1 = (n0 - 1 << 2) + pp;
+	for (i4 = (i0 << 2) + pp; i4 <= i__1; i4 += 4) {
+	    z__[i4 - (pp << 1) - 2] = d__ + z__[i4 - 1];
+	    if (z__[i4 - 1] <= tol2 * d__) {
+		z__[i4 - 1] = -0.f;
+		z__[i4 - (pp << 1) - 2] = d__;
+		z__[i4 - (pp << 1)] = 0.f;
+		d__ = z__[i4 + 1];
+	    } else if (safmin * z__[i4 + 1] < z__[i4 - (pp << 1) - 2] &&
+		    safmin * z__[i4 - (pp << 1) - 2] < z__[i4 + 1]) {
+		temp = z__[i4 + 1] / z__[i4 - (pp << 1) - 2];
+		z__[i4 - (pp << 1)] = z__[i4 - 1] * temp;
+		d__ *= temp;
+	    } else {
+		z__[i4 - (pp << 1)] = z__[i4 + 1] * (z__[i4 - 1] / z__[i4 - (
+			pp << 1) - 2]);
+		d__ = z__[i4 + 1] * (d__ / z__[i4 - (pp << 1) - 2]);
+	    }
+/* Computing MIN */
+	    r__1 = emin, r__2 = z__[i4 - (pp << 1)];
+	    emin = dmin(r__1,r__2);
+/* L60: */
+	}
+	z__[(n0 << 2) - pp - 2] = d__;
+
+/*        Now find qmax. */
+
+	qmax = z__[(i0 << 2) - pp - 2];
+	i__1 = (n0 << 2) - pp - 2;
+	for (i4 = (i0 << 2) - pp + 2; i4 <= i__1; i4 += 4) {
+/* Computing MAX */
+	    r__1 = qmax, r__2 = z__[i4];
+	    qmax = dmax(r__1,r__2);
+/* L70: */
+	}
+
+/*        Prepare for the next iteration on K. */
+
+	pp = 1 - pp;
+/* L80: */
+    }
+
+/*     Initialise variables to pass to SLASQ3. */
+
+    ttype = 0;
+    dmin1 = 0.f;
+    dmin2 = 0.f;
+    dn = 0.f;
+    dn1 = 0.f;
+    dn2 = 0.f;
+    g = 0.f;
+    tau = 0.f;
+
+    iter = 2;
+    nfail = 0;
+    ndiv = n0 - i0 << 1;
+
+    i__1 = *n + 1;
+    for (iwhila = 1; iwhila <= i__1; ++iwhila) {
+	if (n0 < 1) {
+	    goto L170;
+	}
+
+/*
+          While array unfinished do
+
+          E(N0) holds the value of SIGMA when submatrix in I0:N0
+          splits from the rest of the array, but is negated.
+*/
+
+	desig = 0.f;
+	if (n0 == *n) {
+	    sigma = 0.f;
+	} else {
+	    sigma = -z__[(n0 << 2) - 1];
+	}
+	if (sigma < 0.f) {
+	    *info = 1;
+	    return 0;
+	}
+
+/*
+          Find last unreduced submatrix's top index I0, find QMAX and
+          EMIN. Find Gershgorin-type bound if Q's much greater than E's.
+*/
+
+	emax = 0.f;
+	if (n0 > i0) {
+	    emin = (r__1 = z__[(n0 << 2) - 5], dabs(r__1));
+	} else {
+	    emin = 0.f;
+	}
+	qmin = z__[(n0 << 2) - 3];
+	qmax = qmin;
+	for (i4 = n0 << 2; i4 >= 8; i4 += -4) {
+	    if (z__[i4 - 5] <= 0.f) {
+		goto L100;
+	    }
+	    if (qmin >= emax * 4.f) {
+/* Computing MIN */
+		r__1 = qmin, r__2 = z__[i4 - 3];
+		qmin = dmin(r__1,r__2);
+/* Computing MAX */
+		r__1 = emax, r__2 = z__[i4 - 5];
+		emax = dmax(r__1,r__2);
+	    }
+/* Computing MAX */
+	    r__1 = qmax, r__2 = z__[i4 - 7] + z__[i4 - 5];
+	    qmax = dmax(r__1,r__2);
+/* Computing MIN */
+	    r__1 = emin, r__2 = z__[i4 - 5];
+	    emin = dmin(r__1,r__2);
+/* L90: */
+	}
+	i4 = 4;
+
+L100:
+	i0 = i4 / 4;
+	pp = 0;
+
+	if (n0 - i0 > 1) {
+	    dee = z__[(i0 << 2) - 3];
+	    deemin = dee;
+	    kmin = i0;
+	    i__2 = (n0 << 2) - 3;
+	    for (i4 = (i0 << 2) + 1; i4 <= i__2; i4 += 4) {
+		dee = z__[i4] * (dee / (dee + z__[i4 - 2]));
+		if (dee <= deemin) {
+		    deemin = dee;
+		    kmin = (i4 + 3) / 4;
+		}
+/* L110: */
+	    }
+	    if (kmin - i0 << 1 < n0 - kmin && deemin <= z__[(n0 << 2) - 3] *
+		    .5f) {
+		ipn4 = i0 + n0 << 2;
+		pp = 2;
+		i__2 = i0 + n0 - 1 << 1;
+		for (i4 = i0 << 2; i4 <= i__2; i4 += 4) {
+		    temp = z__[i4 - 3];
+		    z__[i4 - 3] = z__[ipn4 - i4 - 3];
+		    z__[ipn4 - i4 - 3] = temp;
+		    temp = z__[i4 - 2];
+		    z__[i4 - 2] = z__[ipn4 - i4 - 2];
+		    z__[ipn4 - i4 - 2] = temp;
+		    temp = z__[i4 - 1];
+		    z__[i4 - 1] = z__[ipn4 - i4 - 5];
+		    z__[ipn4 - i4 - 5] = temp;
+		    temp = z__[i4];
+		    z__[i4] = z__[ipn4 - i4 - 4];
+		    z__[ipn4 - i4 - 4] = temp;
+/* L120: */
+		}
+	    }
+	}
+
+/*
+          Put -(initial shift) into DMIN.
+
+   Computing MAX
+*/
+	r__1 = 0.f, r__2 = qmin - sqrt(qmin) * 2.f * sqrt(emax);
+	dmin__ = -dmax(r__1,r__2);
+
+/*
+          Now I0:N0 is unreduced.
+          PP = 0 for ping, PP = 1 for pong.
+          PP = 2 indicates that flipping was applied to the Z array and
+                 and that the tests for deflation upon entry in SLASQ3
+                 should not be performed.
+*/
+
+	nbig = (n0 - i0 + 1) * 30;
+	i__2 = nbig;
+	for (iwhilb = 1; iwhilb <= i__2; ++iwhilb) {
+	    if (i0 > n0) {
+		goto L150;
+	    }
+
+/*           While submatrix unfinished take a good dqds step. */
+
+	    slasq3_(&i0, &n0, &z__[1], &pp, &dmin__, &sigma, &desig, &qmax, &
+		    nfail, &iter, &ndiv, &ieee, &ttype, &dmin1, &dmin2, &dn, &
+		    dn1, &dn2, &g, &tau);
+
+	    pp = 1 - pp;
+
+/*           When EMIN is very small check for splits. */
+
+	    if (pp == 0 && n0 - i0 >= 3) {
+		if (z__[n0 * 4] <= tol2 * qmax || z__[(n0 << 2) - 1] <= tol2 *
+			 sigma) {
+		    splt = i0 - 1;
+		    qmax = z__[(i0 << 2) - 3];
+		    emin = z__[(i0 << 2) - 1];
+		    oldemn = z__[i0 * 4];
+		    i__3 = n0 - 3 << 2;
+		    for (i4 = i0 << 2; i4 <= i__3; i4 += 4) {
+			if (z__[i4] <= tol2 * z__[i4 - 3] || z__[i4 - 1] <=
+				tol2 * sigma) {
+			    z__[i4 - 1] = -sigma;
+			    splt = i4 / 4;
+			    qmax = 0.f;
+			    emin = z__[i4 + 3];
+			    oldemn = z__[i4 + 4];
+			} else {
+/* Computing MAX */
+			    r__1 = qmax, r__2 = z__[i4 + 1];
+			    qmax = dmax(r__1,r__2);
+/* Computing MIN */
+			    r__1 = emin, r__2 = z__[i4 - 1];
+			    emin = dmin(r__1,r__2);
+/* Computing MIN */
+			    r__1 = oldemn, r__2 = z__[i4];
+			    oldemn = dmin(r__1,r__2);
+			}
+/* L130: */
+		    }
+		    z__[(n0 << 2) - 1] = emin;
+		    z__[n0 * 4] = oldemn;
+		    i0 = splt + 1;
+		}
+	    }
+
+/* L140: */
+	}
+
+	*info = 2;
+	return 0;
+
+/*        end IWHILB */
+
+L150:
+
+/* L160: */
+	;
+    }
+
+    *info = 3;
+    return 0;
+
+/*     end IWHILA */
+
+L170:
+
+/*     Move q's to the front. */
+
+    i__1 = *n;
+    for (k = 2; k <= i__1; ++k) {
+	z__[k] = z__[(k << 2) - 3];
+/* L180: */
+    }
+
+/*     Sort and compute sum of eigenvalues. */
+
+    slasrt_("D", n, &z__[1], &iinfo);
+
+    e = 0.f;
+    for (k = *n; k >= 1; --k) {
+	e += z__[k];
+/* L190: */
+    }
+
+/*     Store trace, sum(eigenvalues) and information on performance. */
+
+    z__[(*n << 1) + 1] = trace;
+    z__[(*n << 1) + 2] = e;
+    z__[(*n << 1) + 3] = (real) iter;
+/* Computing 2nd power */
+    i__1 = *n;
+    z__[(*n << 1) + 4] = (real) ndiv / (real) (i__1 * i__1);
+    z__[(*n << 1) + 5] = nfail * 100.f / (real) iter;
+    return 0;
+
+/*     End of SLASQ2 */
+
+} /* slasq2_ */
+
+/* Subroutine */ int slasq3_(integer *i0, integer *n0, real *z__, integer *pp,
+	 real *dmin__, real *sigma, real *desig, real *qmax, integer *nfail,
+	integer *iter, integer *ndiv, logical *ieee, integer *ttype, real *
+	dmin1, real *dmin2, real *dn, real *dn1, real *dn2, real *g, real *
+	tau)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real s, t;
+    static integer j4, nn;
+    static real eps, tol;
+    static integer n0in, ipn4;
+    static real tol2, temp;
+    extern /* Subroutine */ int slasq4_(integer *, integer *, real *, integer
+	    *, integer *, real *, real *, real *, real *, real *, real *,
+	    real *, integer *, real *), slasq5_(integer *, integer *, real *,
+	    integer *, real *, real *, real *, real *, real *, real *, real *,
+	     logical *), slasq6_(integer *, integer *, real *, integer *,
+	    real *, real *, real *, real *, real *, real *);
+    extern doublereal slamch_(char *);
+    extern logical sisnan_(real *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- June 2010                                                       --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    SLASQ3 checks for deflation, computes a shift (TAU) and calls dqds.
+    In case of failure it changes shifts, and tries again until output
+    is positive.
+
+    Arguments
+    =========
+
+    I0     (input) INTEGER
+           First index.
+
+    N0     (input/output) INTEGER
+           Last index.
+
+    Z      (input) REAL array, dimension ( 4*N )
+           Z holds the qd array.
+
+    PP     (input/output) INTEGER
+           PP=0 for ping, PP=1 for pong.
+           PP=2 indicates that flipping was applied to the Z array
+           and that the initial tests for deflation should not be
+           performed.
+
+    DMIN   (output) REAL
+           Minimum value of d.
+
+    SIGMA  (output) REAL
+           Sum of shifts used in current segment.
+
+    DESIG  (input/output) REAL
+           Lower order part of SIGMA
+
+    QMAX   (input) REAL
+           Maximum value of q.
+
+    NFAIL  (output) INTEGER
+           Number of times shift was too big.
+
+    ITER   (output) INTEGER
+           Number of iterations.
+
+    NDIV   (output) INTEGER
+           Number of divisions.
+
+    IEEE   (input) LOGICAL
+           Flag for IEEE or non IEEE arithmetic (passed to SLASQ5).
+
+    TTYPE  (input/output) INTEGER
+           Shift type.
+
+    DMIN1  (input/output) REAL
+
+    DMIN2  (input/output) REAL
+
+    DN     (input/output) REAL
+
+    DN1    (input/output) REAL
+
+    DN2    (input/output) REAL
+
+    G      (input/output) REAL
+
+    TAU    (input/output) REAL
+
+           These are passed as arguments in order to save their values
+           between calls to SLASQ3.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    n0in = *n0;
+    eps = slamch_("Precision");
+    tol = eps * 100.f;
+/* Computing 2nd power */
+    r__1 = tol;
+    tol2 = r__1 * r__1;
+
+/*     Check for deflation. */
+
+L10:
+
+    if (*n0 < *i0) {
+	return 0;
+    }
+    if (*n0 == *i0) {
+	goto L20;
+    }
+    nn = (*n0 << 2) + *pp;
+    if (*n0 == *i0 + 1) {
+	goto L40;
+    }
+
+/*     Check whether E(N0-1) is negligible, 1 eigenvalue. */
+
+    if (z__[nn - 5] > tol2 * (*sigma + z__[nn - 3]) && z__[nn - (*pp << 1) -
+	    4] > tol2 * z__[nn - 7]) {
+	goto L30;
+    }
+
+L20:
+
+    z__[(*n0 << 2) - 3] = z__[(*n0 << 2) + *pp - 3] + *sigma;
+    --(*n0);
+    goto L10;
+
+/*     Check  whether E(N0-2) is negligible, 2 eigenvalues. */
+
+L30:
+
+    if (z__[nn - 9] > tol2 * *sigma && z__[nn - (*pp << 1) - 8] > tol2 * z__[
+	    nn - 11]) {
+	goto L50;
+    }
+
+L40:
+
+    if (z__[nn - 3] > z__[nn - 7]) {
+	s = z__[nn - 3];
+	z__[nn - 3] = z__[nn - 7];
+	z__[nn - 7] = s;
+    }
+    if (z__[nn - 5] > z__[nn - 3] * tol2) {
+	t = (z__[nn - 7] - z__[nn - 3] + z__[nn - 5]) * .5f;
+	s = z__[nn - 3] * (z__[nn - 5] / t);
+	if (s <= t) {
+	    s = z__[nn - 3] * (z__[nn - 5] / (t * (sqrt(s / t + 1.f) + 1.f)));
+	} else {
+	    s = z__[nn - 3] * (z__[nn - 5] / (t + sqrt(t) * sqrt(t + s)));
+	}
+	t = z__[nn - 7] + (s + z__[nn - 5]);
+	z__[nn - 3] *= z__[nn - 7] / t;
+	z__[nn - 7] = t;
+    }
+    z__[(*n0 << 2) - 7] = z__[nn - 7] + *sigma;
+    z__[(*n0 << 2) - 3] = z__[nn - 3] + *sigma;
+    *n0 += -2;
+    goto L10;
+
+L50:
+    if (*pp == 2) {
+	*pp = 0;
+    }
+
+/*     Reverse the qd-array, if warranted. */
+
+    if (*dmin__ <= 0.f || *n0 < n0in) {
+	if (z__[(*i0 << 2) + *pp - 3] * 1.5f < z__[(*n0 << 2) + *pp - 3]) {
+	    ipn4 = *i0 + *n0 << 2;
+	    i__1 = *i0 + *n0 - 1 << 1;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		temp = z__[j4 - 3];
+		z__[j4 - 3] = z__[ipn4 - j4 - 3];
+		z__[ipn4 - j4 - 3] = temp;
+		temp = z__[j4 - 2];
+		z__[j4 - 2] = z__[ipn4 - j4 - 2];
+		z__[ipn4 - j4 - 2] = temp;
+		temp = z__[j4 - 1];
+		z__[j4 - 1] = z__[ipn4 - j4 - 5];
+		z__[ipn4 - j4 - 5] = temp;
+		temp = z__[j4];
+		z__[j4] = z__[ipn4 - j4 - 4];
+		z__[ipn4 - j4 - 4] = temp;
+/* L60: */
+	    }
+	    if (*n0 - *i0 <= 4) {
+		z__[(*n0 << 2) + *pp - 1] = z__[(*i0 << 2) + *pp - 1];
+		z__[(*n0 << 2) - *pp] = z__[(*i0 << 2) - *pp];
+	    }
+/* Computing MIN */
+	    r__1 = *dmin2, r__2 = z__[(*n0 << 2) + *pp - 1];
+	    *dmin2 = dmin(r__1,r__2);
+/* Computing MIN */
+	    r__1 = z__[(*n0 << 2) + *pp - 1], r__2 = z__[(*i0 << 2) + *pp - 1]
+		    , r__1 = min(r__1,r__2), r__2 = z__[(*i0 << 2) + *pp + 3];
+	    z__[(*n0 << 2) + *pp - 1] = dmin(r__1,r__2);
+/* Computing MIN */
+	    r__1 = z__[(*n0 << 2) - *pp], r__2 = z__[(*i0 << 2) - *pp], r__1 =
+		     min(r__1,r__2), r__2 = z__[(*i0 << 2) - *pp + 4];
+	    z__[(*n0 << 2) - *pp] = dmin(r__1,r__2);
+/* Computing MAX */
+	    r__1 = *qmax, r__2 = z__[(*i0 << 2) + *pp - 3], r__1 = max(r__1,
+		    r__2), r__2 = z__[(*i0 << 2) + *pp + 1];
+	    *qmax = dmax(r__1,r__2);
+	    *dmin__ = -0.f;
+	}
+    }
+
+/*     Choose a shift. */
+
+    slasq4_(i0, n0, &z__[1], pp, &n0in, dmin__, dmin1, dmin2, dn, dn1, dn2,
+	    tau, ttype, g);
+
+/*     Call dqds until DMIN > 0. */
+
+L70:
+
+    slasq5_(i0, n0, &z__[1], pp, tau, dmin__, dmin1, dmin2, dn, dn1, dn2,
+	    ieee);
+
+    *ndiv += *n0 - *i0 + 2;
+    ++(*iter);
+
+/*     Check status. */
+
+    if (*dmin__ >= 0.f && *dmin1 > 0.f) {
+
+/*        Success. */
+
+	goto L90;
+
+    } else if (*dmin__ < 0.f && *dmin1 > 0.f && z__[(*n0 - 1 << 2) - *pp] <
+	    tol * (*sigma + *dn1) && dabs(*dn) < tol * *sigma) {
+
+/*        Convergence hidden by negative DN. */
+
+	z__[(*n0 - 1 << 2) - *pp + 2] = 0.f;
+	*dmin__ = 0.f;
+	goto L90;
+    } else if (*dmin__ < 0.f) {
+
+/*        TAU too big. Select new TAU and try again. */
+
+	++(*nfail);
+	if (*ttype < -22) {
+
+/*           Failed twice. Play it safe. */
+
+	    *tau = 0.f;
+	} else if (*dmin1 > 0.f) {
+
+/*           Late failure. Gives excellent shift. */
+
+	    *tau = (*tau + *dmin__) * (1.f - eps * 2.f);
+	    *ttype += -11;
+	} else {
+
+/*           Early failure. Divide by 4. */
+
+	    *tau *= .25f;
+	    *ttype += -12;
+	}
+	goto L70;
+    } else if (sisnan_(dmin__)) {
+
+/*        NaN. */
+
+	if (*tau == 0.f) {
+	    goto L80;
+	} else {
+	    *tau = 0.f;
+	    goto L70;
+	}
+    } else {
+
+/*        Possible underflow. Play it safe. */
+
+	goto L80;
+    }
+
+/*     Risk of underflow. */
+
+L80:
+    slasq6_(i0, n0, &z__[1], pp, dmin__, dmin1, dmin2, dn, dn1, dn2);
+    *ndiv += *n0 - *i0 + 2;
+    ++(*iter);
+    *tau = 0.f;
+
+L90:
+    if (*tau < *sigma) {
+	*desig += *tau;
+	t = *sigma + *desig;
+	*desig -= t - *sigma;
+    } else {
+	t = *sigma + *tau;
+	*desig = *sigma - (t - *tau) + *desig;
+    }
+    *sigma = t;
+
+    return 0;
+
+/*     End of SLASQ3 */
+
+} /* slasq3_ */
+
+/* Subroutine */ int slasq4_(integer *i0, integer *n0, real *z__, integer *pp,
+	 integer *n0in, real *dmin__, real *dmin1, real *dmin2, real *dn,
+	real *dn1, real *dn2, real *tau, integer *ttype, real *g)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real s, a2, b1, b2;
+    static integer i4, nn, np;
+    static real gam, gap1, gap2;
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    SLASQ4 computes an approximation TAU to the smallest eigenvalue
+    using values of d from the previous transform.
+
+    I0    (input) INTEGER
+          First index.
+
+    N0    (input) INTEGER
+          Last index.
+
+    Z     (input) REAL array, dimension ( 4*N )
+          Z holds the qd array.
+
+    PP    (input) INTEGER
+          PP=0 for ping, PP=1 for pong.
+
+    NOIN  (input) INTEGER
+          The value of N0 at start of EIGTEST.
+
+    DMIN  (input) REAL
+          Minimum value of d.
+
+    DMIN1 (input) REAL
+          Minimum value of d, excluding D( N0 ).
+
+    DMIN2 (input) REAL
+          Minimum value of d, excluding D( N0 ) and D( N0-1 ).
+
+    DN    (input) REAL
+          d(N)
+
+    DN1   (input) REAL
+          d(N-1)
+
+    DN2   (input) REAL
+          d(N-2)
+
+    TAU   (output) REAL
+          This is the shift.
+
+    TTYPE (output) INTEGER
+          Shift type.
+
+    G     (input/output) REAL
+          G is passed as an argument in order to save its value between
+          calls to SLASQ4.
+
+    Further Details
+    ===============
+    CNST1 = 9/16
+
+    =====================================================================
+
+
+       A negative DMIN forces the shift to take that absolute value
+       TTYPE records the type of shift.
+*/
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    if (*dmin__ <= 0.f) {
+	*tau = -(*dmin__);
+	*ttype = -1;
+	return 0;
+    }
+
+    nn = (*n0 << 2) + *pp;
+    if (*n0in == *n0) {
+
+/*        No eigenvalues deflated. */
+
+	if (*dmin__ == *dn || *dmin__ == *dn1) {
+
+	    b1 = sqrt(z__[nn - 3]) * sqrt(z__[nn - 5]);
+	    b2 = sqrt(z__[nn - 7]) * sqrt(z__[nn - 9]);
+	    a2 = z__[nn - 7] + z__[nn - 5];
+
+/*           Cases 2 and 3. */
+
+	    if (*dmin__ == *dn && *dmin1 == *dn1) {
+		gap2 = *dmin2 - a2 - *dmin2 * .25f;
+		if (gap2 > 0.f && gap2 > b2) {
+		    gap1 = a2 - *dn - b2 / gap2 * b2;
+		} else {
+		    gap1 = a2 - *dn - (b1 + b2);
+		}
+		if (gap1 > 0.f && gap1 > b1) {
+/* Computing MAX */
+		    r__1 = *dn - b1 / gap1 * b1, r__2 = *dmin__ * .5f;
+		    s = dmax(r__1,r__2);
+		    *ttype = -2;
+		} else {
+		    s = 0.f;
+		    if (*dn > b1) {
+			s = *dn - b1;
+		    }
+		    if (a2 > b1 + b2) {
+/* Computing MIN */
+			r__1 = s, r__2 = a2 - (b1 + b2);
+			s = dmin(r__1,r__2);
+		    }
+/* Computing MAX */
+		    r__1 = s, r__2 = *dmin__ * .333f;
+		    s = dmax(r__1,r__2);
+		    *ttype = -3;
+		}
+	    } else {
+
+/*              Case 4. */
+
+		*ttype = -4;
+		s = *dmin__ * .25f;
+		if (*dmin__ == *dn) {
+		    gam = *dn;
+		    a2 = 0.f;
+		    if (z__[nn - 5] > z__[nn - 7]) {
+			return 0;
+		    }
+		    b2 = z__[nn - 5] / z__[nn - 7];
+		    np = nn - 9;
+		} else {
+		    np = nn - (*pp << 1);
+		    b2 = z__[np - 2];
+		    gam = *dn1;
+		    if (z__[np - 4] > z__[np - 2]) {
+			return 0;
+		    }
+		    a2 = z__[np - 4] / z__[np - 2];
+		    if (z__[nn - 9] > z__[nn - 11]) {
+			return 0;
+		    }
+		    b2 = z__[nn - 9] / z__[nn - 11];
+		    np = nn - 13;
+		}
+
+/*              Approximate contribution to norm squared from I < NN-1. */
+
+		a2 += b2;
+		i__1 = (*i0 << 2) - 1 + *pp;
+		for (i4 = np; i4 >= i__1; i4 += -4) {
+		    if (b2 == 0.f) {
+			goto L20;
+		    }
+		    b1 = b2;
+		    if (z__[i4] > z__[i4 - 2]) {
+			return 0;
+		    }
+		    b2 *= z__[i4] / z__[i4 - 2];
+		    a2 += b2;
+		    if (dmax(b2,b1) * 100.f < a2 || .563f < a2) {
+			goto L20;
+		    }
+/* L10: */
+		}
+L20:
+		a2 *= 1.05f;
+
+/*              Rayleigh quotient residual bound. */
+
+		if (a2 < .563f) {
+		    s = gam * (1.f - sqrt(a2)) / (a2 + 1.f);
+		}
+	    }
+	} else if (*dmin__ == *dn2) {
+
+/*           Case 5. */
+
+	    *ttype = -5;
+	    s = *dmin__ * .25f;
+
+/*           Compute contribution to norm squared from I > NN-2. */
+
+	    np = nn - (*pp << 1);
+	    b1 = z__[np - 2];
+	    b2 = z__[np - 6];
+	    gam = *dn2;
+	    if (z__[np - 8] > b2 || z__[np - 4] > b1) {
+		return 0;
+	    }
+	    a2 = z__[np - 8] / b2 * (z__[np - 4] / b1 + 1.f);
+
+/*           Approximate contribution to norm squared from I < NN-2. */
+
+	    if (*n0 - *i0 > 2) {
+		b2 = z__[nn - 13] / z__[nn - 15];
+		a2 += b2;
+		i__1 = (*i0 << 2) - 1 + *pp;
+		for (i4 = nn - 17; i4 >= i__1; i4 += -4) {
+		    if (b2 == 0.f) {
+			goto L40;
+		    }
+		    b1 = b2;
+		    if (z__[i4] > z__[i4 - 2]) {
+			return 0;
+		    }
+		    b2 *= z__[i4] / z__[i4 - 2];
+		    a2 += b2;
+		    if (dmax(b2,b1) * 100.f < a2 || .563f < a2) {
+			goto L40;
+		    }
+/* L30: */
+		}
+L40:
+		a2 *= 1.05f;
+	    }
+
+	    if (a2 < .563f) {
+		s = gam * (1.f - sqrt(a2)) / (a2 + 1.f);
+	    }
+	} else {
+
+/*           Case 6, no information to guide us. */
+
+	    if (*ttype == -6) {
+		*g += (1.f - *g) * .333f;
+	    } else if (*ttype == -18) {
+		*g = .083250000000000005f;
+	    } else {
+		*g = .25f;
+	    }
+	    s = *g * *dmin__;
+	    *ttype = -6;
+	}
+
+    } else if (*n0in == *n0 + 1) {
+
+/*        One eigenvalue just deflated. Use DMIN1, DN1 for DMIN and DN. */
+
+	if (*dmin1 == *dn1 && *dmin2 == *dn2) {
+
+/*           Cases 7 and 8. */
+
+	    *ttype = -7;
+	    s = *dmin1 * .333f;
+	    if (z__[nn - 5] > z__[nn - 7]) {
+		return 0;
+	    }
+	    b1 = z__[nn - 5] / z__[nn - 7];
+	    b2 = b1;
+	    if (b2 == 0.f) {
+		goto L60;
+	    }
+	    i__1 = (*i0 << 2) - 1 + *pp;
+	    for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) {
+		a2 = b1;
+		if (z__[i4] > z__[i4 - 2]) {
+		    return 0;
+		}
+		b1 *= z__[i4] / z__[i4 - 2];
+		b2 += b1;
+		if (dmax(b1,a2) * 100.f < b2) {
+		    goto L60;
+		}
+/* L50: */
+	    }
+L60:
+	    b2 = sqrt(b2 * 1.05f);
+/* Computing 2nd power */
+	    r__1 = b2;
+	    a2 = *dmin1 / (r__1 * r__1 + 1.f);
+	    gap2 = *dmin2 * .5f - a2;
+	    if (gap2 > 0.f && gap2 > b2 * a2) {
+/* Computing MAX */
+		r__1 = s, r__2 = a2 * (1.f - a2 * 1.01f * (b2 / gap2) * b2);
+		s = dmax(r__1,r__2);
+	    } else {
+/* Computing MAX */
+		r__1 = s, r__2 = a2 * (1.f - b2 * 1.01f);
+		s = dmax(r__1,r__2);
+		*ttype = -8;
+	    }
+	} else {
+
+/*           Case 9. */
+
+	    s = *dmin1 * .25f;
+	    if (*dmin1 == *dn1) {
+		s = *dmin1 * .5f;
+	    }
+	    *ttype = -9;
+	}
+
+    } else if (*n0in == *n0 + 2) {
+
+/*
+          Two eigenvalues deflated. Use DMIN2, DN2 for DMIN and DN.
+
+          Cases 10 and 11.
+*/
+
+	if (*dmin2 == *dn2 && z__[nn - 5] * 2.f < z__[nn - 7]) {
+	    *ttype = -10;
+	    s = *dmin2 * .333f;
+	    if (z__[nn - 5] > z__[nn - 7]) {
+		return 0;
+	    }
+	    b1 = z__[nn - 5] / z__[nn - 7];
+	    b2 = b1;
+	    if (b2 == 0.f) {
+		goto L80;
+	    }
+	    i__1 = (*i0 << 2) - 1 + *pp;
+	    for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) {
+		if (z__[i4] > z__[i4 - 2]) {
+		    return 0;
+		}
+		b1 *= z__[i4] / z__[i4 - 2];
+		b2 += b1;
+		if (b1 * 100.f < b2) {
+		    goto L80;
+		}
+/* L70: */
+	    }
+L80:
+	    b2 = sqrt(b2 * 1.05f);
+/* Computing 2nd power */
+	    r__1 = b2;
+	    a2 = *dmin2 / (r__1 * r__1 + 1.f);
+	    gap2 = z__[nn - 7] + z__[nn - 9] - sqrt(z__[nn - 11]) * sqrt(z__[
+		    nn - 9]) - a2;
+	    if (gap2 > 0.f && gap2 > b2 * a2) {
+/* Computing MAX */
+		r__1 = s, r__2 = a2 * (1.f - a2 * 1.01f * (b2 / gap2) * b2);
+		s = dmax(r__1,r__2);
+	    } else {
+/* Computing MAX */
+		r__1 = s, r__2 = a2 * (1.f - b2 * 1.01f);
+		s = dmax(r__1,r__2);
+	    }
+	} else {
+	    s = *dmin2 * .25f;
+	    *ttype = -11;
+	}
+    } else if (*n0in > *n0 + 2) {
+
+/*        Case 12, more than two eigenvalues deflated. No information. */
+
+	s = 0.f;
+	*ttype = -12;
+    }
+
+    *tau = s;
+    return 0;
+
+/*     End of SLASQ4 */
+
+} /* slasq4_ */
+
+/* Subroutine */ int slasq5_(integer *i0, integer *n0, real *z__, integer *pp,
+	 real *tau, real *dmin__, real *dmin1, real *dmin2, real *dn, real *
+	dnm1, real *dnm2, logical *ieee)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real d__;
+    static integer j4, j4p2;
+    static real emin, temp;
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    SLASQ5 computes one dqds transform in ping-pong form, one
+    version for IEEE machines another for non IEEE machines.
+
+    Arguments
+    =========
+
+    I0    (input) INTEGER
+          First index.
+
+    N0    (input) INTEGER
+          Last index.
+
+    Z     (input) REAL array, dimension ( 4*N )
+          Z holds the qd array. EMIN is stored in Z(4*N0) to avoid
+          an extra argument.
+
+    PP    (input) INTEGER
+          PP=0 for ping, PP=1 for pong.
+
+    TAU   (input) REAL
+          This is the shift.
+
+    DMIN  (output) REAL
+          Minimum value of d.
+
+    DMIN1 (output) REAL
+          Minimum value of d, excluding D( N0 ).
+
+    DMIN2 (output) REAL
+          Minimum value of d, excluding D( N0 ) and D( N0-1 ).
+
+    DN    (output) REAL
+          d(N0), the last value of d.
+
+    DNM1  (output) REAL
+          d(N0-1).
+
+    DNM2  (output) REAL
+          d(N0-2).
+
+    IEEE  (input) LOGICAL
+          Flag for IEEE or non IEEE arithmetic.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    if (*n0 - *i0 - 1 <= 0) {
+	return 0;
+    }
+
+    j4 = (*i0 << 2) + *pp - 3;
+    emin = z__[j4 + 4];
+    d__ = z__[j4] - *tau;
+    *dmin__ = d__;
+    *dmin1 = -z__[j4];
+
+    if (*ieee) {
+
+/*        Code for IEEE arithmetic. */
+
+	if (*pp == 0) {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 2] = d__ + z__[j4 - 1];
+		temp = z__[j4 + 1] / z__[j4 - 2];
+		d__ = d__ * temp - *tau;
+		*dmin__ = dmin(*dmin__,d__);
+		z__[j4] = z__[j4 - 1] * temp;
+/* Computing MIN */
+		r__1 = z__[j4];
+		emin = dmin(r__1,emin);
+/* L10: */
+	    }
+	} else {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 3] = d__ + z__[j4];
+		temp = z__[j4 + 2] / z__[j4 - 3];
+		d__ = d__ * temp - *tau;
+		*dmin__ = dmin(*dmin__,d__);
+		z__[j4 - 1] = z__[j4] * temp;
+/* Computing MIN */
+		r__1 = z__[j4 - 1];
+		emin = dmin(r__1,emin);
+/* L20: */
+	    }
+	}
+
+/*        Unroll last two steps. */
+
+	*dnm2 = d__;
+	*dmin2 = *dmin__;
+	j4 = (*n0 - 2 << 2) - *pp;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm2 + z__[j4p2];
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau;
+	*dmin__ = dmin(*dmin__,*dnm1);
+
+	*dmin1 = *dmin__;
+	j4 += 4;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm1 + z__[j4p2];
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau;
+	*dmin__ = dmin(*dmin__,*dn);
+
+    } else {
+
+/*        Code for non IEEE arithmetic. */
+
+	if (*pp == 0) {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 2] = d__ + z__[j4 - 1];
+		if (d__ < 0.f) {
+		    return 0;
+		} else {
+		    z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]);
+		    d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]) - *tau;
+		}
+		*dmin__ = dmin(*dmin__,d__);
+/* Computing MIN */
+		r__1 = emin, r__2 = z__[j4];
+		emin = dmin(r__1,r__2);
+/* L30: */
+	    }
+	} else {
+	    i__1 = *n0 - 3 << 2;
+	    for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+		z__[j4 - 3] = d__ + z__[j4];
+		if (d__ < 0.f) {
+		    return 0;
+		} else {
+		    z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]);
+		    d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]) - *tau;
+		}
+		*dmin__ = dmin(*dmin__,d__);
+/* Computing MIN */
+		r__1 = emin, r__2 = z__[j4 - 1];
+		emin = dmin(r__1,r__2);
+/* L40: */
+	    }
+	}
+
+/*        Unroll last two steps. */
+
+	*dnm2 = d__;
+	*dmin2 = *dmin__;
+	j4 = (*n0 - 2 << 2) - *pp;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm2 + z__[j4p2];
+	if (*dnm2 < 0.f) {
+	    return 0;
+	} else {
+	    z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	    *dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau;
+	}
+	*dmin__ = dmin(*dmin__,*dnm1);
+
+	*dmin1 = *dmin__;
+	j4 += 4;
+	j4p2 = j4 + (*pp << 1) - 1;
+	z__[j4 - 2] = *dnm1 + z__[j4p2];
+	if (*dnm1 < 0.f) {
+	    return 0;
+	} else {
+	    z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	    *dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau;
+	}
+	*dmin__ = dmin(*dmin__,*dn);
+
+    }
+
+    z__[j4 + 2] = *dn;
+    z__[(*n0 << 2) - *pp] = emin;
+    return 0;
+
+/*     End of SLASQ5 */
+
+} /* slasq5_ */
+
+/* Subroutine */ int slasq6_(integer *i0, integer *n0, real *z__, integer *pp,
+	 real *dmin__, real *dmin1, real *dmin2, real *dn, real *dnm1, real *
+	dnm2)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real d__;
+    static integer j4, j4p2;
+    static real emin, temp;
+    extern doublereal slamch_(char *);
+    static real safmin;
+
+
+/*
+    -- LAPACK routine (version 3.2)                                    --
+
+    -- Contributed by Osni Marques of the Lawrence Berkeley National   --
+    -- Laboratory and Beresford Parlett of the Univ. of California at  --
+    -- Berkeley                                                        --
+    -- November 2008                                                   --
+
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+
+
+    Purpose
+    =======
+
+    SLASQ6 computes one dqd (shift equal to zero) transform in
+    ping-pong form, with protection against underflow and overflow.
+
+    Arguments
+    =========
+
+    I0    (input) INTEGER
+          First index.
+
+    N0    (input) INTEGER
+          Last index.
+
+    Z     (input) REAL array, dimension ( 4*N )
+          Z holds the qd array. EMIN is stored in Z(4*N0) to avoid
+          an extra argument.
+
+    PP    (input) INTEGER
+          PP=0 for ping, PP=1 for pong.
+
+    DMIN  (output) REAL
+          Minimum value of d.
+
+    DMIN1 (output) REAL
+          Minimum value of d, excluding D( N0 ).
+
+    DMIN2 (output) REAL
+          Minimum value of d, excluding D( N0 ) and D( N0-1 ).
+
+    DN    (output) REAL
+          d(N0), the last value of d.
+
+    DNM1  (output) REAL
+          d(N0-1).
+
+    DNM2  (output) REAL
+          d(N0-2).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --z__;
+
+    /* Function Body */
+    if (*n0 - *i0 - 1 <= 0) {
+	return 0;
+    }
+
+    safmin = slamch_("Safe minimum");
+    j4 = (*i0 << 2) + *pp - 3;
+    emin = z__[j4 + 4];
+    d__ = z__[j4];
+    *dmin__ = d__;
+
+    if (*pp == 0) {
+	i__1 = *n0 - 3 << 2;
+	for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+	    z__[j4 - 2] = d__ + z__[j4 - 1];
+	    if (z__[j4 - 2] == 0.f) {
+		z__[j4] = 0.f;
+		d__ = z__[j4 + 1];
+		*dmin__ = d__;
+		emin = 0.f;
+	    } else if (safmin * z__[j4 + 1] < z__[j4 - 2] && safmin * z__[j4
+		    - 2] < z__[j4 + 1]) {
+		temp = z__[j4 + 1] / z__[j4 - 2];
+		z__[j4] = z__[j4 - 1] * temp;
+		d__ *= temp;
+	    } else {
+		z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]);
+		d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]);
+	    }
+	    *dmin__ = dmin(*dmin__,d__);
+/* Computing MIN */
+	    r__1 = emin, r__2 = z__[j4];
+	    emin = dmin(r__1,r__2);
+/* L10: */
+	}
+    } else {
+	i__1 = *n0 - 3 << 2;
+	for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) {
+	    z__[j4 - 3] = d__ + z__[j4];
+	    if (z__[j4 - 3] == 0.f) {
+		z__[j4 - 1] = 0.f;
+		d__ = z__[j4 + 2];
+		*dmin__ = d__;
+		emin = 0.f;
+	    } else if (safmin * z__[j4 + 2] < z__[j4 - 3] && safmin * z__[j4
+		    - 3] < z__[j4 + 2]) {
+		temp = z__[j4 + 2] / z__[j4 - 3];
+		z__[j4 - 1] = z__[j4] * temp;
+		d__ *= temp;
+	    } else {
+		z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]);
+		d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]);
+	    }
+	    *dmin__ = dmin(*dmin__,d__);
+/* Computing MIN */
+	    r__1 = emin, r__2 = z__[j4 - 1];
+	    emin = dmin(r__1,r__2);
+/* L20: */
+	}
+    }
+
+/*     Unroll last two steps. */
+
+    *dnm2 = d__;
+    *dmin2 = *dmin__;
+    j4 = (*n0 - 2 << 2) - *pp;
+    j4p2 = j4 + (*pp << 1) - 1;
+    z__[j4 - 2] = *dnm2 + z__[j4p2];
+    if (z__[j4 - 2] == 0.f) {
+	z__[j4] = 0.f;
+	*dnm1 = z__[j4p2 + 2];
+	*dmin__ = *dnm1;
+	emin = 0.f;
+    } else if (safmin * z__[j4p2 + 2] < z__[j4 - 2] && safmin * z__[j4 - 2] <
+	    z__[j4p2 + 2]) {
+	temp = z__[j4p2 + 2] / z__[j4 - 2];
+	z__[j4] = z__[j4p2] * temp;
+	*dnm1 = *dnm2 * temp;
+    } else {
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]);
+    }
+    *dmin__ = dmin(*dmin__,*dnm1);
+
+    *dmin1 = *dmin__;
+    j4 += 4;
+    j4p2 = j4 + (*pp << 1) - 1;
+    z__[j4 - 2] = *dnm1 + z__[j4p2];
+    if (z__[j4 - 2] == 0.f) {
+	z__[j4] = 0.f;
+	*dn = z__[j4p2 + 2];
+	*dmin__ = *dn;
+	emin = 0.f;
+    } else if (safmin * z__[j4p2 + 2] < z__[j4 - 2] && safmin * z__[j4 - 2] <
+	    z__[j4p2 + 2]) {
+	temp = z__[j4p2 + 2] / z__[j4 - 2];
+	z__[j4] = z__[j4p2] * temp;
+	*dn = *dnm1 * temp;
+    } else {
+	z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]);
+	*dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]);
+    }
+    *dmin__ = dmin(*dmin__,*dn);
+
+    z__[j4 + 2] = *dn;
+    z__[(*n0 << 2) - *pp] = emin;
+    return 0;
+
+/*     End of SLASQ6 */
+
+} /* slasq6_ */
+
+/* Subroutine */ int slasr_(char *side, char *pivot, char *direct, integer *m,
+	 integer *n, real *c__, real *s, real *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, info;
+    static real temp;
+    extern logical lsame_(char *, char *);
+    static real ctemp, stemp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASR applies a sequence of plane rotations to a real matrix A,
+    from either the left or the right.
+
+    When SIDE = 'L', the transformation takes the form
+
+       A := P*A
+
+    and when SIDE = 'R', the transformation takes the form
+
+       A := A*P**T
+
+    where P is an orthogonal matrix consisting of a sequence of z plane
+    rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+    and P**T is the transpose of P.
+
+    When DIRECT = 'F' (Forward sequence), then
+
+       P = P(z-1) * ... * P(2) * P(1)
+
+    and when DIRECT = 'B' (Backward sequence), then
+
+       P = P(1) * P(2) * ... * P(z-1)
+
+    where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+
+       R(k) = (  c(k)  s(k) )
+            = ( -s(k)  c(k) ).
+
+    When PIVOT = 'V' (Variable pivot), the rotation is performed
+    for the plane (k,k+1), i.e., P(k) has the form
+
+       P(k) = (  1                                            )
+              (       ...                                     )
+              (              1                                )
+              (                   c(k)  s(k)                  )
+              (                  -s(k)  c(k)                  )
+              (                                1              )
+              (                                     ...       )
+              (                                            1  )
+
+    where R(k) appears as a rank-2 modification to the identity matrix in
+    rows and columns k and k+1.
+
+    When PIVOT = 'T' (Top pivot), the rotation is performed for the
+    plane (1,k+1), so P(k) has the form
+
+       P(k) = (  c(k)                    s(k)                 )
+              (         1                                     )
+              (              ...                              )
+              (                     1                         )
+              ( -s(k)                    c(k)                 )
+              (                                 1             )
+              (                                      ...      )
+              (                                             1 )
+
+    where R(k) appears in rows and columns 1 and k+1.
+
+    Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+    performed for the plane (k,z), giving P(k) the form
+
+       P(k) = ( 1                                             )
+              (      ...                                      )
+              (             1                                 )
+              (                  c(k)                    s(k) )
+              (                         1                     )
+              (                              ...              )
+              (                                     1         )
+              (                 -s(k)                    c(k) )
+
+    where R(k) appears in rows and columns k and z.  The rotations are
+    performed without ever forming P(k) explicitly.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            Specifies whether the plane rotation matrix P is applied to
+            A on the left or the right.
+            = 'L':  Left, compute A := P*A
+            = 'R':  Right, compute A:= A*P**T
+
+    PIVOT   (input) CHARACTER*1
+            Specifies the plane for which P(k) is a plane rotation
+            matrix.
+            = 'V':  Variable pivot, the plane (k,k+1)
+            = 'T':  Top pivot, the plane (1,k+1)
+            = 'B':  Bottom pivot, the plane (k,z)
+
+    DIRECT  (input) CHARACTER*1
+            Specifies whether P is a forward or backward sequence of
+            plane rotations.
+            = 'F':  Forward, P = P(z-1)*...*P(2)*P(1)
+            = 'B':  Backward, P = P(1)*P(2)*...*P(z-1)
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  If m <= 1, an immediate
+            return is effected.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  If n <= 1, an
+            immediate return is effected.
+
+    C       (input) REAL array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The cosines c(k) of the plane rotations.
+
+    S       (input) REAL array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The sines s(k) of the plane rotations.  The 2-by-2 plane
+            rotation part of the matrix P(k), R(k), has the form
+            R(k) = (  c(k)  s(k) )
+                   ( -s(k)  c(k) ).
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            The M-by-N matrix A.  On exit, A is overwritten by P*A if
+            SIDE = 'R' or by A*P**T if SIDE = 'L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --c__;
+    --s;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! (lsame_(side, "L") || lsame_(side, "R"))) {
+	info = 1;
+    } else if (! (lsame_(pivot, "V") || lsame_(pivot,
+	    "T") || lsame_(pivot, "B"))) {
+	info = 2;
+    } else if (! (lsame_(direct, "F") || lsame_(direct,
+	    "B"))) {
+	info = 3;
+    } else if (*m < 0) {
+	info = 4;
+    } else if (*n < 0) {
+	info = 5;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("SLASR ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+    if (lsame_(side, "L")) {
+
+/*        Form  P * A */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[j + 1 + i__ * a_dim1];
+			    a[j + 1 + i__ * a_dim1] = ctemp * temp - stemp *
+				    a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * temp + ctemp * a[j
+				    + i__ * a_dim1];
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[j + 1 + i__ * a_dim1];
+			    a[j + 1 + i__ * a_dim1] = ctemp * temp - stemp *
+				    a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * temp + ctemp * a[j
+				    + i__ * a_dim1];
+/* L30: */
+			}
+		    }
+/* L40: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = ctemp * temp - stemp * a[
+				    i__ * a_dim1 + 1];
+			    a[i__ * a_dim1 + 1] = stemp * temp + ctemp * a[
+				    i__ * a_dim1 + 1];
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = ctemp * temp - stemp * a[
+				    i__ * a_dim1 + 1];
+			    a[i__ * a_dim1 + 1] = stemp * temp + ctemp * a[
+				    i__ * a_dim1 + 1];
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * a[*m + i__ * a_dim1]
+				     + ctemp * temp;
+			    a[*m + i__ * a_dim1] = ctemp * a[*m + i__ *
+				    a_dim1] - stemp * temp;
+/* L90: */
+			}
+		    }
+/* L100: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[j + i__ * a_dim1];
+			    a[j + i__ * a_dim1] = stemp * a[*m + i__ * a_dim1]
+				     + ctemp * temp;
+			    a[*m + i__ * a_dim1] = ctemp * a[*m + i__ *
+				    a_dim1] - stemp * temp;
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+	    }
+	}
+    } else if (lsame_(side, "R")) {
+
+/*        Form A * P' */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[i__ + (j + 1) * a_dim1];
+			    a[i__ + (j + 1) * a_dim1] = ctemp * temp - stemp *
+				     a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * temp + ctemp * a[
+				    i__ + j * a_dim1];
+/* L130: */
+			}
+		    }
+/* L140: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[i__ + (j + 1) * a_dim1];
+			    a[i__ + (j + 1) * a_dim1] = ctemp * temp - stemp *
+				     a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * temp + ctemp * a[
+				    i__ + j * a_dim1];
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = ctemp * temp - stemp * a[
+				    i__ + a_dim1];
+			    a[i__ + a_dim1] = stemp * temp + ctemp * a[i__ +
+				    a_dim1];
+/* L170: */
+			}
+		    }
+/* L180: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = ctemp * temp - stemp * a[
+				    i__ + a_dim1];
+			    a[i__ + a_dim1] = stemp * temp + ctemp * a[i__ +
+				    a_dim1];
+/* L190: */
+			}
+		    }
+/* L200: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * a[i__ + *n * a_dim1]
+				     + ctemp * temp;
+			    a[i__ + *n * a_dim1] = ctemp * a[i__ + *n *
+				    a_dim1] - stemp * temp;
+/* L210: */
+			}
+		    }
+/* L220: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1.f || stemp != 0.f) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    temp = a[i__ + j * a_dim1];
+			    a[i__ + j * a_dim1] = stemp * a[i__ + *n * a_dim1]
+				     + ctemp * temp;
+			    a[i__ + *n * a_dim1] = ctemp * a[i__ + *n *
+				    a_dim1] - stemp * temp;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SLASR */
+
+} /* slasr_ */
+
+/* Subroutine */ int slasrt_(char *id, integer *n, real *d__, integer *info)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static real d1, d2, d3;
+    static integer dir;
+    static real tmp;
+    static integer endd;
+    extern logical lsame_(char *, char *);
+    static integer stack[64]	/* was [2][32] */;
+    static real dmnmx;
+    static integer start;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static integer stkpnt;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    Sort the numbers in D in increasing order (if ID = 'I') or
+    in decreasing order (if ID = 'D' ).
+
+    Use Quick Sort, reverting to Insertion sort on arrays of
+    size <= 20. Dimension of STACK limits N to about 2**32.
+
+    Arguments
+    =========
+
+    ID      (input) CHARACTER*1
+            = 'I': sort D in increasing order;
+            = 'D': sort D in decreasing order.
+
+    N       (input) INTEGER
+            The length of the array D.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the array to be sorted.
+            On exit, D has been sorted into increasing order
+            (D(1) <= ... <= D(N) ) or into decreasing order
+            (D(1) >= ... >= D(N) ), depending on ID.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input paramters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+    dir = -1;
+    if (lsame_(id, "D")) {
+	dir = 0;
+    } else if (lsame_(id, "I")) {
+	dir = 1;
+    }
+    if (dir == -1) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLASRT", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 1) {
+	return 0;
+    }
+
+    stkpnt = 1;
+    stack[0] = 1;
+    stack[1] = *n;
+L10:
+    start = stack[(stkpnt << 1) - 2];
+    endd = stack[(stkpnt << 1) - 1];
+    --stkpnt;
+    if (endd - start <= 20 && endd - start > 0) {
+
+/*        Do Insertion sort on D( START:ENDD ) */
+
+	if (dir == 0) {
+
+/*           Sort into decreasing order */
+
+	    i__1 = endd;
+	    for (i__ = start + 1; i__ <= i__1; ++i__) {
+		i__2 = start + 1;
+		for (j = i__; j >= i__2; --j) {
+		    if (d__[j] > d__[j - 1]) {
+			dmnmx = d__[j];
+			d__[j] = d__[j - 1];
+			d__[j - 1] = dmnmx;
+		    } else {
+			goto L30;
+		    }
+/* L20: */
+		}
+L30:
+		;
+	    }
+
+	} else {
+
+/*           Sort into increasing order */
+
+	    i__1 = endd;
+	    for (i__ = start + 1; i__ <= i__1; ++i__) {
+		i__2 = start + 1;
+		for (j = i__; j >= i__2; --j) {
+		    if (d__[j] < d__[j - 1]) {
+			dmnmx = d__[j];
+			d__[j] = d__[j - 1];
+			d__[j - 1] = dmnmx;
+		    } else {
+			goto L50;
+		    }
+/* L40: */
+		}
+L50:
+		;
+	    }
+
+	}
+
+    } else if (endd - start > 20) {
+
+/*
+          Partition D( START:ENDD ) and stack parts, largest one first
+
+          Choose partition entry as median of 3
+*/
+
+	d1 = d__[start];
+	d2 = d__[endd];
+	i__ = (start + endd) / 2;
+	d3 = d__[i__];
+	if (d1 < d2) {
+	    if (d3 < d1) {
+		dmnmx = d1;
+	    } else if (d3 < d2) {
+		dmnmx = d3;
+	    } else {
+		dmnmx = d2;
+	    }
+	} else {
+	    if (d3 < d2) {
+		dmnmx = d2;
+	    } else if (d3 < d1) {
+		dmnmx = d3;
+	    } else {
+		dmnmx = d1;
+	    }
+	}
+
+	if (dir == 0) {
+
+/*           Sort into decreasing order */
+
+	    i__ = start - 1;
+	    j = endd + 1;
+L60:
+L70:
+	    --j;
+	    if (d__[j] < dmnmx) {
+		goto L70;
+	    }
+L80:
+	    ++i__;
+	    if (d__[i__] > dmnmx) {
+		goto L80;
+	    }
+	    if (i__ < j) {
+		tmp = d__[i__];
+		d__[i__] = d__[j];
+		d__[j] = tmp;
+		goto L60;
+	    }
+	    if (j - start > endd - j - 1) {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+	    } else {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+	    }
+	} else {
+
+/*           Sort into increasing order */
+
+	    i__ = start - 1;
+	    j = endd + 1;
+L90:
+L100:
+	    --j;
+	    if (d__[j] > dmnmx) {
+		goto L100;
+	    }
+L110:
+	    ++i__;
+	    if (d__[i__] < dmnmx) {
+		goto L110;
+	    }
+	    if (i__ < j) {
+		tmp = d__[i__];
+		d__[i__] = d__[j];
+		d__[j] = tmp;
+		goto L90;
+	    }
+	    if (j - start > endd - j - 1) {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+	    } else {
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = j + 1;
+		stack[(stkpnt << 1) - 1] = endd;
+		++stkpnt;
+		stack[(stkpnt << 1) - 2] = start;
+		stack[(stkpnt << 1) - 1] = j;
+	    }
+	}
+    }
+    if (stkpnt > 0) {
+	goto L10;
+    }
+    return 0;
+
+/*     End of SLASRT */
+
+} /* slasrt_ */
+
+/* Subroutine */ int slassq_(integer *n, real *x, integer *incx, real *scale,
+	real *sumsq)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer ix;
+    static real absxi;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASSQ  returns the values  scl  and  smsq  such that
+
+       ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+
+    where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+    assumed to be non-negative and  scl  returns the value
+
+       scl = max( scale, abs( x( i ) ) ).
+
+    scale and sumsq must be supplied in SCALE and SUMSQ and
+    scl and smsq are overwritten on SCALE and SUMSQ respectively.
+
+    The routine makes only one pass through the vector x.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of elements to be used from the vector X.
+
+    X       (input) REAL array, dimension (N)
+            The vector for which a scaled sum of squares is computed.
+               x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+
+    INCX    (input) INTEGER
+            The increment between successive values of the vector X.
+            INCX > 0.
+
+    SCALE   (input/output) REAL
+            On entry, the value  scale  in the equation above.
+            On exit, SCALE is overwritten with  scl , the scaling factor
+            for the sum of squares.
+
+    SUMSQ   (input/output) REAL
+            On entry, the value  sumsq  in the equation above.
+            On exit, SUMSQ is overwritten with  smsq , the basic sum of
+            squares from which  scl  has been factored out.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n > 0) {
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    if (x[ix] != 0.f) {
+		absxi = (r__1 = x[ix], dabs(r__1));
+		if (*scale < absxi) {
+/* Computing 2nd power */
+		    r__1 = *scale / absxi;
+		    *sumsq = *sumsq * (r__1 * r__1) + 1;
+		    *scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    r__1 = absxi / *scale;
+		    *sumsq += r__1 * r__1;
+		}
+	    }
+/* L10: */
+	}
+    }
+    return 0;
+
+/*     End of SLASSQ */
+
+} /* slassq_ */
+
+/* Subroutine */ int slasv2_(real *f, real *g, real *h__, real *ssmin, real *
+	ssmax, real *snr, real *csr, real *snl, real *csl)
+{
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    static real a, d__, l, m, r__, s, t, fa, ga, ha, ft, gt, ht, mm, tt, clt,
+	    crt, slt, srt;
+    static integer pmax;
+    static real temp;
+    static logical swap;
+    static real tsign;
+    static logical gasmal;
+    extern doublereal slamch_(char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASV2 computes the singular value decomposition of a 2-by-2
+    triangular matrix
+       [  F   G  ]
+       [  0   H  ].
+    On return, abs(SSMAX) is the larger singular value, abs(SSMIN) is the
+    smaller singular value, and (CSL,SNL) and (CSR,SNR) are the left and
+    right singular vectors for abs(SSMAX), giving the decomposition
+
+       [ CSL  SNL ] [  F   G  ] [ CSR -SNR ]  =  [ SSMAX   0   ]
+       [-SNL  CSL ] [  0   H  ] [ SNR  CSR ]     [  0    SSMIN ].
+
+    Arguments
+    =========
+
+    F       (input) REAL
+            The (1,1) element of the 2-by-2 matrix.
+
+    G       (input) REAL
+            The (1,2) element of the 2-by-2 matrix.
+
+    H       (input) REAL
+            The (2,2) element of the 2-by-2 matrix.
+
+    SSMIN   (output) REAL
+            abs(SSMIN) is the smaller singular value.
+
+    SSMAX   (output) REAL
+            abs(SSMAX) is the larger singular value.
+
+    SNL     (output) REAL
+    CSL     (output) REAL
+            The vector (CSL, SNL) is a unit left singular vector for the
+            singular value abs(SSMAX).
+
+    SNR     (output) REAL
+    CSR     (output) REAL
+            The vector (CSR, SNR) is a unit right singular vector for the
+            singular value abs(SSMAX).
+
+    Further Details
+    ===============
+
+    Any input parameter may be aliased with any output parameter.
+
+    Barring over/underflow and assuming a guard digit in subtraction, all
+    output quantities are correct to within a few units in the last
+    place (ulps).
+
+    In IEEE arithmetic, the code works correctly if one matrix element is
+    infinite.
+
+    Overflow will not occur unless the largest singular value itself
+    overflows or is within a few ulps of overflow. (On machines with
+    partial overflow, like the Cray, overflow may occur if the largest
+    singular value is within a factor of 2 of overflow.)
+
+    Underflow is harmless if underflow is gradual. Otherwise, results
+    may correspond to a matrix modified by perturbations of size near
+    the underflow threshold.
+
+   =====================================================================
+*/
+
+
+    ft = *f;
+    fa = dabs(ft);
+    ht = *h__;
+    ha = dabs(*h__);
+
+/*
+       PMAX points to the maximum absolute element of matrix
+         PMAX = 1 if F largest in absolute values
+         PMAX = 2 if G largest in absolute values
+         PMAX = 3 if H largest in absolute values
+*/
+
+    pmax = 1;
+    swap = ha > fa;
+    if (swap) {
+	pmax = 3;
+	temp = ft;
+	ft = ht;
+	ht = temp;
+	temp = fa;
+	fa = ha;
+	ha = temp;
+
+/*        Now FA .ge. HA */
+
+    }
+    gt = *g;
+    ga = dabs(gt);
+    if (ga == 0.f) {
+
+/*        Diagonal matrix */
+
+	*ssmin = ha;
+	*ssmax = fa;
+	clt = 1.f;
+	crt = 1.f;
+	slt = 0.f;
+	srt = 0.f;
+    } else {
+	gasmal = TRUE_;
+	if (ga > fa) {
+	    pmax = 2;
+	    if (fa / ga < slamch_("EPS")) {
+
+/*              Case of very large GA */
+
+		gasmal = FALSE_;
+		*ssmax = ga;
+		if (ha > 1.f) {
+		    *ssmin = fa / (ga / ha);
+		} else {
+		    *ssmin = fa / ga * ha;
+		}
+		clt = 1.f;
+		slt = ht / gt;
+		srt = 1.f;
+		crt = ft / gt;
+	    }
+	}
+	if (gasmal) {
+
+/*           Normal case */
+
+	    d__ = fa - ha;
+	    if (d__ == fa) {
+
+/*              Copes with infinite F or H */
+
+		l = 1.f;
+	    } else {
+		l = d__ / fa;
+	    }
+
+/*           Note that 0 .le. L .le. 1 */
+
+	    m = gt / ft;
+
+/*           Note that abs(M) .le. 1/macheps */
+
+	    t = 2.f - l;
+
+/*           Note that T .ge. 1 */
+
+	    mm = m * m;
+	    tt = t * t;
+	    s = sqrt(tt + mm);
+
+/*           Note that 1 .le. S .le. 1 + 1/macheps */
+
+	    if (l == 0.f) {
+		r__ = dabs(m);
+	    } else {
+		r__ = sqrt(l * l + mm);
+	    }
+
+/*           Note that 0 .le. R .le. 1 + 1/macheps */
+
+	    a = (s + r__) * .5f;
+
+/*           Note that 1 .le. A .le. 1 + abs(M) */
+
+	    *ssmin = ha / a;
+	    *ssmax = fa * a;
+	    if (mm == 0.f) {
+
+/*              Note that M is very tiny */
+
+		if (l == 0.f) {
+		    t = r_sign(&c_b3178, &ft) * r_sign(&c_b15, &gt);
+		} else {
+		    t = gt / r_sign(&d__, &ft) + m / t;
+		}
+	    } else {
+		t = (m / (s + t) + m / (r__ + l)) * (a + 1.f);
+	    }
+	    l = sqrt(t * t + 4.f);
+	    crt = 2.f / l;
+	    srt = t / l;
+	    clt = (crt + srt * m) / a;
+	    slt = ht / ft * srt / a;
+	}
+    }
+    if (swap) {
+	*csl = srt;
+	*snl = crt;
+	*csr = slt;
+	*snr = clt;
+    } else {
+	*csl = clt;
+	*snl = slt;
+	*csr = crt;
+	*snr = srt;
+    }
+
+/*     Correct signs of SSMAX and SSMIN */
+
+    if (pmax == 1) {
+	tsign = r_sign(&c_b15, csr) * r_sign(&c_b15, csl) * r_sign(&c_b15, f);
+    }
+    if (pmax == 2) {
+	tsign = r_sign(&c_b15, snr) * r_sign(&c_b15, csl) * r_sign(&c_b15, g);
+    }
+    if (pmax == 3) {
+	tsign = r_sign(&c_b15, snr) * r_sign(&c_b15, snl) * r_sign(&c_b15,
+		h__);
+    }
+    *ssmax = r_sign(ssmax, &tsign);
+    r__1 = tsign * r_sign(&c_b15, f) * r_sign(&c_b15, h__);
+    *ssmin = r_sign(ssmin, &r__1);
+    return 0;
+
+/*     End of SLASV2 */
+
+} /* slasv2_ */
+
+/* Subroutine */ int slaswp_(integer *n, real *a, integer *lda, integer *k1,
+	integer *k2, integer *ipiv, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, k, i1, i2, n32, ip, ix, ix0, inc;
+    static real temp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASWP performs a series of row interchanges on the matrix A.
+    One row interchange is initiated for each of rows K1 through K2 of A.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the matrix of column dimension N to which the row
+            interchanges will be applied.
+            On exit, the permuted matrix.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+
+    K1      (input) INTEGER
+            The first element of IPIV for which a row interchange will
+            be done.
+
+    K2      (input) INTEGER
+            The last element of IPIV for which a row interchange will
+            be done.
+
+    IPIV    (input) INTEGER array, dimension (K2*abs(INCX))
+            The vector of pivot indices.  Only the elements in positions
+            K1 through K2 of IPIV are accessed.
+            IPIV(K) = L implies rows K and L are to be interchanged.
+
+    INCX    (input) INTEGER
+            The increment between successive values of IPIV.  If IPIV
+            is negative, the pivots are applied in reverse order.
+
+    Further Details
+    ===============
+
+    Modified by
+     R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA
+
+   =====================================================================
+
+
+       Interchange row I with row IPIV(I) for each of rows K1 through K2.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    if (*incx > 0) {
+	ix0 = *k1;
+	i1 = *k1;
+	i2 = *k2;
+	inc = 1;
+    } else if (*incx < 0) {
+	ix0 = (1 - *k2) * *incx + 1;
+	i1 = *k2;
+	i2 = *k1;
+	inc = -1;
+    } else {
+	return 0;
+    }
+
+    n32 = *n / 32 << 5;
+    if (n32 != 0) {
+	i__1 = n32;
+	for (j = 1; j <= i__1; j += 32) {
+	    ix = ix0;
+	    i__2 = i2;
+	    i__3 = inc;
+	    for (i__ = i1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__3)
+		    {
+		ip = ipiv[ix];
+		if (ip != i__) {
+		    i__4 = j + 31;
+		    for (k = j; k <= i__4; ++k) {
+			temp = a[i__ + k * a_dim1];
+			a[i__ + k * a_dim1] = a[ip + k * a_dim1];
+			a[ip + k * a_dim1] = temp;
+/* L10: */
+		    }
+		}
+		ix += *incx;
+/* L20: */
+	    }
+/* L30: */
+	}
+    }
+    if (n32 != *n) {
+	++n32;
+	ix = ix0;
+	i__1 = i2;
+	i__3 = inc;
+	for (i__ = i1; i__3 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__3) {
+	    ip = ipiv[ix];
+	    if (ip != i__) {
+		i__2 = *n;
+		for (k = n32; k <= i__2; ++k) {
+		    temp = a[i__ + k * a_dim1];
+		    a[i__ + k * a_dim1] = a[ip + k * a_dim1];
+		    a[ip + k * a_dim1] = temp;
+/* L40: */
+		}
+	    }
+	    ix += *incx;
+/* L50: */
+	}
+    }
+
+    return 0;
+
+/*     End of SLASWP */
+
+} /* slaswp_ */
+
+/* Subroutine */ int slasy2_(logical *ltranl, logical *ltranr, integer *isgn,
+	integer *n1, integer *n2, real *tl, integer *ldtl, real *tr, integer *
+	ldtr, real *b, integer *ldb, real *scale, real *x, integer *ldx, real
+	*xnorm, integer *info)
+{
+    /* Initialized data */
+
+    static integer locu12[4] = { 3,4,1,2 };
+    static integer locl21[4] = { 2,1,4,3 };
+    static integer locu22[4] = { 4,3,2,1 };
+    static logical xswpiv[4] = { FALSE_,FALSE_,TRUE_,TRUE_ };
+    static logical bswpiv[4] = { FALSE_,TRUE_,FALSE_,TRUE_ };
+
+    /* System generated locals */
+    integer b_dim1, b_offset, tl_dim1, tl_offset, tr_dim1, tr_offset, x_dim1,
+	    x_offset;
+    real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static real x2[2], l21, u11, u12;
+    static integer ip, jp;
+    static real u22, t16[16]	/* was [4][4] */, gam, bet, eps, sgn, tmp[4],
+	    tau1, btmp[4], smin;
+    static integer ipiv;
+    static real temp;
+    static integer jpiv[4];
+    static real xmax;
+    static integer ipsv, jpsv;
+    static logical bswap;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), sswap_(integer *, real *, integer *, real *, integer *
+	    );
+    static logical xswap;
+    extern doublereal slamch_(char *);
+    extern integer isamax_(integer *, real *, integer *);
+    static real smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLASY2 solves for the N1 by N2 matrix X, 1 <= N1,N2 <= 2, in
+
+           op(TL)*X + ISGN*X*op(TR) = SCALE*B,
+
+    where TL is N1 by N1, TR is N2 by N2, B is N1 by N2, and ISGN = 1 or
+    -1.  op(T) = T or T', where T' denotes the transpose of T.
+
+    Arguments
+    =========
+
+    LTRANL  (input) LOGICAL
+            On entry, LTRANL specifies the op(TL):
+               = .FALSE., op(TL) = TL,
+               = .TRUE., op(TL) = TL'.
+
+    LTRANR  (input) LOGICAL
+            On entry, LTRANR specifies the op(TR):
+              = .FALSE., op(TR) = TR,
+              = .TRUE., op(TR) = TR'.
+
+    ISGN    (input) INTEGER
+            On entry, ISGN specifies the sign of the equation
+            as described before. ISGN may only be 1 or -1.
+
+    N1      (input) INTEGER
+            On entry, N1 specifies the order of matrix TL.
+            N1 may only be 0, 1 or 2.
+
+    N2      (input) INTEGER
+            On entry, N2 specifies the order of matrix TR.
+            N2 may only be 0, 1 or 2.
+
+    TL      (input) REAL array, dimension (LDTL,2)
+            On entry, TL contains an N1 by N1 matrix.
+
+    LDTL    (input) INTEGER
+            The leading dimension of the matrix TL. LDTL >= max(1,N1).
+
+    TR      (input) REAL array, dimension (LDTR,2)
+            On entry, TR contains an N2 by N2 matrix.
+
+    LDTR    (input) INTEGER
+            The leading dimension of the matrix TR. LDTR >= max(1,N2).
+
+    B       (input) REAL array, dimension (LDB,2)
+            On entry, the N1 by N2 matrix B contains the right-hand
+            side of the equation.
+
+    LDB     (input) INTEGER
+            The leading dimension of the matrix B. LDB >= max(1,N1).
+
+    SCALE   (output) REAL
+            On exit, SCALE contains the scale factor. SCALE is chosen
+            less than or equal to 1 to prevent the solution overflowing.
+
+    X       (output) REAL array, dimension (LDX,2)
+            On exit, X contains the N1 by N2 solution.
+
+    LDX     (input) INTEGER
+            The leading dimension of the matrix X. LDX >= max(1,N1).
+
+    XNORM   (output) REAL
+            On exit, XNORM is the infinity-norm of the solution.
+
+    INFO    (output) INTEGER
+            On exit, INFO is set to
+               0: successful exit.
+               1: TL and TR have too close eigenvalues, so TL or
+                  TR is perturbed to get a nonsingular equation.
+            NOTE: In the interests of speed, this routine does not
+                  check the inputs for errors.
+
+   =====================================================================
+*/
+
+    /* Parameter adjustments */
+    tl_dim1 = *ldtl;
+    tl_offset = 1 + tl_dim1;
+    tl -= tl_offset;
+    tr_dim1 = *ldtr;
+    tr_offset = 1 + tr_dim1;
+    tr -= tr_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+
+    /* Function Body */
+
+/*     Do not check the input parameters for errors */
+
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n1 == 0 || *n2 == 0) {
+	return 0;
+    }
+
+/*     Set constants to control overflow */
+
+    eps = slamch_("P");
+    smlnum = slamch_("S") / eps;
+    sgn = (real) (*isgn);
+
+    k = *n1 + *n1 + *n2 - 2;
+    switch (k) {
+	case 1:  goto L10;
+	case 2:  goto L20;
+	case 3:  goto L30;
+	case 4:  goto L50;
+    }
+
+/*     1 by 1: TL11*X + SGN*X*TR11 = B11 */
+
+L10:
+    tau1 = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    bet = dabs(tau1);
+    if (bet <= smlnum) {
+	tau1 = smlnum;
+	bet = smlnum;
+	*info = 1;
+    }
+
+    *scale = 1.f;
+    gam = (r__1 = b[b_dim1 + 1], dabs(r__1));
+    if (smlnum * gam > bet) {
+	*scale = 1.f / gam;
+    }
+
+    x[x_dim1 + 1] = b[b_dim1 + 1] * *scale / tau1;
+    *xnorm = (r__1 = x[x_dim1 + 1], dabs(r__1));
+    return 0;
+
+/*
+       1 by 2:
+       TL11*[X11 X12] + ISGN*[X11 X12]*op[TR11 TR12]  = [B11 B12]
+                                         [TR21 TR22]
+*/
+
+L20:
+
+/*
+   Computing MAX
+   Computing MAX
+*/
+    r__7 = (r__1 = tl[tl_dim1 + 1], dabs(r__1)), r__8 = (r__2 = tr[tr_dim1 +
+	    1], dabs(r__2)), r__7 = max(r__7,r__8), r__8 = (r__3 = tr[(
+	    tr_dim1 << 1) + 1], dabs(r__3)), r__7 = max(r__7,r__8), r__8 = (
+	    r__4 = tr[tr_dim1 + 2], dabs(r__4)), r__7 = max(r__7,r__8), r__8 =
+	     (r__5 = tr[(tr_dim1 << 1) + 2], dabs(r__5));
+    r__6 = eps * dmax(r__7,r__8);
+    smin = dmax(r__6,smlnum);
+    tmp[0] = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    tmp[3] = tl[tl_dim1 + 1] + sgn * tr[(tr_dim1 << 1) + 2];
+    if (*ltranr) {
+	tmp[1] = sgn * tr[tr_dim1 + 2];
+	tmp[2] = sgn * tr[(tr_dim1 << 1) + 1];
+    } else {
+	tmp[1] = sgn * tr[(tr_dim1 << 1) + 1];
+	tmp[2] = sgn * tr[tr_dim1 + 2];
+    }
+    btmp[0] = b[b_dim1 + 1];
+    btmp[1] = b[(b_dim1 << 1) + 1];
+    goto L40;
+
+/*
+       2 by 1:
+            op[TL11 TL12]*[X11] + ISGN* [X11]*TR11  = [B11]
+              [TL21 TL22] [X21]         [X21]         [B21]
+*/
+
+L30:
+/*
+   Computing MAX
+   Computing MAX
+*/
+    r__7 = (r__1 = tr[tr_dim1 + 1], dabs(r__1)), r__8 = (r__2 = tl[tl_dim1 +
+	    1], dabs(r__2)), r__7 = max(r__7,r__8), r__8 = (r__3 = tl[(
+	    tl_dim1 << 1) + 1], dabs(r__3)), r__7 = max(r__7,r__8), r__8 = (
+	    r__4 = tl[tl_dim1 + 2], dabs(r__4)), r__7 = max(r__7,r__8), r__8 =
+	     (r__5 = tl[(tl_dim1 << 1) + 2], dabs(r__5));
+    r__6 = eps * dmax(r__7,r__8);
+    smin = dmax(r__6,smlnum);
+    tmp[0] = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    tmp[3] = tl[(tl_dim1 << 1) + 2] + sgn * tr[tr_dim1 + 1];
+    if (*ltranl) {
+	tmp[1] = tl[(tl_dim1 << 1) + 1];
+	tmp[2] = tl[tl_dim1 + 2];
+    } else {
+	tmp[1] = tl[tl_dim1 + 2];
+	tmp[2] = tl[(tl_dim1 << 1) + 1];
+    }
+    btmp[0] = b[b_dim1 + 1];
+    btmp[1] = b[b_dim1 + 2];
+L40:
+
+/*
+       Solve 2 by 2 system using complete pivoting.
+       Set pivots less than SMIN to SMIN.
+*/
+
+    ipiv = isamax_(&c__4, tmp, &c__1);
+    u11 = tmp[ipiv - 1];
+    if (dabs(u11) <= smin) {
+	*info = 1;
+	u11 = smin;
+    }
+    u12 = tmp[locu12[ipiv - 1] - 1];
+    l21 = tmp[locl21[ipiv - 1] - 1] / u11;
+    u22 = tmp[locu22[ipiv - 1] - 1] - u12 * l21;
+    xswap = xswpiv[ipiv - 1];
+    bswap = bswpiv[ipiv - 1];
+    if (dabs(u22) <= smin) {
+	*info = 1;
+	u22 = smin;
+    }
+    if (bswap) {
+	temp = btmp[1];
+	btmp[1] = btmp[0] - l21 * temp;
+	btmp[0] = temp;
+    } else {
+	btmp[1] -= l21 * btmp[0];
+    }
+    *scale = 1.f;
+    if (smlnum * 2.f * dabs(btmp[1]) > dabs(u22) || smlnum * 2.f * dabs(btmp[
+	    0]) > dabs(u11)) {
+/* Computing MAX */
+	r__1 = dabs(btmp[0]), r__2 = dabs(btmp[1]);
+	*scale = .5f / dmax(r__1,r__2);
+	btmp[0] *= *scale;
+	btmp[1] *= *scale;
+    }
+    x2[1] = btmp[1] / u22;
+    x2[0] = btmp[0] / u11 - u12 / u11 * x2[1];
+    if (xswap) {
+	temp = x2[1];
+	x2[1] = x2[0];
+	x2[0] = temp;
+    }
+    x[x_dim1 + 1] = x2[0];
+    if (*n1 == 1) {
+	x[(x_dim1 << 1) + 1] = x2[1];
+	*xnorm = (r__1 = x[x_dim1 + 1], dabs(r__1)) + (r__2 = x[(x_dim1 << 1)
+		+ 1], dabs(r__2));
+    } else {
+	x[x_dim1 + 2] = x2[1];
+/* Computing MAX */
+	r__3 = (r__1 = x[x_dim1 + 1], dabs(r__1)), r__4 = (r__2 = x[x_dim1 +
+		2], dabs(r__2));
+	*xnorm = dmax(r__3,r__4);
+    }
+    return 0;
+
+/*
+       2 by 2:
+       op[TL11 TL12]*[X11 X12] +ISGN* [X11 X12]*op[TR11 TR12] = [B11 B12]
+         [TL21 TL22] [X21 X22]        [X21 X22]   [TR21 TR22]   [B21 B22]
+
+       Solve equivalent 4 by 4 system using complete pivoting.
+       Set pivots less than SMIN to SMIN.
+*/
+
+L50:
+/* Computing MAX */
+    r__5 = (r__1 = tr[tr_dim1 + 1], dabs(r__1)), r__6 = (r__2 = tr[(tr_dim1 <<
+	     1) + 1], dabs(r__2)), r__5 = max(r__5,r__6), r__6 = (r__3 = tr[
+	    tr_dim1 + 2], dabs(r__3)), r__5 = max(r__5,r__6), r__6 = (r__4 =
+	    tr[(tr_dim1 << 1) + 2], dabs(r__4));
+    smin = dmax(r__5,r__6);
+/* Computing MAX */
+    r__5 = smin, r__6 = (r__1 = tl[tl_dim1 + 1], dabs(r__1)), r__5 = max(r__5,
+	    r__6), r__6 = (r__2 = tl[(tl_dim1 << 1) + 1], dabs(r__2)), r__5 =
+	    max(r__5,r__6), r__6 = (r__3 = tl[tl_dim1 + 2], dabs(r__3)), r__5
+	    = max(r__5,r__6), r__6 = (r__4 = tl[(tl_dim1 << 1) + 2], dabs(
+	    r__4));
+    smin = dmax(r__5,r__6);
+/* Computing MAX */
+    r__1 = eps * smin;
+    smin = dmax(r__1,smlnum);
+    btmp[0] = 0.f;
+    scopy_(&c__16, btmp, &c__0, t16, &c__1);
+    t16[0] = tl[tl_dim1 + 1] + sgn * tr[tr_dim1 + 1];
+    t16[5] = tl[(tl_dim1 << 1) + 2] + sgn * tr[tr_dim1 + 1];
+    t16[10] = tl[tl_dim1 + 1] + sgn * tr[(tr_dim1 << 1) + 2];
+    t16[15] = tl[(tl_dim1 << 1) + 2] + sgn * tr[(tr_dim1 << 1) + 2];
+    if (*ltranl) {
+	t16[4] = tl[tl_dim1 + 2];
+	t16[1] = tl[(tl_dim1 << 1) + 1];
+	t16[14] = tl[tl_dim1 + 2];
+	t16[11] = tl[(tl_dim1 << 1) + 1];
+    } else {
+	t16[4] = tl[(tl_dim1 << 1) + 1];
+	t16[1] = tl[tl_dim1 + 2];
+	t16[14] = tl[(tl_dim1 << 1) + 1];
+	t16[11] = tl[tl_dim1 + 2];
+    }
+    if (*ltranr) {
+	t16[8] = sgn * tr[(tr_dim1 << 1) + 1];
+	t16[13] = sgn * tr[(tr_dim1 << 1) + 1];
+	t16[2] = sgn * tr[tr_dim1 + 2];
+	t16[7] = sgn * tr[tr_dim1 + 2];
+    } else {
+	t16[8] = sgn * tr[tr_dim1 + 2];
+	t16[13] = sgn * tr[tr_dim1 + 2];
+	t16[2] = sgn * tr[(tr_dim1 << 1) + 1];
+	t16[7] = sgn * tr[(tr_dim1 << 1) + 1];
+    }
+    btmp[0] = b[b_dim1 + 1];
+    btmp[1] = b[b_dim1 + 2];
+    btmp[2] = b[(b_dim1 << 1) + 1];
+    btmp[3] = b[(b_dim1 << 1) + 2];
+
+/*     Perform elimination */
+
+    for (i__ = 1; i__ <= 3; ++i__) {
+	xmax = 0.f;
+	for (ip = i__; ip <= 4; ++ip) {
+	    for (jp = i__; jp <= 4; ++jp) {
+		if ((r__1 = t16[ip + (jp << 2) - 5], dabs(r__1)) >= xmax) {
+		    xmax = (r__1 = t16[ip + (jp << 2) - 5], dabs(r__1));
+		    ipsv = ip;
+		    jpsv = jp;
+		}
+/* L60: */
+	    }
+/* L70: */
+	}
+	if (ipsv != i__) {
+	    sswap_(&c__4, &t16[ipsv - 1], &c__4, &t16[i__ - 1], &c__4);
+	    temp = btmp[i__ - 1];
+	    btmp[i__ - 1] = btmp[ipsv - 1];
+	    btmp[ipsv - 1] = temp;
+	}
+	if (jpsv != i__) {
+	    sswap_(&c__4, &t16[(jpsv << 2) - 4], &c__1, &t16[(i__ << 2) - 4],
+		    &c__1);
+	}
+	jpiv[i__ - 1] = jpsv;
+	if ((r__1 = t16[i__ + (i__ << 2) - 5], dabs(r__1)) < smin) {
+	    *info = 1;
+	    t16[i__ + (i__ << 2) - 5] = smin;
+	}
+	for (j = i__ + 1; j <= 4; ++j) {
+	    t16[j + (i__ << 2) - 5] /= t16[i__ + (i__ << 2) - 5];
+	    btmp[j - 1] -= t16[j + (i__ << 2) - 5] * btmp[i__ - 1];
+	    for (k = i__ + 1; k <= 4; ++k) {
+		t16[j + (k << 2) - 5] -= t16[j + (i__ << 2) - 5] * t16[i__ + (
+			k << 2) - 5];
+/* L80: */
+	    }
+/* L90: */
+	}
+/* L100: */
+    }
+    if (dabs(t16[15]) < smin) {
+	t16[15] = smin;
+    }
+    *scale = 1.f;
+    if (smlnum * 8.f * dabs(btmp[0]) > dabs(t16[0]) || smlnum * 8.f * dabs(
+	    btmp[1]) > dabs(t16[5]) || smlnum * 8.f * dabs(btmp[2]) > dabs(
+	    t16[10]) || smlnum * 8.f * dabs(btmp[3]) > dabs(t16[15])) {
+/* Computing MAX */
+	r__1 = dabs(btmp[0]), r__2 = dabs(btmp[1]), r__1 = max(r__1,r__2),
+		r__2 = dabs(btmp[2]), r__1 = max(r__1,r__2), r__2 = dabs(btmp[
+		3]);
+	*scale = .125f / dmax(r__1,r__2);
+	btmp[0] *= *scale;
+	btmp[1] *= *scale;
+	btmp[2] *= *scale;
+	btmp[3] *= *scale;
+    }
+    for (i__ = 1; i__ <= 4; ++i__) {
+	k = 5 - i__;
+	temp = 1.f / t16[k + (k << 2) - 5];
+	tmp[k - 1] = btmp[k - 1] * temp;
+	for (j = k + 1; j <= 4; ++j) {
+	    tmp[k - 1] -= temp * t16[k + (j << 2) - 5] * tmp[j - 1];
+/* L110: */
+	}
+/* L120: */
+    }
+    for (i__ = 1; i__ <= 3; ++i__) {
+	if (jpiv[4 - i__ - 1] != 4 - i__) {
+	    temp = tmp[4 - i__ - 1];
+	    tmp[4 - i__ - 1] = tmp[jpiv[4 - i__ - 1] - 1];
+	    tmp[jpiv[4 - i__ - 1] - 1] = temp;
+	}
+/* L130: */
+    }
+    x[x_dim1 + 1] = tmp[0];
+    x[x_dim1 + 2] = tmp[1];
+    x[(x_dim1 << 1) + 1] = tmp[2];
+    x[(x_dim1 << 1) + 2] = tmp[3];
+/* Computing MAX */
+    r__1 = dabs(tmp[0]) + dabs(tmp[2]), r__2 = dabs(tmp[1]) + dabs(tmp[3]);
+    *xnorm = dmax(r__1,r__2);
+    return 0;
+
+/*     End of SLASY2 */
+
+} /* slasy2_ */
+
+/* Subroutine */ int slatrd_(char *uplo, integer *n, integer *nb, real *a,
+	integer *lda, real *e, real *tau, real *w, integer *ldw)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, iw;
+    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
+    static real alpha;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemv_(char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *), saxpy_(
+	    integer *, real *, real *, integer *, real *, integer *), ssymv_(
+	    char *, integer *, real *, real *, integer *, real *, integer *,
+	    real *, real *, integer *), slarfg_(integer *, real *,
+	    real *, integer *, real *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLATRD reduces NB rows and columns of a real symmetric matrix A to
+    symmetric tridiagonal form by an orthogonal similarity
+    transformation Q' * A * Q, and returns the matrices V and W which are
+    needed to apply the transformation to the unreduced part of A.
+
+    If UPLO = 'U', SLATRD reduces the last NB rows and columns of a
+    matrix, of which the upper triangle is supplied;
+    if UPLO = 'L', SLATRD reduces the first NB rows and columns of a
+    matrix, of which the lower triangle is supplied.
+
+    This is an auxiliary routine called by SSYTRD.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is stored:
+            = 'U': Upper triangular
+            = 'L': Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    NB      (input) INTEGER
+            The number of rows and columns to be reduced.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit:
+            if UPLO = 'U', the last NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements above the diagonal
+              with the array TAU, represent the orthogonal matrix Q as a
+              product of elementary reflectors;
+            if UPLO = 'L', the first NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements below the diagonal
+              with the array TAU, represent the  orthogonal matrix Q as a
+              product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= (1,N).
+
+    E       (output) REAL array, dimension (N-1)
+            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
+            elements of the last NB columns of the reduced matrix;
+            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
+            the first NB columns of the reduced matrix.
+
+    TAU     (output) REAL array, dimension (N-1)
+            The scalar factors of the elementary reflectors, stored in
+            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
+            See Further Details.
+
+    W       (output) REAL array, dimension (LDW,NB)
+            The n-by-nb matrix W required to update the unreduced part
+            of A.
+
+    LDW     (input) INTEGER
+            The leading dimension of the array W. LDW >= max(1,N).
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n) H(n-1) . . . H(n-nb+1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
+    and tau in TAU(i-1).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
+    and tau in TAU(i).
+
+    The elements of the vectors v together form the n-by-nb matrix V
+    which is needed, with W, to apply the transformation to the unreduced
+    part of the matrix, using a symmetric rank-2k update of the form:
+    A := A - V*W' - W*V'.
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5 and nb = 2:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  a   a   a   v4  v5 )              (  d                  )
+      (      a   a   v4  v5 )              (  1   d              )
+      (          a   1   v5 )              (  v1  1   a          )
+      (              d   1  )              (  v1  v2  a   a      )
+      (                  d  )              (  v1  v2  a   a   a  )
+
+    where d denotes a diagonal element of the reduced matrix, a denotes
+    an element of the original matrix that is unchanged, and vi denotes
+    an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --e;
+    --tau;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(uplo, "U")) {
+
+/*        Reduce last NB columns of upper triangle */
+
+	i__1 = *n - *nb + 1;
+	for (i__ = *n; i__ >= i__1; --i__) {
+	    iw = i__ - *n + *nb;
+	    if (i__ < *n) {
+
+/*              Update A(1:i,i) */
+
+		i__2 = *n - i__;
+		sgemv_("No transpose", &i__, &i__2, &c_b151, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &w[i__ + (iw + 1) * w_dim1], ldw, &
+			c_b15, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		sgemv_("No transpose", &i__, &i__2, &c_b151, &w[(iw + 1) *
+			w_dim1 + 1], ldw, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b15, &a[i__ * a_dim1 + 1], &c__1);
+	    }
+	    if (i__ > 1) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(1:i-2,i)
+*/
+
+		i__2 = i__ - 1;
+		slarfg_(&i__2, &a[i__ - 1 + i__ * a_dim1], &a[i__ * a_dim1 +
+			1], &c__1, &tau[i__ - 1]);
+		e[i__ - 1] = a[i__ - 1 + i__ * a_dim1];
+		a[i__ - 1 + i__ * a_dim1] = 1.f;
+
+/*              Compute W(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		ssymv_("Upper", &i__2, &c_b15, &a[a_offset], lda, &a[i__ *
+			a_dim1 + 1], &c__1, &c_b29, &w[iw * w_dim1 + 1], &
+			c__1);
+		if (i__ < *n) {
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    sgemv_("Transpose", &i__2, &i__3, &c_b15, &w[(iw + 1) *
+			    w_dim1 + 1], ldw, &a[i__ * a_dim1 + 1], &c__1, &
+			    c_b29, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[(i__ + 1)
+			     * a_dim1 + 1], lda, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b15, &w[iw * w_dim1 + 1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[(i__ + 1) *
+			    a_dim1 + 1], lda, &a[i__ * a_dim1 + 1], &c__1, &
+			    c_b29, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    sgemv_("No transpose", &i__2, &i__3, &c_b151, &w[(iw + 1)
+			    * w_dim1 + 1], ldw, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b15, &w[iw * w_dim1 + 1], &c__1);
+		}
+		i__2 = i__ - 1;
+		sscal_(&i__2, &tau[i__ - 1], &w[iw * w_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		alpha = tau[i__ - 1] * -.5f * sdot_(&i__2, &w[iw * w_dim1 + 1]
+			, &c__1, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		saxpy_(&i__2, &alpha, &a[i__ * a_dim1 + 1], &c__1, &w[iw *
+			w_dim1 + 1], &c__1);
+	    }
+
+/* L10: */
+	}
+    } else {
+
+/*        Reduce first NB columns of lower triangle */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:n,i) */
+
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + a_dim1],
+		    lda, &w[i__ + w_dim1], ldw, &c_b15, &a[i__ + i__ * a_dim1]
+		    , &c__1);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    sgemv_("No transpose", &i__2, &i__3, &c_b151, &w[i__ + w_dim1],
+		    ldw, &a[i__ + a_dim1], lda, &c_b15, &a[i__ + i__ * a_dim1]
+		    , &c__1);
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:n,i)
+*/
+
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		slarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*n) +
+			i__ * a_dim1], &c__1, &tau[i__]);
+		e[i__] = a[i__ + 1 + i__ * a_dim1];
+		a[i__ + 1 + i__ * a_dim1] = 1.f;
+
+/*              Compute W(i+1:n,i) */
+
+		i__2 = *n - i__;
+		ssymv_("Lower", &i__2, &c_b15, &a[i__ + 1 + (i__ + 1) *
+			a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b29, &w[i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &w[i__ + 1 + w_dim1]
+			, ldw, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &w[
+			i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[i__ + 1 +
+			a_dim1], lda, &w[i__ * w_dim1 + 1], &c__1, &c_b15, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + a_dim1]
+			, lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &w[
+			i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &w[i__ + 1 +
+			w_dim1], ldw, &w[i__ * w_dim1 + 1], &c__1, &c_b15, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		sscal_(&i__2, &tau[i__], &w[i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		alpha = tau[i__] * -.5f * sdot_(&i__2, &w[i__ + 1 + i__ *
+			w_dim1], &c__1, &a[i__ + 1 + i__ * a_dim1], &c__1);
+		i__2 = *n - i__;
+		saxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+	    }
+
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of SLATRD */
+
+} /* slatrd_ */
+
+/* Subroutine */ int slauu2_(char *uplo, integer *n, real *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    static real aii;
+    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemv_(char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAUU2 computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAUU2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the product U * U'. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    aii = a[i__ + i__ * a_dim1];
+	    if (i__ < *n) {
+		i__2 = *n - i__ + 1;
+		a[i__ + i__ * a_dim1] = sdot_(&i__2, &a[i__ + i__ * a_dim1],
+			lda, &a[i__ + i__ * a_dim1], lda);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		sgemv_("No transpose", &i__2, &i__3, &c_b15, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			aii, &a[i__ * a_dim1 + 1], &c__1);
+	    } else {
+		sscal_(&i__, &aii, &a[i__ * a_dim1 + 1], &c__1);
+	    }
+/* L10: */
+	}
+
+    } else {
+
+/*        Compute the product L' * L. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    aii = a[i__ + i__ * a_dim1];
+	    if (i__ < *n) {
+		i__2 = *n - i__ + 1;
+		a[i__ + i__ * a_dim1] = sdot_(&i__2, &a[i__ + i__ * a_dim1], &
+			c__1, &a[i__ + i__ * a_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		sgemv_("Transpose", &i__2, &i__3, &c_b15, &a[i__ + 1 + a_dim1]
+			, lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &aii, &a[
+			i__ + a_dim1], lda);
+	    } else {
+		sscal_(&i__, &aii, &a[i__ + a_dim1], lda);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of SLAUU2 */
+
+} /* slauu2_ */
+
+/* Subroutine */ int slauum_(char *uplo, integer *n, real *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, ib, nb;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int strmm_(char *, char *, char *, char *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), ssyrk_(char *, char *, integer
+	    *, integer *, real *, real *, integer *, real *, real *, integer *
+	    ), slauu2_(char *, integer *, real *, integer *,
+	    integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLAUUM computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLAUUM", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "SLAUUM", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	slauu2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute the product U * U'. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		strmm_("Right", "Upper", "Transpose", "Non-unit", &i__3, &ib,
+			&c_b15, &a[i__ + i__ * a_dim1], lda, &a[i__ * a_dim1
+			+ 1], lda)
+			;
+		slauu2_("Upper", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    sgemm_("No transpose", "Transpose", &i__3, &ib, &i__4, &
+			    c_b15, &a[(i__ + ib) * a_dim1 + 1], lda, &a[i__ +
+			    (i__ + ib) * a_dim1], lda, &c_b15, &a[i__ *
+			    a_dim1 + 1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    ssyrk_("Upper", "No transpose", &ib, &i__3, &c_b15, &a[
+			    i__ + (i__ + ib) * a_dim1], lda, &c_b15, &a[i__ +
+			    i__ * a_dim1], lda);
+		}
+/* L10: */
+	    }
+	} else {
+
+/*           Compute the product L' * L. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		strmm_("Left", "Lower", "Transpose", "Non-unit", &ib, &i__3, &
+			c_b15, &a[i__ + i__ * a_dim1], lda, &a[i__ + a_dim1],
+			lda);
+		slauu2_("Lower", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    sgemm_("Transpose", "No transpose", &ib, &i__3, &i__4, &
+			    c_b15, &a[i__ + ib + i__ * a_dim1], lda, &a[i__ +
+			    ib + a_dim1], lda, &c_b15, &a[i__ + a_dim1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    ssyrk_("Lower", "Transpose", &ib, &i__3, &c_b15, &a[i__ +
+			    ib + i__ * a_dim1], lda, &c_b15, &a[i__ + i__ *
+			    a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SLAUUM */
+
+} /* slauum_ */
+
+/* Subroutine */ int sorg2r_(integer *m, integer *n, integer *k, real *a,
+	integer *lda, real *tau, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    slarf_(char *, integer *, integer *, real *, integer *, real *,
+	    real *, integer *, real *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORG2R generates an m by n real matrix Q with orthonormal columns,
+    which is defined as the first n columns of a product of k elementary
+    reflectors of order m
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by SGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by SGEQRF in the first k columns of its array
+            argument A.
+            On exit, the m-by-n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEQRF.
+
+    WORK    (workspace) REAL array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORG2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Initialise columns k+1:n to columns of the unit matrix */
+
+    i__1 = *n;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (l = 1; l <= i__2; ++l) {
+	    a[l + j * a_dim1] = 0.f;
+/* L10: */
+	}
+	a[j + j * a_dim1] = 1.f;
+/* L20: */
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i) to A(i:m,i:n) from the left */
+
+	if (i__ < *n) {
+	    a[i__ + i__ * a_dim1] = 1.f;
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n - i__;
+	    slarf_("Left", &i__1, &i__2, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	}
+	if (i__ < *m) {
+	    i__1 = *m - i__;
+	    r__1 = -tau[i__];
+	    sscal_(&i__1, &r__1, &a[i__ + 1 + i__ * a_dim1], &c__1);
+	}
+	a[i__ + i__ * a_dim1] = 1.f - tau[i__];
+
+/*        Set A(1:i-1,i) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    a[l + i__ * a_dim1] = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of SORG2R */
+
+} /* sorg2r_ */
+
+/* Subroutine */ int sorgbr_(char *vect, integer *m, integer *n, integer *k,
+	real *a, integer *lda, real *tau, real *work, integer *lwork, integer
+	*info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, nb, mn;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical wantq;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int sorglq_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *), sorgqr_(
+	    integer *, integer *, integer *, real *, integer *, real *, real *
+	    , integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORGBR generates one of the real orthogonal matrices Q or P**T
+    determined by SGEBRD when reducing a real matrix A to bidiagonal
+    form: A = Q * B * P**T.  Q and P**T are defined as products of
+    elementary reflectors H(i) or G(i) respectively.
+
+    If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+    is of order M:
+    if m >= k, Q = H(1) H(2) . . . H(k) and SORGBR returns the first n
+    columns of Q, where m >= n >= k;
+    if m < k, Q = H(1) H(2) . . . H(m-1) and SORGBR returns Q as an
+    M-by-M matrix.
+
+    If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**T
+    is of order N:
+    if k < n, P**T = G(k) . . . G(2) G(1) and SORGBR returns the first m
+    rows of P**T, where n >= m >= k;
+    if k >= n, P**T = G(n-1) . . . G(2) G(1) and SORGBR returns P**T as
+    an N-by-N matrix.
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            Specifies whether the matrix Q or the matrix P**T is
+            required, as defined in the transformation applied by SGEBRD:
+            = 'Q':  generate Q;
+            = 'P':  generate P**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q or P**T to be returned.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q or P**T to be returned.
+            N >= 0.
+            If VECT = 'Q', M >= N >= min(M,K);
+            if VECT = 'P', N >= M >= min(N,K).
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original M-by-K
+            matrix reduced by SGEBRD.
+            If VECT = 'P', the number of rows in the original K-by-N
+            matrix reduced by SGEBRD.
+            K >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by SGEBRD.
+            On exit, the M-by-N matrix Q or P**T.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) REAL array, dimension
+                                  (min(M,K)) if VECT = 'Q'
+                                  (min(N,K)) if VECT = 'P'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i), which determines Q or P**T, as
+            returned by SGEBRD in its array argument TAUQ or TAUP.
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,min(M,N)).
+            For optimum performance LWORK >= min(M,N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(vect, "Q");
+    mn = min(*m,*n);
+    lquery = *lwork == -1;
+    if (! wantq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0 || wantq && (*n > *m || *n < min(*m,*k)) || ! wantq && (
+	    *m > *n || *m < min(*n,*k))) {
+	*info = -3;
+    } else if (*k < 0) {
+	*info = -4;
+    } else if (*lda < max(1,*m)) {
+	*info = -6;
+    } else if (*lwork < max(1,mn) && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+	if (wantq) {
+	    nb = ilaenv_(&c__1, "SORGQR", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	} else {
+	    nb = ilaenv_(&c__1, "SORGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	}
+	lwkopt = max(1,mn) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORGBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    if (wantq) {
+
+/*
+          Form Q, determined by a call to SGEBRD to reduce an m-by-k
+          matrix
+*/
+
+	if (*m >= *k) {
+
+/*           If m >= k, assume m >= n >= k */
+
+	    sorgqr_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If m < k, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             column to the right, and set the first row and column of Q
+             to those of the unit matrix
+*/
+
+	    for (j = *m; j >= 2; --j) {
+		a[j * a_dim1 + 1] = 0.f;
+		i__1 = *m;
+		for (i__ = j + 1; i__ <= i__1; ++i__) {
+		    a[i__ + j * a_dim1] = a[i__ + (j - 1) * a_dim1];
+/* L10: */
+		}
+/* L20: */
+	    }
+	    a[a_dim1 + 1] = 1.f;
+	    i__1 = *m;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		a[i__ + a_dim1] = 0.f;
+/* L30: */
+	    }
+	    if (*m > 1) {
+
+/*              Form Q(2:m,2:m) */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		sorgqr_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    } else {
+
+/*
+          Form P', determined by a call to SGEBRD to reduce a k-by-n
+          matrix
+*/
+
+	if (*k < *n) {
+
+/*           If k < n, assume k <= m <= n */
+
+	    sorglq_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If k >= n, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             row downward, and set the first row and column of P' to
+             those of the unit matrix
+*/
+
+	    a[a_dim1 + 1] = 1.f;
+	    i__1 = *n;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		a[i__ + a_dim1] = 0.f;
+/* L40: */
+	    }
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		for (i__ = j - 1; i__ >= 2; --i__) {
+		    a[i__ + j * a_dim1] = a[i__ - 1 + j * a_dim1];
+/* L50: */
+		}
+		a[j * a_dim1 + 1] = 0.f;
+/* L60: */
+	    }
+	    if (*n > 1) {
+
+/*              Form P'(2:n,2:n) */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		sorglq_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORGBR */
+
+} /* sorgbr_ */
+
+/* Subroutine */ int sorghr_(integer *n, integer *ilo, integer *ihi, real *a,
+	integer *lda, real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, j, nb, nh, iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORGHR generates a real orthogonal matrix Q which is defined as the
+    product of IHI-ILO elementary reflectors of order N, as returned by
+    SGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix Q. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of SGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by SGEHRD.
+            On exit, the N-by-N orthogonal matrix Q.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,N).
+
+    TAU     (input) REAL array, dimension (N-1)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEHRD.
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= IHI-ILO.
+            For optimum performance LWORK >= (IHI-ILO)*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,nh) && ! lquery) {
+	*info = -8;
+    }
+
+    if (*info == 0) {
+	nb = ilaenv_(&c__1, "SORGQR", " ", &nh, &nh, &nh, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+	lwkopt = max(1,nh) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORGHR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+/*
+       Shift the vectors which define the elementary reflectors one
+       column to the right, and set the first ilo and the last n-ihi
+       rows and columns to those of the unit matrix
+*/
+
+    i__1 = *ilo + 1;
+    for (j = *ihi; j >= i__1; --j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.f;
+/* L10: */
+	}
+	i__2 = *ihi;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = a[i__ + (j - 1) * a_dim1];
+/* L20: */
+	}
+	i__2 = *n;
+	for (i__ = *ihi + 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+    i__1 = *ilo;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.f;
+/* L50: */
+	}
+	a[j + j * a_dim1] = 1.f;
+/* L60: */
+    }
+    i__1 = *n;
+    for (j = *ihi + 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    a[i__ + j * a_dim1] = 0.f;
+/* L70: */
+	}
+	a[j + j * a_dim1] = 1.f;
+/* L80: */
+    }
+
+    if (nh > 0) {
+
+/*        Generate Q(ilo+1:ihi,ilo+1:ihi) */
+
+	sorgqr_(&nh, &nh, &nh, &a[*ilo + 1 + (*ilo + 1) * a_dim1], lda, &tau[*
+		ilo], &work[1], lwork, &iinfo);
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORGHR */
+
+} /* sorghr_ */
+
+/* Subroutine */ int sorgl2_(integer *m, integer *n, integer *k, real *a,
+	integer *lda, real *tau, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    slarf_(char *, integer *, integer *, real *, integer *, real *,
+	    real *, integer *, real *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORGL2 generates an m by n real matrix Q with orthonormal rows,
+    which is defined as the first m rows of a product of k elementary
+    reflectors of order n
+
+          Q  =  H(k) . . . H(2) H(1)
+
+    as returned by SGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by SGELQF in the first k rows of its array argument A.
+            On exit, the m-by-n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGELQF.
+
+    WORK    (workspace) REAL array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORGL2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	return 0;
+    }
+
+    if (*k < *m) {
+
+/*        Initialise rows k+1:m to rows of the unit matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (l = *k + 1; l <= i__2; ++l) {
+		a[l + j * a_dim1] = 0.f;
+/* L10: */
+	    }
+	    if (j > *k && j <= *m) {
+		a[j + j * a_dim1] = 1.f;
+	    }
+/* L20: */
+	}
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i) to A(i:m,i:n) from the right */
+
+	if (i__ < *n) {
+	    if (i__ < *m) {
+		a[i__ + i__ * a_dim1] = 1.f;
+		i__1 = *m - i__;
+		i__2 = *n - i__ + 1;
+		slarf_("Right", &i__1, &i__2, &a[i__ + i__ * a_dim1], lda, &
+			tau[i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    i__1 = *n - i__;
+	    r__1 = -tau[i__];
+	    sscal_(&i__1, &r__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+	a[i__ + i__ * a_dim1] = 1.f - tau[i__];
+
+/*        Set A(i,1:i-1) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    a[i__ + l * a_dim1] = 0.f;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of SORGL2 */
+
+} /* sorgl2_ */
+
+/* Subroutine */ int sorglq_(integer *m, integer *n, integer *k, real *a,
+	integer *lda, real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int sorgl2_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *), slarfb_(char *, char *,
+	    char *, char *, integer *, integer *, integer *, real *, integer *
+	    , real *, integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORGLQ generates an M-by-N real matrix Q with orthonormal rows,
+    which is defined as the first M rows of a product of K elementary
+    reflectors of order N
+
+          Q  =  H(k) . . . H(2) H(1)
+
+    as returned by SGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by SGELQF in the first k rows of its array argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGELQF.
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "SORGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*m) * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORGLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SORGLQ", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SORGLQ", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk rows are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(kk+1:m,1:kk) to zero. */
+
+	i__1 = kk;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = kk + 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *m) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	sorgl2_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *n - i__ + 1;
+		slarft_("Forward", "Rowwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i+ib:m,i:n) from the right */
+
+		i__2 = *m - i__ - ib + 1;
+		i__3 = *n - i__ + 1;
+		slarfb_("Right", "Transpose", "Forward", "Rowwise", &i__2, &
+			i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib +
+			1], &ldwork);
+	    }
+
+/*           Apply H' to columns i:n of current block */
+
+	    i__2 = *n - i__ + 1;
+	    sorgl2_(&ib, &i__2, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set columns 1:i-1 of current block to zero */
+
+	    i__2 = i__ - 1;
+	    for (j = 1; j <= i__2; ++j) {
+		i__3 = i__ + ib - 1;
+		for (l = i__; l <= i__3; ++l) {
+		    a[l + j * a_dim1] = 0.f;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1] = (real) iws;
+    return 0;
+
+/*     End of SORGLQ */
+
+} /* sorglq_ */
+
+/* Subroutine */ int sorgqr_(integer *m, integer *n, integer *k, real *a,
+	integer *lda, real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int sorg2r_(integer *, integer *, integer *, real
+	    *, integer *, real *, real *, integer *), slarfb_(char *, char *,
+	    char *, char *, integer *, integer *, integer *, real *, integer *
+	    , real *, integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORGQR generates an M-by-N real matrix Q with orthonormal columns,
+    which is defined as the first N columns of a product of K elementary
+    reflectors of order M
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by SGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by SGEQRF in the first k columns of its array
+            argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEQRF.
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "SORGQR", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*n) * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORGQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SORGQR", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SORGQR", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk columns are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(1:kk,kk+1:n) to zero. */
+
+	i__1 = *n;
+	for (j = kk + 1; j <= i__1; ++j) {
+	    i__2 = kk;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = 0.f;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *n) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	sorg2r_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *m - i__ + 1;
+		slarft_("Forward", "Columnwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i:m,i+ib:n) from the left */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__ - ib + 1;
+		slarfb_("Left", "No transpose", "Forward", "Columnwise", &
+			i__2, &i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[
+			1], &ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &
+			work[ib + 1], &ldwork);
+	    }
+
+/*           Apply H to rows i:m of current block */
+
+	    i__2 = *m - i__ + 1;
+	    sorg2r_(&i__2, &ib, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set rows 1:i-1 of current block to zero */
+
+	    i__2 = i__ + ib - 1;
+	    for (j = i__; j <= i__2; ++j) {
+		i__3 = i__ - 1;
+		for (l = 1; l <= i__3; ++l) {
+		    a[l + j * a_dim1] = 0.f;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1] = (real) iws;
+    return 0;
+
+/*     End of SORGQR */
+
+} /* sorgqr_ */
+
+/* Subroutine */ int sorm2l_(char *side, char *trans, integer *m, integer *n,
+	integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, mi, ni, nq;
+    static real aii;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORM2L overwrites the general real m by n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'T', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'T',
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by SGEQLF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'T': apply Q' (Transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) REAL array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            SGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEQLF.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) REAL array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORM2L", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+    } else {
+	mi = *m;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(1:m-k+i,1:n) */
+
+	    mi = *m - *k + i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,1:n-k+i) */
+
+	    ni = *n - *k + i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[nq - *k + i__ + i__ * a_dim1];
+	a[nq - *k + i__ + i__ * a_dim1] = 1.f;
+	slarf_(side, &mi, &ni, &a[i__ * a_dim1 + 1], &c__1, &tau[i__], &c__[
+		c_offset], ldc, &work[1]);
+	a[nq - *k + i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of SORM2L */
+
+} /* sorm2l_ */
+
+/* Subroutine */ int sorm2r_(char *side, char *trans, integer *m, integer *n,
+	integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static real aii;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORM2R overwrites the general real m by n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'T', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'T',
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by SGEQRF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'T': apply Q' (Transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) REAL array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            SGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEQRF.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) REAL array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORM2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && ! notran || ! left && notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[i__ + i__ * a_dim1];
+	a[i__ + i__ * a_dim1] = 1.f;
+	slarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], &c__1, &tau[i__], &c__[
+		ic + jc * c_dim1], ldc, &work[1]);
+	a[i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of SORM2R */
+
+} /* sorm2r_ */
+
+/* Subroutine */ int sormbr_(char *vect, char *side, char *trans, integer *m,
+	integer *n, integer *k, real *a, integer *lda, real *tau, real *c__,
+	integer *ldc, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2];
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran, applyq;
+    static char transt[1];
+    extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    If VECT = 'Q', SORMBR overwrites the general real M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    If VECT = 'P', SORMBR overwrites the general real M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      P * C          C * P
+    TRANS = 'T':      P**T * C       C * P**T
+
+    Here Q and P**T are the orthogonal matrices determined by SGEBRD when
+    reducing a real matrix A to bidiagonal form: A = Q * B * P**T. Q and
+    P**T are defined as products of elementary reflectors H(i) and G(i)
+    respectively.
+
+    Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+    order of the orthogonal matrix Q or P**T that is applied.
+
+    If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+    if nq >= k, Q = H(1) H(2) . . . H(k);
+    if nq < k, Q = H(1) H(2) . . . H(nq-1).
+
+    If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+    if k < nq, P = G(1) G(2) . . . G(k);
+    if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            = 'Q': apply Q or Q**T;
+            = 'P': apply P or P**T.
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q, Q**T, P or P**T from the Left;
+            = 'R': apply Q, Q**T, P or P**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q  or P;
+            = 'T':  Transpose, apply Q**T or P**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original
+            matrix reduced by SGEBRD.
+            If VECT = 'P', the number of rows in the original
+            matrix reduced by SGEBRD.
+            K >= 0.
+
+    A       (input) REAL array, dimension
+                                  (LDA,min(nq,K)) if VECT = 'Q'
+                                  (LDA,nq)        if VECT = 'P'
+            The vectors which define the elementary reflectors H(i) and
+            G(i), whose products determine the matrices Q and P, as
+            returned by SGEBRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If VECT = 'Q', LDA >= max(1,nq);
+            if VECT = 'P', LDA >= max(1,min(nq,K)).
+
+    TAU     (input) REAL array, dimension (min(nq,K))
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i) which determines Q or P, as returned
+            by SGEBRD in the array argument TAUQ or TAUP.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q
+            or P*C or P**T*C or C*P or C*P**T.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    applyq = lsame_(vect, "Q");
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q or P and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! applyq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (! left && ! lsame_(side, "R")) {
+	*info = -2;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*k < 0) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = 1, i__2 = min(nq,*k);
+	if (applyq && *lda < max(1,nq) || ! applyq && *lda < max(i__1,i__2)) {
+	    *info = -8;
+	} else if (*ldc < max(1,*m)) {
+	    *info = -11;
+	} else if (*lwork < max(1,nw) && ! lquery) {
+	    *info = -13;
+	}
+    }
+
+    if (*info == 0) {
+	if (applyq) {
+	    if (left) {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		nb = ilaenv_(&c__1, "SORMQR", ch__1, &i__1, n, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		nb = ilaenv_(&c__1, "SORMQR", ch__1, m, &i__1, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	} else {
+	    if (left) {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		nb = ilaenv_(&c__1, "SORMLQ", ch__1, &i__1, n, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__3[0] = 1, a__1[0] = side;
+		i__3[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		nb = ilaenv_(&c__1, "SORMLQ", ch__1, m, &i__1, &i__2, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORMBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    work[1] = 1.f;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    if (applyq) {
+
+/*        Apply Q */
+
+	if (nq >= *k) {
+
+/*           Q was determined by a call to SGEBRD with nq >= k */
+
+	    sormqr_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           Q was determined by a call to SGEBRD with nq < k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    sormqr_(side, trans, &mi, &ni, &i__1, &a[a_dim1 + 2], lda, &tau[1]
+		    , &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+	}
+    } else {
+
+/*        Apply P */
+
+	if (notran) {
+	    *(unsigned char *)transt = 'T';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+	if (nq > *k) {
+
+/*           P was determined by a call to SGEBRD with nq > k */
+
+	    sormlq_(side, transt, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           P was determined by a call to SGEBRD with nq <= k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    sormlq_(side, transt, &mi, &ni, &i__1, &a[(a_dim1 << 1) + 1], lda,
+		     &tau[1], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &
+		    iinfo);
+	}
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORMBR */
+
+} /* sormbr_ */
+
+/* Subroutine */ int sormhr_(char *side, char *trans, integer *m, integer *n,
+	integer *ilo, integer *ihi, real *a, integer *lda, real *tau, real *
+	c__, integer *ldc, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, nh, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORMHR overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    IHI-ILO elementary reflectors, as returned by SGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of SGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            If SIDE = 'L', then 1 <= ILO <= IHI <= M, if M > 0, and
+            ILO = 1 and IHI = 0, if M = 0;
+            if SIDE = 'R', then 1 <= ILO <= IHI <= N, if N > 0, and
+            ILO = 1 and IHI = 0, if N = 0.
+
+    A       (input) REAL array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by SGEHRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) REAL array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEHRD.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    left = lsame_(side, "L");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ilo < 1 || *ilo > max(1,nq)) {
+	*info = -5;
+    } else if (*ihi < min(*ilo,nq) || *ihi > nq) {
+	*info = -6;
+    } else if (*lda < max(1,nq)) {
+	*info = -8;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -13;
+    }
+
+    if (*info == 0) {
+	if (left) {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "SORMQR", ch__1, &nh, n, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	} else {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "SORMQR", ch__1, m, &nh, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("SORMHR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nh == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    if (left) {
+	mi = nh;
+	ni = *n;
+	i1 = *ilo + 1;
+	i2 = 1;
+    } else {
+	mi = *m;
+	ni = nh;
+	i1 = 1;
+	i2 = *ilo + 1;
+    }
+
+    sormqr_(side, trans, &mi, &ni, &nh, &a[*ilo + 1 + *ilo * a_dim1], lda, &
+	    tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORMHR */
+
+} /* sormhr_ */
+
+/* Subroutine */ int sorml2_(char *side, char *trans, integer *m, integer *n,
+	integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static real aii;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORML2 overwrites the general real m by n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'T', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'T',
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by SGELQF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'T': apply Q' (Transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) REAL array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            SGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGELQF.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the m by n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) REAL array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORML2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[i__ + i__ * a_dim1];
+	a[i__ + i__ * a_dim1] = 1.f;
+	slarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], lda, &tau[i__], &c__[
+		ic + jc * c_dim1], ldc, &work[1]);
+	a[i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of SORML2 */
+
+} /* sorml2_ */
+
+/* Subroutine */ int sormlq_(char *side, char *trans, integer *m, integer *n,
+	integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static real t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int sorml2_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *), slarfb_(char *, char *, char *, char *
+	    , integer *, integer *, integer *, real *, integer *, real *,
+	    integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static logical notran;
+    static integer ldwork;
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORMLQ overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by SGELQF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) REAL array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            SGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGELQF.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "SORMLQ", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORMLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "SORMLQ", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	sorml2_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	if (notran) {
+	    *(unsigned char *)transt = 'T';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    slarft_("Forward", "Rowwise", &i__4, &ib, &a[i__ + i__ * a_dim1],
+		    lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    slarfb_(side, transt, "Forward", "Rowwise", &mi, &ni, &ib, &a[i__
+		    + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc * c_dim1],
+		    ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORMLQ */
+
+} /* sormlq_ */
+
+/* Subroutine */ int sormql_(char *side, char *trans, integer *m, integer *n,
+	integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static real t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int sorm2l_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *), slarfb_(char *, char *, char *, char *
+	    , integer *, integer *, integer *, real *, integer *, real *,
+	    integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static logical notran;
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORMQL overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by SGEQLF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) REAL array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            SGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEQLF.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = max(1,*n);
+    } else {
+	nq = *n;
+	nw = max(1,*m);
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+
+    if (*info == 0) {
+	if (*m == 0 || *n == 0) {
+	    lwkopt = 1;
+	} else {
+
+/*
+             Determine the block size.  NB may be at most NBMAX, where
+             NBMAX is used to define the local array T.
+
+
+   Computing MIN
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 64, i__2 = ilaenv_(&c__1, "SORMQL", ch__1, m, n, k, &c_n1,
+		    (ftnlen)6, (ftnlen)2);
+	    nb = min(i__1,i__2);
+	    lwkopt = nw * nb;
+	}
+	work[1] = (real) lwkopt;
+
+	if (*lwork < nw && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORMQL", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "SORMQL", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	sorm2l_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	} else {
+	    mi = *m;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i+ib-1) . . . H(i+1) H(i)
+*/
+
+	    i__4 = nq - *k + i__ + ib - 1;
+	    slarft_("Backward", "Columnwise", &i__4, &ib, &a[i__ * a_dim1 + 1]
+		    , lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(1:m-k+i+ib-1,1:n) */
+
+		mi = *m - *k + i__ + ib - 1;
+	    } else {
+
+/*              H or H' is applied to C(1:m,1:n-k+i+ib-1) */
+
+		ni = *n - *k + i__ + ib - 1;
+	    }
+
+/*           Apply H or H' */
+
+	    slarfb_(side, trans, "Backward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ * a_dim1 + 1], lda, t, &c__65, &c__[c_offset], ldc, &
+		    work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORMQL */
+
+} /* sormql_ */
+
+/* Subroutine */ int sormqr_(char *side, char *trans, integer *m, integer *n,
+	integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static real t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int sorm2r_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *), slarfb_(char *, char *, char *, char *
+	    , integer *, integer *, integer *, real *, integer *, real *,
+	    integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static logical notran;
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORMQR overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) REAL array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            SGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) REAL array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SGEQRF.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "SORMQR", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SORMQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "SORMQR", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	sorm2r_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && ! notran || ! left && notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    slarft_("Forward", "Columnwise", &i__4, &ib, &a[i__ + i__ *
+		    a_dim1], lda, &tau[i__], t, &c__65)
+		    ;
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    slarfb_(side, trans, "Forward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc *
+		    c_dim1], ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORMQR */
+
+} /* sormqr_ */
+
+/* Subroutine */ int sormtr_(char *side, char *uplo, char *trans, integer *m,
+	integer *n, real *a, integer *lda, real *tau, real *c__, integer *ldc,
+	 real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2, i__3;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int sormql_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SORMTR overwrites the general real M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'T':      Q**T * C       C * Q**T
+
+    where Q is a real orthogonal matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    nq-1 elementary reflectors, as returned by SSYTRD:
+
+    if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+
+    if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**T from the Left;
+            = 'R': apply Q or Q**T from the Right.
+
+    UPLO    (input) CHARACTER*1
+            = 'U': Upper triangle of A contains elementary reflectors
+                   from SSYTRD;
+            = 'L': Lower triangle of A contains elementary reflectors
+                   from SSYTRD.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'T':  Transpose, apply Q**T.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    A       (input) REAL array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by SSYTRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) REAL array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by SSYTRD.
+
+    C       (input/output) REAL array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	*info = -2;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "T")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+	if (upper) {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "SORMQL", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "SORMQL", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	} else {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "SORMQR", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "SORMQR", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("SORMTR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nq == 1) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    if (left) {
+	mi = *m - 1;
+	ni = *n;
+    } else {
+	mi = *m;
+	ni = *n - 1;
+    }
+
+    if (upper) {
+
+/*        Q was determined by a call to SSYTRD with UPLO = 'U' */
+
+	i__2 = nq - 1;
+	sormql_(side, trans, &mi, &ni, &i__2, &a[(a_dim1 << 1) + 1], lda, &
+		tau[1], &c__[c_offset], ldc, &work[1], lwork, &iinfo);
+    } else {
+
+/*        Q was determined by a call to SSYTRD with UPLO = 'L' */
+
+	if (left) {
+	    i1 = 2;
+	    i2 = 1;
+	} else {
+	    i1 = 1;
+	    i2 = 2;
+	}
+	i__2 = nq - 1;
+	sormqr_(side, trans, &mi, &ni, &i__2, &a[a_dim1 + 2], lda, &tau[1], &
+		c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+    }
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SORMTR */
+
+} /* sormtr_ */
+
+/* Subroutine */ int spotf2_(char *uplo, integer *n, real *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+
+    /* Local variables */
+    static integer j;
+    static real ajj;
+    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemv_(char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical sisnan_(real *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SPOTF2 computes the Cholesky factorization of a real symmetric
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U' * U ,  if UPLO = 'U', or
+       A = L  * L',  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is stored.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            n by n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U'*U  or A = L*L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, the leading minor of order k is not
+                 positive definite, and the factorization could not be
+                 completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SPOTF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the Cholesky factorization A = U'*U. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute U(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j - 1;
+	    ajj = a[j + j * a_dim1] - sdot_(&i__2, &a[j * a_dim1 + 1], &c__1,
+		    &a[j * a_dim1 + 1], &c__1);
+	    if (ajj <= 0.f || sisnan_(&ajj)) {
+		a[j + j * a_dim1] = ajj;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    a[j + j * a_dim1] = ajj;
+
+/*           Compute elements J+1:N of row J. */
+
+	    if (j < *n) {
+		i__2 = j - 1;
+		i__3 = *n - j;
+		sgemv_("Transpose", &i__2, &i__3, &c_b151, &a[(j + 1) *
+			a_dim1 + 1], lda, &a[j * a_dim1 + 1], &c__1, &c_b15, &
+			a[j + (j + 1) * a_dim1], lda);
+		i__2 = *n - j;
+		r__1 = 1.f / ajj;
+		sscal_(&i__2, &r__1, &a[j + (j + 1) * a_dim1], lda);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Compute the Cholesky factorization A = L*L'. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute L(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j - 1;
+	    ajj = a[j + j * a_dim1] - sdot_(&i__2, &a[j + a_dim1], lda, &a[j
+		    + a_dim1], lda);
+	    if (ajj <= 0.f || sisnan_(&ajj)) {
+		a[j + j * a_dim1] = ajj;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    a[j + j * a_dim1] = ajj;
+
+/*           Compute elements J+1:N of column J. */
+
+	    if (j < *n) {
+		i__2 = *n - j;
+		i__3 = j - 1;
+		sgemv_("No transpose", &i__2, &i__3, &c_b151, &a[j + 1 +
+			a_dim1], lda, &a[j + a_dim1], lda, &c_b15, &a[j + 1 +
+			j * a_dim1], &c__1);
+		i__2 = *n - j;
+		r__1 = 1.f / ajj;
+		sscal_(&i__2, &r__1, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+    goto L40;
+
+L30:
+    *info = j;
+
+L40:
+    return 0;
+
+/*     End of SPOTF2 */
+
+} /* spotf2_ */
+
+/* Subroutine */ int spotrf_(char *uplo, integer *n, real *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer j, jb, nb;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), ssyrk_(char *, char *, integer
+	    *, integer *, real *, real *, integer *, real *, real *, integer *
+	    ), spotf2_(char *, integer *, real *, integer *,
+	    integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SPOTRF computes the Cholesky factorization of a real symmetric
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U**T * U,  if UPLO = 'U', or
+       A = L  * L**T,  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the block version of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U**T*U or A = L*L**T.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the leading minor of order i is not
+                  positive definite, and the factorization could not be
+                  completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SPOTRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "SPOTRF", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code. */
+
+	spotf2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code. */
+
+	if (upper) {
+
+/*           Compute the Cholesky factorization A = U'*U. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		ssyrk_("Upper", "Transpose", &jb, &i__3, &c_b151, &a[j *
+			a_dim1 + 1], lda, &c_b15, &a[j + j * a_dim1], lda);
+		spotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block row. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    sgemm_("Transpose", "No transpose", &jb, &i__3, &i__4, &
+			    c_b151, &a[j * a_dim1 + 1], lda, &a[(j + jb) *
+			    a_dim1 + 1], lda, &c_b15, &a[j + (j + jb) *
+			    a_dim1], lda);
+		    i__3 = *n - j - jb + 1;
+		    strsm_("Left", "Upper", "Transpose", "Non-unit", &jb, &
+			    i__3, &c_b15, &a[j + j * a_dim1], lda, &a[j + (j
+			    + jb) * a_dim1], lda);
+		}
+/* L10: */
+	    }
+
+	} else {
+
+/*           Compute the Cholesky factorization A = L*L'. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		ssyrk_("Lower", "No transpose", &jb, &i__3, &c_b151, &a[j +
+			a_dim1], lda, &c_b15, &a[j + j * a_dim1], lda);
+		spotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block column. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    sgemm_("No transpose", "Transpose", &i__3, &jb, &i__4, &
+			    c_b151, &a[j + jb + a_dim1], lda, &a[j + a_dim1],
+			    lda, &c_b15, &a[j + jb + j * a_dim1], lda);
+		    i__3 = *n - j - jb + 1;
+		    strsm_("Right", "Lower", "Transpose", "Non-unit", &i__3, &
+			    jb, &c_b15, &a[j + j * a_dim1], lda, &a[j + jb +
+			    j * a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+    goto L40;
+
+L30:
+    *info = *info + j - 1;
+
+L40:
+    return 0;
+
+/*     End of SPOTRF */
+
+} /* spotrf_ */
+
+/* Subroutine */ int spotri_(char *uplo, integer *n, real *a, integer *lda,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), slauum_(
+	    char *, integer *, real *, integer *, integer *), strtri_(
+	    char *, char *, integer *, real *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SPOTRI computes the inverse of a real symmetric positive definite
+    matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+    computed by SPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the triangular factor U or L from the Cholesky
+            factorization A = U**T*U or A = L*L**T, as computed by
+            SPOTRF.
+            On exit, the upper or lower triangle of the (symmetric)
+            inverse of A, overwriting the input factor U or L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the (i,i) element of the factor U or L is
+                  zero, and the inverse could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SPOTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Invert the triangular Cholesky factor U or L. */
+
+    strtri_(uplo, "Non-unit", n, &a[a_offset], lda, info);
+    if (*info > 0) {
+	return 0;
+    }
+
+/*     Form inv(U)*inv(U)' or inv(L)'*inv(L). */
+
+    slauum_(uplo, n, &a[a_offset], lda, info);
+
+    return 0;
+
+/*     End of SPOTRI */
+
+} /* spotri_ */
+
+/* Subroutine */ int spotrs_(char *uplo, integer *n, integer *nrhs, real *a,
+	integer *lda, real *b, integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SPOTRS solves a system of linear equations A*X = B with a symmetric
+    positive definite matrix A using the Cholesky factorization
+    A = U**T*U or A = L*L**T computed by SPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) REAL array, dimension (LDA,N)
+            The triangular factor U or L from the Cholesky factorization
+            A = U**T*U or A = L*L**T, as computed by SPOTRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    B       (input/output) REAL array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SPOTRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*
+          Solve A*X = B where A = U'*U.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	strsm_("Left", "Upper", "Transpose", "Non-unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	strsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b15, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A*X = B where A = L*L'.
+
+          Solve L*X = B, overwriting B with X.
+*/
+
+	strsm_("Left", "Lower", "No transpose", "Non-unit", n, nrhs, &c_b15, &
+		a[a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	strsm_("Left", "Lower", "Transpose", "Non-unit", n, nrhs, &c_b15, &a[
+		a_offset], lda, &b[b_offset], ldb);
+    }
+
+    return 0;
+
+/*     End of SPOTRS */
+
+} /* spotrs_ */
+
+/* Subroutine */ int sstedc_(char *compz, integer *n, real *d__, real *e,
+	real *z__, integer *ldz, real *work, integer *lwork, integer *iwork,
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    static integer i__, j, k, m;
+    static real p;
+    static integer ii, lgn;
+    static real eps, tiny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer lwmin, start;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *,
+	    integer *), slaed0_(integer *, integer *, integer *, real *, real
+	    *, real *, integer *, real *, integer *, real *, integer *,
+	    integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer finish;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *,
+	    real *, integer *), slaset_(char *, integer *, integer *,
+	    real *, real *, real *, integer *);
+    static integer liwmin, icompz;
+    static real orgnrm;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *),
+	     slasrt_(char *, integer *, real *, integer *);
+    static logical lquery;
+    static integer smlsiz;
+    extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *,
+	    real *, integer *, real *, integer *);
+    static integer storez, strtrw;
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SSTEDC computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the divide and conquer method.
+    The eigenvectors of a full or band real symmetric matrix can also be
+    found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this
+    matrix to tridiagonal form.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.  See SLAED3 for details.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'I':  Compute eigenvectors of tridiagonal matrix also.
+            = 'V':  Compute eigenvectors of original dense symmetric
+                    matrix also.  On entry, Z contains the orthogonal
+                    matrix used to reduce the original matrix to
+                    tridiagonal form.
+
+    N       (input) INTEGER
+            The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the subdiagonal elements of the tridiagonal matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) REAL array, dimension (LDZ,N)
+            On entry, if COMPZ = 'V', then Z contains the orthogonal
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original symmetric matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If  COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1.
+            If eigenvectors are desired, then LDZ >= max(1,N).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If COMPZ = 'N' or N <= 1 then LWORK must be at least 1.
+            If COMPZ = 'V' and N > 1 then LWORK must be at least
+                           ( 1 + 3*N + 2*N*lg N + 3*N**2 ),
+                           where lg( N ) = smallest integer k such
+                           that 2**k >= N.
+            If COMPZ = 'I' and N > 1 then LWORK must be at least
+                           ( 1 + 4*N + N**2 ).
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LWORK need
+            only be max(1,2*(N-1)).
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If COMPZ = 'N' or N <= 1 then LIWORK must be at least 1.
+            If COMPZ = 'V' and N > 1 then LIWORK must be at least
+                           ( 6 + 6*N + 5*N*lg N ).
+            If COMPZ = 'I' and N > 1 then LIWORK must be at least
+                           ( 3 + 5*N ).
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LIWORK
+            need only be 1.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal size of the IWORK array,
+            returns this value as the first entry of the IWORK array, and
+            no error message related to LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+
+    if (*info == 0) {
+
+/*        Compute the workspace requirements */
+
+	smlsiz = ilaenv_(&c__9, "SSTEDC", " ", &c__0, &c__0, &c__0, &c__0, (
+		ftnlen)6, (ftnlen)1);
+	if (*n <= 1 || icompz == 0) {
+	    liwmin = 1;
+	    lwmin = 1;
+	} else if (*n <= smlsiz) {
+	    liwmin = 1;
+	    lwmin = *n - 1 << 1;
+	} else {
+	    lgn = (integer) (log((real) (*n)) / log(2.f));
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    if (icompz == 1) {
+/* Computing 2nd power */
+		i__1 = *n;
+		lwmin = *n * 3 + 1 + (*n << 1) * lgn + i__1 * i__1 * 3;
+		liwmin = *n * 6 + 6 + *n * 5 * lgn;
+	    } else if (icompz == 2) {
+/* Computing 2nd power */
+		i__1 = *n;
+		lwmin = (*n << 2) + 1 + i__1 * i__1;
+		liwmin = *n * 5 + 3;
+	    }
+	}
+	work[1] = (real) lwmin;
+	iwork[1] = liwmin;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -10;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SSTEDC", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*n == 1) {
+	if (icompz != 0) {
+	    z__[z_dim1 + 1] = 1.f;
+	}
+	return 0;
+    }
+
+/*
+       If the following conditional clause is removed, then the routine
+       will use the Divide and Conquer routine to compute only the
+       eigenvalues, which requires (3N + 3N**2) real workspace and
+       (2 + 5N + 2N lg(N)) integer workspace.
+       Since on many architectures SSTERF is much faster than any other
+       algorithm for finding eigenvalues only, it is used here
+       as the default. If the conditional clause is removed, then
+       information on the size of workspace needs to be changed.
+
+       If COMPZ = 'N', use SSTERF to compute the eigenvalues.
+*/
+
+    if (icompz == 0) {
+	ssterf_(n, &d__[1], &e[1], info);
+	goto L50;
+    }
+
+/*
+       If N is smaller than the minimum divide size (SMLSIZ+1), then
+       solve the problem with another solver.
+*/
+
+    if (*n <= smlsiz) {
+
+	ssteqr_(compz, n, &d__[1], &e[1], &z__[z_offset], ldz, &work[1], info);
+
+    } else {
+
+/*
+          If COMPZ = 'V', the Z matrix must be stored elsewhere for later
+          use.
+*/
+
+	if (icompz == 1) {
+	    storez = *n * *n + 1;
+	} else {
+	    storez = 1;
+	}
+
+	if (icompz == 2) {
+	    slaset_("Full", n, n, &c_b29, &c_b15, &z__[z_offset], ldz);
+	}
+
+/*        Scale. */
+
+	orgnrm = slanst_("M", n, &d__[1], &e[1]);
+	if (orgnrm == 0.f) {
+	    goto L50;
+	}
+
+	eps = slamch_("Epsilon");
+
+	start = 1;
+
+/*        while ( START <= N ) */
+
+L10:
+	if (start <= *n) {
+
+/*
+             Let FINISH be the position of the next subdiagonal entry
+             such that E( FINISH ) <= TINY or FINISH = N if no such
+             subdiagonal exists.  The matrix identified by the elements
+             between START and FINISH constitutes an independent
+             sub-problem.
+*/
+
+	    finish = start;
+L20:
+	    if (finish < *n) {
+		tiny = eps * sqrt((r__1 = d__[finish], dabs(r__1))) * sqrt((
+			r__2 = d__[finish + 1], dabs(r__2)));
+		if ((r__1 = e[finish], dabs(r__1)) > tiny) {
+		    ++finish;
+		    goto L20;
+		}
+	    }
+
+/*           (Sub) Problem determined.  Compute its size and solve it. */
+
+	    m = finish - start + 1;
+	    if (m == 1) {
+		start = finish + 1;
+		goto L10;
+	    }
+	    if (m > smlsiz) {
+
+/*              Scale. */
+
+		orgnrm = slanst_("M", &m, &d__[start], &e[start]);
+		slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &m, &c__1, &d__[
+			start], &m, info);
+		i__1 = m - 1;
+		i__2 = m - 1;
+		slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &i__1, &c__1, &e[
+			start], &i__2, info);
+
+		if (icompz == 1) {
+		    strtrw = 1;
+		} else {
+		    strtrw = start;
+		}
+		slaed0_(&icompz, n, &m, &d__[start], &e[start], &z__[strtrw +
+			start * z_dim1], ldz, &work[1], n, &work[storez], &
+			iwork[1], info);
+		if (*info != 0) {
+		    *info = (*info / (m + 1) + start - 1) * (*n + 1) + *info %
+			     (m + 1) + start - 1;
+		    goto L50;
+		}
+
+/*              Scale back. */
+
+		slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, &m, &c__1, &d__[
+			start], &m, info);
+
+	    } else {
+		if (icompz == 1) {
+
+/*
+                   Since QR won't update a Z matrix which is larger than
+                   the length of D, we must solve the sub-problem in a
+                   workspace and then multiply back into Z.
+*/
+
+		    ssteqr_("I", &m, &d__[start], &e[start], &work[1], &m, &
+			    work[m * m + 1], info);
+		    slacpy_("A", n, &m, &z__[start * z_dim1 + 1], ldz, &work[
+			    storez], n);
+		    sgemm_("N", "N", n, &m, &m, &c_b15, &work[storez], n, &
+			    work[1], &m, &c_b29, &z__[start * z_dim1 + 1],
+			    ldz);
+		} else if (icompz == 2) {
+		    ssteqr_("I", &m, &d__[start], &e[start], &z__[start +
+			    start * z_dim1], ldz, &work[1], info);
+		} else {
+		    ssterf_(&m, &d__[start], &e[start], info);
+		}
+		if (*info != 0) {
+		    *info = start * (*n + 1) + finish;
+		    goto L50;
+		}
+	    }
+
+	    start = finish + 1;
+	    goto L10;
+	}
+
+/*
+          endwhile
+
+          If the problem split any number of times, then the eigenvalues
+          will not be properly ordered.  Here we permute the eigenvalues
+          (and the associated eigenvectors) into ascending order.
+*/
+
+	if (m != *n) {
+	    if (icompz == 0) {
+
+/*              Use Quick Sort */
+
+		slasrt_("I", n, &d__[1], info);
+
+	    } else {
+
+/*              Use Selection Sort to minimize swaps of eigenvectors */
+
+		i__1 = *n;
+		for (ii = 2; ii <= i__1; ++ii) {
+		    i__ = ii - 1;
+		    k = i__;
+		    p = d__[i__];
+		    i__2 = *n;
+		    for (j = ii; j <= i__2; ++j) {
+			if (d__[j] < p) {
+			    k = j;
+			    p = d__[j];
+			}
+/* L30: */
+		    }
+		    if (k != i__) {
+			d__[k] = d__[i__];
+			d__[i__] = p;
+			sswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k *
+				z_dim1 + 1], &c__1);
+		    }
+/* L40: */
+		}
+	    }
+	}
+    }
+
+L50:
+    work[1] = (real) lwmin;
+    iwork[1] = liwmin;
+
+    return 0;
+
+/*     End of SSTEDC */
+
+} /* sstedc_ */
+
+/* Subroutine */ int ssteqr_(char *compz, integer *n, real *d__, real *e,
+	real *z__, integer *ldz, real *work, integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    static real b, c__, f, g;
+    static integer i__, j, k, l, m;
+    static real p, r__, s;
+    static integer l1, ii, mm, lm1, mm1, nm1;
+    static real rt1, rt2, eps;
+    static integer lsv;
+    static real tst, eps2;
+    static integer lend, jtot;
+    extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *)
+	    ;
+    extern logical lsame_(char *, char *);
+    static real anorm;
+    extern /* Subroutine */ int slasr_(char *, char *, char *, integer *,
+	    integer *, real *, real *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer *);
+    static integer lendm1, lendp1;
+    extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real *
+	    , real *, real *);
+    extern doublereal slapy2_(real *, real *);
+    static integer iscale;
+    extern doublereal slamch_(char *);
+    static real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real safmax;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *);
+    static integer lendsv;
+    extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real *
+	    ), slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *);
+    static real ssfmin;
+    static integer nmaxit, icompz;
+    static real ssfmax;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SSTEQR computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the implicit QL or QR method.
+    The eigenvectors of a full or band symmetric matrix can also be found
+    if SSYTRD or SSPTRD or SSBTRD has been used to reduce this matrix to
+    tridiagonal form.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'V':  Compute eigenvalues and eigenvectors of the original
+                    symmetric matrix.  On entry, Z must contain the
+                    orthogonal matrix used to reduce the original matrix
+                    to tridiagonal form.
+            = 'I':  Compute eigenvalues and eigenvectors of the
+                    tridiagonal matrix.  Z is initialized to the identity
+                    matrix.
+
+    N       (input) INTEGER
+            The order of the matrix.  N >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the (n-1) subdiagonal elements of the tridiagonal
+            matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) REAL array, dimension (LDZ, N)
+            On entry, if  COMPZ = 'V', then Z contains the orthogonal
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if  COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original symmetric matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1, and if
+            eigenvectors are desired, then  LDZ >= max(1,N).
+
+    WORK    (workspace) REAL array, dimension (max(1,2*N-2))
+            If COMPZ = 'N', then WORK is not referenced.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  the algorithm has failed to find all the eigenvalues in
+                  a total of 30*N iterations; if INFO = i, then i
+                  elements of E have not converged to zero; on exit, D
+                  and E contain the elements of a symmetric tridiagonal
+                  matrix which is orthogonally similar to the original
+                  matrix.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SSTEQR", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	if (icompz == 2) {
+	    z__[z_dim1 + 1] = 1.f;
+	}
+	return 0;
+    }
+
+/*     Determine the unit roundoff and over/underflow thresholds. */
+
+    eps = slamch_("E");
+/* Computing 2nd power */
+    r__1 = eps;
+    eps2 = r__1 * r__1;
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ssfmax = sqrt(safmax) / 3.f;
+    ssfmin = sqrt(safmin) / eps2;
+
+/*
+       Compute the eigenvalues and eigenvectors of the tridiagonal
+       matrix.
+*/
+
+    if (icompz == 2) {
+	slaset_("Full", n, n, &c_b29, &c_b15, &z__[z_offset], ldz);
+    }
+
+    nmaxit = *n * 30;
+    jtot = 0;
+
+/*
+       Determine where the matrix splits and choose QL or QR iteration
+       for each block, according to whether top or bottom diagonal
+       element is smaller.
+*/
+
+    l1 = 1;
+    nm1 = *n - 1;
+
+L10:
+    if (l1 > *n) {
+	goto L160;
+    }
+    if (l1 > 1) {
+	e[l1 - 1] = 0.f;
+    }
+    if (l1 <= nm1) {
+	i__1 = nm1;
+	for (m = l1; m <= i__1; ++m) {
+	    tst = (r__1 = e[m], dabs(r__1));
+	    if (tst == 0.f) {
+		goto L30;
+	    }
+	    if (tst <= sqrt((r__1 = d__[m], dabs(r__1))) * sqrt((r__2 = d__[m
+		    + 1], dabs(r__2))) * eps) {
+		e[m] = 0.f;
+		goto L30;
+	    }
+/* L20: */
+	}
+    }
+    m = *n;
+
+L30:
+    l = l1;
+    lsv = l;
+    lend = m;
+    lendsv = lend;
+    l1 = m + 1;
+    if (lend == l) {
+	goto L10;
+    }
+
+/*     Scale submatrix in rows and columns L to LEND */
+
+    i__1 = lend - l + 1;
+    anorm = slanst_("I", &i__1, &d__[l], &e[l]);
+    iscale = 0;
+    if (anorm == 0.f) {
+	goto L10;
+    }
+    if (anorm > ssfmax) {
+	iscale = 1;
+	i__1 = lend - l + 1;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n,
+		info);
+    } else if (anorm < ssfmin) {
+	iscale = 2;
+	i__1 = lend - l + 1;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n,
+		info);
+    }
+
+/*     Choose between QL and QR iteration */
+
+    if ((r__1 = d__[lend], dabs(r__1)) < (r__2 = d__[l], dabs(r__2))) {
+	lend = lsv;
+	l = lendsv;
+    }
+
+    if (lend > l) {
+
+/*
+          QL Iteration
+
+          Look for small subdiagonal element.
+*/
+
+L40:
+	if (l != lend) {
+	    lendm1 = lend - 1;
+	    i__1 = lendm1;
+	    for (m = l; m <= i__1; ++m) {
+/* Computing 2nd power */
+		r__2 = (r__1 = e[m], dabs(r__1));
+		tst = r__2 * r__2;
+		if (tst <= eps2 * (r__1 = d__[m], dabs(r__1)) * (r__2 = d__[m
+			+ 1], dabs(r__2)) + safmin) {
+		    goto L60;
+		}
+/* L50: */
+	    }
+	}
+
+	m = lend;
+
+L60:
+	if (m < lend) {
+	    e[m] = 0.f;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L80;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use SLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l + 1) {
+	    if (icompz > 0) {
+		slaev2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2, &c__, &s);
+		work[l] = c__;
+		work[*n - 1 + l] = s;
+		slasr_("R", "V", "B", n, &c__2, &work[l], &work[*n - 1 + l], &
+			z__[l * z_dim1 + 1], ldz);
+	    } else {
+		slae2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2);
+	    }
+	    d__[l] = rt1;
+	    d__[l + 1] = rt2;
+	    e[l] = 0.f;
+	    l += 2;
+	    if (l <= lend) {
+		goto L40;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l + 1] - p) / (e[l] * 2.f);
+	r__ = slapy2_(&g, &c_b15);
+	g = d__[m] - p + e[l] / (g + r_sign(&r__, &g));
+
+	s = 1.f;
+	c__ = 1.f;
+	p = 0.f;
+
+/*        Inner loop */
+
+	mm1 = m - 1;
+	i__1 = l;
+	for (i__ = mm1; i__ >= i__1; --i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    slartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m - 1) {
+		e[i__ + 1] = r__;
+	    }
+	    g = d__[i__ + 1] - p;
+	    r__ = (d__[i__] - g) * s + c__ * 2.f * b;
+	    p = s * r__;
+	    d__[i__ + 1] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = -s;
+	    }
+
+/* L70: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = m - l + 1;
+	    slasr_("R", "V", "B", n, &mm, &work[l], &work[*n - 1 + l], &z__[l
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[l] = g;
+	goto L40;
+
+/*        Eigenvalue found. */
+
+L80:
+	d__[l] = p;
+
+	++l;
+	if (l <= lend) {
+	    goto L40;
+	}
+	goto L140;
+
+    } else {
+
+/*
+          QR Iteration
+
+          Look for small superdiagonal element.
+*/
+
+L90:
+	if (l != lend) {
+	    lendp1 = lend + 1;
+	    i__1 = lendp1;
+	    for (m = l; m >= i__1; --m) {
+/* Computing 2nd power */
+		r__2 = (r__1 = e[m - 1], dabs(r__1));
+		tst = r__2 * r__2;
+		if (tst <= eps2 * (r__1 = d__[m], dabs(r__1)) * (r__2 = d__[m
+			- 1], dabs(r__2)) + safmin) {
+		    goto L110;
+		}
+/* L100: */
+	    }
+	}
+
+	m = lend;
+
+L110:
+	if (m > lend) {
+	    e[m - 1] = 0.f;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L130;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use SLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l - 1) {
+	    if (icompz > 0) {
+		slaev2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2, &c__, &s)
+			;
+		work[m] = c__;
+		work[*n - 1 + m] = s;
+		slasr_("R", "V", "F", n, &c__2, &work[m], &work[*n - 1 + m], &
+			z__[(l - 1) * z_dim1 + 1], ldz);
+	    } else {
+		slae2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2);
+	    }
+	    d__[l - 1] = rt1;
+	    d__[l] = rt2;
+	    e[l - 1] = 0.f;
+	    l += -2;
+	    if (l >= lend) {
+		goto L90;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l - 1] - p) / (e[l - 1] * 2.f);
+	r__ = slapy2_(&g, &c_b15);
+	g = d__[m] - p + e[l - 1] / (g + r_sign(&r__, &g));
+
+	s = 1.f;
+	c__ = 1.f;
+	p = 0.f;
+
+/*        Inner loop */
+
+	lm1 = l - 1;
+	i__1 = lm1;
+	for (i__ = m; i__ <= i__1; ++i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    slartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m) {
+		e[i__ - 1] = r__;
+	    }
+	    g = d__[i__] - p;
+	    r__ = (d__[i__ + 1] - g) * s + c__ * 2.f * b;
+	    p = s * r__;
+	    d__[i__] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = s;
+	    }
+
+/* L120: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = l - m + 1;
+	    slasr_("R", "V", "F", n, &mm, &work[m], &work[*n - 1 + m], &z__[m
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[lm1] = g;
+	goto L90;
+
+/*        Eigenvalue found. */
+
+L130:
+	d__[l] = p;
+
+	--l;
+	if (l >= lend) {
+	    goto L90;
+	}
+	goto L140;
+
+    }
+
+/*     Undo scaling if necessary */
+
+L140:
+    if (iscale == 1) {
+	i__1 = lendsv - lsv + 1;
+	slascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	slascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    } else if (iscale == 2) {
+	i__1 = lendsv - lsv + 1;
+	slascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	slascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    }
+
+/*
+       Check for no convergence to an eigenvalue after a total
+       of N*MAXIT iterations.
+*/
+
+    if (jtot < nmaxit) {
+	goto L10;
+    }
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (e[i__] != 0.f) {
+	    ++(*info);
+	}
+/* L150: */
+    }
+    goto L190;
+
+/*     Order eigenvalues and eigenvectors. */
+
+L160:
+    if (icompz == 0) {
+
+/*        Use Quick Sort */
+
+	slasrt_("I", n, &d__[1], info);
+
+    } else {
+
+/*        Use Selection Sort to minimize swaps of eigenvectors */
+
+	i__1 = *n;
+	for (ii = 2; ii <= i__1; ++ii) {
+	    i__ = ii - 1;
+	    k = i__;
+	    p = d__[i__];
+	    i__2 = *n;
+	    for (j = ii; j <= i__2; ++j) {
+		if (d__[j] < p) {
+		    k = j;
+		    p = d__[j];
+		}
+/* L170: */
+	    }
+	    if (k != i__) {
+		d__[k] = d__[i__];
+		d__[i__] = p;
+		sswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k * z_dim1 + 1],
+			 &c__1);
+	    }
+/* L180: */
+	}
+    }
+
+L190:
+    return 0;
+
+/*     End of SSTEQR */
+
+} /* ssteqr_ */
+
+/* Subroutine */ int ssterf_(integer *n, real *d__, real *e, integer *info)
+{
+    /* System generated locals */
+    integer i__1;
+    real r__1, r__2, r__3;
+
+    /* Local variables */
+    static real c__;
+    static integer i__, l, m;
+    static real p, r__, s;
+    static integer l1;
+    static real bb, rt1, rt2, eps, rte;
+    static integer lsv;
+    static real eps2, oldc;
+    static integer lend, jtot;
+    extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *)
+	    ;
+    static real gamma, alpha, sigma, anorm;
+    extern doublereal slapy2_(real *, real *);
+    static integer iscale;
+    static real oldgam;
+    extern doublereal slamch_(char *);
+    static real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real safmax;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *);
+    static integer lendsv;
+    static real ssfmin;
+    static integer nmaxit;
+    static real ssfmax;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SSTERF computes all eigenvalues of a symmetric tridiagonal matrix
+    using the Pal-Walker-Kahan variant of the QL or QR algorithm.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix.  N >= 0.
+
+    D       (input/output) REAL array, dimension (N)
+            On entry, the n diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) REAL array, dimension (N-1)
+            On entry, the (n-1) subdiagonal elements of the tridiagonal
+            matrix.
+            On exit, E has been destroyed.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  the algorithm failed to find all of the eigenvalues in
+                  a total of 30*N iterations; if INFO = i, then i
+                  elements of E have not converged to zero.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --e;
+    --d__;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n < 0) {
+	*info = -1;
+	i__1 = -(*info);
+	xerbla_("SSTERF", &i__1);
+	return 0;
+    }
+    if (*n <= 1) {
+	return 0;
+    }
+
+/*     Determine the unit roundoff for this environment. */
+
+    eps = slamch_("E");
+/* Computing 2nd power */
+    r__1 = eps;
+    eps2 = r__1 * r__1;
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ssfmax = sqrt(safmax) / 3.f;
+    ssfmin = sqrt(safmin) / eps2;
+
+/*     Compute the eigenvalues of the tridiagonal matrix. */
+
+    nmaxit = *n * 30;
+    sigma = 0.f;
+    jtot = 0;
+
+/*
+       Determine where the matrix splits and choose QL or QR iteration
+       for each block, according to whether top or bottom diagonal
+       element is smaller.
+*/
+
+    l1 = 1;
+
+L10:
+    if (l1 > *n) {
+	goto L170;
+    }
+    if (l1 > 1) {
+	e[l1 - 1] = 0.f;
+    }
+    i__1 = *n - 1;
+    for (m = l1; m <= i__1; ++m) {
+	if ((r__3 = e[m], dabs(r__3)) <= sqrt((r__1 = d__[m], dabs(r__1))) *
+		sqrt((r__2 = d__[m + 1], dabs(r__2))) * eps) {
+	    e[m] = 0.f;
+	    goto L30;
+	}
+/* L20: */
+    }
+    m = *n;
+
+L30:
+    l = l1;
+    lsv = l;
+    lend = m;
+    lendsv = lend;
+    l1 = m + 1;
+    if (lend == l) {
+	goto L10;
+    }
+
+/*     Scale submatrix in rows and columns L to LEND */
+
+    i__1 = lend - l + 1;
+    anorm = slanst_("I", &i__1, &d__[l], &e[l]);
+    iscale = 0;
+    if (anorm > ssfmax) {
+	iscale = 1;
+	i__1 = lend - l + 1;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n,
+		info);
+    } else if (anorm < ssfmin) {
+	iscale = 2;
+	i__1 = lend - l + 1;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	slascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n,
+		info);
+    }
+
+    i__1 = lend - 1;
+    for (i__ = l; i__ <= i__1; ++i__) {
+/* Computing 2nd power */
+	r__1 = e[i__];
+	e[i__] = r__1 * r__1;
+/* L40: */
+    }
+
+/*     Choose between QL and QR iteration */
+
+    if ((r__1 = d__[lend], dabs(r__1)) < (r__2 = d__[l], dabs(r__2))) {
+	lend = lsv;
+	l = lendsv;
+    }
+
+    if (lend >= l) {
+
+/*
+          QL Iteration
+
+          Look for small subdiagonal element.
+*/
+
+L50:
+	if (l != lend) {
+	    i__1 = lend - 1;
+	    for (m = l; m <= i__1; ++m) {
+		if ((r__2 = e[m], dabs(r__2)) <= eps2 * (r__1 = d__[m] * d__[
+			m + 1], dabs(r__1))) {
+		    goto L70;
+		}
+/* L60: */
+	    }
+	}
+	m = lend;
+
+L70:
+	if (m < lend) {
+	    e[m] = 0.f;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L90;
+	}
+
+/*
+          If remaining matrix is 2 by 2, use SLAE2 to compute its
+          eigenvalues.
+*/
+
+	if (m == l + 1) {
+	    rte = sqrt(e[l]);
+	    slae2_(&d__[l], &rte, &d__[l + 1], &rt1, &rt2);
+	    d__[l] = rt1;
+	    d__[l + 1] = rt2;
+	    e[l] = 0.f;
+	    l += 2;
+	    if (l <= lend) {
+		goto L50;
+	    }
+	    goto L150;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L150;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	rte = sqrt(e[l]);
+	sigma = (d__[l + 1] - p) / (rte * 2.f);
+	r__ = slapy2_(&sigma, &c_b15);
+	sigma = p - rte / (sigma + r_sign(&r__, &sigma));
+
+	c__ = 1.f;
+	s = 0.f;
+	gamma = d__[m] - sigma;
+	p = gamma * gamma;
+
+/*        Inner loop */
+
+	i__1 = l;
+	for (i__ = m - 1; i__ >= i__1; --i__) {
+	    bb = e[i__];
+	    r__ = p + bb;
+	    if (i__ != m - 1) {
+		e[i__ + 1] = s * r__;
+	    }
+	    oldc = c__;
+	    c__ = p / r__;
+	    s = bb / r__;
+	    oldgam = gamma;
+	    alpha = d__[i__];
+	    gamma = c__ * (alpha - sigma) - s * oldgam;
+	    d__[i__ + 1] = oldgam + (alpha - gamma);
+	    if (c__ != 0.f) {
+		p = gamma * gamma / c__;
+	    } else {
+		p = oldc * bb;
+	    }
+/* L80: */
+	}
+
+	e[l] = s * p;
+	d__[l] = sigma + gamma;
+	goto L50;
+
+/*        Eigenvalue found. */
+
+L90:
+	d__[l] = p;
+
+	++l;
+	if (l <= lend) {
+	    goto L50;
+	}
+	goto L150;
+
+    } else {
+
+/*
+          QR Iteration
+
+          Look for small superdiagonal element.
+*/
+
+L100:
+	i__1 = lend + 1;
+	for (m = l; m >= i__1; --m) {
+	    if ((r__2 = e[m - 1], dabs(r__2)) <= eps2 * (r__1 = d__[m] * d__[
+		    m - 1], dabs(r__1))) {
+		goto L120;
+	    }
+/* L110: */
+	}
+	m = lend;
+
+L120:
+	if (m > lend) {
+	    e[m - 1] = 0.f;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L140;
+	}
+
+/*
+          If remaining matrix is 2 by 2, use SLAE2 to compute its
+          eigenvalues.
+*/
+
+	if (m == l - 1) {
+	    rte = sqrt(e[l - 1]);
+	    slae2_(&d__[l], &rte, &d__[l - 1], &rt1, &rt2);
+	    d__[l] = rt1;
+	    d__[l - 1] = rt2;
+	    e[l - 1] = 0.f;
+	    l += -2;
+	    if (l >= lend) {
+		goto L100;
+	    }
+	    goto L150;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L150;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	rte = sqrt(e[l - 1]);
+	sigma = (d__[l - 1] - p) / (rte * 2.f);
+	r__ = slapy2_(&sigma, &c_b15);
+	sigma = p - rte / (sigma + r_sign(&r__, &sigma));
+
+	c__ = 1.f;
+	s = 0.f;
+	gamma = d__[m] - sigma;
+	p = gamma * gamma;
+
+/*        Inner loop */
+
+	i__1 = l - 1;
+	for (i__ = m; i__ <= i__1; ++i__) {
+	    bb = e[i__];
+	    r__ = p + bb;
+	    if (i__ != m) {
+		e[i__ - 1] = s * r__;
+	    }
+	    oldc = c__;
+	    c__ = p / r__;
+	    s = bb / r__;
+	    oldgam = gamma;
+	    alpha = d__[i__ + 1];
+	    gamma = c__ * (alpha - sigma) - s * oldgam;
+	    d__[i__] = oldgam + (alpha - gamma);
+	    if (c__ != 0.f) {
+		p = gamma * gamma / c__;
+	    } else {
+		p = oldc * bb;
+	    }
+/* L130: */
+	}
+
+	e[l - 1] = s * p;
+	d__[l] = sigma + gamma;
+	goto L100;
+
+/*        Eigenvalue found. */
+
+L140:
+	d__[l] = p;
+
+	--l;
+	if (l >= lend) {
+	    goto L100;
+	}
+	goto L150;
+
+    }
+
+/*     Undo scaling if necessary */
+
+L150:
+    if (iscale == 1) {
+	i__1 = lendsv - lsv + 1;
+	slascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+    }
+    if (iscale == 2) {
+	i__1 = lendsv - lsv + 1;
+	slascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+    }
+
+/*
+       Check for no convergence to an eigenvalue after a total
+       of N*MAXIT iterations.
+*/
+
+    if (jtot < nmaxit) {
+	goto L10;
+    }
+    i__1 = *n - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (e[i__] != 0.f) {
+	    ++(*info);
+	}
+/* L160: */
+    }
+    goto L180;
+
+/*     Sort eigenvalues in increasing order. */
+
+L170:
+    slasrt_("I", n, &d__[1], info);
+
+L180:
+    return 0;
+
+/*     End of SSTERF */
+
+} /* ssterf_ */
+
+/* Subroutine */ int ssyevd_(char *jobz, char *uplo, integer *n, real *a,
+	integer *lda, real *w, real *work, integer *lwork, integer *iwork,
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static real eps;
+    static integer inde;
+    static real anrm, rmin, rmax;
+    static integer lopt;
+    static real sigma;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static integer lwmin, liopt;
+    static logical lower, wantz;
+    static integer indwk2, llwrk2, iscale;
+    extern doublereal slamch_(char *);
+    static real safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *);
+    static integer indtau;
+    extern /* Subroutine */ int sstedc_(char *, integer *, real *, real *,
+	    real *, integer *, real *, integer *, integer *, integer *,
+	    integer *), slacpy_(char *, integer *, integer *, real *,
+	    integer *, real *, integer *);
+    static integer indwrk, liwmin;
+    extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *);
+    extern doublereal slansy_(char *, char *, integer *, real *, integer *,
+	    real *);
+    static integer llwork;
+    static real smlnum;
+    static logical lquery;
+    extern /* Subroutine */ int sormtr_(char *, char *, char *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *), ssytrd_(char *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SSYEVD computes all eigenvalues and, optionally, eigenvectors of a
+    real symmetric matrix A. If eigenvectors are desired, it uses a
+    divide and conquer algorithm.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Because of large use of BLAS of level 3, SSYEVD needs N**2 more
+    workspace than SSYEVX.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only;
+            = 'V':  Compute eigenvalues and eigenvectors.
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA, N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of A contains the
+            upper triangular part of the matrix A.  If UPLO = 'L',
+            the leading N-by-N lower triangular part of A contains
+            the lower triangular part of the matrix A.
+            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+            orthonormal eigenvectors of the matrix A.
+            If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+            or the upper triangle (if UPLO='U') of A, including the
+            diagonal, is destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    W       (output) REAL array, dimension (N)
+            If INFO = 0, the eigenvalues in ascending order.
+
+    WORK    (workspace/output) REAL array,
+                                           dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If N <= 1,               LWORK must be at least 1.
+            If JOBZ = 'N' and N > 1, LWORK must be at least 2*N+1.
+            If JOBZ = 'V' and N > 1, LWORK must be at least
+                                                  1 + 6*N + 2*N**2.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal sizes of the WORK and IWORK
+            arrays, returns these values as the first entries of the WORK
+            and IWORK arrays, and no error message related to LWORK or
+            LIWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If N <= 1,                LIWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+            If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK and
+            IWORK arrays, returns these values as the first entries of
+            the WORK and IWORK arrays, and no error message related to
+            LWORK or LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+                  to converge; i off-diagonal elements of an intermediate
+                  tridiagonal form did not converge to zero;
+                  if INFO = i and JOBZ = 'V', then the algorithm failed
+                  to compute an eigenvalue while working on the submatrix
+                  lying in rows and columns INFO/(N+1) through
+                  mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+    Modified by Francoise Tisseur, University of Tennessee.
+
+    Modified description of INFO. Sven, 16 Feb 05.
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --w;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    wantz = lsame_(jobz, "V");
+    lower = lsame_(uplo, "L");
+    lquery = *lwork == -1 || *liwork == -1;
+
+    *info = 0;
+    if (! (wantz || lsame_(jobz, "N"))) {
+	*info = -1;
+    } else if (! (lower || lsame_(uplo, "U"))) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+
+    if (*info == 0) {
+	if (*n <= 1) {
+	    liwmin = 1;
+	    lwmin = 1;
+	    lopt = lwmin;
+	    liopt = liwmin;
+	} else {
+	    if (wantz) {
+		liwmin = *n * 5 + 3;
+/* Computing 2nd power */
+		i__1 = *n;
+		lwmin = *n * 6 + 1 + (i__1 * i__1 << 1);
+	    } else {
+		liwmin = 1;
+		lwmin = (*n << 1) + 1;
+	    }
+/* Computing MAX */
+	    i__1 = lwmin, i__2 = (*n << 1) + ilaenv_(&c__1, "SSYTRD", uplo, n,
+		     &c_n1, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+	    lopt = max(i__1,i__2);
+	    liopt = liwmin;
+	}
+	work[1] = (real) lopt;
+	iwork[1] = liopt;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -10;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SSYEVD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	w[1] = a[a_dim1 + 1];
+	if (wantz) {
+	    a[a_dim1 + 1] = 1.f;
+	}
+	return 0;
+    }
+
+/*     Get machine constants. */
+
+    safmin = slamch_("Safe minimum");
+    eps = slamch_("Precision");
+    smlnum = safmin / eps;
+    bignum = 1.f / smlnum;
+    rmin = sqrt(smlnum);
+    rmax = sqrt(bignum);
+
+/*     Scale matrix to allowable range, if necessary. */
+
+    anrm = slansy_("M", uplo, n, &a[a_offset], lda, &work[1]);
+    iscale = 0;
+    if (anrm > 0.f && anrm < rmin) {
+	iscale = 1;
+	sigma = rmin / anrm;
+    } else if (anrm > rmax) {
+	iscale = 1;
+	sigma = rmax / anrm;
+    }
+    if (iscale == 1) {
+	slascl_(uplo, &c__0, &c__0, &c_b15, &sigma, n, n, &a[a_offset], lda,
+		info);
+    }
+
+/*     Call SSYTRD to reduce symmetric matrix to tridiagonal form. */
+
+    inde = 1;
+    indtau = inde + *n;
+    indwrk = indtau + *n;
+    llwork = *lwork - indwrk + 1;
+    indwk2 = indwrk + *n * *n;
+    llwrk2 = *lwork - indwk2 + 1;
+
+    ssytrd_(uplo, n, &a[a_offset], lda, &w[1], &work[inde], &work[indtau], &
+	    work[indwrk], &llwork, &iinfo);
+
+/*
+       For eigenvalues only, call SSTERF.  For eigenvectors, first call
+       SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+       tridiagonal matrix, then call SORMTR to multiply it by the
+       Householder transformations stored in A.
+*/
+
+    if (! wantz) {
+	ssterf_(n, &w[1], &work[inde], info);
+    } else {
+	sstedc_("I", n, &w[1], &work[inde], &work[indwrk], n, &work[indwk2], &
+		llwrk2, &iwork[1], liwork, info);
+	sormtr_("L", uplo, "N", n, n, &a[a_offset], lda, &work[indtau], &work[
+		indwrk], n, &work[indwk2], &llwrk2, &iinfo);
+	slacpy_("A", n, n, &work[indwrk], n, &a[a_offset], lda);
+    }
+
+/*     If matrix was scaled, then rescale eigenvalues appropriately. */
+
+    if (iscale == 1) {
+	r__1 = 1.f / sigma;
+	sscal_(n, &r__1, &w[1], &c__1);
+    }
+
+    work[1] = (real) lopt;
+    iwork[1] = liopt;
+
+    return 0;
+
+/*     End of SSYEVD */
+
+} /* ssyevd_ */
+
+/* Subroutine */ int ssytd2_(char *uplo, integer *n, real *a, integer *lda,
+	real *d__, real *e, real *tau, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__;
+    static real taui;
+    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
+    extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, integer *);
+    static real alpha;
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *,
+	    real *, integer *), ssymv_(char *, integer *, real *, real *,
+	    integer *, real *, integer *, real *, real *, integer *),
+	    xerbla_(char *, integer *), slarfg_(integer *, real *,
+	    real *, integer *, real *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SSYTD2 reduces a real symmetric matrix A to symmetric tridiagonal
+    form T by an orthogonal similarity transformation: Q' * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            symmetric matrix A is stored:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the orthogonal
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the orthogonal matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) REAL array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) REAL array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SSYTD2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Reduce the upper triangle of A */
+
+	for (i__ = *n - 1; i__ >= 1; --i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(1:i-1,i+1)
+*/
+
+	    slarfg_(&i__, &a[i__ + (i__ + 1) * a_dim1], &a[(i__ + 1) * a_dim1
+		    + 1], &c__1, &taui);
+	    e[i__] = a[i__ + (i__ + 1) * a_dim1];
+
+	    if (taui != 0.f) {
+
+/*              Apply H(i) from both sides to A(1:i,1:i) */
+
+		a[i__ + (i__ + 1) * a_dim1] = 1.f;
+
+/*              Compute  x := tau * A * v  storing x in TAU(1:i) */
+
+		ssymv_(uplo, &i__, &taui, &a[a_offset], lda, &a[(i__ + 1) *
+			a_dim1 + 1], &c__1, &c_b29, &tau[1], &c__1)
+			;
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		alpha = taui * -.5f * sdot_(&i__, &tau[1], &c__1, &a[(i__ + 1)
+			 * a_dim1 + 1], &c__1);
+		saxpy_(&i__, &alpha, &a[(i__ + 1) * a_dim1 + 1], &c__1, &tau[
+			1], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		ssyr2_(uplo, &i__, &c_b151, &a[(i__ + 1) * a_dim1 + 1], &c__1,
+			 &tau[1], &c__1, &a[a_offset], lda);
+
+		a[i__ + (i__ + 1) * a_dim1] = e[i__];
+	    }
+	    d__[i__ + 1] = a[i__ + 1 + (i__ + 1) * a_dim1];
+	    tau[i__] = taui;
+/* L10: */
+	}
+	d__[1] = a[a_dim1 + 1];
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(i+2:n,i)
+*/
+
+	    i__2 = *n - i__;
+/* Computing MIN */
+	    i__3 = i__ + 2;
+	    slarfg_(&i__2, &a[i__ + 1 + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		     a_dim1], &c__1, &taui);
+	    e[i__] = a[i__ + 1 + i__ * a_dim1];
+
+	    if (taui != 0.f) {
+
+/*              Apply H(i) from both sides to A(i+1:n,i+1:n) */
+
+		a[i__ + 1 + i__ * a_dim1] = 1.f;
+
+/*              Compute  x := tau * A * v  storing y in TAU(i:n-1) */
+
+		i__2 = *n - i__;
+		ssymv_(uplo, &i__2, &taui, &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b29, &tau[
+			i__], &c__1);
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		i__2 = *n - i__;
+		alpha = taui * -.5f * sdot_(&i__2, &tau[i__], &c__1, &a[i__ +
+			1 + i__ * a_dim1], &c__1);
+		i__2 = *n - i__;
+		saxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+			i__], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		i__2 = *n - i__;
+		ssyr2_(uplo, &i__2, &c_b151, &a[i__ + 1 + i__ * a_dim1], &
+			c__1, &tau[i__], &c__1, &a[i__ + 1 + (i__ + 1) *
+			a_dim1], lda);
+
+		a[i__ + 1 + i__ * a_dim1] = e[i__];
+	    }
+	    d__[i__] = a[i__ + i__ * a_dim1];
+	    tau[i__] = taui;
+/* L20: */
+	}
+	d__[*n] = a[*n + *n * a_dim1];
+    }
+
+    return 0;
+
+/*     End of SSYTD2 */
+
+} /* ssytd2_ */
+
+/* Subroutine */ int ssytrd_(char *uplo, integer *n, real *a, integer *lda,
+	real *d__, real *e, real *tau, real *work, integer *lwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, nb, kk, nx, iws;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    static logical upper;
+    extern /* Subroutine */ int ssytd2_(char *, integer *, real *, integer *,
+	    real *, real *, real *, integer *), ssyr2k_(char *, char *
+	    , integer *, integer *, real *, real *, integer *, real *,
+	    integer *, real *, real *, integer *), xerbla_(
+	    char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slatrd_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, real *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SSYTRD reduces a real symmetric matrix A to real symmetric
+    tridiagonal form T by an orthogonal similarity transformation:
+    Q**T * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the orthogonal
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the orthogonal matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) REAL array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) REAL array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) REAL array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= 1.
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*lwork < 1 && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+
+/*        Determine the block size. */
+
+	nb = ilaenv_(&c__1, "SSYTRD", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6,
+		 (ftnlen)1);
+	lwkopt = *n * nb;
+	work[1] = (real) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SSYTRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nx = *n;
+    iws = 1;
+    if (nb > 1 && nb < *n) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code).
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "SSYTRD", uplo, n, &c_n1, &c_n1, &
+		c_n1, (ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *n) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code by setting NX = N.
+
+   Computing MAX
+*/
+		i__1 = *lwork / ldwork;
+		nb = max(i__1,1);
+		nbmin = ilaenv_(&c__2, "SSYTRD", uplo, n, &c_n1, &c_n1, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		if (nb < nbmin) {
+		    nx = *n;
+		}
+	    }
+	} else {
+	    nx = *n;
+	}
+    } else {
+	nb = 1;
+    }
+
+    if (upper) {
+
+/*
+          Reduce the upper triangle of A.
+          Columns 1:kk are handled by the unblocked method.
+*/
+
+	kk = *n - (*n - nx + nb - 1) / nb * nb;
+	i__1 = kk + 1;
+	i__2 = -nb;
+	for (i__ = *n - nb + 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+		i__2) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = i__ + nb - 1;
+	    slatrd_(uplo, &i__3, &nb, &a[a_offset], lda, &e[1], &tau[1], &
+		    work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(1:i-1,1:i-1), using an
+             update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = i__ - 1;
+	    ssyr2k_(uplo, "No transpose", &i__3, &nb, &c_b151, &a[i__ *
+		    a_dim1 + 1], lda, &work[1], &ldwork, &c_b15, &a[a_offset],
+		     lda);
+
+/*
+             Copy superdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j - 1 + j * a_dim1] = e[j - 1];
+		d__[j] = a[j + j * a_dim1];
+/* L10: */
+	    }
+/* L20: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	ssytd2_(uplo, &kk, &a[a_offset], lda, &d__[1], &e[1], &tau[1], &iinfo);
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__2 = *n - nx;
+	i__1 = nb;
+	for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = *n - i__ + 1;
+	    slatrd_(uplo, &i__3, &nb, &a[i__ + i__ * a_dim1], lda, &e[i__], &
+		    tau[i__], &work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(i+ib:n,i+ib:n), using
+             an update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = *n - i__ - nb + 1;
+	    ssyr2k_(uplo, "No transpose", &i__3, &nb, &c_b151, &a[i__ + nb +
+		    i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b15, &a[
+		    i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*
+             Copy subdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		a[j + 1 + j * a_dim1] = e[j];
+		d__[j] = a[j + j * a_dim1];
+/* L30: */
+	    }
+/* L40: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	i__1 = *n - i__ + 1;
+	ssytd2_(uplo, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__],
+		&tau[i__], &iinfo);
+    }
+
+    work[1] = (real) lwkopt;
+    return 0;
+
+/*     End of SSYTRD */
+
+} /* ssytrd_ */
+
+/* Subroutine */ int strevc_(char *side, char *howmny, logical *select,
+	integer *n, real *t, integer *ldt, real *vl, integer *ldvl, real *vr,
+	integer *ldvr, integer *mm, integer *m, real *work, integer *info)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3;
+    real r__1, r__2, r__3, r__4;
+
+    /* Local variables */
+    static integer i__, j, k;
+    static real x[4]	/* was [2][2] */;
+    static integer j1, j2, n2, ii, ki, ip, is;
+    static real wi, wr, rec, ulp, beta, emax;
+    static logical pair, allv;
+    static integer ierr;
+    static real unfl, ovfl, smin;
+    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
+    static logical over;
+    static real vmax;
+    static integer jnxt;
+    static real scale;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static real remax;
+    static logical leftv;
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *,
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    static logical bothv;
+    static real vcrit;
+    static logical somev;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static real xnorm;
+    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *,
+	    real *, integer *), slaln2_(logical *, integer *, integer *, real
+	    *, real *, real *, integer *, real *, real *, real *, integer *,
+	    real *, real *, real *, integer *, real *, real *, integer *),
+	    slabad_(real *, real *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real bignum;
+    extern integer isamax_(integer *, real *, integer *);
+    static logical rightv;
+    static real smlnum;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    STREVC computes some or all of the right and/or left eigenvectors of
+    a real upper quasi-triangular matrix T.
+    Matrices of this type are produced by the Schur factorization of
+    a real general matrix:  A = Q*T*Q**T, as computed by SHSEQR.
+
+    The right eigenvector x and the left eigenvector y of T corresponding
+    to an eigenvalue w are defined by:
+
+       T*x = w*x,     (y**H)*T = w*(y**H)
+
+    where y**H denotes the conjugate transpose of y.
+    The eigenvalues are not input to this routine, but are read directly
+    from the diagonal blocks of T.
+
+    This routine returns the matrices X and/or Y of right and left
+    eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+    input matrix.  If Q is the orthogonal factor that reduces a matrix
+    A to Schur form T, then Q*X and Q*Y are the matrices of right and
+    left eigenvectors of A.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  compute right eigenvectors only;
+            = 'L':  compute left eigenvectors only;
+            = 'B':  compute both right and left eigenvectors.
+
+    HOWMNY  (input) CHARACTER*1
+            = 'A':  compute all right and/or left eigenvectors;
+            = 'B':  compute all right and/or left eigenvectors,
+                    backtransformed by the matrices in VR and/or VL;
+            = 'S':  compute selected right and/or left eigenvectors,
+                    as indicated by the logical array SELECT.
+
+    SELECT  (input/output) LOGICAL array, dimension (N)
+            If HOWMNY = 'S', SELECT specifies the eigenvectors to be
+            computed.
+            If w(j) is a real eigenvalue, the corresponding real
+            eigenvector is computed if SELECT(j) is .TRUE..
+            If w(j) and w(j+1) are the real and imaginary parts of a
+            complex eigenvalue, the corresponding complex eigenvector is
+            computed if either SELECT(j) or SELECT(j+1) is .TRUE., and
+            on exit SELECT(j) is set to .TRUE. and SELECT(j+1) is set to
+            .FALSE..
+            Not referenced if HOWMNY = 'A' or 'B'.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input) REAL array, dimension (LDT,N)
+            The upper quasi-triangular matrix T in Schur canonical form.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    VL      (input/output) REAL array, dimension (LDVL,MM)
+            On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must
+            contain an N-by-N matrix Q (usually the orthogonal matrix Q
+            of Schur vectors returned by SHSEQR).
+            On exit, if SIDE = 'L' or 'B', VL contains:
+            if HOWMNY = 'A', the matrix Y of left eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*Y;
+            if HOWMNY = 'S', the left eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VL, in the same order as their
+                             eigenvalues.
+            A complex eigenvector corresponding to a complex eigenvalue
+            is stored in two consecutive columns, the first holding the
+            real part, and the second the imaginary part.
+            Not referenced if SIDE = 'R'.
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1, and if
+            SIDE = 'L' or 'B', LDVL >= N.
+
+    VR      (input/output) REAL array, dimension (LDVR,MM)
+            On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
+            contain an N-by-N matrix Q (usually the orthogonal matrix Q
+            of Schur vectors returned by SHSEQR).
+            On exit, if SIDE = 'R' or 'B', VR contains:
+            if HOWMNY = 'A', the matrix X of right eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*X;
+            if HOWMNY = 'S', the right eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VR, in the same order as their
+                             eigenvalues.
+            A complex eigenvector corresponding to a complex eigenvalue
+            is stored in two consecutive columns, the first holding the
+            real part and the second the imaginary part.
+            Not referenced if SIDE = 'L'.
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1, and if
+            SIDE = 'R' or 'B', LDVR >= N.
+
+    MM      (input) INTEGER
+            The number of columns in the arrays VL and/or VR. MM >= M.
+
+    M       (output) INTEGER
+            The number of columns in the arrays VL and/or VR actually
+            used to store the eigenvectors.
+            If HOWMNY = 'A' or 'B', M is set to N.
+            Each selected real eigenvector occupies one column and each
+            selected complex eigenvector occupies two columns.
+
+    WORK    (workspace) REAL array, dimension (3*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The algorithm used in this program is basically backward (forward)
+    substitution, with scaling to make the the code robust against
+    possible overflow.
+
+    Each eigenvector is normalized so that the element of largest
+    magnitude has magnitude 1; here the magnitude of a complex number
+    (x,y) is taken to be |x| + |y|.
+
+    =====================================================================
+
+
+       Decode and test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --select;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+
+    /* Function Body */
+    bothv = lsame_(side, "B");
+    rightv = lsame_(side, "R") || bothv;
+    leftv = lsame_(side, "L") || bothv;
+
+    allv = lsame_(howmny, "A");
+    over = lsame_(howmny, "B");
+    somev = lsame_(howmny, "S");
+
+    *info = 0;
+    if (! rightv && ! leftv) {
+	*info = -1;
+    } else if (! allv && ! over && ! somev) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ldt < max(1,*n)) {
+	*info = -6;
+    } else if (*ldvl < 1 || leftv && *ldvl < *n) {
+	*info = -8;
+    } else if (*ldvr < 1 || rightv && *ldvr < *n) {
+	*info = -10;
+    } else {
+
+/*
+          Set M to the number of columns required to store the selected
+          eigenvectors, standardize the array SELECT if necessary, and
+          test MM.
+*/
+
+	if (somev) {
+	    *m = 0;
+	    pair = FALSE_;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (pair) {
+		    pair = FALSE_;
+		    select[j] = FALSE_;
+		} else {
+		    if (j < *n) {
+			if (t[j + 1 + j * t_dim1] == 0.f) {
+			    if (select[j]) {
+				++(*m);
+			    }
+			} else {
+			    pair = TRUE_;
+			    if (select[j] || select[j + 1]) {
+				select[j] = TRUE_;
+				*m += 2;
+			    }
+			}
+		    } else {
+			if (select[*n]) {
+			    ++(*m);
+			}
+		    }
+		}
+/* L10: */
+	    }
+	} else {
+	    *m = *n;
+	}
+
+	if (*mm < *m) {
+	    *info = -11;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("STREVC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Set the constants to control overflow. */
+
+    unfl = slamch_("Safe minimum");
+    ovfl = 1.f / unfl;
+    slabad_(&unfl, &ovfl);
+    ulp = slamch_("Precision");
+    smlnum = unfl * (*n / ulp);
+    bignum = (1.f - ulp) / smlnum;
+
+/*
+       Compute 1-norm of each column of strictly upper triangular
+       part of T to control overflow in triangular solver.
+*/
+
+    work[1] = 0.f;
+    i__1 = *n;
+    for (j = 2; j <= i__1; ++j) {
+	work[j] = 0.f;
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    work[j] += (r__1 = t[i__ + j * t_dim1], dabs(r__1));
+/* L20: */
+	}
+/* L30: */
+    }
+
+/*
+       Index IP is used to specify the real or complex eigenvalue:
+         IP = 0, real eigenvalue,
+              1, first of conjugate complex pair: (wr,wi)
+             -1, second of conjugate complex pair: (wr,wi)
+*/
+
+    n2 = *n << 1;
+
+    if (rightv) {
+
+/*        Compute right eigenvectors. */
+
+	ip = 0;
+	is = *m;
+	for (ki = *n; ki >= 1; --ki) {
+
+	    if (ip == 1) {
+		goto L130;
+	    }
+	    if (ki == 1) {
+		goto L40;
+	    }
+	    if (t[ki + (ki - 1) * t_dim1] == 0.f) {
+		goto L40;
+	    }
+	    ip = -1;
+
+L40:
+	    if (somev) {
+		if (ip == 0) {
+		    if (! select[ki]) {
+			goto L130;
+		    }
+		} else {
+		    if (! select[ki - 1]) {
+			goto L130;
+		    }
+		}
+	    }
+
+/*           Compute the KI-th eigenvalue (WR,WI). */
+
+	    wr = t[ki + ki * t_dim1];
+	    wi = 0.f;
+	    if (ip != 0) {
+		wi = sqrt((r__1 = t[ki + (ki - 1) * t_dim1], dabs(r__1))) *
+			sqrt((r__2 = t[ki - 1 + ki * t_dim1], dabs(r__2)));
+	    }
+/* Computing MAX */
+	    r__1 = ulp * (dabs(wr) + dabs(wi));
+	    smin = dmax(r__1,smlnum);
+
+	    if (ip == 0) {
+
+/*              Real right eigenvector */
+
+		work[ki + *n] = 1.f;
+
+/*              Form right-hand side */
+
+		i__1 = ki - 1;
+		for (k = 1; k <= i__1; ++k) {
+		    work[k + *n] = -t[k + ki * t_dim1];
+/* L50: */
+		}
+
+/*
+                Solve the upper quasi-triangular system:
+                   (T(1:KI-1,1:KI-1) - WR)*X = SCALE*WORK.
+*/
+
+		jnxt = ki - 1;
+		for (j = ki - 1; j >= 1; --j) {
+		    if (j > jnxt) {
+			goto L60;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j - 1;
+		    if (j > 1) {
+			if (t[j + (j - 1) * t_dim1] != 0.f) {
+			    j1 = j - 1;
+			    jnxt = j - 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*                    1-by-1 diagonal block */
+
+			slaln2_(&c_false, &c__1, &c__1, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &c_b29, x, &c__2, &scale, &xnorm,
+				&ierr);
+
+/*
+                      Scale X(1,1) to avoid overflow when updating
+                      the right-hand side.
+*/
+
+			if (xnorm > 1.f) {
+			    if (work[j] > bignum / xnorm) {
+				x[0] /= xnorm;
+				scale /= xnorm;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    sscal_(&ki, &scale, &work[*n + 1], &c__1);
+			}
+			work[j + *n] = x[0];
+
+/*                    Update right-hand side */
+
+			i__1 = j - 1;
+			r__1 = -x[0];
+			saxpy_(&i__1, &r__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+
+		    } else {
+
+/*                    2-by-2 diagonal block */
+
+			slaln2_(&c_false, &c__2, &c__1, &smin, &c_b15, &t[j -
+				1 + (j - 1) * t_dim1], ldt, &c_b15, &c_b15, &
+				work[j - 1 + *n], n, &wr, &c_b29, x, &c__2, &
+				scale, &xnorm, &ierr);
+
+/*
+                      Scale X(1,1) and X(2,1) to avoid overflow when
+                      updating the right-hand side.
+*/
+
+			if (xnorm > 1.f) {
+/* Computing MAX */
+			    r__1 = work[j - 1], r__2 = work[j];
+			    beta = dmax(r__1,r__2);
+			    if (beta > bignum / xnorm) {
+				x[0] /= xnorm;
+				x[1] /= xnorm;
+				scale /= xnorm;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    sscal_(&ki, &scale, &work[*n + 1], &c__1);
+			}
+			work[j - 1 + *n] = x[0];
+			work[j + *n] = x[1];
+
+/*                    Update right-hand side */
+
+			i__1 = j - 2;
+			r__1 = -x[0];
+			saxpy_(&i__1, &r__1, &t[(j - 1) * t_dim1 + 1], &c__1,
+				&work[*n + 1], &c__1);
+			i__1 = j - 2;
+			r__1 = -x[1];
+			saxpy_(&i__1, &r__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+		    }
+L60:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VR and normalize. */
+
+		if (! over) {
+		    scopy_(&ki, &work[*n + 1], &c__1, &vr[is * vr_dim1 + 1], &
+			    c__1);
+
+		    ii = isamax_(&ki, &vr[is * vr_dim1 + 1], &c__1);
+		    remax = 1.f / (r__1 = vr[ii + is * vr_dim1], dabs(r__1));
+		    sscal_(&ki, &remax, &vr[is * vr_dim1 + 1], &c__1);
+
+		    i__1 = *n;
+		    for (k = ki + 1; k <= i__1; ++k) {
+			vr[k + is * vr_dim1] = 0.f;
+/* L70: */
+		    }
+		} else {
+		    if (ki > 1) {
+			i__1 = ki - 1;
+			sgemv_("N", n, &i__1, &c_b15, &vr[vr_offset], ldvr, &
+				work[*n + 1], &c__1, &work[ki + *n], &vr[ki *
+				vr_dim1 + 1], &c__1);
+		    }
+
+		    ii = isamax_(n, &vr[ki * vr_dim1 + 1], &c__1);
+		    remax = 1.f / (r__1 = vr[ii + ki * vr_dim1], dabs(r__1));
+		    sscal_(n, &remax, &vr[ki * vr_dim1 + 1], &c__1);
+		}
+
+	    } else {
+
+/*
+                Complex right eigenvector.
+
+                Initial solve
+                  [ (T(KI-1,KI-1) T(KI-1,KI) ) - (WR + I* WI)]*X = 0.
+                  [ (T(KI,KI-1)   T(KI,KI)   )               ]
+*/
+
+		if ((r__1 = t[ki - 1 + ki * t_dim1], dabs(r__1)) >= (r__2 = t[
+			ki + (ki - 1) * t_dim1], dabs(r__2))) {
+		    work[ki - 1 + *n] = 1.f;
+		    work[ki + n2] = wi / t[ki - 1 + ki * t_dim1];
+		} else {
+		    work[ki - 1 + *n] = -wi / t[ki + (ki - 1) * t_dim1];
+		    work[ki + n2] = 1.f;
+		}
+		work[ki + *n] = 0.f;
+		work[ki - 1 + n2] = 0.f;
+
+/*              Form right-hand side */
+
+		i__1 = ki - 2;
+		for (k = 1; k <= i__1; ++k) {
+		    work[k + *n] = -work[ki - 1 + *n] * t[k + (ki - 1) *
+			    t_dim1];
+		    work[k + n2] = -work[ki + n2] * t[k + ki * t_dim1];
+/* L80: */
+		}
+
+/*
+                Solve upper quasi-triangular system:
+                (T(1:KI-2,1:KI-2) - (WR+i*WI))*X = SCALE*(WORK+i*WORK2)
+*/
+
+		jnxt = ki - 2;
+		for (j = ki - 2; j >= 1; --j) {
+		    if (j > jnxt) {
+			goto L90;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j - 1;
+		    if (j > 1) {
+			if (t[j + (j - 1) * t_dim1] != 0.f) {
+			    j1 = j - 1;
+			    jnxt = j - 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*                    1-by-1 diagonal block */
+
+			slaln2_(&c_false, &c__1, &c__2, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &wi, x, &c__2, &scale, &xnorm, &
+				ierr);
+
+/*
+                      Scale X(1,1) and X(1,2) to avoid overflow when
+                      updating the right-hand side.
+*/
+
+			if (xnorm > 1.f) {
+			    if (work[j] > bignum / xnorm) {
+				x[0] /= xnorm;
+				x[2] /= xnorm;
+				scale /= xnorm;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    sscal_(&ki, &scale, &work[*n + 1], &c__1);
+			    sscal_(&ki, &scale, &work[n2 + 1], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + n2] = x[2];
+
+/*                    Update the right-hand side */
+
+			i__1 = j - 1;
+			r__1 = -x[0];
+			saxpy_(&i__1, &r__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+			i__1 = j - 1;
+			r__1 = -x[2];
+			saxpy_(&i__1, &r__1, &t[j * t_dim1 + 1], &c__1, &work[
+				n2 + 1], &c__1);
+
+		    } else {
+
+/*                    2-by-2 diagonal block */
+
+			slaln2_(&c_false, &c__2, &c__2, &smin, &c_b15, &t[j -
+				1 + (j - 1) * t_dim1], ldt, &c_b15, &c_b15, &
+				work[j - 1 + *n], n, &wr, &wi, x, &c__2, &
+				scale, &xnorm, &ierr);
+
+/*
+                      Scale X to avoid overflow when updating
+                      the right-hand side.
+*/
+
+			if (xnorm > 1.f) {
+/* Computing MAX */
+			    r__1 = work[j - 1], r__2 = work[j];
+			    beta = dmax(r__1,r__2);
+			    if (beta > bignum / xnorm) {
+				rec = 1.f / xnorm;
+				x[0] *= rec;
+				x[2] *= rec;
+				x[1] *= rec;
+				x[3] *= rec;
+				scale *= rec;
+			    }
+			}
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    sscal_(&ki, &scale, &work[*n + 1], &c__1);
+			    sscal_(&ki, &scale, &work[n2 + 1], &c__1);
+			}
+			work[j - 1 + *n] = x[0];
+			work[j + *n] = x[1];
+			work[j - 1 + n2] = x[2];
+			work[j + n2] = x[3];
+
+/*                    Update the right-hand side */
+
+			i__1 = j - 2;
+			r__1 = -x[0];
+			saxpy_(&i__1, &r__1, &t[(j - 1) * t_dim1 + 1], &c__1,
+				&work[*n + 1], &c__1);
+			i__1 = j - 2;
+			r__1 = -x[1];
+			saxpy_(&i__1, &r__1, &t[j * t_dim1 + 1], &c__1, &work[
+				*n + 1], &c__1);
+			i__1 = j - 2;
+			r__1 = -x[2];
+			saxpy_(&i__1, &r__1, &t[(j - 1) * t_dim1 + 1], &c__1,
+				&work[n2 + 1], &c__1);
+			i__1 = j - 2;
+			r__1 = -x[3];
+			saxpy_(&i__1, &r__1, &t[j * t_dim1 + 1], &c__1, &work[
+				n2 + 1], &c__1);
+		    }
+L90:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VR and normalize. */
+
+		if (! over) {
+		    scopy_(&ki, &work[*n + 1], &c__1, &vr[(is - 1) * vr_dim1
+			    + 1], &c__1);
+		    scopy_(&ki, &work[n2 + 1], &c__1, &vr[is * vr_dim1 + 1], &
+			    c__1);
+
+		    emax = 0.f;
+		    i__1 = ki;
+		    for (k = 1; k <= i__1; ++k) {
+/* Computing MAX */
+			r__3 = emax, r__4 = (r__1 = vr[k + (is - 1) * vr_dim1]
+				, dabs(r__1)) + (r__2 = vr[k + is * vr_dim1],
+				dabs(r__2));
+			emax = dmax(r__3,r__4);
+/* L100: */
+		    }
+
+		    remax = 1.f / emax;
+		    sscal_(&ki, &remax, &vr[(is - 1) * vr_dim1 + 1], &c__1);
+		    sscal_(&ki, &remax, &vr[is * vr_dim1 + 1], &c__1);
+
+		    i__1 = *n;
+		    for (k = ki + 1; k <= i__1; ++k) {
+			vr[k + (is - 1) * vr_dim1] = 0.f;
+			vr[k + is * vr_dim1] = 0.f;
+/* L110: */
+		    }
+
+		} else {
+
+		    if (ki > 2) {
+			i__1 = ki - 2;
+			sgemv_("N", n, &i__1, &c_b15, &vr[vr_offset], ldvr, &
+				work[*n + 1], &c__1, &work[ki - 1 + *n], &vr[(
+				ki - 1) * vr_dim1 + 1], &c__1);
+			i__1 = ki - 2;
+			sgemv_("N", n, &i__1, &c_b15, &vr[vr_offset], ldvr, &
+				work[n2 + 1], &c__1, &work[ki + n2], &vr[ki *
+				vr_dim1 + 1], &c__1);
+		    } else {
+			sscal_(n, &work[ki - 1 + *n], &vr[(ki - 1) * vr_dim1
+				+ 1], &c__1);
+			sscal_(n, &work[ki + n2], &vr[ki * vr_dim1 + 1], &
+				c__1);
+		    }
+
+		    emax = 0.f;
+		    i__1 = *n;
+		    for (k = 1; k <= i__1; ++k) {
+/* Computing MAX */
+			r__3 = emax, r__4 = (r__1 = vr[k + (ki - 1) * vr_dim1]
+				, dabs(r__1)) + (r__2 = vr[k + ki * vr_dim1],
+				dabs(r__2));
+			emax = dmax(r__3,r__4);
+/* L120: */
+		    }
+		    remax = 1.f / emax;
+		    sscal_(n, &remax, &vr[(ki - 1) * vr_dim1 + 1], &c__1);
+		    sscal_(n, &remax, &vr[ki * vr_dim1 + 1], &c__1);
+		}
+	    }
+
+	    --is;
+	    if (ip != 0) {
+		--is;
+	    }
+L130:
+	    if (ip == 1) {
+		ip = 0;
+	    }
+	    if (ip == -1) {
+		ip = 1;
+	    }
+/* L140: */
+	}
+    }
+
+    if (leftv) {
+
+/*        Compute left eigenvectors. */
+
+	ip = 0;
+	is = 1;
+	i__1 = *n;
+	for (ki = 1; ki <= i__1; ++ki) {
+
+	    if (ip == -1) {
+		goto L250;
+	    }
+	    if (ki == *n) {
+		goto L150;
+	    }
+	    if (t[ki + 1 + ki * t_dim1] == 0.f) {
+		goto L150;
+	    }
+	    ip = 1;
+
+L150:
+	    if (somev) {
+		if (! select[ki]) {
+		    goto L250;
+		}
+	    }
+
+/*           Compute the KI-th eigenvalue (WR,WI). */
+
+	    wr = t[ki + ki * t_dim1];
+	    wi = 0.f;
+	    if (ip != 0) {
+		wi = sqrt((r__1 = t[ki + (ki + 1) * t_dim1], dabs(r__1))) *
+			sqrt((r__2 = t[ki + 1 + ki * t_dim1], dabs(r__2)));
+	    }
+/* Computing MAX */
+	    r__1 = ulp * (dabs(wr) + dabs(wi));
+	    smin = dmax(r__1,smlnum);
+
+	    if (ip == 0) {
+
+/*              Real left eigenvector. */
+
+		work[ki + *n] = 1.f;
+
+/*              Form right-hand side */
+
+		i__2 = *n;
+		for (k = ki + 1; k <= i__2; ++k) {
+		    work[k + *n] = -t[ki + k * t_dim1];
+/* L160: */
+		}
+
+/*
+                Solve the quasi-triangular system:
+                   (T(KI+1:N,KI+1:N) - WR)'*X = SCALE*WORK
+*/
+
+		vmax = 1.f;
+		vcrit = bignum;
+
+		jnxt = ki + 1;
+		i__2 = *n;
+		for (j = ki + 1; j <= i__2; ++j) {
+		    if (j < jnxt) {
+			goto L170;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j + 1;
+		    if (j < *n) {
+			if (t[j + 1 + j * t_dim1] != 0.f) {
+			    j2 = j + 1;
+			    jnxt = j + 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*
+                      1-by-1 diagonal block
+
+                      Scale if necessary to avoid overflow when forming
+                      the right-hand side.
+*/
+
+			if (work[j] > vcrit) {
+			    rec = 1.f / vmax;
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    vmax = 1.f;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 1;
+			work[j + *n] -= sdot_(&i__3, &t[ki + 1 + j * t_dim1],
+				&c__1, &work[ki + 1 + *n], &c__1);
+
+/*                    Solve (T(J,J)-WR)'*X = WORK */
+
+			slaln2_(&c_false, &c__1, &c__1, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &c_b29, x, &c__2, &scale, &xnorm,
+				&ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			}
+			work[j + *n] = x[0];
+/* Computing MAX */
+			r__2 = (r__1 = work[j + *n], dabs(r__1));
+			vmax = dmax(r__2,vmax);
+			vcrit = bignum / vmax;
+
+		    } else {
+
+/*
+                      2-by-2 diagonal block
+
+                      Scale if necessary to avoid overflow when forming
+                      the right-hand side.
+
+   Computing MAX
+*/
+			r__1 = work[j], r__2 = work[j + 1];
+			beta = dmax(r__1,r__2);
+			if (beta > vcrit) {
+			    rec = 1.f / vmax;
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    vmax = 1.f;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 1;
+			work[j + *n] -= sdot_(&i__3, &t[ki + 1 + j * t_dim1],
+				&c__1, &work[ki + 1 + *n], &c__1);
+
+			i__3 = j - ki - 1;
+			work[j + 1 + *n] -= sdot_(&i__3, &t[ki + 1 + (j + 1) *
+				 t_dim1], &c__1, &work[ki + 1 + *n], &c__1);
+
+/*
+                      Solve
+                        [T(J,J)-WR   T(J,J+1)     ]'* X = SCALE*( WORK1 )
+                        [T(J+1,J)    T(J+1,J+1)-WR]             ( WORK2 )
+*/
+
+			slaln2_(&c_true, &c__2, &c__1, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &c_b29, x, &c__2, &scale, &xnorm,
+				&ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + 1 + *n] = x[1];
+
+/* Computing MAX */
+			r__3 = (r__1 = work[j + *n], dabs(r__1)), r__4 = (
+				r__2 = work[j + 1 + *n], dabs(r__2)), r__3 =
+				max(r__3,r__4);
+			vmax = dmax(r__3,vmax);
+			vcrit = bignum / vmax;
+
+		    }
+L170:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VL and normalize. */
+
+		if (! over) {
+		    i__2 = *n - ki + 1;
+		    scopy_(&i__2, &work[ki + *n], &c__1, &vl[ki + is *
+			    vl_dim1], &c__1);
+
+		    i__2 = *n - ki + 1;
+		    ii = isamax_(&i__2, &vl[ki + is * vl_dim1], &c__1) + ki -
+			    1;
+		    remax = 1.f / (r__1 = vl[ii + is * vl_dim1], dabs(r__1));
+		    i__2 = *n - ki + 1;
+		    sscal_(&i__2, &remax, &vl[ki + is * vl_dim1], &c__1);
+
+		    i__2 = ki - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			vl[k + is * vl_dim1] = 0.f;
+/* L180: */
+		    }
+
+		} else {
+
+		    if (ki < *n) {
+			i__2 = *n - ki;
+			sgemv_("N", n, &i__2, &c_b15, &vl[(ki + 1) * vl_dim1
+				+ 1], ldvl, &work[ki + 1 + *n], &c__1, &work[
+				ki + *n], &vl[ki * vl_dim1 + 1], &c__1);
+		    }
+
+		    ii = isamax_(n, &vl[ki * vl_dim1 + 1], &c__1);
+		    remax = 1.f / (r__1 = vl[ii + ki * vl_dim1], dabs(r__1));
+		    sscal_(n, &remax, &vl[ki * vl_dim1 + 1], &c__1);
+
+		}
+
+	    } else {
+
+/*
+                Complex left eigenvector.
+
+                 Initial solve:
+                   ((T(KI,KI)    T(KI,KI+1) )' - (WR - I* WI))*X = 0.
+                   ((T(KI+1,KI) T(KI+1,KI+1))                )
+*/
+
+		if ((r__1 = t[ki + (ki + 1) * t_dim1], dabs(r__1)) >= (r__2 =
+			t[ki + 1 + ki * t_dim1], dabs(r__2))) {
+		    work[ki + *n] = wi / t[ki + (ki + 1) * t_dim1];
+		    work[ki + 1 + n2] = 1.f;
+		} else {
+		    work[ki + *n] = 1.f;
+		    work[ki + 1 + n2] = -wi / t[ki + 1 + ki * t_dim1];
+		}
+		work[ki + 1 + *n] = 0.f;
+		work[ki + n2] = 0.f;
+
+/*              Form right-hand side */
+
+		i__2 = *n;
+		for (k = ki + 2; k <= i__2; ++k) {
+		    work[k + *n] = -work[ki + *n] * t[ki + k * t_dim1];
+		    work[k + n2] = -work[ki + 1 + n2] * t[ki + 1 + k * t_dim1]
+			    ;
+/* L190: */
+		}
+
+/*
+                Solve complex quasi-triangular system:
+                ( T(KI+2,N:KI+2,N) - (WR-i*WI) )*X = WORK1+i*WORK2
+*/
+
+		vmax = 1.f;
+		vcrit = bignum;
+
+		jnxt = ki + 2;
+		i__2 = *n;
+		for (j = ki + 2; j <= i__2; ++j) {
+		    if (j < jnxt) {
+			goto L200;
+		    }
+		    j1 = j;
+		    j2 = j;
+		    jnxt = j + 1;
+		    if (j < *n) {
+			if (t[j + 1 + j * t_dim1] != 0.f) {
+			    j2 = j + 1;
+			    jnxt = j + 2;
+			}
+		    }
+
+		    if (j1 == j2) {
+
+/*
+                      1-by-1 diagonal block
+
+                      Scale if necessary to avoid overflow when
+                      forming the right-hand side elements.
+*/
+
+			if (work[j] > vcrit) {
+			    rec = 1.f / vmax;
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &rec, &work[ki + n2], &c__1);
+			    vmax = 1.f;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 2;
+			work[j + *n] -= sdot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + *n], &c__1);
+			i__3 = j - ki - 2;
+			work[j + n2] -= sdot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + n2], &c__1);
+
+/*                    Solve (T(J,J)-(WR-i*WI))*(X11+i*X12)= WK+I*WK2 */
+
+			r__1 = -wi;
+			slaln2_(&c_false, &c__1, &c__2, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &r__1, x, &c__2, &scale, &xnorm, &
+				ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &scale, &work[ki + n2], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + n2] = x[2];
+/* Computing MAX */
+			r__3 = (r__1 = work[j + *n], dabs(r__1)), r__4 = (
+				r__2 = work[j + n2], dabs(r__2)), r__3 = max(
+				r__3,r__4);
+			vmax = dmax(r__3,vmax);
+			vcrit = bignum / vmax;
+
+		    } else {
+
+/*
+                      2-by-2 diagonal block
+
+                      Scale if necessary to avoid overflow when forming
+                      the right-hand side elements.
+
+   Computing MAX
+*/
+			r__1 = work[j], r__2 = work[j + 1];
+			beta = dmax(r__1,r__2);
+			if (beta > vcrit) {
+			    rec = 1.f / vmax;
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &rec, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &rec, &work[ki + n2], &c__1);
+			    vmax = 1.f;
+			    vcrit = bignum;
+			}
+
+			i__3 = j - ki - 2;
+			work[j + *n] -= sdot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + *n], &c__1);
+
+			i__3 = j - ki - 2;
+			work[j + n2] -= sdot_(&i__3, &t[ki + 2 + j * t_dim1],
+				&c__1, &work[ki + 2 + n2], &c__1);
+
+			i__3 = j - ki - 2;
+			work[j + 1 + *n] -= sdot_(&i__3, &t[ki + 2 + (j + 1) *
+				 t_dim1], &c__1, &work[ki + 2 + *n], &c__1);
+
+			i__3 = j - ki - 2;
+			work[j + 1 + n2] -= sdot_(&i__3, &t[ki + 2 + (j + 1) *
+				 t_dim1], &c__1, &work[ki + 2 + n2], &c__1);
+
+/*
+                      Solve 2-by-2 complex linear equation
+                        ([T(j,j)   T(j,j+1)  ]'-(wr-i*wi)*I)*X = SCALE*B
+                        ([T(j+1,j) T(j+1,j+1)]             )
+*/
+
+			r__1 = -wi;
+			slaln2_(&c_true, &c__2, &c__2, &smin, &c_b15, &t[j +
+				j * t_dim1], ldt, &c_b15, &c_b15, &work[j + *
+				n], n, &wr, &r__1, x, &c__2, &scale, &xnorm, &
+				ierr);
+
+/*                    Scale if necessary */
+
+			if (scale != 1.f) {
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &scale, &work[ki + *n], &c__1);
+			    i__3 = *n - ki + 1;
+			    sscal_(&i__3, &scale, &work[ki + n2], &c__1);
+			}
+			work[j + *n] = x[0];
+			work[j + n2] = x[2];
+			work[j + 1 + *n] = x[1];
+			work[j + 1 + n2] = x[3];
+/* Computing MAX */
+			r__1 = dabs(x[0]), r__2 = dabs(x[2]), r__1 = max(r__1,
+				r__2), r__2 = dabs(x[1]), r__1 = max(r__1,
+				r__2), r__2 = dabs(x[3]), r__1 = max(r__1,
+				r__2);
+			vmax = dmax(r__1,vmax);
+			vcrit = bignum / vmax;
+
+		    }
+L200:
+		    ;
+		}
+
+/*              Copy the vector x or Q*x to VL and normalize. */
+
+		if (! over) {
+		    i__2 = *n - ki + 1;
+		    scopy_(&i__2, &work[ki + *n], &c__1, &vl[ki + is *
+			    vl_dim1], &c__1);
+		    i__2 = *n - ki + 1;
+		    scopy_(&i__2, &work[ki + n2], &c__1, &vl[ki + (is + 1) *
+			    vl_dim1], &c__1);
+
+		    emax = 0.f;
+		    i__2 = *n;
+		    for (k = ki; k <= i__2; ++k) {
+/* Computing MAX */
+			r__3 = emax, r__4 = (r__1 = vl[k + is * vl_dim1],
+				dabs(r__1)) + (r__2 = vl[k + (is + 1) *
+				vl_dim1], dabs(r__2));
+			emax = dmax(r__3,r__4);
+/* L220: */
+		    }
+		    remax = 1.f / emax;
+		    i__2 = *n - ki + 1;
+		    sscal_(&i__2, &remax, &vl[ki + is * vl_dim1], &c__1);
+		    i__2 = *n - ki + 1;
+		    sscal_(&i__2, &remax, &vl[ki + (is + 1) * vl_dim1], &c__1)
+			    ;
+
+		    i__2 = ki - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			vl[k + is * vl_dim1] = 0.f;
+			vl[k + (is + 1) * vl_dim1] = 0.f;
+/* L230: */
+		    }
+		} else {
+		    if (ki < *n - 1) {
+			i__2 = *n - ki - 1;
+			sgemv_("N", n, &i__2, &c_b15, &vl[(ki + 2) * vl_dim1
+				+ 1], ldvl, &work[ki + 2 + *n], &c__1, &work[
+				ki + *n], &vl[ki * vl_dim1 + 1], &c__1);
+			i__2 = *n - ki - 1;
+			sgemv_("N", n, &i__2, &c_b15, &vl[(ki + 2) * vl_dim1
+				+ 1], ldvl, &work[ki + 2 + n2], &c__1, &work[
+				ki + 1 + n2], &vl[(ki + 1) * vl_dim1 + 1], &
+				c__1);
+		    } else {
+			sscal_(n, &work[ki + *n], &vl[ki * vl_dim1 + 1], &
+				c__1);
+			sscal_(n, &work[ki + 1 + n2], &vl[(ki + 1) * vl_dim1
+				+ 1], &c__1);
+		    }
+
+		    emax = 0.f;
+		    i__2 = *n;
+		    for (k = 1; k <= i__2; ++k) {
+/* Computing MAX */
+			r__3 = emax, r__4 = (r__1 = vl[k + ki * vl_dim1],
+				dabs(r__1)) + (r__2 = vl[k + (ki + 1) *
+				vl_dim1], dabs(r__2));
+			emax = dmax(r__3,r__4);
+/* L240: */
+		    }
+		    remax = 1.f / emax;
+		    sscal_(n, &remax, &vl[ki * vl_dim1 + 1], &c__1);
+		    sscal_(n, &remax, &vl[(ki + 1) * vl_dim1 + 1], &c__1);
+
+		}
+
+	    }
+
+	    ++is;
+	    if (ip != 0) {
+		++is;
+	    }
+L250:
+	    if (ip == -1) {
+		ip = 0;
+	    }
+	    if (ip == 1) {
+		ip = -1;
+	    }
+
+/* L260: */
+	}
+
+    }
+
+    return 0;
+
+/*     End of STREVC */
+
+} /* strevc_ */
+
+/* Subroutine */ int strexc_(char *compq, integer *n, real *t, integer *ldt,
+	real *q, integer *ldq, integer *ifst, integer *ilst, real *work,
+	integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, t_dim1, t_offset, i__1;
+
+    /* Local variables */
+    static integer nbf, nbl, here;
+    extern logical lsame_(char *, char *);
+    static logical wantq;
+    extern /* Subroutine */ int xerbla_(char *, integer *), slaexc_(
+	    logical *, integer *, real *, integer *, real *, integer *,
+	    integer *, integer *, integer *, real *, integer *);
+    static integer nbnext;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    STREXC reorders the real Schur factorization of a real matrix
+    A = Q*T*Q**T, so that the diagonal block of T with row index IFST is
+    moved to row ILST.
+
+    The real Schur form T is reordered by an orthogonal similarity
+    transformation Z**T*T*Z, and optionally the matrix Q of Schur vectors
+    is updated by postmultiplying it with Z.
+
+    T must be in Schur canonical form (as returned by SHSEQR), that is,
+    block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+    2-by-2 diagonal block has its diagonal elements equal and its
+    off-diagonal elements of opposite sign.
+
+    Arguments
+    =========
+
+    COMPQ   (input) CHARACTER*1
+            = 'V':  update the matrix Q of Schur vectors;
+            = 'N':  do not update Q.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) REAL array, dimension (LDT,N)
+            On entry, the upper quasi-triangular matrix T, in Schur
+            Schur canonical form.
+            On exit, the reordered upper quasi-triangular matrix, again
+            in Schur canonical form.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    Q       (input/output) REAL array, dimension (LDQ,N)
+            On entry, if COMPQ = 'V', the matrix Q of Schur vectors.
+            On exit, if COMPQ = 'V', Q has been postmultiplied by the
+            orthogonal transformation matrix Z which reorders T.
+            If COMPQ = 'N', Q is not referenced.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    IFST    (input/output) INTEGER
+    ILST    (input/output) INTEGER
+            Specify the reordering of the diagonal blocks of T.
+            The block with row index IFST is moved to row ILST, by a
+            sequence of transpositions between adjacent blocks.
+            On exit, if IFST pointed on entry to the second row of a
+            2-by-2 block, it is changed to point to the first row; ILST
+            always points to the first row of the block in its final
+            position (which may differ from its input value by +1 or -1).
+            1 <= IFST <= N; 1 <= ILST <= N.
+
+    WORK    (workspace) REAL array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            = 1:  two adjacent blocks were too close to swap (the problem
+                  is very ill-conditioned); T may have been partially
+                  reordered, and ILST points to the first row of the
+                  current position of the block being moved.
+
+    =====================================================================
+
+
+       Decode and test the input arguments.
+*/
+
+    /* Parameter adjustments */
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(compq, "V");
+    if (! wantq && ! lsame_(compq, "N")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldt < max(1,*n)) {
+	*info = -4;
+    } else if (*ldq < 1 || wantq && *ldq < max(1,*n)) {
+	*info = -6;
+    } else if (*ifst < 1 || *ifst > *n) {
+	*info = -7;
+    } else if (*ilst < 1 || *ilst > *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("STREXC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 1) {
+	return 0;
+    }
+
+/*
+       Determine the first row of specified block
+       and find out it is 1 by 1 or 2 by 2.
+*/
+
+    if (*ifst > 1) {
+	if (t[*ifst + (*ifst - 1) * t_dim1] != 0.f) {
+	    --(*ifst);
+	}
+    }
+    nbf = 1;
+    if (*ifst < *n) {
+	if (t[*ifst + 1 + *ifst * t_dim1] != 0.f) {
+	    nbf = 2;
+	}
+    }
+
+/*
+       Determine the first row of the final block
+       and find out it is 1 by 1 or 2 by 2.
+*/
+
+    if (*ilst > 1) {
+	if (t[*ilst + (*ilst - 1) * t_dim1] != 0.f) {
+	    --(*ilst);
+	}
+    }
+    nbl = 1;
+    if (*ilst < *n) {
+	if (t[*ilst + 1 + *ilst * t_dim1] != 0.f) {
+	    nbl = 2;
+	}
+    }
+
+    if (*ifst == *ilst) {
+	return 0;
+    }
+
+    if (*ifst < *ilst) {
+
+/*        Update ILST */
+
+	if (nbf == 2 && nbl == 1) {
+	    --(*ilst);
+	}
+	if (nbf == 1 && nbl == 2) {
+	    ++(*ilst);
+	}
+
+	here = *ifst;
+
+L10:
+
+/*        Swap block with next one below */
+
+	if (nbf == 1 || nbf == 2) {
+
+/*           Current block either 1 by 1 or 2 by 2 */
+
+	    nbnext = 1;
+	    if (here + nbf + 1 <= *n) {
+		if (t[here + nbf + 1 + (here + nbf) * t_dim1] != 0.f) {
+		    nbnext = 2;
+		}
+	    }
+	    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &here, &
+		    nbf, &nbnext, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    here += nbnext;
+
+/*           Test if 2 by 2 block breaks into two 1 by 1 blocks */
+
+	    if (nbf == 2) {
+		if (t[here + 1 + here * t_dim1] == 0.f) {
+		    nbf = 3;
+		}
+	    }
+
+	} else {
+
+/*
+             Current block consists of two 1 by 1 blocks each of which
+             must be swapped individually
+*/
+
+	    nbnext = 1;
+	    if (here + 3 <= *n) {
+		if (t[here + 3 + (here + 2) * t_dim1] != 0.f) {
+		    nbnext = 2;
+		}
+	    }
+	    i__1 = here + 1;
+	    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &i__1, &
+		    c__1, &nbnext, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    if (nbnext == 1) {
+
+/*              Swap two 1 by 1 blocks, no problems possible */
+
+		slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			here, &c__1, &nbnext, &work[1], info);
+		++here;
+	    } else {
+
+/*              Recompute NBNEXT in case 2 by 2 split */
+
+		if (t[here + 2 + (here + 1) * t_dim1] == 0.f) {
+		    nbnext = 1;
+		}
+		if (nbnext == 2) {
+
+/*                 2 by 2 Block did not split */
+
+		    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    here, &c__1, &nbnext, &work[1], info);
+		    if (*info != 0) {
+			*ilst = here;
+			return 0;
+		    }
+		    here += 2;
+		} else {
+
+/*                 2 by 2 Block did split */
+
+		    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    here, &c__1, &c__1, &work[1], info);
+		    i__1 = here + 1;
+		    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    i__1, &c__1, &c__1, &work[1], info);
+		    here += 2;
+		}
+	    }
+	}
+	if (here < *ilst) {
+	    goto L10;
+	}
+
+    } else {
+
+	here = *ifst;
+L20:
+
+/*        Swap block with next one above */
+
+	if (nbf == 1 || nbf == 2) {
+
+/*           Current block either 1 by 1 or 2 by 2 */
+
+	    nbnext = 1;
+	    if (here >= 3) {
+		if (t[here - 1 + (here - 2) * t_dim1] != 0.f) {
+		    nbnext = 2;
+		}
+	    }
+	    i__1 = here - nbnext;
+	    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &i__1, &
+		    nbnext, &nbf, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    here -= nbnext;
+
+/*           Test if 2 by 2 block breaks into two 1 by 1 blocks */
+
+	    if (nbf == 2) {
+		if (t[here + 1 + here * t_dim1] == 0.f) {
+		    nbf = 3;
+		}
+	    }
+
+	} else {
+
+/*
+             Current block consists of two 1 by 1 blocks each of which
+             must be swapped individually
+*/
+
+	    nbnext = 1;
+	    if (here >= 3) {
+		if (t[here - 1 + (here - 2) * t_dim1] != 0.f) {
+		    nbnext = 2;
+		}
+	    }
+	    i__1 = here - nbnext;
+	    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &i__1, &
+		    nbnext, &c__1, &work[1], info);
+	    if (*info != 0) {
+		*ilst = here;
+		return 0;
+	    }
+	    if (nbnext == 1) {
+
+/*              Swap two 1 by 1 blocks, no problems possible */
+
+		slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			here, &nbnext, &c__1, &work[1], info);
+		--here;
+	    } else {
+
+/*              Recompute NBNEXT in case 2 by 2 split */
+
+		if (t[here + (here - 1) * t_dim1] == 0.f) {
+		    nbnext = 1;
+		}
+		if (nbnext == 2) {
+
+/*                 2 by 2 Block did not split */
+
+		    i__1 = here - 1;
+		    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    i__1, &c__2, &c__1, &work[1], info);
+		    if (*info != 0) {
+			*ilst = here;
+			return 0;
+		    }
+		    here += -2;
+		} else {
+
+/*                 2 by 2 Block did split */
+
+		    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    here, &c__1, &c__1, &work[1], info);
+		    i__1 = here - 1;
+		    slaexc_(&wantq, n, &t[t_offset], ldt, &q[q_offset], ldq, &
+			    i__1, &c__1, &c__1, &work[1], info);
+		    here += -2;
+		}
+	    }
+	}
+	if (here > *ilst) {
+	    goto L20;
+	}
+    }
+    *ilst = here;
+
+    return 0;
+
+/*     End of STREXC */
+
+} /* strexc_ */
+
+/* Subroutine */ int strti2_(char *uplo, char *diag, integer *n, real *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer j;
+    static real ajj;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int strmv_(char *, char *, char *, integer *,
+	    real *, integer *, real *, integer *),
+	    xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    STRTI2 computes the inverse of a real upper or lower triangular
+    matrix.
+
+    This is the Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the matrix A is upper or lower triangular.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    DIAG    (input) CHARACTER*1
+            Specifies whether or not the matrix A is unit triangular.
+            = 'N':  Non-unit triangular
+            = 'U':  Unit triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading n by n upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("STRTI2", &i__1);
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute inverse of upper triangular matrix. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (nounit) {
+		a[j + j * a_dim1] = 1.f / a[j + j * a_dim1];
+		ajj = -a[j + j * a_dim1];
+	    } else {
+		ajj = -1.f;
+	    }
+
+/*           Compute elements 1:j-1 of j-th column. */
+
+	    i__2 = j - 1;
+	    strmv_("Upper", "No transpose", diag, &i__2, &a[a_offset], lda, &
+		    a[j * a_dim1 + 1], &c__1);
+	    i__2 = j - 1;
+	    sscal_(&i__2, &ajj, &a[j * a_dim1 + 1], &c__1);
+/* L10: */
+	}
+    } else {
+
+/*        Compute inverse of lower triangular matrix. */
+
+	for (j = *n; j >= 1; --j) {
+	    if (nounit) {
+		a[j + j * a_dim1] = 1.f / a[j + j * a_dim1];
+		ajj = -a[j + j * a_dim1];
+	    } else {
+		ajj = -1.f;
+	    }
+	    if (j < *n) {
+
+/*              Compute elements j+1:n of j-th column. */
+
+		i__1 = *n - j;
+		strmv_("Lower", "No transpose", diag, &i__1, &a[j + 1 + (j +
+			1) * a_dim1], lda, &a[j + 1 + j * a_dim1], &c__1);
+		i__1 = *n - j;
+		sscal_(&i__1, &ajj, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of STRTI2 */
+
+} /* strti2_ */
+
+/* Subroutine */ int strtri_(char *uplo, char *diag, integer *n, real *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, i__1, i__2[2], i__3, i__4, i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer j, jb, nb, nn;
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int strmm_(char *, char *, char *, char *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), strsm_(char *, char *, char *,
+	    char *, integer *, integer *, real *, real *, integer *, real *,
+	    integer *), strti2_(char *, char *
+	    , integer *, real *, integer *, integer *),
+	    xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    STRTRI computes the inverse of a real upper or lower triangular
+    matrix A.
+
+    This is the Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  A is upper triangular;
+            = 'L':  A is lower triangular.
+
+    DIAG    (input) CHARACTER*1
+            = 'N':  A is non-unit triangular;
+            = 'U':  A is unit triangular.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+            > 0: if INFO = i, A(i,i) is exactly zero.  The triangular
+                 matrix is singular and its inverse can not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("STRTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Check for singularity if non-unit. */
+
+    if (nounit) {
+	i__1 = *n;
+	for (*info = 1; *info <= i__1; ++(*info)) {
+	    if (a[*info + *info * a_dim1] == 0.f) {
+		return 0;
+	    }
+/* L10: */
+	}
+	*info = 0;
+    }
+
+/*
+       Determine the block size for this environment.
+
+   Writing concatenation
+*/
+    i__2[0] = 1, a__1[0] = uplo;
+    i__2[1] = 1, a__1[1] = diag;
+    s_cat(ch__1, a__1, i__2, &c__2, (ftnlen)2);
+    nb = ilaenv_(&c__1, "STRTRI", ch__1, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)2);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	strti2_(uplo, diag, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute inverse of upper triangular matrix */
+
+	    i__1 = *n;
+	    i__3 = nb;
+	    for (j = 1; i__3 < 0 ? j >= i__1 : j <= i__1; j += i__3) {
+/* Computing MIN */
+		i__4 = nb, i__5 = *n - j + 1;
+		jb = min(i__4,i__5);
+
+/*              Compute rows 1:j-1 of current block column */
+
+		i__4 = j - 1;
+		strmm_("Left", "Upper", "No transpose", diag, &i__4, &jb, &
+			c_b15, &a[a_offset], lda, &a[j * a_dim1 + 1], lda);
+		i__4 = j - 1;
+		strsm_("Right", "Upper", "No transpose", diag, &i__4, &jb, &
+			c_b151, &a[j + j * a_dim1], lda, &a[j * a_dim1 + 1],
+			lda);
+
+/*              Compute inverse of current diagonal block */
+
+		strti2_("Upper", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L20: */
+	    }
+	} else {
+
+/*           Compute inverse of lower triangular matrix */
+
+	    nn = (*n - 1) / nb * nb + 1;
+	    i__3 = -nb;
+	    for (j = nn; i__3 < 0 ? j >= 1 : j <= 1; j += i__3) {
+/* Computing MIN */
+		i__1 = nb, i__4 = *n - j + 1;
+		jb = min(i__1,i__4);
+		if (j + jb <= *n) {
+
+/*                 Compute rows j+jb:n of current block column */
+
+		    i__1 = *n - j - jb + 1;
+		    strmm_("Left", "Lower", "No transpose", diag, &i__1, &jb,
+			    &c_b15, &a[j + jb + (j + jb) * a_dim1], lda, &a[j
+			    + jb + j * a_dim1], lda);
+		    i__1 = *n - j - jb + 1;
+		    strsm_("Right", "Lower", "No transpose", diag, &i__1, &jb,
+			     &c_b151, &a[j + j * a_dim1], lda, &a[j + jb + j *
+			     a_dim1], lda);
+		}
+
+/*              Compute inverse of current diagonal block */
+
+		strti2_("Lower", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L30: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STRTRI */
+
+} /* strtri_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_s_lapack.f.patch b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_s_lapack.f.patch
new file mode 100644
index 00000000000..2e82d986e62
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_s_lapack.f.patch
@@ -0,0 +1,32 @@
+@@ -17359,5 +17359,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( LASTV, I ).NE.ZERO ) EXIT
++                     IF( V( LASTV, I ).NE.ZERO ) GO TO 15
+                   END DO
++   15             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -17371,5 +17372,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( I, LASTV ).NE.ZERO ) EXIT
++                     IF( V( I, LASTV ).NE.ZERO ) GO TO 16
+                   END DO
++   16             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -17415,5 +17417,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = 1, I-1
+-                        IF( V( LASTV, I ).NE.ZERO ) EXIT
++                        IF( V( LASTV, I ).NE.ZERO ) GO TO 35
+                      END DO
++   35                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
+@@ -17431,5 +17434,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = N, I+1, -1
+-                        IF( V( I, LASTV ).NE.ZERO ) EXIT
++                        IF( V( I, LASTV ).NE.ZERO ) GO TO 36
+                      END DO
++   36                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_z_lapack.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_z_lapack.c
new file mode 100644
index 00000000000..0f11f2e725e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_z_lapack.c
@@ -0,0 +1,29996 @@
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublecomplex c_b56 = {0.,0.};
+static doublecomplex c_b57 = {1.,0.};
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+static integer c__0 = 0;
+static integer c__65 = 65;
+static integer c__9 = 9;
+static integer c__6 = 6;
+static doublereal c_b328 = 0.;
+static doublereal c_b1034 = 1.;
+static integer c__12 = 12;
+static integer c__49 = 49;
+static doublereal c_b1276 = -1.;
+static integer c__13 = 13;
+static integer c__15 = 15;
+static integer c__14 = 14;
+static integer c__16 = 16;
+static logical c_false = FALSE_;
+static logical c_true = TRUE_;
+static doublereal c_b2435 = .5;
+
+/* Subroutine */ int zgebak_(char *job, char *side, integer *n, integer *ilo,
+	integer *ihi, doublereal *scale, integer *m, doublecomplex *v,
+	integer *ldv, integer *info)
+{
+    /* System generated locals */
+    integer v_dim1, v_offset, i__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal s;
+    static integer ii;
+    extern logical lsame_(char *, char *);
+    static logical leftv;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), xerbla_(char *, integer *),
+	    zdscal_(integer *, doublereal *, doublecomplex *, integer *);
+    static logical rightv;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGEBAK forms the right or left eigenvectors of a complex general
+    matrix by backward transformation on the computed eigenvectors of the
+    balanced matrix output by ZGEBAL.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the type of backward transformation required:
+            = 'N', do nothing, return immediately;
+            = 'P', do backward transformation for permutation only;
+            = 'S', do backward transformation for scaling only;
+            = 'B', do backward transformations for both permutation and
+                   scaling.
+            JOB must be the same as the argument JOB supplied to ZGEBAL.
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  V contains right eigenvectors;
+            = 'L':  V contains left eigenvectors.
+
+    N       (input) INTEGER
+            The number of rows of the matrix V.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            The integers ILO and IHI determined by ZGEBAL.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    SCALE   (input) DOUBLE PRECISION array, dimension (N)
+            Details of the permutation and scaling factors, as returned
+            by ZGEBAL.
+
+    M       (input) INTEGER
+            The number of columns of the matrix V.  M >= 0.
+
+    V       (input/output) COMPLEX*16 array, dimension (LDV,M)
+            On entry, the matrix of right or left eigenvectors to be
+            transformed, as returned by ZHSEIN or ZTREVC.
+            On exit, V is overwritten by the transformed eigenvectors.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V. LDV >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Decode and Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --scale;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+
+    /* Function Body */
+    rightv = lsame_(side, "R");
+    leftv = lsame_(side, "L");
+
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (! rightv && ! leftv) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -7;
+    } else if (*ldv < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEBAK", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*m == 0) {
+	return 0;
+    }
+    if (lsame_(job, "N")) {
+	return 0;
+    }
+
+    if (*ilo == *ihi) {
+	goto L30;
+    }
+
+/*     Backward balance */
+
+    if (lsame_(job, "S") || lsame_(job, "B")) {
+
+	if (rightv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = scale[i__];
+		zdscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L10: */
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *ihi;
+	    for (i__ = *ilo; i__ <= i__1; ++i__) {
+		s = 1. / scale[i__];
+		zdscal_(m, &s, &v[i__ + v_dim1], ldv);
+/* L20: */
+	    }
+	}
+
+    }
+
+/*
+       Backward permutation
+
+       For  I = ILO-1 step -1 until 1,
+                IHI+1 step 1 until N do --
+*/
+
+L30:
+    if (lsame_(job, "P") || lsame_(job, "B")) {
+	if (rightv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L40;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = (integer) scale[i__];
+		if (k == i__) {
+		    goto L40;
+		}
+		zswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L40:
+		;
+	    }
+	}
+
+	if (leftv) {
+	    i__1 = *n;
+	    for (ii = 1; ii <= i__1; ++ii) {
+		i__ = ii;
+		if (i__ >= *ilo && i__ <= *ihi) {
+		    goto L50;
+		}
+		if (i__ < *ilo) {
+		    i__ = *ilo - ii;
+		}
+		k = (integer) scale[i__];
+		if (k == i__) {
+		    goto L50;
+		}
+		zswap_(m, &v[i__ + v_dim1], ldv, &v[k + v_dim1], ldv);
+L50:
+		;
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZGEBAK */
+
+} /* zgebak_ */
+
+/* Subroutine */ int zgebal_(char *job, integer *n, doublecomplex *a, integer
+	*lda, integer *ilo, integer *ihi, doublereal *scale, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal c__, f, g;
+    static integer i__, j, k, l, m;
+    static doublereal r__, s, ca, ra;
+    static integer ica, ira, iexc;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static doublereal sfmin1, sfmin2, sfmax1, sfmax2;
+
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), zdscal_(
+	    integer *, doublereal *, doublecomplex *, integer *);
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+    static logical noconv;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    ZGEBAL balances a general complex matrix A.  This involves, first,
+    permuting A by a similarity transformation to isolate eigenvalues
+    in the first 1 to ILO-1 and last IHI+1 to N elements on the
+    diagonal; and second, applying a diagonal similarity transformation
+    to rows and columns ILO to IHI to make the rows and columns as
+    close in norm as possible.  Both steps are optional.
+
+    Balancing may reduce the 1-norm of the matrix, and improve the
+    accuracy of the computed eigenvalues and/or eigenvectors.
+
+    Arguments
+    =========
+
+    JOB     (input) CHARACTER*1
+            Specifies the operations to be performed on A:
+            = 'N':  none:  simply set ILO = 1, IHI = N, SCALE(I) = 1.0
+                    for i = 1,...,N;
+            = 'P':  permute only;
+            = 'S':  scale only;
+            = 'B':  both permute and scale.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the input matrix A.
+            On exit,  A is overwritten by the balanced matrix.
+            If JOB = 'N', A is not referenced.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    ILO     (output) INTEGER
+    IHI     (output) INTEGER
+            ILO and IHI are set to integers such that on exit
+            A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
+            If JOB = 'N' or 'S', ILO = 1 and IHI = N.
+
+    SCALE   (output) DOUBLE PRECISION array, dimension (N)
+            Details of the permutations and scaling factors applied to
+            A.  If P(j) is the index of the row and column interchanged
+            with row and column j and D(j) is the scaling factor
+            applied to row and column j, then
+            SCALE(j) = P(j)    for j = 1,...,ILO-1
+                     = D(j)    for j = ILO,...,IHI
+                     = P(j)    for j = IHI+1,...,N.
+            The order in which the interchanges are made is N to IHI+1,
+            then 1 to ILO-1.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The permutations consist of row and column interchanges which put
+    the matrix in the form
+
+               ( T1   X   Y  )
+       P A P = (  0   B   Z  )
+               (  0   0   T2 )
+
+    where T1 and T2 are upper triangular matrices whose eigenvalues lie
+    along the diagonal.  The column indices ILO and IHI mark the starting
+    and ending columns of the submatrix B. Balancing consists of applying
+    a diagonal similarity transformation inv(D) * B * D to make the
+    1-norms of each row of B and its corresponding column nearly equal.
+    The output matrix is
+
+       ( T1     X*D          Y    )
+       (  0  inv(D)*B*D  inv(D)*Z ).
+       (  0      0           T2   )
+
+    Information about the permutations P and the diagonal matrix D is
+    returned in the vector SCALE.
+
+    This subroutine is based on the EISPACK routine CBAL.
+
+    Modified by Tzu-Yi Chen, Computer Science Division, University of
+      California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --scale;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(job, "N") && ! lsame_(job, "P") && ! lsame_(job, "S")
+	    && ! lsame_(job, "B")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEBAL", &i__1);
+	return 0;
+    }
+
+    k = 1;
+    l = *n;
+
+    if (*n == 0) {
+	goto L210;
+    }
+
+    if (lsame_(job, "N")) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scale[i__] = 1.;
+/* L10: */
+	}
+	goto L210;
+    }
+
+    if (lsame_(job, "S")) {
+	goto L120;
+    }
+
+/*     Permutation to isolate eigenvalues if possible */
+
+    goto L50;
+
+/*     Row and column exchange. */
+
+L20:
+    scale[m] = (doublereal) j;
+    if (j == m) {
+	goto L30;
+    }
+
+    zswap_(&l, &a[j * a_dim1 + 1], &c__1, &a[m * a_dim1 + 1], &c__1);
+    i__1 = *n - k + 1;
+    zswap_(&i__1, &a[j + k * a_dim1], lda, &a[m + k * a_dim1], lda);
+
+L30:
+    switch (iexc) {
+	case 1:  goto L40;
+	case 2:  goto L80;
+    }
+
+/*     Search for rows isolating an eigenvalue and push them down. */
+
+L40:
+    if (l == 1) {
+	goto L210;
+    }
+    --l;
+
+L50:
+    for (j = l; j >= 1; --j) {
+
+	i__1 = l;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (i__ == j) {
+		goto L60;
+	    }
+	    i__2 = j + i__ * a_dim1;
+	    if (a[i__2].r != 0. || d_imag(&a[j + i__ * a_dim1]) != 0.) {
+		goto L70;
+	    }
+L60:
+	    ;
+	}
+
+	m = l;
+	iexc = 1;
+	goto L20;
+L70:
+	;
+    }
+
+    goto L90;
+
+/*     Search for columns isolating an eigenvalue and push them left. */
+
+L80:
+    ++k;
+
+L90:
+    i__1 = l;
+    for (j = k; j <= i__1; ++j) {
+
+	i__2 = l;
+	for (i__ = k; i__ <= i__2; ++i__) {
+	    if (i__ == j) {
+		goto L100;
+	    }
+	    i__3 = i__ + j * a_dim1;
+	    if (a[i__3].r != 0. || d_imag(&a[i__ + j * a_dim1]) != 0.) {
+		goto L110;
+	    }
+L100:
+	    ;
+	}
+
+	m = k;
+	iexc = 2;
+	goto L20;
+L110:
+	;
+    }
+
+L120:
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	scale[i__] = 1.;
+/* L130: */
+    }
+
+    if (lsame_(job, "P")) {
+	goto L210;
+    }
+
+/*
+       Balance the submatrix in rows K to L.
+
+       Iterative loop for norm reduction
+*/
+
+    sfmin1 = SAFEMINIMUM / PRECISION;
+    sfmax1 = 1. / sfmin1;
+    sfmin2 = sfmin1 * 2.;
+    sfmax2 = 1. / sfmin2;
+L140:
+    noconv = FALSE_;
+
+    i__1 = l;
+    for (i__ = k; i__ <= i__1; ++i__) {
+	c__ = 0.;
+	r__ = 0.;
+
+	i__2 = l;
+	for (j = k; j <= i__2; ++j) {
+	    if (j == i__) {
+		goto L150;
+	    }
+	    i__3 = j + i__ * a_dim1;
+	    c__ += (d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j + i__ *
+		     a_dim1]), abs(d__2));
+	    i__3 = i__ + j * a_dim1;
+	    r__ += (d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[i__ + j *
+		     a_dim1]), abs(d__2));
+L150:
+	    ;
+	}
+	ica = izamax_(&l, &a[i__ * a_dim1 + 1], &c__1);
+	ca = z_abs(&a[ica + i__ * a_dim1]);
+	i__2 = *n - k + 1;
+	ira = izamax_(&i__2, &a[i__ + k * a_dim1], lda);
+	ra = z_abs(&a[i__ + (ira + k - 1) * a_dim1]);
+
+/*        Guard against zero C or R due to underflow. */
+
+	if (c__ == 0. || r__ == 0.) {
+	    goto L200;
+	}
+	g = r__ / 2.;
+	f = 1.;
+	s = c__ + r__;
+L160:
+/* Computing MAX */
+	d__1 = max(f,c__);
+/* Computing MIN */
+	d__2 = min(r__,g);
+	if (c__ >= g || max(d__1,ca) >= sfmax2 || min(d__2,ra) <= sfmin2) {
+	    goto L170;
+	}
+	d__1 = c__ + f + ca + r__ + g + ra;
+	if (disnan_(&d__1)) {
+
+/*           Exit if NaN to avoid infinite loop */
+
+	    *info = -3;
+	    i__2 = -(*info);
+	    xerbla_("ZGEBAL", &i__2);
+	    return 0;
+	}
+	f *= 2.;
+	c__ *= 2.;
+	ca *= 2.;
+	r__ /= 2.;
+	g /= 2.;
+	ra /= 2.;
+	goto L160;
+
+L170:
+	g = c__ / 2.;
+L180:
+/* Computing MIN */
+	d__1 = min(f,c__), d__1 = min(d__1,g);
+	if (g < r__ || max(r__,ra) >= sfmax2 || min(d__1,ca) <= sfmin2) {
+	    goto L190;
+	}
+	f /= 2.;
+	c__ /= 2.;
+	g /= 2.;
+	ca /= 2.;
+	r__ *= 2.;
+	ra *= 2.;
+	goto L180;
+
+/*        Now balance. */
+
+L190:
+	if (c__ + r__ >= s * .95) {
+	    goto L200;
+	}
+	if (f < 1. && scale[i__] < 1.) {
+	    if (f * scale[i__] <= sfmin1) {
+		goto L200;
+	    }
+	}
+	if (f > 1. && scale[i__] > 1.) {
+	    if (scale[i__] >= sfmax1 / f) {
+		goto L200;
+	    }
+	}
+	g = 1. / f;
+	scale[i__] *= f;
+	noconv = TRUE_;
+
+	i__2 = *n - k + 1;
+	zdscal_(&i__2, &g, &a[i__ + k * a_dim1], lda);
+	zdscal_(&l, &f, &a[i__ * a_dim1 + 1], &c__1);
+
+L200:
+	;
+    }
+
+    if (noconv) {
+	goto L140;
+    }
+
+L210:
+    *ilo = k;
+    *ihi = l;
+
+    return 0;
+
+/*     End of ZGEBAL */
+
+} /* zgebal_ */
+
+/* Subroutine */ int zgebd2_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublereal *d__, doublereal *e, doublecomplex *tauq,
+	doublecomplex *taup, doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex alpha;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *,
+	    integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGEBD2 reduces a complex general m by n matrix A to upper or lower
+    real bidiagonal form B by a unitary transformation: Q' * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the unitary matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the unitary matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) COMPLEX*16 array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX*16 array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix P. See Further Details.
+
+    WORK    (workspace) COMPLEX*16 array, dimension (max(M,N))
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in
+    A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in
+    A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, v and u are complex vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEBD2", &i__1);
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    zlarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1, &
+		    tauq[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+
+/*           Apply H(i)' to A(i:m,i+1:n) from the left */
+
+	    if (i__ < *n) {
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		d_cnjg(&z__1, &tauq[i__]);
+		zlarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &
+			z__1, &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    }
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__;
+	    a[i__2].r = d__[i__3], a[i__2].i = 0.;
+
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector G(i) to annihilate
+                A(i,i+2:n)
+*/
+
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		zlarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+			taup[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Apply G(i) to A(i+1:m,i+1:n) from the right */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		zlarf_("Right", &i__2, &i__3, &a[i__ + (i__ + 1) * a_dim1],
+			lda, &taup[i__], &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &work[1]);
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		i__3 = i__;
+		a[i__2].r = e[i__3], a[i__2].i = 0.;
+	    } else {
+		i__2 = i__;
+		taup[i__2].r = 0., taup[i__2].i = 0.;
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Generate elementary reflector G(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = *n - i__ + 1;
+	    zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    zlarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+		    taup[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+
+/*           Apply G(i) to A(i+1:m,i:n) from the right */
+
+	    if (i__ < *m) {
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		zlarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &
+			taup[i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    i__2 = *n - i__ + 1;
+	    zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__;
+	    a[i__2].r = d__[i__3], a[i__2].i = 0.;
+
+	    if (i__ < *m) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:m,i)
+*/
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		zlarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1,
+			 &tauq[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Apply H(i)' to A(i+1:m,i+1:n) from the left */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		d_cnjg(&z__1, &tauq[i__]);
+		zlarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &
+			c__1, &z__1, &a[i__ + 1 + (i__ + 1) * a_dim1], lda, &
+			work[1]);
+		i__2 = i__ + 1 + i__ * a_dim1;
+		i__3 = i__;
+		a[i__2].r = e[i__3], a[i__2].i = 0.;
+	    } else {
+		i__2 = i__;
+		tauq[i__2].r = 0., tauq[i__2].i = 0.;
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of ZGEBD2 */
+
+} /* zgebd2_ */
+
+/* Subroutine */ int zgebrd_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublereal *d__, doublereal *e, doublecomplex *tauq,
+	doublecomplex *taup, doublecomplex *work, integer *lwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, nb, nx;
+    static doublereal ws;
+    static integer nbmin, iinfo, minmn;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), zgebd2_(integer *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublereal *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, integer *),
+	    xerbla_(char *, integer *), zlabrd_(integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublereal *, doublereal *,
+	     doublecomplex *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwrkx, ldwrky, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGEBRD reduces a general complex M-by-N matrix A to upper or lower
+    bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the M-by-N general matrix to be reduced.
+            On exit,
+            if m >= n, the diagonal and the first superdiagonal are
+              overwritten with the upper bidiagonal matrix B; the
+              elements below the diagonal, with the array TAUQ, represent
+              the unitary matrix Q as a product of elementary
+              reflectors, and the elements above the first superdiagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors;
+            if m < n, the diagonal and the first subdiagonal are
+              overwritten with the lower bidiagonal matrix B; the
+              elements below the first subdiagonal, with the array TAUQ,
+              represent the unitary matrix Q as a product of
+              elementary reflectors, and the elements above the diagonal,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The diagonal elements of the bidiagonal matrix B:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (min(M,N)-1)
+            The off-diagonal elements of the bidiagonal matrix B:
+            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) COMPLEX*16 array dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX*16 array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix P. See Further Details.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,M,N).
+            For optimum performance LWORK >= (M+N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+
+       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in
+    A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in
+    A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+
+       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
+      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
+      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
+      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
+      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
+      (  v1  v2  v3  v4  v5 )
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MAX */
+    i__1 = 1, i__2 = ilaenv_(&c__1, "ZGEBRD", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = max(i__1,i__2);
+    lwkopt = (*m + *n) * nb;
+    d__1 = (doublereal) lwkopt;
+    work[1].r = d__1, work[1].i = 0.;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = max(1,*m);
+	if (*lwork < max(i__1,*n) && ! lquery) {
+	    *info = -10;
+	}
+    }
+    if (*info < 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEBRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    minmn = min(*m,*n);
+    if (minmn == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    ws = (doublereal) max(*m,*n);
+    ldwrkx = *m;
+    ldwrky = *n;
+
+    if (nb > 1 && nb < minmn) {
+
+/*
+          Set the crossover point NX.
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "ZGEBRD", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+
+/*        Determine when to switch from blocked to unblocked code. */
+
+	if (nx < minmn) {
+	    ws = (doublereal) ((*m + *n) * nb);
+	    if ((doublereal) (*lwork) < ws) {
+
+/*
+                Not enough work space for the optimal NB, consider using
+                a smaller block size.
+*/
+
+		nbmin = ilaenv_(&c__2, "ZGEBRD", " ", m, n, &c_n1, &c_n1, (
+			ftnlen)6, (ftnlen)1);
+		if (*lwork >= (*m + *n) * nbmin) {
+		    nb = *lwork / (*m + *n);
+		} else {
+		    nb = 1;
+		    nx = minmn;
+		}
+	    }
+	}
+    } else {
+	nx = minmn;
+    }
+
+    i__1 = minmn - nx;
+    i__2 = nb;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+
+/*
+          Reduce rows and columns i:i+ib-1 to bidiagonal form and return
+          the matrices X and Y which are needed to update the unreduced
+          part of the matrix
+*/
+
+	i__3 = *m - i__ + 1;
+	i__4 = *n - i__ + 1;
+	zlabrd_(&i__3, &i__4, &nb, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[
+		i__], &tauq[i__], &taup[i__], &work[1], &ldwrkx, &work[ldwrkx
+		* nb + 1], &ldwrky);
+
+/*
+          Update the trailing submatrix A(i+ib:m,i+ib:n), using
+          an update of the form  A := A - V*Y' - X*U'
+*/
+
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	z__1.r = -1., z__1.i = -0.;
+	zgemm_("No transpose", "Conjugate transpose", &i__3, &i__4, &nb, &
+		z__1, &a[i__ + nb + i__ * a_dim1], lda, &work[ldwrkx * nb +
+		nb + 1], &ldwrky, &c_b57, &a[i__ + nb + (i__ + nb) * a_dim1],
+		lda);
+	i__3 = *m - i__ - nb + 1;
+	i__4 = *n - i__ - nb + 1;
+	z__1.r = -1., z__1.i = -0.;
+	zgemm_("No transpose", "No transpose", &i__3, &i__4, &nb, &z__1, &
+		work[nb + 1], &ldwrkx, &a[i__ + (i__ + nb) * a_dim1], lda, &
+		c_b57, &a[i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*        Copy diagonal and off-diagonal elements of B back into A */
+
+	if (*m >= *n) {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = d__[i__5], a[i__4].i = 0.;
+		i__4 = j + (j + 1) * a_dim1;
+		i__5 = j;
+		a[i__4].r = e[i__5], a[i__4].i = 0.;
+/* L10: */
+	    }
+	} else {
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = d__[i__5], a[i__4].i = 0.;
+		i__4 = j + 1 + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = e[i__5], a[i__4].i = 0.;
+/* L20: */
+	    }
+	}
+/* L30: */
+    }
+
+/*     Use unblocked code to reduce the remainder of the matrix */
+
+    i__2 = *m - i__ + 1;
+    i__1 = *n - i__ + 1;
+    zgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], &
+	    tauq[i__], &taup[i__], &work[1], &iinfo);
+    work[1].r = ws, work[1].i = 0.;
+    return 0;
+
+/*     End of ZGEBRD */
+
+} /* zgebrd_ */
+
+/* Subroutine */ int zgeev_(char *jobvl, char *jobvr, integer *n,
+	doublecomplex *a, integer *lda, doublecomplex *w, doublecomplex *vl,
+	integer *ldvl, doublecomplex *vr, integer *ldvr, doublecomplex *work,
+	integer *lwork, doublereal *rwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3;
+    doublereal d__1, d__2;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, k, ihi;
+    static doublereal scl;
+    static integer ilo;
+    static doublereal dum[1], eps;
+    static doublecomplex tmp;
+    static integer ibal;
+    static char side[1];
+    static doublereal anrm;
+    static integer ierr, itau, iwrk, nout;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), dlabad_(doublereal *, doublereal *);
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    static logical scalea;
+
+    static doublereal cscale;
+    extern /* Subroutine */ int zgebak_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublecomplex *, integer *,
+	    integer *), zgebal_(char *, integer *,
+	    doublecomplex *, integer *, integer *, integer *, doublereal *,
+	    integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical select[1];
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *,
+	    doublecomplex *, integer *);
+    static doublereal bignum;
+    extern doublereal zlange_(char *, integer *, integer *, doublecomplex *,
+	    integer *, doublereal *);
+    extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *), zlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublecomplex *,
+	     integer *, integer *), zlacpy_(char *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *);
+    static integer minwrk, maxwrk;
+    static logical wantvl;
+    static doublereal smlnum;
+    static integer hswork, irwork;
+    extern /* Subroutine */ int zhseqr_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *), ztrevc_(char *, char *, logical *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, integer *, doublecomplex *,
+	     doublereal *, integer *);
+    static logical lquery, wantvr;
+    extern /* Subroutine */ int zunghr_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the
+    eigenvalues and, optionally, the left and/or right eigenvectors.
+
+    The right eigenvector v(j) of A satisfies
+                     A * v(j) = lambda(j) * v(j)
+    where lambda(j) is its eigenvalue.
+    The left eigenvector u(j) of A satisfies
+                  u(j)**H * A = lambda(j) * u(j)**H
+    where u(j)**H denotes the conjugate transpose of u(j).
+
+    The computed eigenvectors are normalized to have Euclidean norm
+    equal to 1 and largest component real.
+
+    Arguments
+    =========
+
+    JOBVL   (input) CHARACTER*1
+            = 'N': left eigenvectors of A are not computed;
+            = 'V': left eigenvectors of are computed.
+
+    JOBVR   (input) CHARACTER*1
+            = 'N': right eigenvectors of A are not computed;
+            = 'V': right eigenvectors of A are computed.
+
+    N       (input) INTEGER
+            The order of the matrix A. N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the N-by-N matrix A.
+            On exit, A has been overwritten.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    W       (output) COMPLEX*16 array, dimension (N)
+            W contains the computed eigenvalues.
+
+    VL      (output) COMPLEX*16 array, dimension (LDVL,N)
+            If JOBVL = 'V', the left eigenvectors u(j) are stored one
+            after another in the columns of VL, in the same order
+            as their eigenvalues.
+            If JOBVL = 'N', VL is not referenced.
+            u(j) = VL(:,j), the j-th column of VL.
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1; if
+            JOBVL = 'V', LDVL >= N.
+
+    VR      (output) COMPLEX*16 array, dimension (LDVR,N)
+            If JOBVR = 'V', the right eigenvectors v(j) are stored one
+            after another in the columns of VR, in the same order
+            as their eigenvalues.
+            If JOBVR = 'N', VR is not referenced.
+            v(j) = VR(:,j), the j-th column of VR.
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1; if
+            JOBVR = 'V', LDVR >= N.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,2*N).
+            For good performance, LWORK must generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    RWORK   (workspace) DOUBLE PRECISION array, dimension (2*N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = i, the QR algorithm failed to compute all the
+                  eigenvalues, and no eigenvectors have been computed;
+                  elements and i+1:N of W contain eigenvalues which have
+                  converged.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --w;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+    --rwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    wantvl = lsame_(jobvl, "V");
+    wantvr = lsame_(jobvr, "V");
+    if (! wantvl && ! lsame_(jobvl, "N")) {
+	*info = -1;
+    } else if (! wantvr && ! lsame_(jobvr, "N")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldvl < 1 || wantvl && *ldvl < *n) {
+	*info = -8;
+    } else if (*ldvr < 1 || wantvr && *ldvr < *n) {
+	*info = -10;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         CWorkspace refers to complex workspace, and RWorkspace to real
+         workspace. NB refers to the optimal block size for the
+         immediately following subroutine, as returned by ILAENV.
+         HSWORK refers to the workspace preferred by ZHSEQR, as
+         calculated below. HSWORK is computed assuming ILO=1 and IHI=N,
+         the worst case.)
+*/
+
+    if (*info == 0) {
+	if (*n == 0) {
+	    minwrk = 1;
+	    maxwrk = 1;
+	} else {
+	    maxwrk = *n + *n * ilaenv_(&c__1, "ZGEHRD", " ", n, &c__1, n, &
+		    c__0, (ftnlen)6, (ftnlen)1);
+	    minwrk = *n << 1;
+	    if (wantvl) {
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + (*n - 1) * ilaenv_(&c__1, "ZUNGHR",
+			 " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+		zhseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &w[1], &vl[
+			vl_offset], ldvl, &work[1], &c_n1, info);
+	    } else if (wantvr) {
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + (*n - 1) * ilaenv_(&c__1, "ZUNGHR",
+			 " ", n, &c__1, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+		zhseqr_("S", "V", n, &c__1, n, &a[a_offset], lda, &w[1], &vr[
+			vr_offset], ldvr, &work[1], &c_n1, info);
+	    } else {
+		zhseqr_("E", "N", n, &c__1, n, &a[a_offset], lda, &w[1], &vr[
+			vr_offset], ldvr, &work[1], &c_n1, info);
+	    }
+	    hswork = (integer) work[1].r;
+/* Computing MAX */
+	    i__1 = max(maxwrk,hswork);
+	    maxwrk = max(i__1,minwrk);
+	}
+	work[1].r = (doublereal) maxwrk, work[1].i = 0.;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEEV ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = PRECISION;
+    smlnum = SAFEMINIMUM;
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+    smlnum = sqrt(smlnum) / eps;
+    bignum = 1. / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = zlange_("M", n, n, &a[a_offset], lda, dum);
+    scalea = FALSE_;
+    if (anrm > 0. && anrm < smlnum) {
+	scalea = TRUE_;
+	cscale = smlnum;
+    } else if (anrm > bignum) {
+	scalea = TRUE_;
+	cscale = bignum;
+    }
+    if (scalea) {
+	zlascl_("G", &c__0, &c__0, &anrm, &cscale, n, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+/*
+       Balance the matrix
+       (CWorkspace: none)
+       (RWorkspace: need N)
+*/
+
+    ibal = 1;
+    zgebal_("B", n, &a[a_offset], lda, &ilo, &ihi, &rwork[ibal], &ierr);
+
+/*
+       Reduce to upper Hessenberg form
+       (CWorkspace: need 2*N, prefer N+N*NB)
+       (RWorkspace: none)
+*/
+
+    itau = 1;
+    iwrk = itau + *n;
+    i__1 = *lwork - iwrk + 1;
+    zgehrd_(n, &ilo, &ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1,
+	     &ierr);
+
+    if (wantvl) {
+
+/*
+          Want left eigenvectors
+          Copy Householder vectors to VL
+*/
+
+	*(unsigned char *)side = 'L';
+	zlacpy_("L", n, n, &a[a_offset], lda, &vl[vl_offset], ldvl)
+		;
+
+/*
+          Generate unitary matrix in VL
+          (CWorkspace: need 2*N-1, prefer N+(N-1)*NB)
+          (RWorkspace: none)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	zunghr_(n, &ilo, &ihi, &vl[vl_offset], ldvl, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VL
+          (CWorkspace: need 1, prefer HSWORK (see comments) )
+          (RWorkspace: none)
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	zhseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &w[1], &vl[
+		vl_offset], ldvl, &work[iwrk], &i__1, info);
+
+	if (wantvr) {
+
+/*
+             Want left and right eigenvectors
+             Copy Schur vectors to VR
+*/
+
+	    *(unsigned char *)side = 'B';
+	    zlacpy_("F", n, n, &vl[vl_offset], ldvl, &vr[vr_offset], ldvr);
+	}
+
+    } else if (wantvr) {
+
+/*
+          Want right eigenvectors
+          Copy Householder vectors to VR
+*/
+
+	*(unsigned char *)side = 'R';
+	zlacpy_("L", n, n, &a[a_offset], lda, &vr[vr_offset], ldvr)
+		;
+
+/*
+          Generate unitary matrix in VR
+          (CWorkspace: need 2*N-1, prefer N+(N-1)*NB)
+          (RWorkspace: none)
+*/
+
+	i__1 = *lwork - iwrk + 1;
+	zunghr_(n, &ilo, &ihi, &vr[vr_offset], ldvr, &work[itau], &work[iwrk],
+		 &i__1, &ierr);
+
+/*
+          Perform QR iteration, accumulating Schur vectors in VR
+          (CWorkspace: need 1, prefer HSWORK (see comments) )
+          (RWorkspace: none)
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	zhseqr_("S", "V", n, &ilo, &ihi, &a[a_offset], lda, &w[1], &vr[
+		vr_offset], ldvr, &work[iwrk], &i__1, info);
+
+    } else {
+
+/*
+          Compute eigenvalues only
+          (CWorkspace: need 1, prefer HSWORK (see comments) )
+          (RWorkspace: none)
+*/
+
+	iwrk = itau;
+	i__1 = *lwork - iwrk + 1;
+	zhseqr_("E", "N", n, &ilo, &ihi, &a[a_offset], lda, &w[1], &vr[
+		vr_offset], ldvr, &work[iwrk], &i__1, info);
+    }
+
+/*     If INFO > 0 from ZHSEQR, then quit */
+
+    if (*info > 0) {
+	goto L50;
+    }
+
+    if (wantvl || wantvr) {
+
+/*
+          Compute left and/or right eigenvectors
+          (CWorkspace: need 2*N)
+          (RWorkspace: need 2*N)
+*/
+
+	irwork = ibal + *n;
+	ztrevc_(side, "B", select, n, &a[a_offset], lda, &vl[vl_offset], ldvl,
+		 &vr[vr_offset], ldvr, n, &nout, &work[iwrk], &rwork[irwork],
+		&ierr);
+    }
+
+    if (wantvl) {
+
+/*
+          Undo balancing of left eigenvectors
+          (CWorkspace: none)
+          (RWorkspace: need N)
+*/
+
+	zgebak_("B", "L", n, &ilo, &ihi, &rwork[ibal], n, &vl[vl_offset],
+		ldvl, &ierr);
+
+/*        Normalize left eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scl = 1. / dznrm2_(n, &vl[i__ * vl_dim1 + 1], &c__1);
+	    zdscal_(n, &scl, &vl[i__ * vl_dim1 + 1], &c__1);
+	    i__2 = *n;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k + i__ * vl_dim1;
+/* Computing 2nd power */
+		d__1 = vl[i__3].r;
+/* Computing 2nd power */
+		d__2 = d_imag(&vl[k + i__ * vl_dim1]);
+		rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2;
+/* L10: */
+	    }
+	    k = idamax_(n, &rwork[irwork], &c__1);
+	    d_cnjg(&z__2, &vl[k + i__ * vl_dim1]);
+	    d__1 = sqrt(rwork[irwork + k - 1]);
+	    z__1.r = z__2.r / d__1, z__1.i = z__2.i / d__1;
+	    tmp.r = z__1.r, tmp.i = z__1.i;
+	    zscal_(n, &tmp, &vl[i__ * vl_dim1 + 1], &c__1);
+	    i__2 = k + i__ * vl_dim1;
+	    i__3 = k + i__ * vl_dim1;
+	    d__1 = vl[i__3].r;
+	    z__1.r = d__1, z__1.i = 0.;
+	    vl[i__2].r = z__1.r, vl[i__2].i = z__1.i;
+/* L20: */
+	}
+    }
+
+    if (wantvr) {
+
+/*
+          Undo balancing of right eigenvectors
+          (CWorkspace: none)
+          (RWorkspace: need N)
+*/
+
+	zgebak_("B", "R", n, &ilo, &ihi, &rwork[ibal], n, &vr[vr_offset],
+		ldvr, &ierr);
+
+/*        Normalize right eigenvectors and make largest component real */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    scl = 1. / dznrm2_(n, &vr[i__ * vr_dim1 + 1], &c__1);
+	    zdscal_(n, &scl, &vr[i__ * vr_dim1 + 1], &c__1);
+	    i__2 = *n;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = k + i__ * vr_dim1;
+/* Computing 2nd power */
+		d__1 = vr[i__3].r;
+/* Computing 2nd power */
+		d__2 = d_imag(&vr[k + i__ * vr_dim1]);
+		rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2;
+/* L30: */
+	    }
+	    k = idamax_(n, &rwork[irwork], &c__1);
+	    d_cnjg(&z__2, &vr[k + i__ * vr_dim1]);
+	    d__1 = sqrt(rwork[irwork + k - 1]);
+	    z__1.r = z__2.r / d__1, z__1.i = z__2.i / d__1;
+	    tmp.r = z__1.r, tmp.i = z__1.i;
+	    zscal_(n, &tmp, &vr[i__ * vr_dim1 + 1], &c__1);
+	    i__2 = k + i__ * vr_dim1;
+	    i__3 = k + i__ * vr_dim1;
+	    d__1 = vr[i__3].r;
+	    z__1.r = d__1, z__1.i = 0.;
+	    vr[i__2].r = z__1.r, vr[i__2].i = z__1.i;
+/* L40: */
+	}
+    }
+
+/*     Undo scaling if necessary */
+
+L50:
+    if (scalea) {
+	i__1 = *n - *info;
+/* Computing MAX */
+	i__3 = *n - *info;
+	i__2 = max(i__3,1);
+	zlascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &w[*info + 1]
+		, &i__2, &ierr);
+	if (*info > 0) {
+	    i__1 = ilo - 1;
+	    zlascl_("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, &w[1], n,
+		     &ierr);
+	}
+    }
+
+    work[1].r = (doublereal) maxwrk, work[1].i = 0.;
+    return 0;
+
+/*     End of ZGEEV */
+
+} /* zgeev_ */
+
+/* Subroutine */ int zgehd2_(integer *n, integer *ilo, integer *ihi,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex alpha;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGEHD2 reduces a complex general matrix A to upper Hessenberg form H
+    by a unitary similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to ZGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= max(1,N).
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the n by n general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the unitary matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) COMPLEX*16 array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) COMPLEX*16 array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEHD2", &i__1);
+	return 0;
+    }
+
+    i__1 = *ihi - 1;
+    for (i__ = *ilo; i__ <= i__1; ++i__) {
+
+/*        Compute elementary reflector H(i) to annihilate A(i+2:ihi,i) */
+
+	i__2 = i__ + 1 + i__ * a_dim1;
+	alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	i__2 = *ihi - i__;
+/* Computing MIN */
+	i__3 = i__ + 2;
+	zlarfg_(&i__2, &alpha, &a[min(i__3,*n) + i__ * a_dim1], &c__1, &tau[
+		i__]);
+	i__2 = i__ + 1 + i__ * a_dim1;
+	a[i__2].r = 1., a[i__2].i = 0.;
+
+/*        Apply H(i) to A(1:ihi,i+1:ihi) from the right */
+
+	i__2 = *ihi - i__;
+	zlarf_("Right", ihi, &i__2, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+		i__], &a[(i__ + 1) * a_dim1 + 1], lda, &work[1]);
+
+/*        Apply H(i)' to A(i+1:ihi,i+1:n) from the left */
+
+	i__2 = *ihi - i__;
+	i__3 = *n - i__;
+	d_cnjg(&z__1, &tau[i__]);
+	zlarf_("Left", &i__2, &i__3, &a[i__ + 1 + i__ * a_dim1], &c__1, &z__1,
+		 &a[i__ + 1 + (i__ + 1) * a_dim1], lda, &work[1]);
+
+	i__2 = i__ + 1 + i__ * a_dim1;
+	a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+/* L10: */
+    }
+
+    return 0;
+
+/*     End of ZGEHD2 */
+
+} /* zgehd2_ */
+
+/* Subroutine */ int zgehrd_(integer *n, integer *ilo, integer *ihi,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublecomplex t[4160]	/* was [65][64] */;
+    static integer ib;
+    static doublecomplex ei;
+    static integer nb, nh, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), ztrmm_(char *, char *, char *, char *,
+	     integer *, integer *, doublecomplex *, doublecomplex *, integer *
+	    , doublecomplex *, integer *),
+	    zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zgehd2_(integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *), zlahr2_(integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(
+	    char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2.1)                                  --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    ZGEHRD reduces a complex general matrix A to upper Hessenberg form H by
+    an unitary similarity transformation:  Q' * A * Q = H .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            It is assumed that A is already upper triangular in rows
+            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+            set by a previous call to ZGEBAL; otherwise they should be
+            set to 1 and N respectively. See Further Details.
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the N-by-N general matrix to be reduced.
+            On exit, the upper triangle and the first subdiagonal of A
+            are overwritten with the upper Hessenberg matrix H, and the
+            elements below the first subdiagonal, with the array TAU,
+            represent the unitary matrix Q as a product of elementary
+            reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) COMPLEX*16 array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
+            zero.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (LWORK)
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of (ihi-ilo) elementary
+    reflectors
+
+       Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
+    exit in A(i+2:ihi,i), and tau in TAU(i).
+
+    The contents of A are illustrated by the following example, with
+    n = 7, ilo = 2 and ihi = 6:
+
+    on entry,                        on exit,
+
+    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
+    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
+    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
+    (                         a )    (                          a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This file is a slight modification of LAPACK-3.0's DGEHRD
+    subroutine incorporating improvements proposed by Quintana-Orti and
+    Van de Geijn (2006). (See DLAHR2.)
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+/* Computing MIN */
+    i__1 = 64, i__2 = ilaenv_(&c__1, "ZGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    lwkopt = *n * nb;
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEHRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
+
+    i__1 = *ilo - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	tau[i__2].r = 0., tau[i__2].i = 0.;
+/* L10: */
+    }
+    i__1 = *n - 1;
+    for (i__ = max(1,*ihi); i__ <= i__1; ++i__) {
+	i__2 = i__;
+	tau[i__2].r = 0., tau[i__2].i = 0.;
+/* L20: */
+    }
+
+/*     Quick return if possible */
+
+    nh = *ihi - *ilo + 1;
+    if (nh <= 1) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+/*
+       Determine the block size
+
+   Computing MIN
+*/
+    i__1 = 64, i__2 = ilaenv_(&c__1, "ZGEHRD", " ", n, ilo, ihi, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nb = min(i__1,i__2);
+    nbmin = 2;
+    iws = 1;
+    if (nb > 1 && nb < nh) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code)
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "ZGEHRD", " ", n, ilo, ihi, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < nh) {
+
+/*           Determine if workspace is large enough for blocked code */
+
+	    iws = *n * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code
+
+   Computing MAX
+*/
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEHRD", " ", n, ilo, ihi, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+		if (*lwork >= *n * nbmin) {
+		    nb = *lwork / *n;
+		} else {
+		    nb = 1;
+		}
+	    }
+	}
+    }
+    ldwork = *n;
+
+    if (nb < nbmin || nb >= nh) {
+
+/*        Use unblocked code below */
+
+	i__ = *ilo;
+
+    } else {
+
+/*        Use blocked code */
+
+	i__1 = *ihi - 1 - nx;
+	i__2 = nb;
+	for (i__ = *ilo; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = nb, i__4 = *ihi - i__;
+	    ib = min(i__3,i__4);
+
+/*
+             Reduce columns i:i+ib-1 to Hessenberg form, returning the
+             matrices V and T of the block reflector H = I - V*T*V'
+             which performs the reduction, and also the matrix Y = A*V*T
+*/
+
+	    zlahr2_(ihi, &i__, &ib, &a[i__ * a_dim1 + 1], lda, &tau[i__], t, &
+		    c__65, &work[1], &ldwork);
+
+/*
+             Apply the block reflector H to A(1:ihi,i+ib:ihi) from the
+             right, computing  A := A - Y * V'. V(i+ib,ib-1) must be set
+             to 1
+*/
+
+	    i__3 = i__ + ib + (i__ + ib - 1) * a_dim1;
+	    ei.r = a[i__3].r, ei.i = a[i__3].i;
+	    i__3 = i__ + ib + (i__ + ib - 1) * a_dim1;
+	    a[i__3].r = 1., a[i__3].i = 0.;
+	    i__3 = *ihi - i__ - ib + 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemm_("No transpose", "Conjugate transpose", ihi, &i__3, &ib, &
+		    z__1, &work[1], &ldwork, &a[i__ + ib + i__ * a_dim1], lda,
+		     &c_b57, &a[(i__ + ib) * a_dim1 + 1], lda);
+	    i__3 = i__ + ib + (i__ + ib - 1) * a_dim1;
+	    a[i__3].r = ei.r, a[i__3].i = ei.i;
+
+/*
+             Apply the block reflector H to A(1:i,i+1:i+ib-1) from the
+             right
+*/
+
+	    i__3 = ib - 1;
+	    ztrmm_("Right", "Lower", "Conjugate transpose", "Unit", &i__, &
+		    i__3, &c_b57, &a[i__ + 1 + i__ * a_dim1], lda, &work[1], &
+		    ldwork);
+	    i__3 = ib - 2;
+	    for (j = 0; j <= i__3; ++j) {
+		z__1.r = -1., z__1.i = -0.;
+		zaxpy_(&i__, &z__1, &work[ldwork * j + 1], &c__1, &a[(i__ + j
+			+ 1) * a_dim1 + 1], &c__1);
+/* L30: */
+	    }
+
+/*
+             Apply the block reflector H to A(i+1:ihi,i+ib:n) from the
+             left
+*/
+
+	    i__3 = *ihi - i__;
+	    i__4 = *n - i__ - ib + 1;
+	    zlarfb_("Left", "Conjugate transpose", "Forward", "Columnwise", &
+		    i__3, &i__4, &ib, &a[i__ + 1 + i__ * a_dim1], lda, t, &
+		    c__65, &a[i__ + 1 + (i__ + ib) * a_dim1], lda, &work[1], &
+		    ldwork);
+/* L40: */
+	}
+    }
+
+/*     Use unblocked code to reduce the rest of the matrix */
+
+    zgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo);
+    work[1].r = (doublereal) iws, work[1].i = 0.;
+
+    return 0;
+
+/*     End of ZGEHRD */
+
+} /* zgehrd_ */
+
+/* Subroutine */ int zgelq2_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublecomplex *tau, doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublecomplex alpha;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *,
+	    integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    ZGELQ2 computes an LQ factorization of a complex m by n matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m by min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX*16 array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) COMPLEX*16 array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k)' . . . H(2)' H(1)', where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
+    A(i,i+1:n), and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGELQ2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i,i+1:n) */
+
+	i__2 = *n - i__ + 1;
+	zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	i__2 = i__ + i__ * a_dim1;
+	alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	i__2 = *n - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	zlarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &tau[i__]
+		);
+	if (i__ < *m) {
+
+/*           Apply H(i) to A(i+1:m,i:n) from the right */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	    i__2 = *m - i__;
+	    i__3 = *n - i__ + 1;
+	    zlarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[
+		    i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	}
+	i__2 = i__ + i__ * a_dim1;
+	a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+	i__2 = *n - i__ + 1;
+	zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+/* L10: */
+    }
+    return 0;
+
+/*     End of ZGELQ2 */
+
+} /* zgelq2_ */
+
+/* Subroutine */ int zgelqf_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int zgelq2_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *), xerbla_(
+	    char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGELQF computes an LQ factorization of a complex M-by-N matrix A:
+    A = L * Q.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and below the diagonal of the array
+            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
+            lower triangular if m <= n); the elements above the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX*16 array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(k)' . . . H(2)' H(1)', where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
+    A(i,i+1:n), and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "ZGELQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *m * nb;
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGELQF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZGELQF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZGELQF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the LQ factorization of the current block
+             A(i:i+ib-1,i:n)
+*/
+
+	    i__3 = *n - i__ + 1;
+	    zgelq2_(&ib, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *n - i__ + 1;
+		zlarft_("Forward", "Rowwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i+ib:m,i:n) from the right */
+
+		i__3 = *m - i__ - ib + 1;
+		i__4 = *n - i__ + 1;
+		zlarfb_("Right", "No transpose", "Forward", "Rowwise", &i__3,
+			&i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib +
+			1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	zgelq2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1].r = (doublereal) iws, work[1].i = 0.;
+    return 0;
+
+/*     End of ZGELQF */
+
+} /* zgelqf_ */
+
+/* Subroutine */ int zgelsd_(integer *m, integer *n, integer *nrhs,
+	doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb,
+	doublereal *s, doublereal *rcond, integer *rank, doublecomplex *work,
+	integer *lwork, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer ie, il, mm;
+    static doublereal eps, anrm, bnrm;
+    static integer itau, nlvl, iascl, ibscl;
+    static doublereal sfmin;
+    static integer minmn, maxmn, itaup, itauq, mnthr, nwork;
+    extern /* Subroutine */ int dlabad_(doublereal *, doublereal *);
+
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dlaset_(char *, integer *, integer
+	    *, doublereal *, doublereal *, doublereal *, integer *),
+	    xerbla_(char *, integer *), zgebrd_(integer *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublereal *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, integer *,
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern doublereal zlange_(char *, integer *, integer *, doublecomplex *,
+	    integer *, doublereal *);
+    static doublereal bignum;
+    extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, integer *
+	    ), zlalsd_(char *, integer *, integer *, integer *, doublereal *,
+	    doublereal *, doublecomplex *, integer *, doublereal *, integer *,
+	     doublecomplex *, doublereal *, integer *, integer *),
+	    zlascl_(char *, integer *, integer *, doublereal *, doublereal *,
+	    integer *, integer *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *,
+	     doublecomplex *, doublecomplex *, integer *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int zlacpy_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *),
+	    zlaset_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, integer *);
+    static integer liwork, minwrk, maxwrk;
+    static doublereal smlnum;
+    extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    );
+    static integer lrwork;
+    static logical lquery;
+    static integer nrwork, smlsiz;
+    extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGELSD computes the minimum-norm solution to a real linear least
+    squares problem:
+        minimize 2-norm(| b - A*x |)
+    using the singular value decomposition (SVD) of A. A is an M-by-N
+    matrix which may be rank-deficient.
+
+    Several right hand side vectors b and solution vectors x can be
+    handled in a single call; they are stored as the columns of the
+    M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+    matrix X.
+
+    The problem is solved in three steps:
+    (1) Reduce the coefficient matrix A to bidiagonal form with
+        Householder tranformations, reducing the original problem
+        into a "bidiagonal least squares problem" (BLS)
+    (2) Solve the BLS using a divide and conquer approach.
+    (3) Apply back all the Householder tranformations to solve
+        the original least squares problem.
+
+    The effective rank of A is determined by treating as zero those
+    singular values which are less than RCOND times the largest singular
+    value.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A. N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrices B and X. NRHS >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, A has been destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,M).
+
+    B       (input/output) COMPLEX*16 array, dimension (LDB,NRHS)
+            On entry, the M-by-NRHS right hand side matrix B.
+            On exit, B is overwritten by the N-by-NRHS solution matrix X.
+            If m >= n and RANK = n, the residual sum-of-squares for
+            the solution in the i-th column is given by the sum of
+            squares of the modulus of elements n+1:m in that column.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M,N).
+
+    S       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The singular values of A in decreasing order.
+            The condition number of A in the 2-norm = S(1)/S(min(m,n)).
+
+    RCOND   (input) DOUBLE PRECISION
+            RCOND is used to determine the effective rank of A.
+            Singular values S(i) <= RCOND*S(1) are treated as zero.
+            If RCOND < 0, machine precision is used instead.
+
+    RANK    (output) INTEGER
+            The effective rank of A, i.e., the number of singular values
+            which are greater than RCOND*S(1).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK must be at least 1.
+            The exact minimum amount of workspace needed depends on M,
+            N and NRHS. As long as LWORK is at least
+                2*N + N*NRHS
+            if M is greater than or equal to N or
+                2*M + M*NRHS
+            if M is less than N, the code will execute correctly.
+            For good performance, LWORK should generally be larger.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the array WORK and the
+            minimum sizes of the arrays RWORK and IWORK, and returns
+            these values as the first entries of the WORK, RWORK and
+            IWORK arrays, and no error message related to LWORK is issued
+            by XERBLA.
+
+    RWORK   (workspace) DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
+            LRWORK >=
+               10*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS +
+               MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS )
+            if M is greater than or equal to N or
+               10*M + 2*M*SMLSIZ + 8*M*NLVL + 3*SMLSIZ*NRHS +
+               MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS )
+            if M is less than N, the code will execute correctly.
+            SMLSIZ is returned by ILAENV and is equal to the maximum
+            size of the subproblems at the bottom of the computation
+            tree (usually about 25), and
+               NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+            On exit, if INFO = 0, RWORK(1) returns the minimum LRWORK.
+
+    IWORK   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
+            LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN),
+            where MINMN = MIN( M,N ).
+            On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK.
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+            > 0:  the algorithm for computing the SVD failed to converge;
+                  if INFO = i, i off-diagonal elements of an intermediate
+                  bidiagonal form did not converge to zero.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input arguments.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --s;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    maxmn = max(*m,*n);
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldb < max(1,maxmn)) {
+	*info = -7;
+    }
+
+/*
+       Compute workspace.
+       (Note: Comments in the code beginning "Workspace:" describe the
+       minimal amount of workspace needed at that point in the code,
+       as well as the preferred amount for good performance.
+       NB refers to the optimal block size for the immediately
+       following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	liwork = 1;
+	lrwork = 1;
+	if (minmn > 0) {
+	    smlsiz = ilaenv_(&c__9, "ZGELSD", " ", &c__0, &c__0, &c__0, &c__0,
+		     (ftnlen)6, (ftnlen)1);
+	    mnthr = ilaenv_(&c__6, "ZGELSD", " ", m, n, nrhs, &c_n1, (ftnlen)
+		    6, (ftnlen)1);
+/* Computing MAX */
+	    i__1 = (integer) (log((doublereal) minmn / (doublereal) (smlsiz +
+		    1)) / log(2.)) + 1;
+	    nlvl = max(i__1,0);
+	    liwork = minmn * 3 * nlvl + minmn * 11;
+	    mm = *m;
+	    if (*m >= *n && *m >= mnthr) {
+
+/*
+                Path 1a - overdetermined, with many more rows than
+                          columns.
+*/
+
+		mm = *n;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * ilaenv_(&c__1, "ZGEQRF", " ", m, n,
+			 &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *nrhs * ilaenv_(&c__1, "ZUNMQR", "LC",
+			m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2);
+		maxwrk = max(i__1,i__2);
+	    }
+	    if (*m >= *n) {
+
+/*
+                Path 1 - overdetermined or exactly determined.
+
+   Computing MAX
+   Computing 2nd power
+*/
+		i__3 = smlsiz + 1;
+		i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1);
+		lrwork = *n * 10 + (*n << 1) * smlsiz + (*n << 3) * nlvl +
+			smlsiz * 3 * *nrhs + max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (mm + *n) * ilaenv_(&c__1,
+			"ZGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, (
+			ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + *nrhs * ilaenv_(&c__1,
+			"ZUNMBR", "QLC", &mm, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"ZUNMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + *n * *nrhs;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = (*n << 1) + mm, i__2 = (*n << 1) + *n * *nrhs;
+		minwrk = max(i__1,i__2);
+	    }
+	    if (*n > *m) {
+/*
+   Computing MAX
+   Computing 2nd power
+*/
+		i__3 = smlsiz + 1;
+		i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1);
+		lrwork = *m * 10 + (*m << 1) * smlsiz + (*m << 3) * nlvl +
+			smlsiz * 3 * *nrhs + max(i__1,i__2);
+		if (*n >= mnthr) {
+
+/*
+                   Path 2a - underdetermined, with many more columns
+                             than rows.
+*/
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "ZGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) *
+			    ilaenv_(&c__1, "ZGEBRD", " ", m, m, &c_n1, &c_n1,
+			    (ftnlen)6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs *
+			    ilaenv_(&c__1, "ZUNMBR", "QLC", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) *
+			    ilaenv_(&c__1, "ZUNMLQ", "LC", n, nrhs, m, &c_n1,
+			    (ftnlen)6, (ftnlen)2);
+		    maxwrk = max(i__1,i__2);
+		    if (*nrhs > 1) {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs;
+			maxwrk = max(i__1,i__2);
+		    } else {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + (*m << 1);
+			maxwrk = max(i__1,i__2);
+		    }
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+/*
+       XXX: Ensure the Path 2a case below is triggered.  The workspace
+       calculation should use queries for all routines eventually.
+   Computing MAX
+   Computing MAX
+*/
+		    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4),
+			    i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3;
+		    i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4)
+			    ;
+		    maxwrk = max(i__1,i__2);
+		} else {
+
+/*                 Path 2 - underdetermined. */
+
+		    maxwrk = (*m << 1) + (*n + *m) * ilaenv_(&c__1, "ZGEBRD",
+			    " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *nrhs * ilaenv_(&c__1,
+			    "ZUNMBR", "QLC", m, nrhs, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+		}
+/* Computing MAX */
+		i__1 = (*m << 1) + *n, i__2 = (*m << 1) + *m * *nrhs;
+		minwrk = max(i__1,i__2);
+	    }
+	}
+	minwrk = min(minwrk,maxwrk);
+	work[1].r = (doublereal) maxwrk, work[1].i = 0.;
+	iwork[1] = liwork;
+	rwork[1] = (doublereal) lrwork;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGELSD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	*rank = 0;
+	return 0;
+    }
+
+/*     Get machine parameters. */
+
+    eps = PRECISION;
+    sfmin = SAFEMINIMUM;
+    smlnum = sfmin / eps;
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+
+/*     Scale A if max entry outside range [SMLNUM,BIGNUM]. */
+
+    anrm = zlange_("M", m, n, &a[a_offset], lda, &rwork[1]);
+    iascl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 2;
+    } else if (anrm == 0.) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	zlaset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	dlaset_("F", &minmn, &c__1, &c_b328, &c_b328, &s[1], &c__1)
+		;
+	*rank = 0;
+	goto L10;
+    }
+
+/*     Scale B if max entry outside range [SMLNUM,BIGNUM]. */
+
+    bnrm = zlange_("M", m, nrhs, &b[b_offset], ldb, &rwork[1]);
+    ibscl = 0;
+    if (bnrm > 0. && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	zlascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	zlascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 2;
+    }
+
+/*     If M < N make sure B(M+1:N,:) = 0 */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	zlaset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     Overdetermined case. */
+
+    if (*m >= *n) {
+
+/*        Path 1 - overdetermined or exactly determined. */
+
+	mm = *m;
+	if (*m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns */
+
+	    mm = *n;
+	    itau = 1;
+	    nwork = itau + *n;
+
+/*
+             Compute A=Q*R.
+             (RWorkspace: need N)
+             (CWorkspace: need N, prefer N*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+
+/*
+             Multiply B by transpose(Q).
+             (RWorkspace: need N)
+             (CWorkspace: need NRHS, prefer NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zunmqr_("L", "C", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+/*           Zero out below R. */
+
+	    if (*n > 1) {
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		zlaset_("L", &i__1, &i__2, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+	    }
+	}
+
+	itauq = 1;
+	itaup = itauq + *n;
+	nwork = itaup + *n;
+	ie = 1;
+	nrwork = ie + *n;
+
+/*
+          Bidiagonalize R in A.
+          (RWorkspace: need N)
+          (CWorkspace: need 2*N+MM, prefer 2*N+(MM+N)*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	zgebrd_(&mm, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq], &
+		work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of R.
+          (CWorkspace: need 2*N+NRHS, prefer 2*N+NRHS*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	zunmbr_("Q", "L", "C", &mm, nrhs, n, &a[a_offset], lda, &work[itauq],
+		&b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	zlalsd_("U", &smlsiz, n, nrhs, &s[1], &rwork[ie], &b[b_offset], ldb,
+		rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1], info);
+	if (*info != 0) {
+	    goto L10;
+	}
+
+/*        Multiply B by right bidiagonalizing vectors of R. */
+
+	i__1 = *lwork - nwork + 1;
+	zunmbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], &
+		b[b_offset], ldb, &work[nwork], &i__1, info);
+
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max(
+		i__1,*nrhs), i__2 = *n - *m * 3;
+	if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,i__2)) {
+
+/*
+          Path 2a - underdetermined, with many more columns than rows
+          and sufficient workspace for an efficient algorithm.
+*/
+
+	    ldwork = *m;
+/*
+   Computing MAX
+   Computing MAX
+*/
+	    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+		    max(i__3,*nrhs), i__4 = *n - *m * 3;
+	    i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda +
+		    *m + *m * *nrhs;
+	    if (*lwork >= max(i__1,i__2)) {
+		ldwork = *lda;
+	    }
+	    itau = 1;
+	    nwork = *m + 1;
+
+/*
+          Compute A=L*Q.
+          (CWorkspace: need 2*M, prefer M+M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+	    il = nwork;
+
+/*        Copy L to WORK(IL), zeroing out above its diagonal. */
+
+	    zlacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork);
+	    i__1 = *m - 1;
+	    i__2 = *m - 1;
+	    zlaset_("U", &i__1, &i__2, &c_b56, &c_b56, &work[il + ldwork], &
+		    ldwork);
+	    itauq = il + ldwork * *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+	    ie = 1;
+	    nrwork = ie + *m;
+
+/*
+          Bidiagonalize L in WORK(IL).
+          (RWorkspace: need M)
+          (CWorkspace: need M*M+4*M, prefer M*M+4*M+2*M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zgebrd_(m, m, &work[il], &ldwork, &s[1], &rwork[ie], &work[itauq],
+		     &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of L.
+          (CWorkspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zunmbr_("Q", "L", "C", m, nrhs, m, &work[il], &ldwork, &work[
+		    itauq], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    zlalsd_("U", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1],
+		     info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of L. */
+
+	    i__1 = *lwork - nwork + 1;
+	    zunmbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[
+		    itaup], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Zero out below first M rows of B. */
+
+	    i__1 = *n - *m;
+	    zlaset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1],
+		    ldb);
+	    nwork = itau + *m;
+
+/*
+          Multiply transpose(Q) by B.
+          (CWorkspace: need NRHS, prefer NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zunmlq_("L", "C", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+	} else {
+
+/*        Path 2 - remaining underdetermined cases. */
+
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+	    ie = 1;
+	    nrwork = ie + *m;
+
+/*
+          Bidiagonalize A.
+          (RWorkspace: need M)
+          (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors.
+          (CWorkspace: need 2*M+NRHS, prefer 2*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zunmbr_("Q", "L", "C", m, nrhs, n, &a[a_offset], lda, &work[itauq]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    zlalsd_("L", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1],
+		     info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of A. */
+
+	    i__1 = *lwork - nwork + 1;
+	    zunmbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+	}
+    }
+
+/*     Undo scaling. */
+
+    if (iascl == 1) {
+	zlascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	dlascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    } else if (iascl == 2) {
+	zlascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	dlascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    }
+    if (ibscl == 1) {
+	zlascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    } else if (ibscl == 2) {
+	zlascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    }
+
+L10:
+    work[1].r = (doublereal) maxwrk, work[1].i = 0.;
+    iwork[1] = liwork;
+    rwork[1] = (doublereal) lrwork;
+    return 0;
+
+/*     End of ZGELSD */
+
+} /* zgelsd_ */
+
+/* Subroutine */ int zgeqr2_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublecomplex *tau, doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublecomplex alpha;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    ZGEQR2 computes a QR factorization of a complex m by n matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(m,n) by n upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of elementary reflectors (see Further Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX*16 array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace) COMPLEX*16 array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQR2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	zlarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
+		, &c__1, &tau[i__]);
+	if (i__ < *n) {
+
+/*           Apply H(i)' to A(i:m,i+1:n) from the left */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    d_cnjg(&z__1, &tau[i__]);
+	    zlarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &z__1,
+		     &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of ZGEQR2 */
+
+} /* zgeqr2_ */
+
+/* Subroutine */ int zgeqrf_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int zgeqr2_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *), xerbla_(
+	    char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of min(m,n) elementary reflectors (see Further
+            Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX*16 array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "ZGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *n * nb;
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQRF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQRF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*
+             Compute the QR factorization of the current block
+             A(i:m,i:i+ib-1)
+*/
+
+	    i__3 = *m - i__ + 1;
+	    zgeqr2_(&i__3, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__3 = *m - i__ + 1;
+		zlarft_("Forward", "Columnwise", &i__3, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i:m,i+ib:n) from the left */
+
+		i__3 = *m - i__ + 1;
+		i__4 = *n - i__ - ib + 1;
+		zlarfb_("Left", "Conjugate transpose", "Forward", "Columnwise"
+			, &i__3, &i__4, &ib, &a[i__ + i__ * a_dim1], lda, &
+			work[1], &ldwork, &a[i__ + (i__ + ib) * a_dim1], lda,
+			&work[ib + 1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	zgeqr2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+		, &iinfo);
+    }
+
+    work[1].r = (doublereal) iws, work[1].i = 0.;
+    return 0;
+
+/*     End of ZGEQRF */
+
+} /* zgeqrf_ */
+
+/* Subroutine */ int zgesdd_(char *jobz, integer *m, integer *n,
+	doublecomplex *a, integer *lda, doublereal *s, doublecomplex *u,
+	integer *ldu, doublecomplex *vt, integer *ldvt, doublecomplex *work,
+	integer *lwork, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, u_dim1, u_offset, vt_dim1, vt_offset, i__1,
+	    i__2, i__3;
+
+    /* Local variables */
+    static integer i__, ie, il, ir, iu, blk;
+    static doublereal dum[1], eps;
+    static integer iru, ivt, iscl;
+    static doublereal anrm;
+    static integer idum[1], ierr, itau, irvt;
+    extern logical lsame_(char *, char *);
+    static integer chunk, minmn;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer wrkbl, itaup, itauq;
+    static logical wntqa;
+    static integer nwork;
+    static logical wntqn, wntqo, wntqs;
+    extern /* Subroutine */ int zlacp2_(char *, integer *, integer *,
+	    doublereal *, integer *, doublecomplex *, integer *);
+    static integer mnthr1, mnthr2;
+    extern /* Subroutine */ int dbdsdc_(char *, char *, integer *, doublereal
+	    *, doublereal *, doublereal *, integer *, doublereal *, integer *,
+	     doublereal *, integer *, doublereal *, integer *, integer *);
+
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), xerbla_(char *, integer *),
+	     zgebrd_(integer *, integer *, doublecomplex *, integer *,
+	    doublereal *, doublereal *, doublecomplex *, doublecomplex *,
+	    doublecomplex *, integer *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static doublereal bignum;
+    extern doublereal zlange_(char *, integer *, integer *, doublecomplex *,
+	    integer *, doublereal *);
+    extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, integer *
+	    ), zlacrm_(integer *, integer *, doublecomplex *, integer *,
+	    doublereal *, integer *, doublecomplex *, integer *, doublereal *)
+	    , zlarcm_(integer *, integer *, doublereal *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublereal *), zlascl_(char *, integer *, integer *, doublereal *,
+	     doublereal *, integer *, integer *, doublecomplex *, integer *,
+	    integer *), zgeqrf_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, integer *
+	    );
+    static integer ldwrkl;
+    extern /* Subroutine */ int zlacpy_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *),
+	    zlaset_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, integer *);
+    static integer ldwrkr, minwrk, ldwrku, maxwrk;
+    extern /* Subroutine */ int zungbr_(char *, integer *, integer *, integer
+	    *, doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *);
+    static integer ldwkvt;
+    static doublereal smlnum;
+    static logical wntqas;
+    extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    ), zunglq_(integer *, integer *, integer *
+	    , doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *);
+    static integer nrwork;
+    extern /* Subroutine */ int zungqr_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+       8-15-00:  Improve consistency of WS calculations (eca)
+
+
+    Purpose
+    =======
+
+    ZGESDD computes the singular value decomposition (SVD) of a complex
+    M-by-N matrix A, optionally computing the left and/or right singular
+    vectors, by using divide-and-conquer method. The SVD is written
+
+         A = U * SIGMA * conjugate-transpose(V)
+
+    where SIGMA is an M-by-N matrix which is zero except for its
+    min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+    V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+    are the singular values of A; they are real and non-negative, and
+    are returned in descending order.  The first min(m,n) columns of
+    U and V are the left and right singular vectors of A.
+
+    Note that the routine returns VT = V**H, not V.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            Specifies options for computing all or part of the matrix U:
+            = 'A':  all M columns of U and all N rows of V**H are
+                    returned in the arrays U and VT;
+            = 'S':  the first min(M,N) columns of U and the first
+                    min(M,N) rows of V**H are returned in the arrays U
+                    and VT;
+            = 'O':  If M >= N, the first N columns of U are overwritten
+                    in the array A and all rows of V**H are returned in
+                    the array VT;
+                    otherwise, all columns of U are returned in the
+                    array U and the first M rows of V**H are overwritten
+                    in the array A;
+            = 'N':  no columns of U or rows of V**H are computed.
+
+    M       (input) INTEGER
+            The number of rows of the input matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the input matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit,
+            if JOBZ = 'O',  A is overwritten with the first N columns
+                            of U (the left singular vectors, stored
+                            columnwise) if M >= N;
+                            A is overwritten with the first M rows
+                            of V**H (the right singular vectors, stored
+                            rowwise) otherwise.
+            if JOBZ .ne. 'O', the contents of A are destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    S       (output) DOUBLE PRECISION array, dimension (min(M,N))
+            The singular values of A, sorted so that S(i) >= S(i+1).
+
+    U       (output) COMPLEX*16 array, dimension (LDU,UCOL)
+            UCOL = M if JOBZ = 'A' or JOBZ = 'O' and M < N;
+            UCOL = min(M,N) if JOBZ = 'S'.
+            If JOBZ = 'A' or JOBZ = 'O' and M < N, U contains the M-by-M
+            unitary matrix U;
+            if JOBZ = 'S', U contains the first min(M,N) columns of U
+            (the left singular vectors, stored columnwise);
+            if JOBZ = 'O' and M >= N, or JOBZ = 'N', U is not referenced.
+
+    LDU     (input) INTEGER
+            The leading dimension of the array U.  LDU >= 1; if
+            JOBZ = 'S' or 'A' or JOBZ = 'O' and M < N, LDU >= M.
+
+    VT      (output) COMPLEX*16 array, dimension (LDVT,N)
+            If JOBZ = 'A' or JOBZ = 'O' and M >= N, VT contains the
+            N-by-N unitary matrix V**H;
+            if JOBZ = 'S', VT contains the first min(M,N) rows of
+            V**H (the right singular vectors, stored rowwise);
+            if JOBZ = 'O' and M < N, or JOBZ = 'N', VT is not referenced.
+
+    LDVT    (input) INTEGER
+            The leading dimension of the array VT.  LDVT >= 1; if
+            JOBZ = 'A' or JOBZ = 'O' and M >= N, LDVT >= N;
+            if JOBZ = 'S', LDVT >= min(M,N).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= 1.
+            if JOBZ = 'N', LWORK >= 2*min(M,N)+max(M,N).
+            if JOBZ = 'O',
+                  LWORK >= 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+            if JOBZ = 'S' or 'A',
+                  LWORK >= min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+            For good performance, LWORK should generally be larger.
+
+            If LWORK = -1, a workspace query is assumed.  The optimal
+            size for the WORK array is calculated and stored in WORK(1),
+            and no other work except argument checking is performed.
+
+    RWORK   (workspace) DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
+            If JOBZ = 'N', LRWORK >= 5*min(M,N).
+            Otherwise,
+            LRWORK >= min(M,N)*max(5*min(M,N)+7,2*max(M,N)+2*min(M,N)+1)
+
+    IWORK   (workspace) INTEGER array, dimension (8*min(M,N))
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The updating process of DBDSDC did not converge.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Huan Ren, Computer Science Division, University of
+       California at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --s;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    vt_dim1 = *ldvt;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    minmn = min(*m,*n);
+    mnthr1 = (integer) (minmn * 17. / 9.);
+    mnthr2 = (integer) (minmn * 5. / 3.);
+    wntqa = lsame_(jobz, "A");
+    wntqs = lsame_(jobz, "S");
+    wntqas = wntqa || wntqs;
+    wntqo = lsame_(jobz, "O");
+    wntqn = lsame_(jobz, "N");
+    minwrk = 1;
+    maxwrk = 1;
+
+    if (! (wntqa || wntqs || wntqo || wntqn)) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*ldu < 1 || wntqas && *ldu < *m || wntqo && *m < *n && *ldu < *
+	    m) {
+	*info = -8;
+    } else if (*ldvt < 1 || wntqa && *ldvt < *n || wntqs && *ldvt < minmn ||
+	    wntqo && *m >= *n && *ldvt < *n) {
+	*info = -10;
+    }
+
+/*
+       Compute workspace
+        (Note: Comments in the code beginning "Workspace:" describe the
+         minimal amount of workspace needed at that point in the code,
+         as well as the preferred amount for good performance.
+         CWorkspace refers to complex workspace, and RWorkspace to
+         real workspace. NB refers to the optimal block size for the
+         immediately following subroutine, as returned by ILAENV.)
+*/
+
+    if (*info == 0 && *m > 0 && *n > 0) {
+	if (*m >= *n) {
+
+/*
+             There is no complex work space needed for bidiagonal SVD
+             The real work space needed for bidiagonal SVD is BDSPAC
+             for computing singular values and singular vectors; BDSPAN
+             for computing singular values only.
+             BDSPAC = 5*N*N + 7*N
+             BDSPAN = MAX(7*N+4, 3*N+2+SMLSIZ*(SMLSIZ+8))
+*/
+
+	    if (*m >= mnthr1) {
+		if (wntqn) {
+
+/*                 Path 1 (M much larger than N, JOBZ='N') */
+
+		    maxwrk = *n + *n * ilaenv_(&c__1, "ZGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *n * 3;
+		} else if (wntqo) {
+
+/*                 Path 2 (M much larger than N, JOBZ='O') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "ZGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "ZUNGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *n + *n * *n + wrkbl;
+		    minwrk = (*n << 1) * *n + *n * 3;
+		} else if (wntqs) {
+
+/*                 Path 3 (M much larger than N, JOBZ='S') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "ZGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *n * ilaenv_(&c__1, "ZUNGQR",
+			    " ", m, n, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *n * *n + wrkbl;
+		    minwrk = *n * *n + *n * 3;
+		} else if (wntqa) {
+
+/*                 Path 4 (M much larger than N, JOBZ='A') */
+
+		    wrkbl = *n + *n * ilaenv_(&c__1, "ZGEQRF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *n + *m * ilaenv_(&c__1, "ZUNGQR",
+			    " ", m, m, n, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + (*n << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", n, n, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *n * *n + wrkbl;
+		    minwrk = *n * *n + (*n << 1) + *m;
+		}
+	    } else if (*m >= mnthr2) {
+
+/*              Path 5 (M much larger than N, but not as much as MNTHR1) */
+
+		maxwrk = (*n << 1) + (*m + *n) * ilaenv_(&c__1, "ZGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*n << 1) + *m;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "P", n, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "Q", m, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *n * *n;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "P", n, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "Q", m, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "P", n, n, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		}
+	    } else {
+
+/*              Path 6 (M at least N, but not much larger) */
+
+		maxwrk = (*n << 1) + (*m + *n) * ilaenv_(&c__1, "ZGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*n << 1) + *m;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", m, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *n * *n;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", m, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "PRC", n, n, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*n << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		}
+	    }
+	} else {
+
+/*
+             There is no complex work space needed for bidiagonal SVD
+             The real work space needed for bidiagonal SVD is BDSPAC
+             for computing singular values and singular vectors; BDSPAN
+             for computing singular values only.
+             BDSPAC = 5*M*M + 7*M
+             BDSPAN = MAX(7*M+4, 3*M+2+SMLSIZ*(SMLSIZ+8))
+*/
+
+	    if (*n >= mnthr1) {
+		if (wntqn) {
+
+/*                 Path 1t (N much larger than M, JOBZ='N') */
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "ZGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+		    minwrk = *m * 3;
+		} else if (wntqo) {
+
+/*                 Path 2t (N much larger than M, JOBZ='O') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "ZGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "ZUNGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *n + *m * *m + wrkbl;
+		    minwrk = (*m << 1) * *m + *m * 3;
+		} else if (wntqs) {
+
+/*                 Path 3t (N much larger than M, JOBZ='S') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "ZGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *m * ilaenv_(&c__1, "ZUNGLQ",
+			    " ", m, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *m + wrkbl;
+		    minwrk = *m * *m + *m * 3;
+		} else if (wntqa) {
+
+/*                 Path 4t (N much larger than M, JOBZ='A') */
+
+		    wrkbl = *m + *m * ilaenv_(&c__1, "ZGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = *m + *n * ilaenv_(&c__1, "ZUNGLQ",
+			    " ", n, n, m, &c_n1, (ftnlen)6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + (*m << 1) * ilaenv_(&
+			    c__1, "ZGEBRD", " ", m, m, &c_n1, &c_n1, (ftnlen)
+			    6, (ftnlen)1);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = wrkbl, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", m, m, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    wrkbl = max(i__1,i__2);
+		    maxwrk = *m * *m + wrkbl;
+		    minwrk = *m * *m + (*m << 1) + *n;
+		}
+	    } else if (*n >= mnthr2) {
+
+/*              Path 5t (N much larger than M, but not as much as MNTHR1) */
+
+		maxwrk = (*m << 1) + (*m + *n) * ilaenv_(&c__1, "ZGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*m << 1) + *n;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "P", m, n, m, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *m * *m;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "P", m, n, m, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "P", n, n, m, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "Q", m, m, n, &c_n1, (ftnlen)6, (ftnlen)
+			    1);
+		    maxwrk = max(i__1,i__2);
+		}
+	    } else {
+
+/*              Path 6t (N greater than M, but not much larger) */
+
+		maxwrk = (*m << 1) + (*m + *n) * ilaenv_(&c__1, "ZGEBRD",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		minwrk = (*m << 1) + *n;
+		if (wntqo) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "PRC", m, n, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNMBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		    maxwrk += *m * *n;
+		    minwrk += *m * *m;
+		} else if (wntqs) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "PRC", m, n, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		} else if (wntqa) {
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *n * ilaenv_(&c__1,
+			    "ZUNGBR", "PRC", n, n, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "ZUNGBR", "QLN", m, m, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		}
+	    }
+	}
+	maxwrk = max(maxwrk,minwrk);
+    }
+    if (*info == 0) {
+	work[1].r = (doublereal) maxwrk, work[1].i = 0.;
+	if (*lwork < minwrk && *lwork != -1) {
+	    *info = -13;
+	}
+    }
+
+/*     Quick returns */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGESDD", &i__1);
+	return 0;
+    }
+    if (*lwork == -1) {
+	return 0;
+    }
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Get machine constants */
+
+    eps = PRECISION;
+    smlnum = sqrt(SAFEMINIMUM) / eps;
+    bignum = 1. / smlnum;
+
+/*     Scale A if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = zlange_("M", m, n, &a[a_offset], lda, dum);
+    iscl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+	iscl = 1;
+	zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, &
+		ierr);
+    } else if (anrm > bignum) {
+	iscl = 1;
+	zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, &
+		ierr);
+    }
+
+    if (*m >= *n) {
+
+/*
+          A has at least as many rows as columns. If A has sufficiently
+          more rows than columns, first reduce using the QR
+          decomposition (if sufficient workspace available)
+*/
+
+	if (*m >= mnthr1) {
+
+	    if (wntqn) {
+
+/*
+                Path 1 (M much larger than N, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: need 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Zero out below R */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		zlaset_("L", &i__1, &i__2, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+		ie = 1;
+		itauq = 1;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (CWorkspace: need 3*N, prefer 2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgebrd_(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+		nrwork = ie + *n;
+
+/*
+                Perform bidiagonal SVD, compute singular values only
+                (CWorkspace: 0)
+                (RWorkspace: need BDSPAN)
+*/
+
+		dbdsdc_("U", "N", n, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2 (M much larger than N, JOBZ='O')
+                N left singular vectors to be overwritten on A and
+                N right singular vectors to be computed in VT
+*/
+
+		iu = 1;
+
+/*              WORK(IU) is N by N */
+
+		ldwrku = *n;
+		ir = iu + ldwrku * *n;
+		if (*lwork >= *m * *n + *n * *n + *n * 3) {
+
+/*                 WORK(IR) is M by N */
+
+		    ldwrkr = *m;
+		} else {
+		    ldwrkr = (*lwork - *n * *n - *n * 3) / *n;
+		}
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (CWorkspace: need N*N+2*N, prefer M*N+N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy R to WORK( IR ), zeroing out below it */
+
+		zlacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		zlaset_("L", &i__1, &i__2, &c_b56, &c_b56, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zungqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in WORK(IR)
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of R in WORK(IRU) and computing right singular vectors
+                of R in WORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *n;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by the left singular vectors of R
+                (CWorkspace: need 2*N*N+3*N, prefer M*N+N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[iru], n, &work[iu], &ldwrku);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &work[iu], &ldwrku, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by the right singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IU), storing result in WORK(IR) and copying to A
+                (CWorkspace: need 2*N*N, prefer N*N+M*N)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *m;
+		i__2 = ldwrkr;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *m - i__ + 1;
+		    chunk = min(i__3,ldwrkr);
+		    zgemm_("N", "N", &chunk, n, n, &c_b57, &a[i__ + a_dim1],
+			    lda, &work[iu], &ldwrku, &c_b56, &work[ir], &
+			    ldwrkr);
+		    zlacpy_("F", &chunk, n, &work[ir], &ldwrkr, &a[i__ +
+			    a_dim1], lda);
+/* L10: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Path 3 (M much larger than N, JOBZ='S')
+                N left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		ir = 1;
+
+/*              WORK(IR) is N by N */
+
+		ldwrkr = *n;
+		itau = ir + ldwrkr * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R
+                (CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy R to WORK(IR), zeroing out below it */
+
+		zlacpy_("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		zlaset_("L", &i__2, &i__1, &c_b56, &c_b56, &work[ir + 1], &
+			ldwrkr);
+
+/*
+                Generate Q in A
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zungqr_(m, n, n, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in WORK(IR)
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgebrd_(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *n;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[iru], n, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", n, n, n, &work[ir], &ldwrkr, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, n, &work[ir], &ldwrkr, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in A by left singular vectors of R in
+                WORK(IR), storing result in U
+                (CWorkspace: need N*N)
+                (RWorkspace: 0)
+*/
+
+		zlacpy_("F", n, n, &u[u_offset], ldu, &work[ir], &ldwrkr);
+		zgemm_("N", "N", m, n, n, &c_b57, &a[a_offset], lda, &work[ir]
+			, &ldwrkr, &c_b56, &u[u_offset], ldu);
+
+	    } else if (wntqa) {
+
+/*
+                Path 4 (M much larger than N, JOBZ='A')
+                M left singular vectors to be computed in U and
+                N right singular vectors to be computed in VT
+*/
+
+		iu = 1;
+
+/*              WORK(IU) is N by N */
+
+		ldwrku = *n;
+		itau = iu + ldwrku * *n;
+		nwork = itau + *n;
+
+/*
+                Compute A=Q*R, copying result to U
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+		zlacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Generate Q in U
+                (CWorkspace: need N+M, prefer N+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zungqr_(m, m, n, &u[u_offset], ldu, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+
+/*              Produce R in A, zeroing out below it */
+
+		i__2 = *n - 1;
+		i__1 = *n - 1;
+		zlaset_("L", &i__2, &i__1, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *n;
+		nwork = itaup + *n;
+
+/*
+                Bidiagonalize R in A
+                (CWorkspace: need 3*N, prefer 2*N+2*N*NB)
+                (RWorkspace: need N)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgebrd_(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+		iru = ie + *n;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by left singular vectors of R
+                (CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[iru], n, &work[iu], &ldwrku);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", n, n, n, &a[a_offset], lda, &work[
+			itauq], &work[iu], &ldwrku, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of R
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply Q in U by left singular vectors of R in
+                WORK(IU), storing result in A
+                (CWorkspace: need N*N)
+                (RWorkspace: 0)
+*/
+
+		zgemm_("N", "N", m, n, n, &c_b57, &u[u_offset], ldu, &work[iu]
+			, &ldwrku, &c_b56, &a[a_offset], lda);
+
+/*              Copy left singular vectors of A from A to U */
+
+		zlacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+
+	    }
+
+	} else if (*m >= mnthr2) {
+
+/*
+             MNTHR2 <= M < MNTHR1
+
+             Path 5 (M much larger than N, but not as much as MNTHR1)
+             Reduce to bidiagonal form without QR decomposition, use
+             ZUNGBR and matrix multiplication to compute singular vectors
+*/
+
+	    ie = 1;
+	    nrwork = ie + *n;
+	    itauq = 1;
+	    itaup = itauq + *n;
+	    nwork = itaup + *n;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*N+M, prefer 2*N+(M+N)*NB)
+             (RWorkspace: need N)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    zgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		dbdsdc_("U", "N", n, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		iu = nwork;
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zungbr_("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__2, &ierr);
+
+/*
+                Generate Q in A
+                (CWorkspace: need 2*N, prefer N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zungbr_("Q", m, n, n, &a[a_offset], lda, &work[itauq], &work[
+			nwork], &i__2, &ierr);
+
+		if (*lwork >= *m * *n + *n * 3) {
+
+/*                 WORK( IU ) is M by N */
+
+		    ldwrku = *m;
+		} else {
+
+/*                 WORK(IU) is LDWRKU by N */
+
+		    ldwrku = (*lwork - *n * 3) / *n;
+		}
+		nwork = iu + ldwrku * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in WORK(IU), copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		zlarcm_(n, n, &rwork[irvt], n, &vt[vt_offset], ldvt, &work[iu]
+			, &ldwrku, &rwork[nrwork]);
+		zlacpy_("F", n, n, &work[iu], &ldwrku, &vt[vt_offset], ldvt);
+
+/*
+                Multiply Q in A by real matrix RWORK(IRU), storing the
+                result in WORK(IU), copying to A
+                (CWorkspace: need N*N, prefer M*N)
+                (Rworkspace: need 3*N*N, prefer N*N+2*M*N)
+*/
+
+		nrwork = irvt;
+		i__2 = *m;
+		i__1 = ldwrku;
+		for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			i__1) {
+/* Computing MIN */
+		    i__3 = *m - i__ + 1;
+		    chunk = min(i__3,ldwrku);
+		    zlacrm_(&chunk, n, &a[i__ + a_dim1], lda, &rwork[iru], n,
+			    &work[iu], &ldwrku, &rwork[nrwork]);
+		    zlacpy_("F", &chunk, n, &work[iu], &ldwrku, &a[i__ +
+			    a_dim1], lda);
+/* L20: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zungbr_("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__1, &ierr);
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zungbr_("Q", m, n, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		zlarcm_(n, n, &rwork[irvt], n, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		zlacpy_("F", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: need 0)
+                (Rworkspace: need N*N+2*M*N)
+*/
+
+		nrwork = irvt;
+		zlacrm_(m, n, &u[u_offset], ldu, &rwork[iru], n, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		zlacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+	    } else {
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zungbr_("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__1, &ierr);
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		zlarcm_(n, n, &rwork[irvt], n, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		zlacpy_("F", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: 0)
+                (Rworkspace: need 3*N*N)
+*/
+
+		nrwork = irvt;
+		zlacrm_(m, n, &u[u_offset], ldu, &rwork[iru], n, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		zlacpy_("F", m, n, &a[a_offset], lda, &u[u_offset], ldu);
+	    }
+
+	} else {
+
+/*
+             M .LT. MNTHR2
+
+             Path 6 (M at least N, but not much larger)
+             Reduce to bidiagonal form without QR decomposition
+             Use ZUNMBR to compute singular vectors
+*/
+
+	    ie = 1;
+	    nrwork = ie + *n;
+	    itauq = 1;
+	    itaup = itauq + *n;
+	    nwork = itaup + *n;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*N+M, prefer 2*N+(M+N)*NB)
+             (RWorkspace: need N)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, &ierr);
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		dbdsdc_("U", "N", n, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		iu = nwork;
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		if (*lwork >= *m * *n + *n * 3) {
+
+/*                 WORK( IU ) is M by N */
+
+		    ldwrku = *m;
+		} else {
+
+/*                 WORK( IU ) is LDWRKU by N */
+
+		    ldwrku = (*lwork - *n * 3) / *n;
+		}
+		nwork = iu + ldwrku * *n;
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (Cworkspace: need 2*N, prefer N+N*NB)
+                (Rworkspace: need 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+		if (*lwork >= *m * *n + *n * 3) {
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by left singular vectors of A, copying
+                to A
+                (Cworkspace: need M*N+2*N, prefer M*N+N+N*NB)
+                (Rworkspace: need 0)
+*/
+
+		    zlaset_("F", m, n, &c_b56, &c_b56, &work[iu], &ldwrku);
+		    zlacp2_("F", n, n, &rwork[iru], n, &work[iu], &ldwrku);
+		    i__1 = *lwork - nwork + 1;
+		    zunmbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			    itauq], &work[iu], &ldwrku, &work[nwork], &i__1, &
+			    ierr);
+		    zlacpy_("F", m, n, &work[iu], &ldwrku, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate Q in A
+                   (Cworkspace: need 2*N, prefer N+N*NB)
+                   (Rworkspace: need 0)
+*/
+
+		    i__1 = *lwork - nwork + 1;
+		    zungbr_("Q", m, n, n, &a[a_offset], lda, &work[itauq], &
+			    work[nwork], &i__1, &ierr);
+
+/*
+                   Multiply Q in A by real matrix RWORK(IRU), storing the
+                   result in WORK(IU), copying to A
+                   (CWorkspace: need N*N, prefer M*N)
+                   (Rworkspace: need 3*N*N, prefer N*N+2*M*N)
+*/
+
+		    nrwork = irvt;
+		    i__1 = *m;
+		    i__2 = ldwrku;
+		    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			     i__2) {
+/* Computing MIN */
+			i__3 = *m - i__ + 1;
+			chunk = min(i__3,ldwrku);
+			zlacrm_(&chunk, n, &a[i__ + a_dim1], lda, &rwork[iru],
+				 n, &work[iu], &ldwrku, &rwork[nrwork]);
+			zlacpy_("F", &chunk, n, &work[iu], &ldwrku, &a[i__ +
+				a_dim1], lda);
+/* L30: */
+		    }
+		}
+
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlaset_("F", m, n, &c_b56, &c_b56, &u[u_offset], ldu);
+		zlacp2_("F", n, n, &rwork[iru], n, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, n, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+	    } else {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = nrwork;
+		irvt = iru + *n * *n;
+		nrwork = irvt + *n * *n;
+		dbdsdc_("U", "I", n, &s[1], &rwork[ie], &rwork[iru], n, &
+			rwork[irvt], n, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*              Set the right corner of U to identity matrix */
+
+		zlaset_("F", m, m, &c_b56, &c_b56, &u[u_offset], ldu);
+		if (*m > *n) {
+		    i__2 = *m - *n;
+		    i__1 = *m - *n;
+		    zlaset_("F", &i__2, &i__1, &c_b56, &c_b57, &u[*n + 1 + (*
+			    n + 1) * u_dim1], ldu);
+		}
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 2*N+M, prefer 2*N+M*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[iru], n, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 3*N, prefer 2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", n, n, &rwork[irvt], n, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, n, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__2, &
+			ierr);
+	    }
+
+	}
+
+    } else {
+
+/*
+          A has more columns than rows. If A has sufficiently more
+          columns than rows, first reduce using the LQ decomposition (if
+          sufficient workspace available)
+*/
+
+	if (*n >= mnthr1) {
+
+	    if (wntqn) {
+
+/*
+                Path 1t (N much larger than M, JOBZ='N')
+                No singular vectors to be computed
+*/
+
+		itau = 1;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Zero out above L */
+
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		zlaset_("U", &i__2, &i__1, &c_b56, &c_b56, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = 1;
+		itauq = 1;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (CWorkspace: need 3*M, prefer 2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgebrd_(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+		nrwork = ie + *m;
+
+/*
+                Perform bidiagonal SVD, compute singular values only
+                (CWorkspace: 0)
+                (RWorkspace: need BDSPAN)
+*/
+
+		dbdsdc_("U", "N", m, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+
+	    } else if (wntqo) {
+
+/*
+                Path 2t (N much larger than M, JOBZ='O')
+                M right singular vectors to be overwritten on A and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+		ldwkvt = *m;
+
+/*              WORK(IVT) is M by M */
+
+		il = ivt + ldwkvt * *m;
+		if (*lwork >= *m * *n + *m * *m + *m * 3) {
+
+/*                 WORK(IL) M by N */
+
+		    ldwrkl = *m;
+		    chunk = *n;
+		} else {
+
+/*                 WORK(IL) is M by CHUNK */
+
+		    ldwrkl = *m;
+		    chunk = (*lwork - *m * *m - *m * 3) / *m;
+		}
+		itau = il + ldwrkl * chunk;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__2, &ierr);
+
+/*              Copy L to WORK(IL), zeroing about above it */
+
+		zlacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__2 = *m - 1;
+		i__1 = *m - 1;
+		zlaset_("U", &i__2, &i__1, &c_b56, &c_b56, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zunglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__2, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IL)
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__2 = *lwork - nwork + 1;
+		zgebrd_(m, m, &work[il], &ldwrkl, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *m;
+		irvt = iru + *m * *m;
+		nrwork = irvt + *m * *m;
+		dbdsdc_("U", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix WORK(IU)
+                Overwrite WORK(IU) by the left singular vectors of L
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix WORK(IVT)
+                Overwrite WORK(IVT) by the right singular vectors of L
+                (CWorkspace: need N*N+3*N, prefer M*N+2*N+N*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[irvt], m, &work[ivt], &ldwkvt);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IL) by Q
+                in A, storing result in WORK(IL) and copying to A
+                (CWorkspace: need 2*M*M, prefer M*M+M*N))
+                (RWorkspace: 0)
+*/
+
+		i__2 = *n;
+		i__1 = chunk;
+		for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			i__1) {
+/* Computing MIN */
+		    i__3 = *n - i__ + 1;
+		    blk = min(i__3,chunk);
+		    zgemm_("N", "N", m, &blk, m, &c_b57, &work[ivt], m, &a[
+			    i__ * a_dim1 + 1], lda, &c_b56, &work[il], &
+			    ldwrkl);
+		    zlacpy_("F", m, &blk, &work[il], &ldwrkl, &a[i__ * a_dim1
+			    + 1], lda);
+/* L40: */
+		}
+
+	    } else if (wntqs) {
+
+/*
+               Path 3t (N much larger than M, JOBZ='S')
+               M right singular vectors to be computed in VT and
+               M left singular vectors to be computed in U
+*/
+
+		il = 1;
+
+/*              WORK(IL) is M by M */
+
+		ldwrkl = *m;
+		itau = il + ldwrkl * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+
+/*              Copy L to WORK(IL), zeroing out above it */
+
+		zlacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwrkl);
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		zlaset_("U", &i__1, &i__2, &c_b56, &c_b56, &work[il + ldwrkl],
+			 &ldwrkl);
+
+/*
+                Generate Q in A
+                (CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zunglq_(m, n, m, &a[a_offset], lda, &work[itau], &work[nwork],
+			 &i__1, &ierr);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in WORK(IL)
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgebrd_(m, m, &work[il], &ldwrkl, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *m;
+		irvt = iru + *m * *m;
+		nrwork = irvt + *m * *m;
+		dbdsdc_("U", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of L
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, m, &work[il], &ldwrkl, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by left singular vectors of L
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[irvt], m, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", m, m, m, &work[il], &ldwrkl, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Copy VT to WORK(IL), multiply right singular vectors of L
+                in WORK(IL) by Q in A, storing result in VT
+                (CWorkspace: need M*M)
+                (RWorkspace: 0)
+*/
+
+		zlacpy_("F", m, m, &vt[vt_offset], ldvt, &work[il], &ldwrkl);
+		zgemm_("N", "N", m, n, m, &c_b57, &work[il], &ldwrkl, &a[
+			a_offset], lda, &c_b56, &vt[vt_offset], ldvt);
+
+	    } else if (wntqa) {
+
+/*
+                Path 9t (N much larger than M, JOBZ='A')
+                N right singular vectors to be computed in VT and
+                M left singular vectors to be computed in U
+*/
+
+		ivt = 1;
+
+/*              WORK(IVT) is M by M */
+
+		ldwkvt = *m;
+		itau = ivt + ldwkvt * *m;
+		nwork = itau + *m;
+
+/*
+                Compute A=L*Q, copying result to VT
+                (CWorkspace: need 2*M, prefer M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &
+			i__1, &ierr);
+		zlacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+/*
+                Generate Q in VT
+                (CWorkspace: need M+N, prefer M+N*NB)
+                (RWorkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zunglq_(n, n, m, &vt[vt_offset], ldvt, &work[itau], &work[
+			nwork], &i__1, &ierr);
+
+/*              Produce L in A, zeroing out above it */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		zlaset_("U", &i__1, &i__2, &c_b56, &c_b56, &a[(a_dim1 << 1) +
+			1], lda);
+		ie = 1;
+		itauq = itau;
+		itaup = itauq + *m;
+		nwork = itaup + *m;
+
+/*
+                Bidiagonalize L in A
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
+                (RWorkspace: need M)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zgebrd_(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &work[
+			itauq], &work[itaup], &work[nwork], &i__1, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		iru = ie + *m;
+		irvt = iru + *m * *m;
+		nrwork = irvt + *m * *m;
+		dbdsdc_("U", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of L
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, m, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix WORK(IVT)
+                Overwrite WORK(IVT) by right singular vectors of L
+                (CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
+                (RWorkspace: 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[irvt], m, &work[ivt], &ldwkvt);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", m, m, m, &a[a_offset], lda, &work[
+			itaup], &work[ivt], &ldwkvt, &work[nwork], &i__1, &
+			ierr);
+
+/*
+                Multiply right singular vectors of L in WORK(IVT) by
+                Q in VT, storing result in A
+                (CWorkspace: need M*M)
+                (RWorkspace: 0)
+*/
+
+		zgemm_("N", "N", m, n, m, &c_b57, &work[ivt], &ldwkvt, &vt[
+			vt_offset], ldvt, &c_b56, &a[a_offset], lda);
+
+/*              Copy right singular vectors of A from A to VT */
+
+		zlacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+
+	    }
+
+	} else if (*n >= mnthr2) {
+
+/*
+             MNTHR2 <= N < MNTHR1
+
+             Path 5t (N much larger than M, but not as much as MNTHR1)
+             Reduce to bidiagonal form without QR decomposition, use
+             ZUNGBR and matrix multiplication to compute singular vectors
+*/
+
+
+	    ie = 1;
+	    nrwork = ie + *m;
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+             (RWorkspace: M)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    zgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, &ierr);
+
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		dbdsdc_("L", "N", m, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		ivt = nwork;
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__1, &ierr);
+
+/*
+                Generate P**H in A
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		i__1 = *lwork - nwork + 1;
+		zungbr_("P", m, n, m, &a[a_offset], lda, &work[itaup], &work[
+			nwork], &i__1, &ierr);
+
+		ldwkvt = *m;
+		if (*lwork >= *m * *n + *m * 3) {
+
+/*                 WORK( IVT ) is M by N */
+
+		    nwork = ivt + ldwkvt * *n;
+		    chunk = *n;
+		} else {
+
+/*                 WORK( IVT ) is M by CHUNK */
+
+		    chunk = (*lwork - *m * 3) / *m;
+		    nwork = ivt + ldwkvt * chunk;
+		}
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		dbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRVT)
+                storing the result in WORK(IVT), copying to U
+                (Cworkspace: need 0)
+                (Rworkspace: need 2*M*M)
+*/
+
+		zlacrm_(m, m, &u[u_offset], ldu, &rwork[iru], m, &work[ivt], &
+			ldwkvt, &rwork[nrwork]);
+		zlacpy_("F", m, m, &work[ivt], &ldwkvt, &u[u_offset], ldu);
+
+/*
+                Multiply RWORK(IRVT) by P**H in A, storing the
+                result in WORK(IVT), copying to A
+                (CWorkspace: need M*M, prefer M*N)
+                (Rworkspace: need 2*M*M, prefer 2*M*N)
+*/
+
+		nrwork = iru;
+		i__1 = *n;
+		i__2 = chunk;
+		for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+			i__2) {
+/* Computing MIN */
+		    i__3 = *n - i__ + 1;
+		    blk = min(i__3,chunk);
+		    zlarcm_(m, &blk, &rwork[irvt], m, &a[i__ * a_dim1 + 1],
+			    lda, &work[ivt], &ldwkvt, &rwork[nrwork]);
+		    zlacpy_("F", m, &blk, &work[ivt], &ldwkvt, &a[i__ *
+			    a_dim1 + 1], lda);
+/* L50: */
+		}
+	    } else if (wntqs) {
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__2, &ierr);
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zungbr_("P", m, n, m, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		dbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: need 0)
+                (Rworkspace: need 3*M*M)
+*/
+
+		zlacrm_(m, m, &u[u_offset], ldu, &rwork[iru], m, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		zlacpy_("F", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need M*M+2*M*N)
+*/
+
+		nrwork = iru;
+		zlarcm_(m, n, &rwork[irvt], m, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		zlacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+	    } else {
+
+/*
+                Copy A to U, generate Q
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zungbr_("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
+			nwork], &i__2, &ierr);
+
+/*
+                Copy A to VT, generate P**H
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: 0)
+*/
+
+		zlacpy_("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+		i__2 = *lwork - nwork + 1;
+		zungbr_("P", n, n, m, &vt[vt_offset], ldvt, &work[itaup], &
+			work[nwork], &i__2, &ierr);
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		dbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Multiply Q in U by real matrix RWORK(IRU), storing the
+                result in A, copying to U
+                (CWorkspace: need 0)
+                (Rworkspace: need 3*M*M)
+*/
+
+		zlacrm_(m, m, &u[u_offset], ldu, &rwork[iru], m, &a[a_offset],
+			 lda, &rwork[nrwork]);
+		zlacpy_("F", m, m, &a[a_offset], lda, &u[u_offset], ldu);
+
+/*
+                Multiply real matrix RWORK(IRVT) by P**H in VT,
+                storing the result in A, copying to VT
+                (Cworkspace: need 0)
+                (Rworkspace: need M*M+2*M*N)
+*/
+
+		zlarcm_(m, n, &rwork[irvt], m, &vt[vt_offset], ldvt, &a[
+			a_offset], lda, &rwork[nrwork]);
+		zlacpy_("F", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
+	    }
+
+	} else {
+
+/*
+             N .LT. MNTHR2
+
+             Path 6t (N greater than M, but not much larger)
+             Reduce to bidiagonal form without LQ decomposition
+             Use ZUNMBR to compute singular vectors
+*/
+
+	    ie = 1;
+	    nrwork = ie + *m;
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+             Bidiagonalize A
+             (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+             (RWorkspace: M)
+*/
+
+	    i__2 = *lwork - nwork + 1;
+	    zgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__2, &ierr);
+	    if (wntqn) {
+
+/*
+                Compute singular values only
+                (Cworkspace: 0)
+                (Rworkspace: need BDSPAN)
+*/
+
+		dbdsdc_("L", "N", m, &s[1], &rwork[ie], dum, &c__1, dum, &
+			c__1, dum, idum, &rwork[nrwork], &iwork[1], info);
+	    } else if (wntqo) {
+		ldwkvt = *m;
+		ivt = nwork;
+		if (*lwork >= *m * *n + *m * 3) {
+
+/*                 WORK( IVT ) is M by N */
+
+		    zlaset_("F", m, n, &c_b56, &c_b56, &work[ivt], &ldwkvt);
+		    nwork = ivt + ldwkvt * *n;
+		} else {
+
+/*                 WORK( IVT ) is M by CHUNK */
+
+		    chunk = (*lwork - *m * 3) / *m;
+		    nwork = ivt + ldwkvt * chunk;
+		}
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		dbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (Cworkspace: need 2*M, prefer M+M*NB)
+                (Rworkspace: need 0)
+*/
+
+		zlacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__2 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__2, &ierr);
+
+		if (*lwork >= *m * *n + *m * 3) {
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix WORK(IVT)
+                Overwrite WORK(IVT) by right singular vectors of A,
+                copying to A
+                (Cworkspace: need M*N+2*M, prefer M*N+M+M*NB)
+                (Rworkspace: need 0)
+*/
+
+		    zlacp2_("F", m, m, &rwork[irvt], m, &work[ivt], &ldwkvt);
+		    i__2 = *lwork - nwork + 1;
+		    zunmbr_("P", "R", "C", m, n, m, &a[a_offset], lda, &work[
+			    itaup], &work[ivt], &ldwkvt, &work[nwork], &i__2,
+			    &ierr);
+		    zlacpy_("F", m, n, &work[ivt], &ldwkvt, &a[a_offset], lda);
+		} else {
+
+/*
+                   Generate P**H in A
+                   (Cworkspace: need 2*M, prefer M+M*NB)
+                   (Rworkspace: need 0)
+*/
+
+		    i__2 = *lwork - nwork + 1;
+		    zungbr_("P", m, n, m, &a[a_offset], lda, &work[itaup], &
+			    work[nwork], &i__2, &ierr);
+
+/*
+                   Multiply Q in A by real matrix RWORK(IRU), storing the
+                   result in WORK(IU), copying to A
+                   (CWorkspace: need M*M, prefer M*N)
+                   (Rworkspace: need 3*M*M, prefer M*M+2*M*N)
+*/
+
+		    nrwork = iru;
+		    i__2 = *n;
+		    i__1 = chunk;
+		    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
+			     i__1) {
+/* Computing MIN */
+			i__3 = *n - i__ + 1;
+			blk = min(i__3,chunk);
+			zlarcm_(m, &blk, &rwork[irvt], m, &a[i__ * a_dim1 + 1]
+				, lda, &work[ivt], &ldwkvt, &rwork[nrwork]);
+			zlacpy_("F", m, &blk, &work[ivt], &ldwkvt, &a[i__ *
+				a_dim1 + 1], lda);
+/* L60: */
+		    }
+		}
+	    } else if (wntqs) {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+		dbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: M*M)
+*/
+
+		zlacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: M*M)
+*/
+
+		zlaset_("F", m, n, &c_b56, &c_b56, &vt[vt_offset], ldvt);
+		zlacp2_("F", m, m, &rwork[irvt], m, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", m, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    } else {
+
+/*
+                Perform bidiagonal SVD, computing left singular vectors
+                of bidiagonal matrix in RWORK(IRU) and computing right
+                singular vectors of bidiagonal matrix in RWORK(IRVT)
+                (CWorkspace: need 0)
+                (RWorkspace: need BDSPAC)
+*/
+
+		irvt = nrwork;
+		iru = irvt + *m * *m;
+		nrwork = iru + *m * *m;
+
+		dbdsdc_("L", "I", m, &s[1], &rwork[ie], &rwork[iru], m, &
+			rwork[irvt], m, dum, idum, &rwork[nrwork], &iwork[1],
+			info);
+
+/*
+                Copy real matrix RWORK(IRU) to complex matrix U
+                Overwrite U by left singular vectors of A
+                (CWorkspace: need 3*M, prefer 2*M+M*NB)
+                (RWorkspace: M*M)
+*/
+
+		zlacp2_("F", m, m, &rwork[iru], m, &u[u_offset], ldu);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("Q", "L", "N", m, m, n, &a[a_offset], lda, &work[
+			itauq], &u[u_offset], ldu, &work[nwork], &i__1, &ierr);
+
+/*              Set all of VT to identity matrix */
+
+		zlaset_("F", n, n, &c_b56, &c_b57, &vt[vt_offset], ldvt);
+
+/*
+                Copy real matrix RWORK(IRVT) to complex matrix VT
+                Overwrite VT by right singular vectors of A
+                (CWorkspace: need 2*M+N, prefer 2*M+N*NB)
+                (RWorkspace: M*M)
+*/
+
+		zlacp2_("F", m, m, &rwork[irvt], m, &vt[vt_offset], ldvt);
+		i__1 = *lwork - nwork + 1;
+		zunmbr_("P", "R", "C", n, n, m, &a[a_offset], lda, &work[
+			itaup], &vt[vt_offset], ldvt, &work[nwork], &i__1, &
+			ierr);
+	    }
+
+	}
+
+    }
+
+/*     Undo scaling if necessary */
+
+    if (iscl == 1) {
+	if (anrm > bignum) {
+	    dlascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+	if (*info != 0 && anrm > bignum) {
+	    i__1 = minmn - 1;
+	    dlascl_("G", &c__0, &c__0, &bignum, &anrm, &i__1, &c__1, &rwork[
+		    ie], &minmn, &ierr);
+	}
+	if (anrm < smlnum) {
+	    dlascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		    minmn, &ierr);
+	}
+	if (*info != 0 && anrm < smlnum) {
+	    i__1 = minmn - 1;
+	    dlascl_("G", &c__0, &c__0, &smlnum, &anrm, &i__1, &c__1, &rwork[
+		    ie], &minmn, &ierr);
+	}
+    }
+
+/*     Return optimal workspace in WORK(1) */
+
+    work[1].r = (doublereal) maxwrk, work[1].i = 0.;
+
+    return 0;
+
+/*     End of ZGESDD */
+
+} /* zgesdd_ */
+
+/* Subroutine */ int zgesv_(integer *n, integer *nrhs, doublecomplex *a,
+	integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int xerbla_(char *, integer *), zgetrf_(
+	    integer *, integer *, doublecomplex *, integer *, integer *,
+	    integer *), zgetrs_(char *, integer *, integer *, doublecomplex *,
+	     integer *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGESV computes the solution to a complex system of linear equations
+       A * X = B,
+    where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+
+    The LU decomposition with partial pivoting and row interchanges is
+    used to factor A as
+       A = P * L * U,
+    where P is a permutation matrix, L is unit lower triangular, and U is
+    upper triangular.  The factored form of A is then used to solve the
+    system of equations A * X = B.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of linear equations, i.e., the order of the
+            matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the N-by-N coefficient matrix A.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (output) INTEGER array, dimension (N)
+            The pivot indices that define the permutation matrix P;
+            row i of the matrix was interchanged with row IPIV(i).
+
+    B       (input/output) COMPLEX*16 array, dimension (LDB,NRHS)
+            On entry, the N-by-NRHS matrix of right hand side matrix B.
+            On exit, if INFO = 0, the N-by-NRHS solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization
+                  has been completed, but the factor U is exactly
+                  singular, so the solution could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*nrhs < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGESV ", &i__1);
+	return 0;
+    }
+
+/*     Compute the LU factorization of A. */
+
+    zgetrf_(n, n, &a[a_offset], lda, &ipiv[1], info);
+    if (*info == 0) {
+
+/*        Solve the system A*X = B, overwriting B with X. */
+
+	zgetrs_("No transpose", n, nrhs, &a[a_offset], lda, &ipiv[1], &b[
+		b_offset], ldb, info);
+    }
+    return 0;
+
+/*     End of ZGESV */
+
+} /* zgesv_ */
+
+/* Subroutine */ int zgetf2_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, jp;
+    static doublereal sfmin;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), zgeru_(integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *), zswap_(integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGETF2 computes an LU factorization of a general m-by-n matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the m by n matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, U(k,k) is exactly zero. The factorization
+                 has been completed, but the factor U is exactly
+                 singular, and division by zero will occur if it is used
+                 to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGETF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Compute machine safe minimum */
+
+    sfmin = SAFEMINIMUM;
+
+    i__1 = min(*m,*n);
+    for (j = 1; j <= i__1; ++j) {
+
+/*        Find pivot and test for singularity. */
+
+	i__2 = *m - j + 1;
+	jp = j - 1 + izamax_(&i__2, &a[j + j * a_dim1], &c__1);
+	ipiv[j] = jp;
+	i__2 = jp + j * a_dim1;
+	if (a[i__2].r != 0. || a[i__2].i != 0.) {
+
+/*           Apply the interchange to columns 1:N. */
+
+	    if (jp != j) {
+		zswap_(n, &a[j + a_dim1], lda, &a[jp + a_dim1], lda);
+	    }
+
+/*           Compute elements J+1:M of J-th column. */
+
+	    if (j < *m) {
+		if (z_abs(&a[j + j * a_dim1]) >= sfmin) {
+		    i__2 = *m - j;
+		    z_div(&z__1, &c_b57, &a[j + j * a_dim1]);
+		    zscal_(&i__2, &z__1, &a[j + 1 + j * a_dim1], &c__1);
+		} else {
+		    i__2 = *m - j;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = j + i__ + j * a_dim1;
+			z_div(&z__1, &a[j + i__ + j * a_dim1], &a[j + j *
+				a_dim1]);
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L20: */
+		    }
+		}
+	    }
+
+	} else if (*info == 0) {
+
+	    *info = j;
+	}
+
+	if (j < min(*m,*n)) {
+
+/*           Update trailing submatrix. */
+
+	    i__2 = *m - j;
+	    i__3 = *n - j;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgeru_(&i__2, &i__3, &z__1, &a[j + 1 + j * a_dim1], &c__1, &a[j +
+		    (j + 1) * a_dim1], lda, &a[j + 1 + (j + 1) * a_dim1], lda)
+		    ;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of ZGETF2 */
+
+} /* zgetf2_ */
+
+/* Subroutine */ int zgetrf_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, integer *ipiv, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, jb, nb, iinfo;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), ztrsm_(char *, char *, char *, char *,
+	     integer *, integer *, doublecomplex *, doublecomplex *, integer *
+	    , doublecomplex *, integer *),
+	    zgetf2_(integer *, integer *, doublecomplex *, integer *, integer
+	    *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *,
+	     integer *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGETRF computes an LU factorization of a general M-by-N matrix A
+    using partial pivoting with row interchanges.
+
+    The factorization has the form
+       A = P * L * U
+    where P is a permutation matrix, L is lower triangular with unit
+    diagonal elements (lower trapezoidal if m > n), and U is upper
+    triangular (upper trapezoidal if m < n).
+
+    This is the right-looking Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the M-by-N matrix to be factored.
+            On exit, the factors L and U from the factorization
+            A = P*L*U; the unit diagonal elements of L are not stored.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    IPIV    (output) INTEGER array, dimension (min(M,N))
+            The pivot indices; for 1 <= i <= min(M,N), row i of the
+            matrix was interchanged with row IPIV(i).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
+                  has been completed, but the factor U is exactly
+                  singular, and division by zero will occur if it is used
+                  to solve a system of equations.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGETRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "ZGETRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    if (nb <= 1 || nb >= min(*m,*n)) {
+
+/*        Use unblocked code. */
+
+	zgetf2_(m, n, &a[a_offset], lda, &ipiv[1], info);
+    } else {
+
+/*        Use blocked code. */
+
+	i__1 = min(*m,*n);
+	i__2 = nb;
+	for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+/* Computing MIN */
+	    i__3 = min(*m,*n) - j + 1;
+	    jb = min(i__3,nb);
+
+/*
+             Factor diagonal and subdiagonal blocks and test for exact
+             singularity.
+*/
+
+	    i__3 = *m - j + 1;
+	    zgetf2_(&i__3, &jb, &a[j + j * a_dim1], lda, &ipiv[j], &iinfo);
+
+/*           Adjust INFO and the pivot indices. */
+
+	    if (*info == 0 && iinfo > 0) {
+		*info = iinfo + j - 1;
+	    }
+/* Computing MIN */
+	    i__4 = *m, i__5 = j + jb - 1;
+	    i__3 = min(i__4,i__5);
+	    for (i__ = j; i__ <= i__3; ++i__) {
+		ipiv[i__] = j - 1 + ipiv[i__];
+/* L10: */
+	    }
+
+/*           Apply interchanges to columns 1:J-1. */
+
+	    i__3 = j - 1;
+	    i__4 = j + jb - 1;
+	    zlaswp_(&i__3, &a[a_offset], lda, &j, &i__4, &ipiv[1], &c__1);
+
+	    if (j + jb <= *n) {
+
+/*              Apply interchanges to columns J+JB:N. */
+
+		i__3 = *n - j - jb + 1;
+		i__4 = j + jb - 1;
+		zlaswp_(&i__3, &a[(j + jb) * a_dim1 + 1], lda, &j, &i__4, &
+			ipiv[1], &c__1);
+
+/*              Compute block row of U. */
+
+		i__3 = *n - j - jb + 1;
+		ztrsm_("Left", "Lower", "No transpose", "Unit", &jb, &i__3, &
+			c_b57, &a[j + j * a_dim1], lda, &a[j + (j + jb) *
+			a_dim1], lda);
+		if (j + jb <= *m) {
+
+/*                 Update trailing submatrix. */
+
+		    i__3 = *m - j - jb + 1;
+		    i__4 = *n - j - jb + 1;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "No transpose", &i__3, &i__4, &jb,
+			    &z__1, &a[j + jb + j * a_dim1], lda, &a[j + (j +
+			    jb) * a_dim1], lda, &c_b57, &a[j + jb + (j + jb) *
+			     a_dim1], lda);
+		}
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of ZGETRF */
+
+} /* zgetrf_ */
+
+/* Subroutine */ int zgetrs_(char *trans, integer *n, integer *nrhs,
+	doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b,
+	integer *ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *),
+	    xerbla_(char *, integer *);
+    static logical notran;
+    extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *,
+	     integer *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZGETRS solves a system of linear equations
+       A * X = B,  A**T * X = B,  or  A**H * X = B
+    with a general N-by-N matrix A using the LU factorization computed
+    by ZGETRF.
+
+    Arguments
+    =========
+
+    TRANS   (input) CHARACTER*1
+            Specifies the form of the system of equations:
+            = 'N':  A * X = B     (No transpose)
+            = 'T':  A**T * X = B  (Transpose)
+            = 'C':  A**H * X = B  (Conjugate transpose)
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The factors L and U from the factorization A = P*L*U
+            as computed by ZGETRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    IPIV    (input) INTEGER array, dimension (N)
+            The pivot indices from ZGETRF; for 1<=i<=N, row i of the
+            matrix was interchanged with row IPIV(i).
+
+    B       (input/output) COMPLEX*16 array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    notran = lsame_(trans, "N");
+    if (! notran && ! lsame_(trans, "T") && ! lsame_(
+	    trans, "C")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGETRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (notran) {
+
+/*
+          Solve A * X = B.
+
+          Apply row interchanges to the right hand sides.
+*/
+
+	zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c__1);
+
+/*        Solve L*X = B, overwriting B with X. */
+
+	ztrsm_("Left", "Lower", "No transpose", "Unit", n, nrhs, &c_b57, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b57, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A**T * X = B  or A**H * X = B.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	ztrsm_("Left", "Upper", trans, "Non-unit", n, nrhs, &c_b57, &a[
+		a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	ztrsm_("Left", "Lower", trans, "Unit", n, nrhs, &c_b57, &a[a_offset],
+		lda, &b[b_offset], ldb);
+
+/*        Apply row interchanges to the solution vectors. */
+
+	zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1);
+    }
+
+    return 0;
+
+/*     End of ZGETRS */
+
+} /* zgetrs_ */
+
+/* Subroutine */ int zheevd_(char *jobz, char *uplo, integer *n,
+	doublecomplex *a, integer *lda, doublereal *w, doublecomplex *work,
+	integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork,
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal eps;
+    static integer inde;
+    static doublereal anrm;
+    static integer imax;
+    static doublereal rmin, rmax;
+    static integer lopt;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    static doublereal sigma;
+    extern logical lsame_(char *, char *);
+    static integer iinfo, lwmin, liopt;
+    static logical lower;
+    static integer llrwk, lropt;
+    static logical wantz;
+    static integer indwk2, llwrk2;
+
+    static integer iscale;
+    static doublereal safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal bignum;
+    extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *,
+	    integer *, doublereal *);
+    static integer indtau;
+    extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *,
+	     integer *), zlascl_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublecomplex *, integer *,
+	    integer *), zstedc_(char *, integer *, doublereal *,
+	    doublereal *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublereal *, integer *, integer *, integer *, integer
+	    *);
+    static integer indrwk, indwrk, liwmin;
+    extern /* Subroutine */ int zhetrd_(char *, integer *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, doublecomplex *,
+	    doublecomplex *, integer *, integer *), zlacpy_(char *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *);
+    static integer lrwmin, llwork;
+    static doublereal smlnum;
+    static logical lquery;
+    extern /* Subroutine */ int zunmtr_(char *, char *, char *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*
+    -- LAPACK driver routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZHEEVD computes all eigenvalues and, optionally, eigenvectors of a
+    complex Hermitian matrix A.  If eigenvectors are desired, it uses a
+    divide and conquer algorithm.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    JOBZ    (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only;
+            = 'V':  Compute eigenvalues and eigenvectors.
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA, N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of A contains the
+            upper triangular part of the matrix A.  If UPLO = 'L',
+            the leading N-by-N lower triangular part of A contains
+            the lower triangular part of the matrix A.
+            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
+            orthonormal eigenvectors of the matrix A.
+            If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
+            or the upper triangle (if UPLO='U') of A, including the
+            diagonal, is destroyed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    W       (output) DOUBLE PRECISION array, dimension (N)
+            If INFO = 0, the eigenvalues in ascending order.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The length of the array WORK.
+            If N <= 1,                LWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LWORK must be at least N + 1.
+            If JOBZ  = 'V' and N > 1, LWORK must be at least 2*N + N**2.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal sizes of the WORK, RWORK and
+            IWORK arrays, returns these values as the first entries of
+            the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    RWORK   (workspace/output) DOUBLE PRECISION array,
+                                           dimension (LRWORK)
+            On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+
+    LRWORK  (input) INTEGER
+            The dimension of the array RWORK.
+            If N <= 1,                LRWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LRWORK must be at least N.
+            If JOBZ  = 'V' and N > 1, LRWORK must be at least
+                           1 + 5*N + 2*N**2.
+
+            If LRWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If N <= 1,                LIWORK must be at least 1.
+            If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
+            If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
+                  to converge; i off-diagonal elements of an intermediate
+                  tridiagonal form did not converge to zero;
+                  if INFO = i and JOBZ = 'V', then the algorithm failed
+                  to compute an eigenvalue while working on the submatrix
+                  lying in rows and columns INFO/(N+1) through
+                  mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    Modified description of INFO. Sven, 16 Feb 05.
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --w;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    wantz = lsame_(jobz, "V");
+    lower = lsame_(uplo, "L");
+    lquery = *lwork == -1 || *lrwork == -1 || *liwork == -1;
+
+    *info = 0;
+    if (! (wantz || lsame_(jobz, "N"))) {
+	*info = -1;
+    } else if (! (lower || lsame_(uplo, "U"))) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+
+    if (*info == 0) {
+	if (*n <= 1) {
+	    lwmin = 1;
+	    lrwmin = 1;
+	    liwmin = 1;
+	    lopt = lwmin;
+	    lropt = lrwmin;
+	    liopt = liwmin;
+	} else {
+	    if (wantz) {
+		lwmin = (*n << 1) + *n * *n;
+/* Computing 2nd power */
+		i__1 = *n;
+		lrwmin = *n * 5 + 1 + (i__1 * i__1 << 1);
+		liwmin = *n * 5 + 3;
+	    } else {
+		lwmin = *n + 1;
+		lrwmin = *n;
+		liwmin = 1;
+	    }
+/* Computing MAX */
+	    i__1 = lwmin, i__2 = *n + ilaenv_(&c__1, "ZHETRD", uplo, n, &c_n1,
+		     &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+	    lopt = max(i__1,i__2);
+	    lropt = lrwmin;
+	    liopt = liwmin;
+	}
+	work[1].r = (doublereal) lopt, work[1].i = 0.;
+	rwork[1] = (doublereal) lropt;
+	iwork[1] = liopt;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*lrwork < lrwmin && ! lquery) {
+	    *info = -10;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZHEEVD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	i__1 = a_dim1 + 1;
+	w[1] = a[i__1].r;
+	if (wantz) {
+	    i__1 = a_dim1 + 1;
+	    a[i__1].r = 1., a[i__1].i = 0.;
+	}
+	return 0;
+    }
+
+/*     Get machine constants. */
+
+    safmin = SAFEMINIMUM;
+    eps = PRECISION;
+    smlnum = safmin / eps;
+    bignum = 1. / smlnum;
+    rmin = sqrt(smlnum);
+    rmax = sqrt(bignum);
+
+/*     Scale matrix to allowable range, if necessary. */
+
+    anrm = zlanhe_("M", uplo, n, &a[a_offset], lda, &rwork[1]);
+    iscale = 0;
+    if (anrm > 0. && anrm < rmin) {
+	iscale = 1;
+	sigma = rmin / anrm;
+    } else if (anrm > rmax) {
+	iscale = 1;
+	sigma = rmax / anrm;
+    }
+    if (iscale == 1) {
+	zlascl_(uplo, &c__0, &c__0, &c_b1034, &sigma, n, n, &a[a_offset], lda,
+		 info);
+    }
+
+/*     Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */
+
+    inde = 1;
+    indtau = 1;
+    indwrk = indtau + *n;
+    indrwk = inde + *n;
+    indwk2 = indwrk + *n * *n;
+    llwork = *lwork - indwrk + 1;
+    llwrk2 = *lwork - indwk2 + 1;
+    llrwk = *lrwork - indrwk + 1;
+    zhetrd_(uplo, n, &a[a_offset], lda, &w[1], &rwork[inde], &work[indtau], &
+	    work[indwrk], &llwork, &iinfo);
+
+/*
+       For eigenvalues only, call DSTERF.  For eigenvectors, first call
+       ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
+       tridiagonal matrix, then call ZUNMTR to multiply it to the
+       Householder transformations represented as Householder vectors in
+       A.
+*/
+
+    if (! wantz) {
+	dsterf_(n, &w[1], &rwork[inde], info);
+    } else {
+	zstedc_("I", n, &w[1], &rwork[inde], &work[indwrk], n, &work[indwk2],
+		&llwrk2, &rwork[indrwk], &llrwk, &iwork[1], liwork, info);
+	zunmtr_("L", uplo, "N", n, n, &a[a_offset], lda, &work[indtau], &work[
+		indwrk], n, &work[indwk2], &llwrk2, &iinfo);
+	zlacpy_("A", n, n, &work[indwrk], n, &a[a_offset], lda);
+    }
+
+/*     If matrix was scaled, then rescale eigenvalues appropriately. */
+
+    if (iscale == 1) {
+	if (*info == 0) {
+	    imax = *n;
+	} else {
+	    imax = *info - 1;
+	}
+	d__1 = 1. / sigma;
+	dscal_(&imax, &d__1, &w[1], &c__1);
+    }
+
+    work[1].r = (doublereal) lopt, work[1].i = 0.;
+    rwork[1] = (doublereal) lropt;
+    iwork[1] = liopt;
+
+    return 0;
+
+/*     End of ZHEEVD */
+
+} /* zheevd_ */
+
+/* Subroutine */ int zhetd2_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, doublereal *d__, doublereal *e, doublecomplex *tau,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex taui;
+    extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static doublecomplex alpha;
+    extern logical lsame_(char *, char *);
+    extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, doublecomplex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(
+	    char *, integer *), zlarfg_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZHETD2 reduces a complex Hermitian matrix A to real symmetric
+    tridiagonal form T by a unitary similarity transformation:
+    Q' * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            Hermitian matrix A is stored:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the unitary
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the unitary matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) DOUBLE PRECISION array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) COMPLEX*16 array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZHETD2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Reduce the upper triangle of A */
+
+	i__1 = *n + *n * a_dim1;
+	i__2 = *n + *n * a_dim1;
+	d__1 = a[i__2].r;
+	a[i__1].r = d__1, a[i__1].i = 0.;
+	for (i__ = *n - 1; i__ >= 1; --i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(1:i-1,i+1)
+*/
+
+	    i__1 = i__ + (i__ + 1) * a_dim1;
+	    alpha.r = a[i__1].r, alpha.i = a[i__1].i;
+	    zlarfg_(&i__, &alpha, &a[(i__ + 1) * a_dim1 + 1], &c__1, &taui);
+	    i__1 = i__;
+	    e[i__1] = alpha.r;
+
+	    if (taui.r != 0. || taui.i != 0.) {
+
+/*              Apply H(i) from both sides to A(1:i,1:i) */
+
+		i__1 = i__ + (i__ + 1) * a_dim1;
+		a[i__1].r = 1., a[i__1].i = 0.;
+
+/*              Compute  x := tau * A * v  storing x in TAU(1:i) */
+
+		zhemv_(uplo, &i__, &taui, &a[a_offset], lda, &a[(i__ + 1) *
+			a_dim1 + 1], &c__1, &c_b56, &tau[1], &c__1)
+			;
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		z__3.r = -.5, z__3.i = -0.;
+		z__2.r = z__3.r * taui.r - z__3.i * taui.i, z__2.i = z__3.r *
+			taui.i + z__3.i * taui.r;
+		zdotc_(&z__4, &i__, &tau[1], &c__1, &a[(i__ + 1) * a_dim1 + 1]
+			, &c__1);
+		z__1.r = z__2.r * z__4.r - z__2.i * z__4.i, z__1.i = z__2.r *
+			z__4.i + z__2.i * z__4.r;
+		alpha.r = z__1.r, alpha.i = z__1.i;
+		zaxpy_(&i__, &alpha, &a[(i__ + 1) * a_dim1 + 1], &c__1, &tau[
+			1], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		z__1.r = -1., z__1.i = -0.;
+		zher2_(uplo, &i__, &z__1, &a[(i__ + 1) * a_dim1 + 1], &c__1, &
+			tau[1], &c__1, &a[a_offset], lda);
+
+	    } else {
+		i__1 = i__ + i__ * a_dim1;
+		i__2 = i__ + i__ * a_dim1;
+		d__1 = a[i__2].r;
+		a[i__1].r = d__1, a[i__1].i = 0.;
+	    }
+	    i__1 = i__ + (i__ + 1) * a_dim1;
+	    i__2 = i__;
+	    a[i__1].r = e[i__2], a[i__1].i = 0.;
+	    i__1 = i__ + 1;
+	    i__2 = i__ + 1 + (i__ + 1) * a_dim1;
+	    d__[i__1] = a[i__2].r;
+	    i__1 = i__;
+	    tau[i__1].r = taui.r, tau[i__1].i = taui.i;
+/* L10: */
+	}
+	i__1 = a_dim1 + 1;
+	d__[1] = a[i__1].r;
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__1 = a_dim1 + 1;
+	i__2 = a_dim1 + 1;
+	d__1 = a[i__2].r;
+	a[i__1].r = d__1, a[i__1].i = 0.;
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+             Generate elementary reflector H(i) = I - tau * v * v'
+             to annihilate A(i+2:n,i)
+*/
+
+	    i__2 = i__ + 1 + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *n - i__;
+/* Computing MIN */
+	    i__3 = i__ + 2;
+	    zlarfg_(&i__2, &alpha, &a[min(i__3,*n) + i__ * a_dim1], &c__1, &
+		    taui);
+	    i__2 = i__;
+	    e[i__2] = alpha.r;
+
+	    if (taui.r != 0. || taui.i != 0.) {
+
+/*              Apply H(i) from both sides to A(i+1:n,i+1:n) */
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute  x := tau * A * v  storing y in TAU(i:n-1) */
+
+		i__2 = *n - i__;
+		zhemv_(uplo, &i__2, &taui, &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_b56, &tau[
+			i__], &c__1);
+
+/*              Compute  w := x - 1/2 * tau * (x'*v) * v */
+
+		z__3.r = -.5, z__3.i = -0.;
+		z__2.r = z__3.r * taui.r - z__3.i * taui.i, z__2.i = z__3.r *
+			taui.i + z__3.i * taui.r;
+		i__2 = *n - i__;
+		zdotc_(&z__4, &i__2, &tau[i__], &c__1, &a[i__ + 1 + i__ *
+			a_dim1], &c__1);
+		z__1.r = z__2.r * z__4.r - z__2.i * z__4.i, z__1.i = z__2.r *
+			z__4.i + z__2.i * z__4.r;
+		alpha.r = z__1.r, alpha.i = z__1.i;
+		i__2 = *n - i__;
+		zaxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &tau[
+			i__], &c__1);
+
+/*
+                Apply the transformation as a rank-2 update:
+                   A := A - v * w' - w * v'
+*/
+
+		i__2 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zher2_(uplo, &i__2, &z__1, &a[i__ + 1 + i__ * a_dim1], &c__1,
+			&tau[i__], &c__1, &a[i__ + 1 + (i__ + 1) * a_dim1],
+			lda);
+
+	    } else {
+		i__2 = i__ + 1 + (i__ + 1) * a_dim1;
+		i__3 = i__ + 1 + (i__ + 1) * a_dim1;
+		d__1 = a[i__3].r;
+		a[i__2].r = d__1, a[i__2].i = 0.;
+	    }
+	    i__2 = i__ + 1 + i__ * a_dim1;
+	    i__3 = i__;
+	    a[i__2].r = e[i__3], a[i__2].i = 0.;
+	    i__2 = i__;
+	    i__3 = i__ + i__ * a_dim1;
+	    d__[i__2] = a[i__3].r;
+	    i__2 = i__;
+	    tau[i__2].r = taui.r, tau[i__2].i = taui.i;
+/* L20: */
+	}
+	i__1 = *n;
+	i__2 = *n + *n * a_dim1;
+	d__[i__1] = a[i__2].r;
+    }
+
+    return 0;
+
+/*     End of ZHETD2 */
+
+} /* zhetd2_ */
+
+/* Subroutine */ int zhetrd_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, doublereal *d__, doublereal *e, doublecomplex *tau,
+	doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, nb, kk, nx, iws;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    static logical upper;
+    extern /* Subroutine */ int zhetd2_(char *, integer *, doublecomplex *,
+	    integer *, doublereal *, doublereal *, doublecomplex *, integer *), zher2k_(char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublereal *, doublecomplex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlatrd_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublecomplex *,
+	    doublecomplex *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZHETRD reduces a complex Hermitian matrix A to real symmetric
+    tridiagonal form T by a unitary similarity transformation:
+    Q**H * A * Q = T.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit, if UPLO = 'U', the diagonal and first superdiagonal
+            of A are overwritten by the corresponding elements of the
+            tridiagonal matrix T, and the elements above the first
+            superdiagonal, with the array TAU, represent the unitary
+            matrix Q as a product of elementary reflectors; if UPLO
+            = 'L', the diagonal and first subdiagonal of A are over-
+            written by the corresponding elements of the tridiagonal
+            matrix T, and the elements below the first subdiagonal, with
+            the array TAU, represent the unitary matrix Q as a product
+            of elementary reflectors. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    D       (output) DOUBLE PRECISION array, dimension (N)
+            The diagonal elements of the tridiagonal matrix T:
+            D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (N-1)
+            The off-diagonal elements of the tridiagonal matrix T:
+            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
+
+    TAU     (output) COMPLEX*16 array, dimension (N-1)
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= 1.
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n-1) . . . H(2) H(1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
+    A(1:i-1,i+1), and tau in TAU(i).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(n-1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
+    and tau in TAU(i).
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  d   e   v2  v3  v4 )              (  d                  )
+      (      d   e   v3  v4 )              (  e   d              )
+      (          d   e   v4 )              (  v1  e   d          )
+      (              d   e  )              (  v1  v2  e   d      )
+      (                  d  )              (  v1  v2  v3  e   d  )
+
+    where d and e denote diagonal and off-diagonal elements of T, and vi
+    denotes an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    } else if (*lwork < 1 && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+
+/*        Determine the block size. */
+
+	nb = ilaenv_(&c__1, "ZHETRD", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6,
+		 (ftnlen)1);
+	lwkopt = *n * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZHETRD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nx = *n;
+    iws = 1;
+    if (nb > 1 && nb < *n) {
+
+/*
+          Determine when to cross over from blocked to unblocked code
+          (last block is always handled by unblocked code).
+
+   Computing MAX
+*/
+	i__1 = nb, i__2 = ilaenv_(&c__3, "ZHETRD", uplo, n, &c_n1, &c_n1, &
+		c_n1, (ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *n) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  determine the
+                minimum value of NB, and reduce NB or force use of
+                unblocked code by setting NX = N.
+
+   Computing MAX
+*/
+		i__1 = *lwork / ldwork;
+		nb = max(i__1,1);
+		nbmin = ilaenv_(&c__2, "ZHETRD", uplo, n, &c_n1, &c_n1, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		if (nb < nbmin) {
+		    nx = *n;
+		}
+	    }
+	} else {
+	    nx = *n;
+	}
+    } else {
+	nb = 1;
+    }
+
+    if (upper) {
+
+/*
+          Reduce the upper triangle of A.
+          Columns 1:kk are handled by the unblocked method.
+*/
+
+	kk = *n - (*n - nx + nb - 1) / nb * nb;
+	i__1 = kk + 1;
+	i__2 = -nb;
+	for (i__ = *n - nb + 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ +=
+		i__2) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = i__ + nb - 1;
+	    zlatrd_(uplo, &i__3, &nb, &a[a_offset], lda, &e[1], &tau[1], &
+		    work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(1:i-1,1:i-1), using an
+             update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zher2k_(uplo, "No transpose", &i__3, &nb, &z__1, &a[i__ * a_dim1
+		    + 1], lda, &work[1], &ldwork, &c_b1034, &a[a_offset], lda);
+
+/*
+             Copy superdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j - 1 + j * a_dim1;
+		i__5 = j - 1;
+		a[i__4].r = e[i__5], a[i__4].i = 0.;
+		i__4 = j;
+		i__5 = j + j * a_dim1;
+		d__[i__4] = a[i__5].r;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	zhetd2_(uplo, &kk, &a[a_offset], lda, &d__[1], &e[1], &tau[1], &iinfo);
+    } else {
+
+/*        Reduce the lower triangle of A */
+
+	i__2 = *n - nx;
+	i__1 = nb;
+	for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+
+/*
+             Reduce columns i:i+nb-1 to tridiagonal form and form the
+             matrix W which is needed to update the unreduced part of
+             the matrix
+*/
+
+	    i__3 = *n - i__ + 1;
+	    zlatrd_(uplo, &i__3, &nb, &a[i__ + i__ * a_dim1], lda, &e[i__], &
+		    tau[i__], &work[1], &ldwork);
+
+/*
+             Update the unreduced submatrix A(i+nb:n,i+nb:n), using
+             an update of the form:  A := A - V*W' - W*V'
+*/
+
+	    i__3 = *n - i__ - nb + 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zher2k_(uplo, "No transpose", &i__3, &nb, &z__1, &a[i__ + nb +
+		    i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b1034, &a[
+		    i__ + nb + (i__ + nb) * a_dim1], lda);
+
+/*
+             Copy subdiagonal elements back into A, and diagonal
+             elements into D
+*/
+
+	    i__3 = i__ + nb - 1;
+	    for (j = i__; j <= i__3; ++j) {
+		i__4 = j + 1 + j * a_dim1;
+		i__5 = j;
+		a[i__4].r = e[i__5], a[i__4].i = 0.;
+		i__4 = j;
+		i__5 = j + j * a_dim1;
+		d__[i__4] = a[i__5].r;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+/*        Use unblocked code to reduce the last or only block */
+
+	i__1 = *n - i__ + 1;
+	zhetd2_(uplo, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__],
+		&tau[i__], &iinfo);
+    }
+
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZHETRD */
+
+} /* zhetrd_ */
+
+/* Subroutine */ int zhseqr_(char *job, char *compz, integer *n, integer *ilo,
+	 integer *ihi, doublecomplex *h__, integer *ldh, doublecomplex *w,
+	doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3[2];
+    doublereal d__1, d__2, d__3;
+    doublecomplex z__1;
+    char ch__1[2];
+
+    /* Local variables */
+    static doublecomplex hl[2401]	/* was [49][49] */;
+    static integer kbot, nmin;
+    extern logical lsame_(char *, char *);
+    static logical initz;
+    static doublecomplex workl[49];
+    static logical wantt, wantz;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zlaqr0_(logical *, logical *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, integer *), xerbla_(char *, integer *
+	    );
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlahqr_(logical *, logical *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, integer *, doublecomplex *, integer *, integer *),
+	    zlacpy_(char *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zlaset_(char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static logical lquery;
+
+
+/*
+    -- LAPACK computational routine (version 3.2.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       June 2010
+
+       Purpose
+       =======
+
+       ZHSEQR computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**H, where T is an upper triangular matrix (the
+       Schur form), and Z is the unitary matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input unitary
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+       Arguments
+       =========
+
+       JOB   (input) CHARACTER*1
+             = 'E':  compute eigenvalues only;
+             = 'S':  compute eigenvalues and the Schur form T.
+
+       COMPZ (input) CHARACTER*1
+             = 'N':  no Schur vectors are computed;
+             = 'I':  Z is initialized to the unit matrix and the matrix Z
+                     of Schur vectors of H is returned;
+             = 'V':  Z must contain an unitary matrix Q on entry, and
+                     the product Q*Z is returned.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
+             set by a previous call to ZGEBAL, and then passed to ZGEHRD
+             when the matrix output by ZGEBAL is reduced to Hessenberg
+             form. Otherwise ILO and IHI should be set to 1 and N
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) COMPLEX*16 array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and JOB = 'S', H contains the upper
+             triangular matrix T from the Schur decomposition (the
+             Schur form). If INFO = 0 and JOB = 'E', the contents of
+             H are unspecified on exit.  (The output value of H when
+             INFO.GT.0 is given under the description of INFO below.)
+
+             Unlike earlier versions of ZHSEQR, this subroutine may
+             explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+             or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       W        (output) COMPLEX*16 array, dimension (N)
+             The computed eigenvalues. If JOB = 'S', the eigenvalues are
+             stored in the same order as on the diagonal of the Schur
+             form returned in H, with W(i) = H(i,i).
+
+       Z     (input/output) COMPLEX*16 array, dimension (LDZ,N)
+             If COMPZ = 'N', Z is not referenced.
+             If COMPZ = 'I', on entry Z need not be set and on exit,
+             if INFO = 0, Z contains the unitary matrix Z of the Schur
+             vectors of H.  If COMPZ = 'V', on entry Z must contain an
+             N-by-N matrix Q, which is assumed to be equal to the unit
+             matrix except for the submatrix Z(ILO:IHI,ILO:IHI). On exit,
+             if INFO = 0, Z contains Q*Z.
+             Normally Q is the unitary matrix generated by ZUNGHR
+             after the call to ZGEHRD which formed the Hessenberg matrix
+             H. (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if COMPZ = 'I' or
+             COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) COMPLEX*16 array, dimension (LWORK)
+             On exit, if INFO = 0, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient and delivers very good and sometimes
+             optimal performance.  However, LWORK as large as 11*N
+             may be required for optimal performance.  A workspace
+             query is recommended to determine the optimal workspace
+             size.
+
+             If LWORK = -1, then ZHSEQR does a workspace query.
+             In this case, ZHSEQR checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .LT. 0:  if INFO = -i, the i-th argument had an illegal
+                      value
+             .GT. 0:  if INFO = i, ZHSEQR failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and JOB = 'E', then on exit, the
+                  remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and JOB   = 'S', then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is a unitary matrix.  The final
+                  value of  H is upper Hessenberg and triangular in
+                  rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and COMPZ = 'V', then on exit
+
+                    (final value of Z)  =  (initial value of Z)*U
+
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'I', then on exit
+                        (final value of Z)  = U
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of JOB.)
+
+                  If INFO .GT. 0 and COMPZ = 'N', then Z is not
+                  accessed.
+
+       ================================================================
+               Default values supplied by
+               ILAENV(ISPEC,'ZHSEQR',JOB(:1)//COMPZ(:1),N,ILO,IHI,LWORK).
+               It is suggested that these defaults be adjusted in order
+               to attain best performance in each particular
+               computational environment.
+
+              ISPEC=12: The ZLAHQR vs ZLAQR0 crossover point.
+                        Default: 75. (Must be at least 11.)
+
+              ISPEC=13: Recommended deflation window size.
+                        This depends on ILO, IHI and NS.  NS is the
+                        number of simultaneous shifts returned
+                        by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
+                        The default for (IHI-ILO+1).LE.500 is NS.
+                        The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+
+              ISPEC=14: Nibble crossover point. (See IPARMQ for
+                        details.)  Default: 14% of deflation window
+                        size.
+
+              ISPEC=15: Number of simultaneous shifts in a multishift
+                        QR iteration.
+
+                        If IHI-ILO+1 is ...
+
+                        greater than      ...but less    ... the
+                        or equal to ...      than        default is
+
+                             1               30          NS =   2(+)
+                            30               60          NS =   4(+)
+                            60              150          NS =  10(+)
+                           150              590          NS =  **
+                           590             3000          NS =  64
+                          3000             6000          NS = 128
+                          6000             infinity      NS = 256
+
+                    (+)  By default some or all matrices of this order
+                         are passed to the implicit double shift routine
+                         ZLAHQR and this parameter is ignored.  See
+                         ISPEC=12 above and comments in IPARMQ for
+                         details.
+
+                   (**)  The asterisks (**) indicate an ad-hoc
+                         function of N increasing from 10 to 64.
+
+              ISPEC=16: Select structured matrix multiply.
+                        If the number of simultaneous shifts (specified
+                        by ISPEC=15) is less than 14, then the default
+                        for ISPEC=16 is 0.  Otherwise the default for
+                        ISPEC=16 is 2.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    ZLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== NL allocates some local workspace to help small matrices
+       .    through a rare ZLAHQR failure.  NL .GT. NTINY = 11 is
+       .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+       .    mended.  (The default value of NMIN is 75.)  Using NL = 49
+       .    allows up to six simultaneous shifts and a 16-by-16
+       .    deflation window.  ====
+
+       ==== Decode and check the input parameters. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    wantt = lsame_(job, "S");
+    initz = lsame_(compz, "I");
+    wantz = initz || lsame_(compz, "V");
+    d__1 = (doublereal) max(1,*n);
+    z__1.r = d__1, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+    lquery = *lwork == -1;
+
+    *info = 0;
+    if (! lsame_(job, "E") && ! wantt) {
+	*info = -1;
+    } else if (! lsame_(compz, "N") && ! wantz) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -4;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -5;
+    } else if (*ldh < max(1,*n)) {
+	*info = -7;
+    } else if (*ldz < 1 || wantz && *ldz < max(1,*n)) {
+	*info = -10;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info != 0) {
+
+/*        ==== Quick return in case of invalid argument. ==== */
+
+	i__1 = -(*info);
+	xerbla_("ZHSEQR", &i__1);
+	return 0;
+
+    } else if (*n == 0) {
+
+/*        ==== Quick return in case N = 0; nothing to do. ==== */
+
+	return 0;
+
+    } else if (lquery) {
+
+/*        ==== Quick return in case of a workspace query ==== */
+
+	zlaqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1], ilo,
+		ihi, &z__[z_offset], ldz, &work[1], lwork, info);
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+   Computing MAX
+*/
+	d__2 = work[1].r, d__3 = (doublereal) max(1,*n);
+	d__1 = max(d__2,d__3);
+	z__1.r = d__1, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+
+    } else {
+
+/*        ==== copy eigenvalues isolated by ZGEBAL ==== */
+
+	if (*ilo > 1) {
+	    i__1 = *ilo - 1;
+	    i__2 = *ldh + 1;
+	    zcopy_(&i__1, &h__[h_offset], &i__2, &w[1], &c__1);
+	}
+	if (*ihi < *n) {
+	    i__1 = *n - *ihi;
+	    i__2 = *ldh + 1;
+	    zcopy_(&i__1, &h__[*ihi + 1 + (*ihi + 1) * h_dim1], &i__2, &w[*
+		    ihi + 1], &c__1);
+	}
+
+/*        ==== Initialize Z, if requested ==== */
+
+	if (initz) {
+	    zlaset_("A", n, n, &c_b56, &c_b57, &z__[z_offset], ldz)
+		    ;
+	}
+
+/*        ==== Quick return if possible ==== */
+
+	if (*ilo == *ihi) {
+	    i__1 = *ilo;
+	    i__2 = *ilo + *ilo * h_dim1;
+	    w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i;
+	    return 0;
+	}
+
+/*
+          ==== ZLAHQR/ZLAQR0 crossover point ====
+
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = job;
+	i__3[1] = 1, a__1[1] = compz;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	nmin = ilaenv_(&c__12, "ZHSEQR", ch__1, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== ZLAQR0 for big matrices; ZLAHQR for small ones ==== */
+
+	if (*n > nmin) {
+	    zlaqr0_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    ilo, ihi, &z__[z_offset], ldz, &work[1], lwork, info);
+	} else {
+
+/*           ==== Small matrix ==== */
+
+	    zlahqr_(&wantt, &wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    ilo, ihi, &z__[z_offset], ldz, info);
+
+	    if (*info > 0) {
+
+/*
+                ==== A rare ZLAHQR failure!  ZLAQR0 sometimes succeeds
+                .    when ZLAHQR fails. ====
+*/
+
+		kbot = *info;
+
+		if (*n >= 49) {
+
+/*
+                   ==== Larger matrices have enough subdiagonal scratch
+                   .    space to call ZLAQR0 directly. ====
+*/
+
+		    zlaqr0_(&wantt, &wantz, n, ilo, &kbot, &h__[h_offset],
+			    ldh, &w[1], ilo, ihi, &z__[z_offset], ldz, &work[
+			    1], lwork, info);
+
+		} else {
+
+/*
+                   ==== Tiny matrices don't have enough subdiagonal
+                   .    scratch space to benefit from ZLAQR0.  Hence,
+                   .    tiny matrices must be copied into a larger
+                   .    array before calling ZLAQR0. ====
+*/
+
+		    zlacpy_("A", n, n, &h__[h_offset], ldh, hl, &c__49);
+		    i__1 = *n + 1 + *n * 49 - 50;
+		    hl[i__1].r = 0., hl[i__1].i = 0.;
+		    i__1 = 49 - *n;
+		    zlaset_("A", &c__49, &i__1, &c_b56, &c_b56, &hl[(*n + 1) *
+			     49 - 49], &c__49);
+		    zlaqr0_(&wantt, &wantz, &c__49, ilo, &kbot, hl, &c__49, &
+			    w[1], ilo, ihi, &z__[z_offset], ldz, workl, &
+			    c__49, info);
+		    if (wantt || *info != 0) {
+			zlacpy_("A", n, n, hl, &c__49, &h__[h_offset], ldh);
+		    }
+		}
+	    }
+	}
+
+/*        ==== Clear out the trash, if necessary. ==== */
+
+	if ((wantt || *info != 0) && *n > 2) {
+	    i__1 = *n - 2;
+	    i__2 = *n - 2;
+	    zlaset_("L", &i__1, &i__2, &c_b56, &c_b56, &h__[h_dim1 + 3], ldh);
+	}
+
+/*
+          ==== Ensure reported workspace size is backward-compatible with
+          .    previous LAPACK versions. ====
+
+   Computing MAX
+*/
+	d__2 = (doublereal) max(1,*n), d__3 = work[1].r;
+	d__1 = max(d__2,d__3);
+	z__1.r = d__1, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+    }
+
+/*     ==== End of ZHSEQR ==== */
+
+    return 0;
+} /* zhseqr_ */
+
+/* Subroutine */ int zlabrd_(integer *m, integer *n, integer *nb,
+	doublecomplex *a, integer *lda, doublereal *d__, doublereal *e,
+	doublecomplex *tauq, doublecomplex *taup, doublecomplex *x, integer *
+	ldx, doublecomplex *y, integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex alpha;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *),
+	    zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLABRD reduces the first NB rows and columns of a complex general
+    m by n matrix A to upper or lower real bidiagonal form by a unitary
+    transformation Q' * A * P, and returns the matrices X and Y which
+    are needed to apply the transformation to the unreduced part of A.
+
+    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+    bidiagonal form.
+
+    This is an auxiliary routine called by ZGEBRD
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows in the matrix A.
+
+    N       (input) INTEGER
+            The number of columns in the matrix A.
+
+    NB      (input) INTEGER
+            The number of leading rows and columns of A to be reduced.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the m by n general matrix to be reduced.
+            On exit, the first NB rows and columns of the matrix are
+            overwritten; the rest of the array is unchanged.
+            If m >= n, elements on and below the diagonal in the first NB
+              columns, with the array TAUQ, represent the unitary
+              matrix Q as a product of elementary reflectors; and
+              elements above the diagonal in the first NB rows, with the
+              array TAUP, represent the unitary matrix P as a product
+              of elementary reflectors.
+            If m < n, elements below the diagonal in the first NB
+              columns, with the array TAUQ, represent the unitary
+              matrix Q as a product of elementary reflectors, and
+              elements on and above the diagonal in the first NB rows,
+              with the array TAUP, represent the unitary matrix P as
+              a product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) DOUBLE PRECISION array, dimension (NB)
+            The diagonal elements of the first NB rows and columns of
+            the reduced matrix.  D(i) = A(i,i).
+
+    E       (output) DOUBLE PRECISION array, dimension (NB)
+            The off-diagonal elements of the first NB rows and columns of
+            the reduced matrix.
+
+    TAUQ    (output) COMPLEX*16 array dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX*16 array, dimension (NB)
+            The scalar factors of the elementary reflectors which
+            represent the unitary matrix P. See Further Details.
+
+    X       (output) COMPLEX*16 array, dimension (LDX,NB)
+            The m-by-nb matrix X required to update the unreduced part
+            of A.
+
+    LDX     (input) INTEGER
+            The leading dimension of the array X. LDX >= max(1,M).
+
+    Y       (output) COMPLEX*16 array, dimension (LDY,NB)
+            The n-by-nb matrix Y required to update the unreduced part
+            of A.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= max(1,N).
+
+    Further Details
+    ===============
+
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)
+
+    Each H(i) and G(i) has the form:
+
+       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex
+    vectors.
+
+    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
+    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The elements of the vectors v and u together form the m-by-nb matrix
+    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
+    the transformation to the unreduced part of the matrix, using a block
+    update of the form:  A := A - V*Y' - X*U'.
+
+    The contents of A on exit are illustrated by the following examples
+    with nb = 2:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
+      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
+      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
+      (  v1  v2  a   a   a  )
+
+    where a denotes an element of the original matrix which is unchanged,
+    vi denotes an element of the vector defining H(i), and ui an element
+    of the vector defining G(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d__;
+    --e;
+    --tauq;
+    --taup;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (*m >= *n) {
+
+/*        Reduce to upper bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:m,i) */
+
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &y[i__ + y_dim1], ldy);
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__2, &i__3, &z__1, &a[i__ + a_dim1], lda,
+		     &y[i__ + y_dim1], ldy, &c_b57, &a[i__ + i__ * a_dim1], &
+		    c__1);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &y[i__ + y_dim1], ldy);
+	    i__2 = *m - i__ + 1;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__2, &i__3, &z__1, &x[i__ + x_dim1], ldx,
+		     &a[i__ * a_dim1 + 1], &c__1, &c_b57, &a[i__ + i__ *
+		    a_dim1], &c__1);
+
+/*           Generate reflection Q(i) to annihilate A(i+1:m,i) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *m - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    zlarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1, &
+		    tauq[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    if (i__ < *n) {
+		i__2 = i__ + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ + (
+			i__ + 1) * a_dim1], lda, &a[i__ + i__ * a_dim1], &
+			c__1, &c_b56, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			a_dim1], lda, &a[i__ + i__ * a_dim1], &c__1, &c_b56, &
+			y[i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b57, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__ + 1;
+		i__3 = i__ - 1;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &x[i__ +
+			x_dim1], ldx, &a[i__ + i__ * a_dim1], &c__1, &c_b56, &
+			y[i__ * y_dim1 + 1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &z__1, &a[(i__ +
+			1) * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &
+			c_b57, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		zscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+
+/*              Update A(i,i+1:n) */
+
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		zlacgv_(&i__, &a[i__ + a_dim1], lda);
+		i__2 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__, &z__1, &y[i__ + 1 +
+			y_dim1], ldy, &a[i__ + a_dim1], lda, &c_b57, &a[i__ +
+			(i__ + 1) * a_dim1], lda);
+		zlacgv_(&i__, &a[i__ + a_dim1], lda);
+		i__2 = i__ - 1;
+		zlacgv_(&i__2, &x[i__ + x_dim1], ldx);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &z__1, &a[(i__ +
+			1) * a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, &c_b57,
+			&a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ - 1;
+		zlacgv_(&i__2, &x[i__ + x_dim1], ldx);
+
+/*              Generate reflection P(i) to annihilate A(i,i+2:n) */
+
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		zlarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+			taup[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + (i__ + 1) * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		zgemv_("No transpose", &i__2, &i__3, &c_b57, &a[i__ + 1 + (
+			i__ + 1) * a_dim1], lda, &a[i__ + (i__ + 1) * a_dim1],
+			 lda, &c_b56, &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__;
+		zgemv_("Conjugate transpose", &i__2, &i__, &c_b57, &y[i__ + 1
+			+ y_dim1], ldy, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b56, &x[i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__, &z__1, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		zgemv_("No transpose", &i__2, &i__3, &c_b57, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b56, &x[i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		zscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Reduce to lower bidiagonal form */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i,i:n) */
+
+	    i__2 = *n - i__ + 1;
+	    zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__2, &i__3, &z__1, &y[i__ + y_dim1], ldy,
+		     &a[i__ + a_dim1], lda, &c_b57, &a[i__ + i__ * a_dim1],
+		    lda);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &x[i__ + x_dim1], ldx);
+	    i__2 = i__ - 1;
+	    i__3 = *n - i__ + 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("Conjugate transpose", &i__2, &i__3, &z__1, &a[i__ *
+		    a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, &c_b57, &a[i__ +
+		    i__ * a_dim1], lda);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &x[i__ + x_dim1], ldx);
+
+/*           Generate reflection P(i) to annihilate A(i,i+1:n) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = *n - i__ + 1;
+/* Computing MIN */
+	    i__3 = i__ + 1;
+	    zlarfg_(&i__2, &alpha, &a[i__ + min(i__3,*n) * a_dim1], lda, &
+		    taup[i__]);
+	    i__2 = i__;
+	    d__[i__2] = alpha.r;
+	    if (i__ < *m) {
+		i__2 = i__ + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute X(i+1:m,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__ + 1;
+		zgemv_("No transpose", &i__2, &i__3, &c_b57, &a[i__ + 1 + i__
+			* a_dim1], lda, &a[i__ + i__ * a_dim1], lda, &c_b56, &
+			x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__ + 1;
+		i__3 = i__ - 1;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &y[i__ +
+			y_dim1], ldy, &a[i__ + i__ * a_dim1], lda, &c_b56, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &a[i__ + 1 +
+			a_dim1], lda, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = i__ - 1;
+		i__3 = *n - i__ + 1;
+		zgemv_("No transpose", &i__2, &i__3, &c_b57, &a[i__ * a_dim1
+			+ 1], lda, &a[i__ + i__ * a_dim1], lda, &c_b56, &x[
+			i__ * x_dim1 + 1], &c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &x[i__ + 1 +
+			x_dim1], ldx, &x[i__ * x_dim1 + 1], &c__1, &c_b57, &x[
+			i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *m - i__;
+		zscal_(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
+		i__2 = *n - i__ + 1;
+		zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+
+/*              Update A(i+1:m,i) */
+
+		i__2 = i__ - 1;
+		zlacgv_(&i__2, &y[i__ + y_dim1], ldy);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &a[i__ + 1 +
+			a_dim1], lda, &y[i__ + y_dim1], ldy, &c_b57, &a[i__ +
+			1 + i__ * a_dim1], &c__1);
+		i__2 = i__ - 1;
+		zlacgv_(&i__2, &y[i__ + y_dim1], ldy);
+		i__2 = *m - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__, &z__1, &x[i__ + 1 +
+			x_dim1], ldx, &a[i__ * a_dim1 + 1], &c__1, &c_b57, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+
+/*              Generate reflection Q(i) to annihilate A(i+2:m,i) */
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *m - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		zlarfg_(&i__2, &alpha, &a[min(i__3,*m) + i__ * a_dim1], &c__1,
+			 &tauq[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute Y(i+1:n,i) */
+
+		i__2 = *m - i__;
+		i__3 = *n - i__;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + (i__ + 1) * a_dim1], lda, &a[i__ + 1 + i__ *
+			a_dim1], &c__1, &c_b56, &y[i__ + 1 + i__ * y_dim1], &
+			c__1);
+		i__2 = *m - i__;
+		i__3 = i__ - 1;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &y[i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &y[i__ + 1 +
+			y_dim1], ldy, &y[i__ * y_dim1 + 1], &c__1, &c_b57, &y[
+			i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *m - i__;
+		zgemv_("Conjugate transpose", &i__2, &i__, &c_b57, &x[i__ + 1
+			+ x_dim1], ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &y[i__ * y_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("Conjugate transpose", &i__, &i__2, &z__1, &a[(i__ + 1)
+			 * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], &c__1, &
+			c_b57, &y[i__ + 1 + i__ * y_dim1], &c__1);
+		i__2 = *n - i__;
+		zscal_(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
+	    } else {
+		i__2 = *n - i__ + 1;
+		zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda);
+	    }
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of ZLABRD */
+
+} /* zlabrd_ */
+
+/* Subroutine */ int zlacgv_(integer *n, doublecomplex *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, ioff;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLACGV conjugates a complex vector of length N.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The length of the vector X.  N >= 0.
+
+    X       (input/output) COMPLEX*16 array, dimension
+                           (1+(N-1)*abs(INCX))
+            On entry, the vector of length N to be conjugated.
+            On exit, X is overwritten with conjg(X).
+
+    INCX    (input) INTEGER
+            The spacing between successive elements of X.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*incx == 1) {
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__;
+	    d_cnjg(&z__1, &x[i__]);
+	    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+/* L10: */
+	}
+    } else {
+	ioff = 1;
+	if (*incx < 0) {
+	    ioff = 1 - (*n - 1) * *incx;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = ioff;
+	    d_cnjg(&z__1, &x[ioff]);
+	    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+	    ioff += *incx;
+/* L20: */
+	}
+    }
+    return 0;
+
+/*     End of ZLACGV */
+
+} /* zlacgv_ */
+
+/* Subroutine */ int zlacp2_(char *uplo, integer *m, integer *n, doublereal *
+	a, integer *lda, doublecomplex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLACP2 copies all or part of a real two-dimensional matrix A to a
+    complex matrix B.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be copied to B.
+            = 'U':      Upper triangular part
+            = 'L':      Lower triangular part
+            Otherwise:  All of the matrix A
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
+            The m by n matrix A.  If UPLO = 'U', only the upper trapezium
+            is accessed; if UPLO = 'L', only the lower trapezium is
+            accessed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (output) COMPLEX*16 array, dimension (LDB,N)
+            On exit, B = A in the locations specified by UPLO.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4], b[i__3].i = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4], b[i__3].i = 0.;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4], b[i__3].i = 0.;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLACP2 */
+
+} /* zlacp2_ */
+
+/* Subroutine */ int zlacpy_(char *uplo, integer *m, integer *n,
+	doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLACPY copies all or part of a two-dimensional matrix A to another
+    matrix B.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be copied to B.
+            = 'U':      Upper triangular part
+            = 'L':      Lower triangular part
+            Otherwise:  All of the matrix A
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The m by n matrix A.  If UPLO = 'U', only the upper trapezium
+            is accessed; if UPLO = 'L', only the lower trapezium is
+            accessed.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    B       (output) COMPLEX*16 array, dimension (LDB,N)
+            On exit, B = A in the locations specified by UPLO.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4].r, b[i__3].i = a[i__4].i;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4].r, b[i__3].i = a[i__4].i;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * b_dim1;
+		i__4 = i__ + j * a_dim1;
+		b[i__3].r = a[i__4].r, b[i__3].i = a[i__4].i;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLACPY */
+
+} /* zlacpy_ */
+
+/* Subroutine */ int zlacrm_(integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublereal *b, integer *ldb, doublecomplex *c__,
+	integer *ldc, doublereal *rwork)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, a_dim1, a_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLACRM performs a very simple matrix-matrix multiplication:
+             C := A * B,
+    where A is M by N and complex; B is N by N and real;
+    C is M by N and complex.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A and of the matrix C.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns and rows of the matrix B and
+            the number of columns of the matrix C.
+            N >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA, N)
+            A contains the M by N matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >=max(1,M).
+
+    B       (input) DOUBLE PRECISION array, dimension (LDB, N)
+            B contains the N by N matrix B.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >=max(1,N).
+
+    C       (input) COMPLEX*16 array, dimension (LDC, N)
+            C contains the M by N matrix C.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >=max(1,N).
+
+    RWORK   (workspace) DOUBLE PRECISION array, dimension (2*M*N)
+
+    =====================================================================
+
+
+       Quick return if possible.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --rwork;
+
+    /* Function Body */
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    rwork[(j - 1) * *m + i__] = a[i__3].r;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    l = *m * *n + 1;
+    dgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = l + (j - 1) * *m + i__ - 1;
+	    c__[i__3].r = rwork[i__4], c__[i__3].i = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    rwork[(j - 1) * *m + i__] = d_imag(&a[i__ + j * a_dim1]);
+/* L50: */
+	}
+/* L60: */
+    }
+    dgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = i__ + j * c_dim1;
+	    d__1 = c__[i__4].r;
+	    i__5 = l + (j - 1) * *m + i__ - 1;
+	    z__1.r = d__1, z__1.i = rwork[i__5];
+	    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L70: */
+	}
+/* L80: */
+    }
+
+    return 0;
+
+/*     End of ZLACRM */
+
+} /* zlacrm_ */
+
+/* Double Complex */ VOID zladiv_(doublecomplex * ret_val, doublecomplex *x,
+	doublecomplex *y)
+{
+    /* System generated locals */
+    doublereal d__1, d__2, d__3, d__4;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static doublereal zi, zr;
+    extern /* Subroutine */ int dladiv_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLADIV := X / Y, where X and Y are complex.  The computation of X / Y
+    will not overflow on an intermediary step unless the results
+    overflows.
+
+    Arguments
+    =========
+
+    X       (input) COMPLEX*16
+    Y       (input) COMPLEX*16
+            The complex scalars X and Y.
+
+    =====================================================================
+*/
+
+
+    d__1 = x->r;
+    d__2 = d_imag(x);
+    d__3 = y->r;
+    d__4 = d_imag(y);
+    dladiv_(&d__1, &d__2, &d__3, &d__4, &zr, &zi);
+    z__1.r = zr, z__1.i = zi;
+     ret_val->r = z__1.r,  ret_val->i = z__1.i;
+
+    return ;
+
+/*     End of ZLADIV */
+
+} /* zladiv_ */
+
+/* Subroutine */ int zlaed0_(integer *qsiz, integer *n, doublereal *d__,
+	doublereal *e, doublecomplex *q, integer *ldq, doublecomplex *qstore,
+	integer *ldqs, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, qstore_dim1, qstore_offset, i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer i__, j, k, ll, iq, lgn, msd2, smm1, spm1, spm2;
+    static doublereal temp;
+    static integer curr, iperm;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *,
+	    doublereal *, integer *);
+    static integer indxq, iwrem, iqptr, tlvls;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zlaed7_(integer *, integer *,
+	    integer *, integer *, integer *, integer *, doublereal *,
+	    doublecomplex *, integer *, doublereal *, integer *, doublereal *,
+	     integer *, integer *, integer *, integer *, integer *,
+	    doublereal *, doublecomplex *, doublereal *, integer *, integer *)
+	    ;
+    static integer igivcl;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlacrm_(integer *, integer *, doublecomplex *,
+	     integer *, doublereal *, integer *, doublecomplex *, integer *,
+	    doublereal *);
+    static integer igivnm, submat, curprb, subpbs, igivpt;
+    extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *);
+    static integer curlvl, matsiz, iprmpt, smlsiz;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    Using the divide and conquer method, ZLAED0 computes all eigenvalues
+    of a symmetric tridiagonal matrix which is one diagonal block of
+    those from reducing a dense or band Hermitian matrix and
+    corresponding eigenvectors of the dense or band matrix.
+
+    Arguments
+    =========
+
+    QSIZ   (input) INTEGER
+           The dimension of the unitary matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N if ICOMPQ = 1.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, the diagonal elements of the tridiagonal matrix.
+           On exit, the eigenvalues in ascending order.
+
+    E      (input/output) DOUBLE PRECISION array, dimension (N-1)
+           On entry, the off-diagonal elements of the tridiagonal matrix.
+           On exit, E has been destroyed.
+
+    Q      (input/output) COMPLEX*16 array, dimension (LDQ,N)
+           On entry, Q must contain an QSIZ x N matrix whose columns
+           unitarily orthonormal. It is a part of the unitary matrix
+           that reduces the full dense Hermitian matrix to a
+           (reducible) symmetric tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    IWORK  (workspace) INTEGER array,
+           the dimension of IWORK must be at least
+                        6 + 6*N + 5*N*lg N
+                        ( lg( N ) = smallest integer k
+                                    such that 2^k >= N )
+
+    RWORK  (workspace) DOUBLE PRECISION array,
+                                 dimension (1 + 3*N + 2*N*lg N + 3*N**2)
+                          ( lg( N ) = smallest integer k
+                                      such that 2^k >= N )
+
+    QSTORE (workspace) COMPLEX*16 array, dimension (LDQS, N)
+           Used to store parts of
+           the eigenvector matrix when the updating matrix multiplies
+           take place.
+
+    LDQS   (input) INTEGER
+           The leading dimension of the array QSTORE.
+           LDQS >= max(1,N).
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    =====================================================================
+
+    Warning:      N could be as big as QSIZ!
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    qstore_dim1 = *ldqs;
+    qstore_offset = 1 + qstore_dim1;
+    qstore -= qstore_offset;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*
+       IF( ICOMPQ .LT. 0 .OR. ICOMPQ .GT. 2 ) THEN
+          INFO = -1
+       ELSE IF( ( ICOMPQ .EQ. 1 ) .AND. ( QSIZ .LT. MAX( 0, N ) ) )
+      $        THEN
+*/
+    if (*qsiz < max(0,*n)) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldq < max(1,*n)) {
+	*info = -6;
+    } else if (*ldqs < max(1,*n)) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLAED0", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    smlsiz = ilaenv_(&c__9, "ZLAED0", " ", &c__0, &c__0, &c__0, &c__0, (
+	    ftnlen)6, (ftnlen)1);
+
+/*
+       Determine the size and placement of the submatrices, and save in
+       the leading elements of IWORK.
+*/
+
+    iwork[1] = *n;
+    subpbs = 1;
+    tlvls = 0;
+L10:
+    if (iwork[subpbs] > smlsiz) {
+	for (j = subpbs; j >= 1; --j) {
+	    iwork[j * 2] = (iwork[j] + 1) / 2;
+	    iwork[(j << 1) - 1] = iwork[j] / 2;
+/* L20: */
+	}
+	++tlvls;
+	subpbs <<= 1;
+	goto L10;
+    }
+    i__1 = subpbs;
+    for (j = 2; j <= i__1; ++j) {
+	iwork[j] += iwork[j - 1];
+/* L30: */
+    }
+
+/*
+       Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1
+       using rank-1 modifications (cuts).
+*/
+
+    spm1 = subpbs - 1;
+    i__1 = spm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	submat = iwork[i__] + 1;
+	smm1 = submat - 1;
+	d__[smm1] -= (d__1 = e[smm1], abs(d__1));
+	d__[submat] -= (d__1 = e[smm1], abs(d__1));
+/* L40: */
+    }
+
+    indxq = (*n << 2) + 3;
+
+/*
+       Set up workspaces for eigenvalues only/accumulate new vectors
+       routine
+*/
+
+    temp = log((doublereal) (*n)) / log(2.);
+    lgn = (integer) temp;
+    if (pow_ii(&c__2, &lgn) < *n) {
+	++lgn;
+    }
+    if (pow_ii(&c__2, &lgn) < *n) {
+	++lgn;
+    }
+    iprmpt = indxq + *n + 1;
+    iperm = iprmpt + *n * lgn;
+    iqptr = iperm + *n * lgn;
+    igivpt = iqptr + *n + 2;
+    igivcl = igivpt + *n * lgn;
+
+    igivnm = 1;
+    iq = igivnm + (*n << 1) * lgn;
+/* Computing 2nd power */
+    i__1 = *n;
+    iwrem = iq + i__1 * i__1 + 1;
+/*     Initialize pointers */
+    i__1 = subpbs;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	iwork[iprmpt + i__] = 1;
+	iwork[igivpt + i__] = 1;
+/* L50: */
+    }
+    iwork[iqptr] = 1;
+
+/*
+       Solve each submatrix eigenproblem at the bottom of the divide and
+       conquer tree.
+*/
+
+    curr = 0;
+    i__1 = spm1;
+    for (i__ = 0; i__ <= i__1; ++i__) {
+	if (i__ == 0) {
+	    submat = 1;
+	    matsiz = iwork[1];
+	} else {
+	    submat = iwork[i__] + 1;
+	    matsiz = iwork[i__ + 1] - iwork[i__];
+	}
+	ll = iq - 1 + iwork[iqptr + curr];
+	dsteqr_("I", &matsiz, &d__[submat], &e[submat], &rwork[ll], &matsiz, &
+		rwork[1], info);
+	zlacrm_(qsiz, &matsiz, &q[submat * q_dim1 + 1], ldq, &rwork[ll], &
+		matsiz, &qstore[submat * qstore_dim1 + 1], ldqs, &rwork[iwrem]
+		);
+/* Computing 2nd power */
+	i__2 = matsiz;
+	iwork[iqptr + curr + 1] = iwork[iqptr + curr] + i__2 * i__2;
+	++curr;
+	if (*info > 0) {
+	    *info = submat * (*n + 1) + submat + matsiz - 1;
+	    return 0;
+	}
+	k = 1;
+	i__2 = iwork[i__ + 1];
+	for (j = submat; j <= i__2; ++j) {
+	    iwork[indxq + j] = k;
+	    ++k;
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       Successively merge eigensystems of adjacent submatrices
+       into eigensystem for the corresponding larger matrix.
+
+       while ( SUBPBS > 1 )
+*/
+
+    curlvl = 1;
+L80:
+    if (subpbs > 1) {
+	spm2 = subpbs - 2;
+	i__1 = spm2;
+	for (i__ = 0; i__ <= i__1; i__ += 2) {
+	    if (i__ == 0) {
+		submat = 1;
+		matsiz = iwork[2];
+		msd2 = iwork[1];
+		curprb = 0;
+	    } else {
+		submat = iwork[i__] + 1;
+		matsiz = iwork[i__ + 2] - iwork[i__];
+		msd2 = matsiz / 2;
+		++curprb;
+	    }
+
+/*
+       Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2)
+       into an eigensystem of size MATSIZ.  ZLAED7 handles the case
+       when the eigenvectors of a full or band Hermitian matrix (which
+       was reduced to tridiagonal form) are desired.
+
+       I am free to use Q as a valuable working space until Loop 150.
+*/
+
+	    zlaed7_(&matsiz, &msd2, qsiz, &tlvls, &curlvl, &curprb, &d__[
+		    submat], &qstore[submat * qstore_dim1 + 1], ldqs, &e[
+		    submat + msd2 - 1], &iwork[indxq + submat], &rwork[iq], &
+		    iwork[iqptr], &iwork[iprmpt], &iwork[iperm], &iwork[
+		    igivpt], &iwork[igivcl], &rwork[igivnm], &q[submat *
+		    q_dim1 + 1], &rwork[iwrem], &iwork[subpbs + 1], info);
+	    if (*info > 0) {
+		*info = submat * (*n + 1) + submat + matsiz - 1;
+		return 0;
+	    }
+	    iwork[i__ / 2 + 1] = iwork[i__ + 2];
+/* L90: */
+	}
+	subpbs /= 2;
+	++curlvl;
+	goto L80;
+    }
+
+/*
+       end while
+
+       Re-merge the eigenvalues/vectors which were deflated at the final
+       merge step.
+*/
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	j = iwork[indxq + i__];
+	rwork[i__] = d__[j];
+	zcopy_(qsiz, &qstore[j * qstore_dim1 + 1], &c__1, &q[i__ * q_dim1 + 1]
+		, &c__1);
+/* L100: */
+    }
+    dcopy_(n, &rwork[1], &c__1, &d__[1], &c__1);
+
+    return 0;
+
+/*     End of ZLAED0 */
+
+} /* zlaed0_ */
+
+/* Subroutine */ int zlaed7_(integer *n, integer *cutpnt, integer *qsiz,
+	integer *tlvls, integer *curlvl, integer *curpbm, doublereal *d__,
+	doublecomplex *q, integer *ldq, doublereal *rho, integer *indxq,
+	doublereal *qstore, integer *qptr, integer *prmptr, integer *perm,
+	integer *givptr, integer *givcol, doublereal *givnum, doublecomplex *
+	work, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, i__1, i__2;
+
+    /* Local variables */
+    static integer i__, k, n1, n2, iq, iw, iz, ptr, indx, curr, indxc, indxp;
+    extern /* Subroutine */ int dlaed9_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, integer *, integer *),
+	    zlaed8_(integer *, integer *, integer *, doublecomplex *, integer
+	    *, doublereal *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublecomplex *, integer *, doublereal *, integer *,
+	     integer *, integer *, integer *, integer *, integer *,
+	    doublereal *, integer *), dlaeda_(integer *, integer *, integer *,
+	     integer *, integer *, integer *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
+	     integer *);
+    static integer idlmda;
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *), zlacrm_(integer *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *, doublecomplex *, integer *, doublereal *
+	    );
+    static integer coltyp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLAED7 computes the updated eigensystem of a diagonal
+    matrix after modification by a rank-one symmetric matrix. This
+    routine is used only for the eigenproblem which requires all
+    eigenvalues and optionally eigenvectors of a dense or banded
+    Hermitian matrix that has been reduced to tridiagonal form.
+
+      T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out)
+
+      where Z = Q'u, u is a vector of length N with ones in the
+      CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+
+       The eigenvectors of the original matrix are stored in Q, and the
+       eigenvalues are in D.  The algorithm consists of three stages:
+
+          The first stage consists of deflating the size of the problem
+          when there are multiple eigenvalues or if there is a zero in
+          the Z vector.  For each such occurence the dimension of the
+          secular equation problem is reduced by one.  This stage is
+          performed by the routine DLAED2.
+
+          The second stage consists of calculating the updated
+          eigenvalues. This is done by finding the roots of the secular
+          equation via the routine DLAED4 (as called by SLAED3).
+          This routine also calculates the eigenvectors of the current
+          problem.
+
+          The final stage consists of computing the updated eigenvectors
+          directly using the updated eigenvalues.  The eigenvectors for
+          the current problem are multiplied with the eigenvectors from
+          the overall problem.
+
+    Arguments
+    =========
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    CUTPNT (input) INTEGER
+           Contains the location of the last eigenvalue in the leading
+           sub-matrix.  min(1,N) <= CUTPNT <= N.
+
+    QSIZ   (input) INTEGER
+           The dimension of the unitary matrix used to reduce
+           the full matrix to tridiagonal form.  QSIZ >= N.
+
+    TLVLS  (input) INTEGER
+           The total number of merging levels in the overall divide and
+           conquer tree.
+
+    CURLVL (input) INTEGER
+           The current level in the overall merge routine,
+           0 <= curlvl <= tlvls.
+
+    CURPBM (input) INTEGER
+           The current problem in the current level in the overall
+           merge routine (counting from upper left to lower right).
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, the eigenvalues of the rank-1-perturbed matrix.
+           On exit, the eigenvalues of the repaired matrix.
+
+    Q      (input/output) COMPLEX*16 array, dimension (LDQ,N)
+           On entry, the eigenvectors of the rank-1-perturbed matrix.
+           On exit, the eigenvectors of the repaired tridiagonal matrix.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    RHO    (input) DOUBLE PRECISION
+           Contains the subdiagonal element used to create the rank-1
+           modification.
+
+    INDXQ  (output) INTEGER array, dimension (N)
+           This contains the permutation which will reintegrate the
+           subproblem just solved back into sorted order,
+           ie. D( INDXQ( I = 1, N ) ) will be in ascending order.
+
+    IWORK  (workspace) INTEGER array, dimension (4*N)
+
+    RWORK  (workspace) DOUBLE PRECISION array,
+                                   dimension (3*N+2*QSIZ*N)
+
+    WORK   (workspace) COMPLEX*16 array, dimension (QSIZ*N)
+
+    QSTORE (input/output) DOUBLE PRECISION array, dimension (N**2+1)
+           Stores eigenvectors of submatrices encountered during
+           divide and conquer, packed together. QPTR points to
+           beginning of the submatrices.
+
+    QPTR   (input/output) INTEGER array, dimension (N+2)
+           List of indices pointing to beginning of submatrices stored
+           in QSTORE. The submatrices are numbered starting at the
+           bottom left of the divide and conquer tree, from left to
+           right and bottom to top.
+
+    PRMPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in PERM a
+           level's permutation is stored.  PRMPTR(i+1) - PRMPTR(i)
+           indicates the size of the permutation and also the size of
+           the full, non-deflated problem.
+
+    PERM   (input) INTEGER array, dimension (N lg N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (input) INTEGER array, dimension (N lg N)
+           Contains a list of pointers which indicate where in GIVCOL a
+           level's Givens rotations are stored.  GIVPTR(i+1) - GIVPTR(i)
+           indicates the number of Givens rotations.
+
+    GIVCOL (input) INTEGER array, dimension (2, N lg N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension (2, N lg N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  if INFO = 1, an eigenvalue did not converge
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --indxq;
+    --qstore;
+    --qptr;
+    --prmptr;
+    --perm;
+    --givptr;
+    givcol -= 3;
+    givnum -= 3;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*
+       IF( ICOMPQ.LT.0 .OR. ICOMPQ.GT.1 ) THEN
+          INFO = -1
+       ELSE IF( N.LT.0 ) THEN
+*/
+    if (*n < 0) {
+	*info = -1;
+    } else if (min(1,*n) > *cutpnt || *n < *cutpnt) {
+	*info = -2;
+    } else if (*qsiz < *n) {
+	*info = -3;
+    } else if (*ldq < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLAED7", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*
+       The following values are for bookkeeping purposes only.  They are
+       integer pointers which indicate the portion of the workspace
+       used by a particular array in DLAED2 and SLAED3.
+*/
+
+    iz = 1;
+    idlmda = iz + *n;
+    iw = idlmda + *n;
+    iq = iw + *n;
+
+    indx = 1;
+    indxc = indx + *n;
+    coltyp = indxc + *n;
+    indxp = coltyp + *n;
+
+/*
+       Form the z-vector which consists of the last row of Q_1 and the
+       first row of Q_2.
+*/
+
+    ptr = pow_ii(&c__2, tlvls) + 1;
+    i__1 = *curlvl - 1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = *tlvls - i__;
+	ptr += pow_ii(&c__2, &i__2);
+/* L10: */
+    }
+    curr = ptr + *curpbm;
+    dlaeda_(n, tlvls, curlvl, curpbm, &prmptr[1], &perm[1], &givptr[1], &
+	    givcol[3], &givnum[3], &qstore[1], &qptr[1], &rwork[iz], &rwork[
+	    iz + *n], info);
+
+/*
+       When solving the final problem, we no longer need the stored data,
+       so we will overwrite the data from this level onto the previously
+       used storage space.
+*/
+
+    if (*curlvl == *tlvls) {
+	qptr[curr] = 1;
+	prmptr[curr] = 1;
+	givptr[curr] = 1;
+    }
+
+/*     Sort and Deflate eigenvalues. */
+
+    zlaed8_(&k, n, qsiz, &q[q_offset], ldq, &d__[1], rho, cutpnt, &rwork[iz],
+	    &rwork[idlmda], &work[1], qsiz, &rwork[iw], &iwork[indxp], &iwork[
+	    indx], &indxq[1], &perm[prmptr[curr]], &givptr[curr + 1], &givcol[
+	    (givptr[curr] << 1) + 1], &givnum[(givptr[curr] << 1) + 1], info);
+    prmptr[curr + 1] = prmptr[curr] + *n;
+    givptr[curr + 1] += givptr[curr];
+
+/*     Solve Secular Equation. */
+
+    if (k != 0) {
+	dlaed9_(&k, &c__1, &k, n, &d__[1], &rwork[iq], &k, rho, &rwork[idlmda]
+		, &rwork[iw], &qstore[qptr[curr]], &k, info);
+	zlacrm_(qsiz, &k, &work[1], qsiz, &qstore[qptr[curr]], &k, &q[
+		q_offset], ldq, &rwork[iq]);
+/* Computing 2nd power */
+	i__1 = k;
+	qptr[curr + 1] = qptr[curr] + i__1 * i__1;
+	if (*info != 0) {
+	    return 0;
+	}
+
+/*     Prepare the INDXQ sorting premutation. */
+
+	n1 = k;
+	n2 = *n - k;
+	dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &indxq[1]);
+    } else {
+	qptr[curr + 1] = qptr[curr];
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    indxq[i__] = i__;
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLAED7 */
+
+} /* zlaed7_ */
+
+/* Subroutine */ int zlaed8_(integer *k, integer *n, integer *qsiz,
+	doublecomplex *q, integer *ldq, doublereal *d__, doublereal *rho,
+	integer *cutpnt, doublereal *z__, doublereal *dlamda, doublecomplex *
+	q2, integer *ldq2, doublereal *w, integer *indxp, integer *indx,
+	integer *indxq, integer *perm, integer *givptr, integer *givcol,
+	doublereal *givnum, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, q2_dim1, q2_offset, i__1;
+    doublereal d__1;
+
+    /* Local variables */
+    static doublereal c__;
+    static integer i__, j;
+    static doublereal s, t;
+    static integer k2, n1, n2, jp, n1p1;
+    static doublereal eps, tau, tol;
+    static integer jlam, imax, jmax;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *), dcopy_(integer *, doublereal *, integer *, doublereal
+	    *, integer *), zdrot_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublereal *), zcopy_(
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *)
+	    ;
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *,
+	    integer *, integer *, integer *), xerbla_(char *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    ZLAED8 merges the two sets of eigenvalues together into a single
+    sorted set.  Then it tries to deflate the size of the problem.
+    There are two ways in which deflation can occur:  when two or more
+    eigenvalues are close together or if there is a tiny element in the
+    Z vector.  For each such occurrence the order of the related secular
+    equation problem is reduced by one.
+
+    Arguments
+    =========
+
+    K      (output) INTEGER
+           Contains the number of non-deflated eigenvalues.
+           This is the order of the related secular equation.
+
+    N      (input) INTEGER
+           The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    QSIZ   (input) INTEGER
+           The dimension of the unitary matrix used to reduce
+           the dense or band matrix to tridiagonal form.
+           QSIZ >= N if ICOMPQ = 1.
+
+    Q      (input/output) COMPLEX*16 array, dimension (LDQ,N)
+           On entry, Q contains the eigenvectors of the partially solved
+           system which has been previously updated in matrix
+           multiplies with other partially solved eigensystems.
+           On exit, Q contains the trailing (N-K) updated eigenvectors
+           (those which were deflated) in its last N-K columns.
+
+    LDQ    (input) INTEGER
+           The leading dimension of the array Q.  LDQ >= max( 1, N ).
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry, D contains the eigenvalues of the two submatrices to
+           be combined.  On exit, D contains the trailing (N-K) updated
+           eigenvalues (those which were deflated) sorted into increasing
+           order.
+
+    RHO    (input/output) DOUBLE PRECISION
+           Contains the off diagonal element associated with the rank-1
+           cut which originally split the two submatrices which are now
+           being recombined. RHO is modified during the computation to
+           the value required by DLAED3.
+
+    CUTPNT (input) INTEGER
+           Contains the location of the last eigenvalue in the leading
+           sub-matrix.  MIN(1,N) <= CUTPNT <= N.
+
+    Z      (input) DOUBLE PRECISION array, dimension (N)
+           On input this vector contains the updating vector (the last
+           row of the first sub-eigenvector matrix and the first row of
+           the second sub-eigenvector matrix).  The contents of Z are
+           destroyed during the updating process.
+
+    DLAMDA (output) DOUBLE PRECISION array, dimension (N)
+           Contains a copy of the first K eigenvalues which will be used
+           by DLAED3 to form the secular equation.
+
+    Q2     (output) COMPLEX*16 array, dimension (LDQ2,N)
+           If ICOMPQ = 0, Q2 is not referenced.  Otherwise,
+           Contains a copy of the first K eigenvectors which will be used
+           by DLAED7 in a matrix multiply (DGEMM) to update the new
+           eigenvectors.
+
+    LDQ2   (input) INTEGER
+           The leading dimension of the array Q2.  LDQ2 >= max( 1, N ).
+
+    W      (output) DOUBLE PRECISION array, dimension (N)
+           This will hold the first k values of the final
+           deflation-altered z-vector and will be passed to DLAED3.
+
+    INDXP  (workspace) INTEGER array, dimension (N)
+           This will contain the permutation used to place deflated
+           values of D at the end of the array. On output INDXP(1:K)
+           points to the nondeflated D-values and INDXP(K+1:N)
+           points to the deflated eigenvalues.
+
+    INDX   (workspace) INTEGER array, dimension (N)
+           This will contain the permutation used to sort the contents of
+           D into ascending order.
+
+    INDXQ  (input) INTEGER array, dimension (N)
+           This contains the permutation which separately sorts the two
+           sub-problems in D into ascending order.  Note that elements in
+           the second half of this permutation must first have CUTPNT
+           added to their values in order to be accurate.
+
+    PERM   (output) INTEGER array, dimension (N)
+           Contains the permutations (from deflation and sorting) to be
+           applied to each eigenblock.
+
+    GIVPTR (output) INTEGER
+           Contains the number of Givens rotations which took place in
+           this subproblem.
+
+    GIVCOL (output) INTEGER array, dimension (2, N)
+           Each pair of numbers indicates a pair of columns to take place
+           in a Givens rotation.
+
+    GIVNUM (output) DOUBLE PRECISION array, dimension (2, N)
+           Each number indicates the S value to be used in the
+           corresponding Givens rotation.
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+    --d__;
+    --z__;
+    --dlamda;
+    q2_dim1 = *ldq2;
+    q2_offset = 1 + q2_dim1;
+    q2 -= q2_offset;
+    --w;
+    --indxp;
+    --indx;
+    --indxq;
+    --perm;
+    givcol -= 3;
+    givnum -= 3;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -2;
+    } else if (*qsiz < *n) {
+	*info = -3;
+    } else if (*ldq < max(1,*n)) {
+	*info = -5;
+    } else if (*cutpnt < min(1,*n) || *cutpnt > *n) {
+	*info = -8;
+    } else if (*ldq2 < max(1,*n)) {
+	*info = -12;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLAED8", &i__1);
+	return 0;
+    }
+
+/*
+       Need to initialize GIVPTR to O here in case of quick exit
+       to prevent an unspecified code behavior (usually sigfault)
+       when IWORK array on entry to *stedc is not zeroed
+       (or at least some IWORK entries which used in *laed7 for GIVPTR).
+*/
+
+    *givptr = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    n1 = *cutpnt;
+    n2 = *n - n1;
+    n1p1 = n1 + 1;
+
+    if (*rho < 0.) {
+	dscal_(&n2, &c_b1276, &z__[n1p1], &c__1);
+    }
+
+/*     Normalize z so that norm(z) = 1 */
+
+    t = 1. / sqrt(2.);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	indx[j] = j;
+/* L10: */
+    }
+    dscal_(n, &t, &z__[1], &c__1);
+    *rho = (d__1 = *rho * 2., abs(d__1));
+
+/*     Sort the eigenvalues into increasing order */
+
+    i__1 = *n;
+    for (i__ = *cutpnt + 1; i__ <= i__1; ++i__) {
+	indxq[i__] += *cutpnt;
+/* L20: */
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dlamda[i__] = d__[indxq[i__]];
+	w[i__] = z__[indxq[i__]];
+/* L30: */
+    }
+    i__ = 1;
+    j = *cutpnt + 1;
+    dlamrg_(&n1, &n2, &dlamda[1], &c__1, &c__1, &indx[1]);
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	d__[i__] = dlamda[indx[i__]];
+	z__[i__] = w[indx[i__]];
+/* L40: */
+    }
+
+/*     Calculate the allowable deflation tolerance */
+
+    imax = idamax_(n, &z__[1], &c__1);
+    jmax = idamax_(n, &d__[1], &c__1);
+    eps = EPSILON;
+    tol = eps * 8. * (d__1 = d__[jmax], abs(d__1));
+
+/*
+       If the rank-1 modifier is small enough, no more needs to be done
+       -- except to reorganize Q so that its columns correspond with the
+       elements in D.
+*/
+
+    if (*rho * (d__1 = z__[imax], abs(d__1)) <= tol) {
+	*k = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    perm[j] = indxq[indx[j]];
+	    zcopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1 + 1]
+		    , &c__1);
+/* L50: */
+	}
+	zlacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq);
+	return 0;
+    }
+
+/*
+       If there are multiple eigenvalues then the problem deflates.  Here
+       the number of equal eigenvalues are found.  As each equal
+       eigenvalue is found, an elementary reflector is computed to rotate
+       the corresponding eigensubspace so that the corresponding
+       components of Z are zero in this new basis.
+*/
+
+    *k = 0;
+    k2 = *n + 1;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	if (*rho * (d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*           Deflate due to small z component. */
+
+	    --k2;
+	    indxp[k2] = j;
+	    if (j == *n) {
+		goto L100;
+	    }
+	} else {
+	    jlam = j;
+	    goto L70;
+	}
+/* L60: */
+    }
+L70:
+    ++j;
+    if (j > *n) {
+	goto L90;
+    }
+    if (*rho * (d__1 = z__[j], abs(d__1)) <= tol) {
+
+/*        Deflate due to small z component. */
+
+	--k2;
+	indxp[k2] = j;
+    } else {
+
+/*        Check if eigenvalues are close enough to allow deflation. */
+
+	s = z__[jlam];
+	c__ = z__[j];
+
+/*
+          Find sqrt(a**2+b**2) without overflow or
+          destructive underflow.
+*/
+
+	tau = dlapy2_(&c__, &s);
+	t = d__[j] - d__[jlam];
+	c__ /= tau;
+	s = -s / tau;
+	if ((d__1 = t * c__ * s, abs(d__1)) <= tol) {
+
+/*           Deflation is possible. */
+
+	    z__[j] = tau;
+	    z__[jlam] = 0.;
+
+/*           Record the appropriate Givens rotation */
+
+	    ++(*givptr);
+	    givcol[(*givptr << 1) + 1] = indxq[indx[jlam]];
+	    givcol[(*givptr << 1) + 2] = indxq[indx[j]];
+	    givnum[(*givptr << 1) + 1] = c__;
+	    givnum[(*givptr << 1) + 2] = s;
+	    zdrot_(qsiz, &q[indxq[indx[jlam]] * q_dim1 + 1], &c__1, &q[indxq[
+		    indx[j]] * q_dim1 + 1], &c__1, &c__, &s);
+	    t = d__[jlam] * c__ * c__ + d__[j] * s * s;
+	    d__[j] = d__[jlam] * s * s + d__[j] * c__ * c__;
+	    d__[jlam] = t;
+	    --k2;
+	    i__ = 1;
+L80:
+	    if (k2 + i__ <= *n) {
+		if (d__[jlam] < d__[indxp[k2 + i__]]) {
+		    indxp[k2 + i__ - 1] = indxp[k2 + i__];
+		    indxp[k2 + i__] = jlam;
+		    ++i__;
+		    goto L80;
+		} else {
+		    indxp[k2 + i__ - 1] = jlam;
+		}
+	    } else {
+		indxp[k2 + i__ - 1] = jlam;
+	    }
+	    jlam = j;
+	} else {
+	    ++(*k);
+	    w[*k] = z__[jlam];
+	    dlamda[*k] = d__[jlam];
+	    indxp[*k] = jlam;
+	    jlam = j;
+	}
+    }
+    goto L70;
+L90:
+
+/*     Record the last eigenvalue. */
+
+    ++(*k);
+    w[*k] = z__[jlam];
+    dlamda[*k] = d__[jlam];
+    indxp[*k] = jlam;
+
+L100:
+
+/*
+       Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+       and Q2 respectively.  The eigenvalues/vectors which were not
+       deflated go into the first K slots of DLAMDA and Q2 respectively,
+       while those which were deflated go into the last N - K slots.
+*/
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jp = indxp[j];
+	dlamda[j] = d__[jp];
+	perm[j] = indxq[indx[jp]];
+	zcopy_(qsiz, &q[perm[j] * q_dim1 + 1], &c__1, &q2[j * q2_dim1 + 1], &
+		c__1);
+/* L110: */
+    }
+
+/*
+       The deflated eigenvalues and their corresponding vectors go back
+       into the last N - K slots of D and Q respectively.
+*/
+
+    if (*k < *n) {
+	i__1 = *n - *k;
+	dcopy_(&i__1, &dlamda[*k + 1], &c__1, &d__[*k + 1], &c__1);
+	i__1 = *n - *k;
+	zlacpy_("A", qsiz, &i__1, &q2[(*k + 1) * q2_dim1 + 1], ldq2, &q[(*k +
+		1) * q_dim1 + 1], ldq);
+    }
+
+    return 0;
+
+/*     End of ZLAED8 */
+
+} /* zlaed8_ */
+
+/* Subroutine */ int zlahqr_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh,
+	doublecomplex *w, integer *iloz, integer *ihiz, doublecomplex *z__,
+	integer *ldz, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    doublecomplex z__1, z__2, z__3, z__4, z__5, z__6, z__7;
+
+    /* Local variables */
+    static integer i__, j, k, l, m;
+    static doublereal s;
+    static doublecomplex t, u, v[2], x, y;
+    static integer i1, i2;
+    static doublecomplex t1;
+    static doublereal t2;
+    static doublecomplex v2;
+    static doublereal aa, ab, ba, bb, h10;
+    static doublecomplex h11;
+    static doublereal h21;
+    static doublecomplex h22, sc;
+    static integer nh, nz;
+    static doublereal sx;
+    static integer jhi;
+    static doublecomplex h11s;
+    static integer jlo, its;
+    static doublereal ulp;
+    static doublecomplex sum;
+    static doublereal tst;
+    static doublecomplex temp;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *);
+    static doublereal rtemp;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), dlabad_(doublereal *, doublereal *);
+
+    static doublereal safmin, safmax;
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *);
+    extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *,
+	     doublecomplex *);
+    static doublereal smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       ZLAHQR is an auxiliary routine called by CHSEQR to update the
+       eigenvalues and Schur decomposition already computed by CHSEQR, by
+       dealing with the Hessenberg submatrix in rows and columns ILO to
+       IHI.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N       (input) INTEGER
+            The order of the matrix H.  N >= 0.
+
+       ILO     (input) INTEGER
+       IHI     (input) INTEGER
+            It is assumed that H is already upper triangular in rows and
+            columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless ILO = 1).
+            ZLAHQR works primarily with the Hessenberg submatrix in rows
+            and columns ILO to IHI, but applies transformations to all of
+            H if WANTT is .TRUE..
+            1 <= ILO <= max(1,IHI); IHI <= N.
+
+       H       (input/output) COMPLEX*16 array, dimension (LDH,N)
+            On entry, the upper Hessenberg matrix H.
+            On exit, if INFO is zero and if WANTT is .TRUE., then H
+            is upper triangular in rows and columns ILO:IHI.  If INFO
+            is zero and if WANTT is .FALSE., then the contents of H
+            are unspecified on exit.  The output state of H in case
+            INF is positive is below under the description of INFO.
+
+       LDH     (input) INTEGER
+            The leading dimension of the array H. LDH >= max(1,N).
+
+       W       (output) COMPLEX*16 array, dimension (N)
+            The computed eigenvalues ILO to IHI are stored in the
+            corresponding elements of W. If WANTT is .TRUE., the
+            eigenvalues are stored in the same order as on the diagonal
+            of the Schur form returned in H, with W(i) = H(i,i).
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE..
+            1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
+
+       Z       (input/output) COMPLEX*16 array, dimension (LDZ,N)
+            If WANTZ is .TRUE., on entry Z must contain the current
+            matrix Z of transformations accumulated by CHSEQR, and on
+            exit Z has been updated; transformations are applied only to
+            the submatrix Z(ILOZ:IHIZ,ILO:IHI).
+            If WANTZ is .FALSE., Z is not referenced.
+
+       LDZ     (input) INTEGER
+            The leading dimension of the array Z. LDZ >= max(1,N).
+
+       INFO    (output) INTEGER
+             =   0: successful exit
+            .GT. 0: if INFO = i, ZLAHQR failed to compute all the
+                    eigenvalues ILO to IHI in a total of 30 iterations
+                    per eigenvalue; elements i+1:ihi of W contain
+                    those eigenvalues which have been successfully
+                    computed.
+
+                    If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+                    the remaining unconverged eigenvalues are the
+                    eigenvalues of the upper Hessenberg matrix
+                    rows and columns ILO thorugh INFO of the final,
+                    output value of H.
+
+                    If INFO .GT. 0 and WANTT is .TRUE., then on exit
+            (*)       (initial value of H)*U  = U*(final value of H)
+                    where U is an orthognal matrix.    The final
+                    value of H is upper Hessenberg and triangular in
+                    rows and columns INFO+1 through IHI.
+
+                    If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+                        (final value of Z)  = (initial value of Z)*U
+                    where U is the orthogonal matrix in (*)
+                    (regardless of the value of WANTT.)
+
+       Further Details
+       ===============
+
+       02-96 Based on modifications by
+       David Day, Sandia National Laboratory, USA
+
+       12-04 Further modifications by
+       Ralph Byers, University of Kansas, USA
+       This is a modified version of ZLAHQR from LAPACK version 3.0.
+       It is (1) more robust against overflow and underflow and
+       (2) adopts the more conservative Ahues & Tisseur stopping
+       criterion (LAWN 122, 1997).
+
+       =========================================================
+*/
+
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+
+    /* Function Body */
+    *info = 0;
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*ilo == *ihi) {
+	i__1 = *ilo;
+	i__2 = *ilo + *ilo * h_dim1;
+	w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i;
+	return 0;
+    }
+
+/*     ==== clear out the trash ==== */
+    i__1 = *ihi - 3;
+    for (j = *ilo; j <= i__1; ++j) {
+	i__2 = j + 2 + j * h_dim1;
+	h__[i__2].r = 0., h__[i__2].i = 0.;
+	i__2 = j + 3 + j * h_dim1;
+	h__[i__2].r = 0., h__[i__2].i = 0.;
+/* L10: */
+    }
+    if (*ilo <= *ihi - 2) {
+	i__1 = *ihi + (*ihi - 2) * h_dim1;
+	h__[i__1].r = 0., h__[i__1].i = 0.;
+    }
+/*     ==== ensure that subdiagonal entries are real ==== */
+    if (*wantt) {
+	jlo = 1;
+	jhi = *n;
+    } else {
+	jlo = *ilo;
+	jhi = *ihi;
+    }
+    i__1 = *ihi;
+    for (i__ = *ilo + 1; i__ <= i__1; ++i__) {
+	if (d_imag(&h__[i__ + (i__ - 1) * h_dim1]) != 0.) {
+/*
+             ==== The following redundant normalization
+             .    avoids problems with both gradual and
+             .    sudden underflow in ABS(H(I,I-1)) ====
+*/
+	    i__2 = i__ + (i__ - 1) * h_dim1;
+	    i__3 = i__ + (i__ - 1) * h_dim1;
+	    d__3 = (d__1 = h__[i__3].r, abs(d__1)) + (d__2 = d_imag(&h__[i__
+		    + (i__ - 1) * h_dim1]), abs(d__2));
+	    z__1.r = h__[i__2].r / d__3, z__1.i = h__[i__2].i / d__3;
+	    sc.r = z__1.r, sc.i = z__1.i;
+	    d_cnjg(&z__2, &sc);
+	    d__1 = z_abs(&sc);
+	    z__1.r = z__2.r / d__1, z__1.i = z__2.i / d__1;
+	    sc.r = z__1.r, sc.i = z__1.i;
+	    i__2 = i__ + (i__ - 1) * h_dim1;
+	    d__1 = z_abs(&h__[i__ + (i__ - 1) * h_dim1]);
+	    h__[i__2].r = d__1, h__[i__2].i = 0.;
+	    i__2 = jhi - i__ + 1;
+	    zscal_(&i__2, &sc, &h__[i__ + i__ * h_dim1], ldh);
+/* Computing MIN */
+	    i__3 = jhi, i__4 = i__ + 1;
+	    i__2 = min(i__3,i__4) - jlo + 1;
+	    d_cnjg(&z__1, &sc);
+	    zscal_(&i__2, &z__1, &h__[jlo + i__ * h_dim1], &c__1);
+	    if (*wantz) {
+		i__2 = *ihiz - *iloz + 1;
+		d_cnjg(&z__1, &sc);
+		zscal_(&i__2, &z__1, &z__[*iloz + i__ * z_dim1], &c__1);
+	    }
+	}
+/* L20: */
+    }
+
+    nh = *ihi - *ilo + 1;
+    nz = *ihiz - *iloz + 1;
+
+/*     Set machine-dependent constants for the stopping criterion. */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) nh / ulp);
+
+/*
+       I1 and I2 are the indices of the first row and last column of H
+       to which transformations must be applied. If eigenvalues only are
+       being computed, I1 and I2 are set inside the main loop.
+*/
+
+    if (*wantt) {
+	i1 = 1;
+	i2 = *n;
+    }
+
+/*
+       The main loop begins here. I is the loop index and decreases from
+       IHI to ILO in steps of 1. Each iteration of the loop works
+       with the active submatrix in rows and columns L to I.
+       Eigenvalues I+1 to IHI have already converged. Either L = ILO, or
+       H(L,L-1) is negligible so that the matrix splits.
+*/
+
+    i__ = *ihi;
+L30:
+    if (i__ < *ilo) {
+	goto L150;
+    }
+
+/*
+       Perform QR iterations on rows and columns ILO to I until a
+       submatrix of order 1 splits off at the bottom because a
+       subdiagonal element has become negligible.
+*/
+
+    l = *ilo;
+    for (its = 0; its <= 30; ++its) {
+
+/*        Look for a single small subdiagonal element. */
+
+	i__1 = l + 1;
+	for (k = i__; k >= i__1; --k) {
+	    i__2 = k + (k - 1) * h_dim1;
+	    if ((d__1 = h__[i__2].r, abs(d__1)) + (d__2 = d_imag(&h__[k + (k
+		    - 1) * h_dim1]), abs(d__2)) <= smlnum) {
+		goto L50;
+	    }
+	    i__2 = k - 1 + (k - 1) * h_dim1;
+	    i__3 = k + k * h_dim1;
+	    tst = (d__1 = h__[i__2].r, abs(d__1)) + (d__2 = d_imag(&h__[k - 1
+		    + (k - 1) * h_dim1]), abs(d__2)) + ((d__3 = h__[i__3].r,
+		    abs(d__3)) + (d__4 = d_imag(&h__[k + k * h_dim1]), abs(
+		    d__4)));
+	    if (tst == 0.) {
+		if (k - 2 >= *ilo) {
+		    i__2 = k - 1 + (k - 2) * h_dim1;
+		    tst += (d__1 = h__[i__2].r, abs(d__1));
+		}
+		if (k + 1 <= *ihi) {
+		    i__2 = k + 1 + k * h_dim1;
+		    tst += (d__1 = h__[i__2].r, abs(d__1));
+		}
+	    }
+/*
+             ==== The following is a conservative small subdiagonal
+             .    deflation criterion due to Ahues & Tisseur (LAWN 122,
+             .    1997). It has better mathematical foundation and
+             .    improves accuracy in some examples.  ====
+*/
+	    i__2 = k + (k - 1) * h_dim1;
+	    if ((d__1 = h__[i__2].r, abs(d__1)) <= ulp * tst) {
+/* Computing MAX */
+		i__2 = k + (k - 1) * h_dim1;
+		i__3 = k - 1 + k * h_dim1;
+		d__5 = (d__1 = h__[i__2].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			k + (k - 1) * h_dim1]), abs(d__2)), d__6 = (d__3 =
+			h__[i__3].r, abs(d__3)) + (d__4 = d_imag(&h__[k - 1 +
+			k * h_dim1]), abs(d__4));
+		ab = max(d__5,d__6);
+/* Computing MIN */
+		i__2 = k + (k - 1) * h_dim1;
+		i__3 = k - 1 + k * h_dim1;
+		d__5 = (d__1 = h__[i__2].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			k + (k - 1) * h_dim1]), abs(d__2)), d__6 = (d__3 =
+			h__[i__3].r, abs(d__3)) + (d__4 = d_imag(&h__[k - 1 +
+			k * h_dim1]), abs(d__4));
+		ba = min(d__5,d__6);
+		i__2 = k - 1 + (k - 1) * h_dim1;
+		i__3 = k + k * h_dim1;
+		z__2.r = h__[i__2].r - h__[i__3].r, z__2.i = h__[i__2].i -
+			h__[i__3].i;
+		z__1.r = z__2.r, z__1.i = z__2.i;
+/* Computing MAX */
+		i__4 = k + k * h_dim1;
+		d__5 = (d__1 = h__[i__4].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			k + k * h_dim1]), abs(d__2)), d__6 = (d__3 = z__1.r,
+			abs(d__3)) + (d__4 = d_imag(&z__1), abs(d__4));
+		aa = max(d__5,d__6);
+		i__2 = k - 1 + (k - 1) * h_dim1;
+		i__3 = k + k * h_dim1;
+		z__2.r = h__[i__2].r - h__[i__3].r, z__2.i = h__[i__2].i -
+			h__[i__3].i;
+		z__1.r = z__2.r, z__1.i = z__2.i;
+/* Computing MIN */
+		i__4 = k + k * h_dim1;
+		d__5 = (d__1 = h__[i__4].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			k + k * h_dim1]), abs(d__2)), d__6 = (d__3 = z__1.r,
+			abs(d__3)) + (d__4 = d_imag(&z__1), abs(d__4));
+		bb = min(d__5,d__6);
+		s = aa + ab;
+/* Computing MAX */
+		d__1 = smlnum, d__2 = ulp * (bb * (aa / s));
+		if (ba * (ab / s) <= max(d__1,d__2)) {
+		    goto L50;
+		}
+	    }
+/* L40: */
+	}
+L50:
+	l = k;
+	if (l > *ilo) {
+
+/*           H(L,L-1) is negligible */
+
+	    i__1 = l + (l - 1) * h_dim1;
+	    h__[i__1].r = 0., h__[i__1].i = 0.;
+	}
+
+/*        Exit from loop if a submatrix of order 1 has split off. */
+
+	if (l >= i__) {
+	    goto L140;
+	}
+
+/*
+          Now the active submatrix is in rows and columns L to I. If
+          eigenvalues only are being computed, only the active submatrix
+          need be transformed.
+*/
+
+	if (! (*wantt)) {
+	    i1 = l;
+	    i2 = i__;
+	}
+
+	if (its == 10) {
+
+/*           Exceptional shift. */
+
+	    i__1 = l + 1 + l * h_dim1;
+	    s = (d__1 = h__[i__1].r, abs(d__1)) * .75;
+	    i__1 = l + l * h_dim1;
+	    z__1.r = s + h__[i__1].r, z__1.i = h__[i__1].i;
+	    t.r = z__1.r, t.i = z__1.i;
+	} else if (its == 20) {
+
+/*           Exceptional shift. */
+
+	    i__1 = i__ + (i__ - 1) * h_dim1;
+	    s = (d__1 = h__[i__1].r, abs(d__1)) * .75;
+	    i__1 = i__ + i__ * h_dim1;
+	    z__1.r = s + h__[i__1].r, z__1.i = h__[i__1].i;
+	    t.r = z__1.r, t.i = z__1.i;
+	} else {
+
+/*           Wilkinson's shift. */
+
+	    i__1 = i__ + i__ * h_dim1;
+	    t.r = h__[i__1].r, t.i = h__[i__1].i;
+	    z_sqrt(&z__2, &h__[i__ - 1 + i__ * h_dim1]);
+	    z_sqrt(&z__3, &h__[i__ + (i__ - 1) * h_dim1]);
+	    z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = z__2.r *
+		    z__3.i + z__2.i * z__3.r;
+	    u.r = z__1.r, u.i = z__1.i;
+	    s = (d__1 = u.r, abs(d__1)) + (d__2 = d_imag(&u), abs(d__2));
+	    if (s != 0.) {
+		i__1 = i__ - 1 + (i__ - 1) * h_dim1;
+		z__2.r = h__[i__1].r - t.r, z__2.i = h__[i__1].i - t.i;
+		z__1.r = z__2.r * .5, z__1.i = z__2.i * .5;
+		x.r = z__1.r, x.i = z__1.i;
+		sx = (d__1 = x.r, abs(d__1)) + (d__2 = d_imag(&x), abs(d__2));
+/* Computing MAX */
+		d__3 = s, d__4 = (d__1 = x.r, abs(d__1)) + (d__2 = d_imag(&x),
+			 abs(d__2));
+		s = max(d__3,d__4);
+		z__5.r = x.r / s, z__5.i = x.i / s;
+		pow_zi(&z__4, &z__5, &c__2);
+		z__7.r = u.r / s, z__7.i = u.i / s;
+		pow_zi(&z__6, &z__7, &c__2);
+		z__3.r = z__4.r + z__6.r, z__3.i = z__4.i + z__6.i;
+		z_sqrt(&z__2, &z__3);
+		z__1.r = s * z__2.r, z__1.i = s * z__2.i;
+		y.r = z__1.r, y.i = z__1.i;
+		if (sx > 0.) {
+		    z__1.r = x.r / sx, z__1.i = x.i / sx;
+		    z__2.r = x.r / sx, z__2.i = x.i / sx;
+		    if (z__1.r * y.r + d_imag(&z__2) * d_imag(&y) < 0.) {
+			z__3.r = -y.r, z__3.i = -y.i;
+			y.r = z__3.r, y.i = z__3.i;
+		    }
+		}
+		z__4.r = x.r + y.r, z__4.i = x.i + y.i;
+		zladiv_(&z__3, &u, &z__4);
+		z__2.r = u.r * z__3.r - u.i * z__3.i, z__2.i = u.r * z__3.i +
+			u.i * z__3.r;
+		z__1.r = t.r - z__2.r, z__1.i = t.i - z__2.i;
+		t.r = z__1.r, t.i = z__1.i;
+	    }
+	}
+
+/*        Look for two consecutive small subdiagonal elements. */
+
+	i__1 = l + 1;
+	for (m = i__ - 1; m >= i__1; --m) {
+
+/*
+             Determine the effect of starting the single-shift QR
+             iteration at row M, and see if this would make H(M,M-1)
+             negligible.
+*/
+
+	    i__2 = m + m * h_dim1;
+	    h11.r = h__[i__2].r, h11.i = h__[i__2].i;
+	    i__2 = m + 1 + (m + 1) * h_dim1;
+	    h22.r = h__[i__2].r, h22.i = h__[i__2].i;
+	    z__1.r = h11.r - t.r, z__1.i = h11.i - t.i;
+	    h11s.r = z__1.r, h11s.i = z__1.i;
+	    i__2 = m + 1 + m * h_dim1;
+	    h21 = h__[i__2].r;
+	    s = (d__1 = h11s.r, abs(d__1)) + (d__2 = d_imag(&h11s), abs(d__2))
+		     + abs(h21);
+	    z__1.r = h11s.r / s, z__1.i = h11s.i / s;
+	    h11s.r = z__1.r, h11s.i = z__1.i;
+	    h21 /= s;
+	    v[0].r = h11s.r, v[0].i = h11s.i;
+	    v[1].r = h21, v[1].i = 0.;
+	    i__2 = m + (m - 1) * h_dim1;
+	    h10 = h__[i__2].r;
+	    if (abs(h10) * abs(h21) <= ulp * (((d__1 = h11s.r, abs(d__1)) + (
+		    d__2 = d_imag(&h11s), abs(d__2))) * ((d__3 = h11.r, abs(
+		    d__3)) + (d__4 = d_imag(&h11), abs(d__4)) + ((d__5 =
+		    h22.r, abs(d__5)) + (d__6 = d_imag(&h22), abs(d__6)))))) {
+		goto L70;
+	    }
+/* L60: */
+	}
+	i__1 = l + l * h_dim1;
+	h11.r = h__[i__1].r, h11.i = h__[i__1].i;
+	i__1 = l + 1 + (l + 1) * h_dim1;
+	h22.r = h__[i__1].r, h22.i = h__[i__1].i;
+	z__1.r = h11.r - t.r, z__1.i = h11.i - t.i;
+	h11s.r = z__1.r, h11s.i = z__1.i;
+	i__1 = l + 1 + l * h_dim1;
+	h21 = h__[i__1].r;
+	s = (d__1 = h11s.r, abs(d__1)) + (d__2 = d_imag(&h11s), abs(d__2)) +
+		abs(h21);
+	z__1.r = h11s.r / s, z__1.i = h11s.i / s;
+	h11s.r = z__1.r, h11s.i = z__1.i;
+	h21 /= s;
+	v[0].r = h11s.r, v[0].i = h11s.i;
+	v[1].r = h21, v[1].i = 0.;
+L70:
+
+/*        Single-shift QR step */
+
+	i__1 = i__ - 1;
+	for (k = m; k <= i__1; ++k) {
+
+/*
+             The first iteration of this loop determines a reflection G
+             from the vector V and applies it from left and right to H,
+             thus creating a nonzero bulge below the subdiagonal.
+
+             Each subsequent iteration determines a reflection G to
+             restore the Hessenberg form in the (K-1)th column, and thus
+             chases the bulge one step toward the bottom of the active
+             submatrix.
+
+             V(2) is always real before the call to ZLARFG, and hence
+             after the call T2 ( = T1*V(2) ) is also real.
+*/
+
+	    if (k > m) {
+		zcopy_(&c__2, &h__[k + (k - 1) * h_dim1], &c__1, v, &c__1);
+	    }
+	    zlarfg_(&c__2, v, &v[1], &c__1, &t1);
+	    if (k > m) {
+		i__2 = k + (k - 1) * h_dim1;
+		h__[i__2].r = v[0].r, h__[i__2].i = v[0].i;
+		i__2 = k + 1 + (k - 1) * h_dim1;
+		h__[i__2].r = 0., h__[i__2].i = 0.;
+	    }
+	    v2.r = v[1].r, v2.i = v[1].i;
+	    z__1.r = t1.r * v2.r - t1.i * v2.i, z__1.i = t1.r * v2.i + t1.i *
+		    v2.r;
+	    t2 = z__1.r;
+
+/*
+             Apply G from the left to transform the rows of the matrix
+             in columns K to I2.
+*/
+
+	    i__2 = i2;
+	    for (j = k; j <= i__2; ++j) {
+		d_cnjg(&z__3, &t1);
+		i__3 = k + j * h_dim1;
+		z__2.r = z__3.r * h__[i__3].r - z__3.i * h__[i__3].i, z__2.i =
+			 z__3.r * h__[i__3].i + z__3.i * h__[i__3].r;
+		i__4 = k + 1 + j * h_dim1;
+		z__4.r = t2 * h__[i__4].r, z__4.i = t2 * h__[i__4].i;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		sum.r = z__1.r, sum.i = z__1.i;
+		i__3 = k + j * h_dim1;
+		i__4 = k + j * h_dim1;
+		z__1.r = h__[i__4].r - sum.r, z__1.i = h__[i__4].i - sum.i;
+		h__[i__3].r = z__1.r, h__[i__3].i = z__1.i;
+		i__3 = k + 1 + j * h_dim1;
+		i__4 = k + 1 + j * h_dim1;
+		z__2.r = sum.r * v2.r - sum.i * v2.i, z__2.i = sum.r * v2.i +
+			sum.i * v2.r;
+		z__1.r = h__[i__4].r - z__2.r, z__1.i = h__[i__4].i - z__2.i;
+		h__[i__3].r = z__1.r, h__[i__3].i = z__1.i;
+/* L80: */
+	    }
+
+/*
+             Apply G from the right to transform the columns of the
+             matrix in rows I1 to min(K+2,I).
+
+   Computing MIN
+*/
+	    i__3 = k + 2;
+	    i__2 = min(i__3,i__);
+	    for (j = i1; j <= i__2; ++j) {
+		i__3 = j + k * h_dim1;
+		z__2.r = t1.r * h__[i__3].r - t1.i * h__[i__3].i, z__2.i =
+			t1.r * h__[i__3].i + t1.i * h__[i__3].r;
+		i__4 = j + (k + 1) * h_dim1;
+		z__3.r = t2 * h__[i__4].r, z__3.i = t2 * h__[i__4].i;
+		z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		sum.r = z__1.r, sum.i = z__1.i;
+		i__3 = j + k * h_dim1;
+		i__4 = j + k * h_dim1;
+		z__1.r = h__[i__4].r - sum.r, z__1.i = h__[i__4].i - sum.i;
+		h__[i__3].r = z__1.r, h__[i__3].i = z__1.i;
+		i__3 = j + (k + 1) * h_dim1;
+		i__4 = j + (k + 1) * h_dim1;
+		d_cnjg(&z__3, &v2);
+		z__2.r = sum.r * z__3.r - sum.i * z__3.i, z__2.i = sum.r *
+			z__3.i + sum.i * z__3.r;
+		z__1.r = h__[i__4].r - z__2.r, z__1.i = h__[i__4].i - z__2.i;
+		h__[i__3].r = z__1.r, h__[i__3].i = z__1.i;
+/* L90: */
+	    }
+
+	    if (*wantz) {
+
+/*              Accumulate transformations in the matrix Z */
+
+		i__2 = *ihiz;
+		for (j = *iloz; j <= i__2; ++j) {
+		    i__3 = j + k * z_dim1;
+		    z__2.r = t1.r * z__[i__3].r - t1.i * z__[i__3].i, z__2.i =
+			     t1.r * z__[i__3].i + t1.i * z__[i__3].r;
+		    i__4 = j + (k + 1) * z_dim1;
+		    z__3.r = t2 * z__[i__4].r, z__3.i = t2 * z__[i__4].i;
+		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+		    sum.r = z__1.r, sum.i = z__1.i;
+		    i__3 = j + k * z_dim1;
+		    i__4 = j + k * z_dim1;
+		    z__1.r = z__[i__4].r - sum.r, z__1.i = z__[i__4].i -
+			    sum.i;
+		    z__[i__3].r = z__1.r, z__[i__3].i = z__1.i;
+		    i__3 = j + (k + 1) * z_dim1;
+		    i__4 = j + (k + 1) * z_dim1;
+		    d_cnjg(&z__3, &v2);
+		    z__2.r = sum.r * z__3.r - sum.i * z__3.i, z__2.i = sum.r *
+			     z__3.i + sum.i * z__3.r;
+		    z__1.r = z__[i__4].r - z__2.r, z__1.i = z__[i__4].i -
+			    z__2.i;
+		    z__[i__3].r = z__1.r, z__[i__3].i = z__1.i;
+/* L100: */
+		}
+	    }
+
+	    if (k == m && m > l) {
+
+/*
+                If the QR step was started at row M > L because two
+                consecutive small subdiagonals were found, then extra
+                scaling must be performed to ensure that H(M,M-1) remains
+                real.
+*/
+
+		z__1.r = 1. - t1.r, z__1.i = 0. - t1.i;
+		temp.r = z__1.r, temp.i = z__1.i;
+		d__1 = z_abs(&temp);
+		z__1.r = temp.r / d__1, z__1.i = temp.i / d__1;
+		temp.r = z__1.r, temp.i = z__1.i;
+		i__2 = m + 1 + m * h_dim1;
+		i__3 = m + 1 + m * h_dim1;
+		d_cnjg(&z__2, &temp);
+		z__1.r = h__[i__3].r * z__2.r - h__[i__3].i * z__2.i, z__1.i =
+			 h__[i__3].r * z__2.i + h__[i__3].i * z__2.r;
+		h__[i__2].r = z__1.r, h__[i__2].i = z__1.i;
+		if (m + 2 <= i__) {
+		    i__2 = m + 2 + (m + 1) * h_dim1;
+		    i__3 = m + 2 + (m + 1) * h_dim1;
+		    z__1.r = h__[i__3].r * temp.r - h__[i__3].i * temp.i,
+			    z__1.i = h__[i__3].r * temp.i + h__[i__3].i *
+			    temp.r;
+		    h__[i__2].r = z__1.r, h__[i__2].i = z__1.i;
+		}
+		i__2 = i__;
+		for (j = m; j <= i__2; ++j) {
+		    if (j != m + 1) {
+			if (i2 > j) {
+			    i__3 = i2 - j;
+			    zscal_(&i__3, &temp, &h__[j + (j + 1) * h_dim1],
+				    ldh);
+			}
+			i__3 = j - i1;
+			d_cnjg(&z__1, &temp);
+			zscal_(&i__3, &z__1, &h__[i1 + j * h_dim1], &c__1);
+			if (*wantz) {
+			    d_cnjg(&z__1, &temp);
+			    zscal_(&nz, &z__1, &z__[*iloz + j * z_dim1], &
+				    c__1);
+			}
+		    }
+/* L110: */
+		}
+	    }
+/* L120: */
+	}
+
+/*        Ensure that H(I,I-1) is real. */
+
+	i__1 = i__ + (i__ - 1) * h_dim1;
+	temp.r = h__[i__1].r, temp.i = h__[i__1].i;
+	if (d_imag(&temp) != 0.) {
+	    rtemp = z_abs(&temp);
+	    i__1 = i__ + (i__ - 1) * h_dim1;
+	    h__[i__1].r = rtemp, h__[i__1].i = 0.;
+	    z__1.r = temp.r / rtemp, z__1.i = temp.i / rtemp;
+	    temp.r = z__1.r, temp.i = z__1.i;
+	    if (i2 > i__) {
+		i__1 = i2 - i__;
+		d_cnjg(&z__1, &temp);
+		zscal_(&i__1, &z__1, &h__[i__ + (i__ + 1) * h_dim1], ldh);
+	    }
+	    i__1 = i__ - i1;
+	    zscal_(&i__1, &temp, &h__[i1 + i__ * h_dim1], &c__1);
+	    if (*wantz) {
+		zscal_(&nz, &temp, &z__[*iloz + i__ * z_dim1], &c__1);
+	    }
+	}
+
+/* L130: */
+    }
+
+/*     Failure to converge in remaining number of iterations */
+
+    *info = i__;
+    return 0;
+
+L140:
+
+/*     H(I,I-1) is negligible: one eigenvalue has converged. */
+
+    i__1 = i__;
+    i__2 = i__ + i__ * h_dim1;
+    w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i;
+
+/*     return to start of the main loop with new value of I. */
+
+    i__ = l - 1;
+    goto L30;
+
+L150:
+    return 0;
+
+/*     End of ZLAHQR */
+
+} /* zlahqr_ */
+
+/* Subroutine */ int zlahr2_(integer *n, integer *k, integer *nb,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *t,
+	integer *ldt, doublecomplex *y, integer *ldy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__1, i__2,
+	    i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex ei;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), zgemm_(char *, char *, integer *,
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *),
+	    zcopy_(integer *, doublecomplex *, integer *, doublecomplex *,
+	    integer *), ztrmm_(char *, char *, char *, char *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *),
+	    zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), ztrmv_(char *, char *, char *,
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlarfg_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *,
+	    doublecomplex *, integer *), zlacpy_(char *, integer *, integer *,
+	     doublecomplex *, integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+    -- April 2009                                                      --
+
+
+    Purpose
+    =======
+
+    ZLAHR2 reduces the first NB columns of A complex general n-BY-(n-k+1)
+    matrix A so that elements below the k-th subdiagonal are zero. The
+    reduction is performed by an unitary similarity transformation
+    Q' * A * Q. The routine returns the matrices V and T which determine
+    Q as a block reflector I - V*T*V', and also the matrix Y = A * V * T.
+
+    This is an auxiliary routine called by ZGEHRD.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    K       (input) INTEGER
+            The offset for the reduction. Elements below the k-th
+            subdiagonal in the first NB columns are reduced to zero.
+            K < N.
+
+    NB      (input) INTEGER
+            The number of columns to be reduced.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N-K+1)
+            On entry, the n-by-(n-k+1) general matrix A.
+            On exit, the elements on and above the k-th subdiagonal in
+            the first NB columns are overwritten with the corresponding
+            elements of the reduced matrix; the elements below the k-th
+            subdiagonal, with the array TAU, represent the matrix Q as a
+            product of elementary reflectors. The other columns of A are
+            unchanged. See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    TAU     (output) COMPLEX*16 array, dimension (NB)
+            The scalar factors of the elementary reflectors. See Further
+            Details.
+
+    T       (output) COMPLEX*16 array, dimension (LDT,NB)
+            The upper triangular matrix T.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T.  LDT >= NB.
+
+    Y       (output) COMPLEX*16 array, dimension (LDY,NB)
+            The n-by-nb matrix Y.
+
+    LDY     (input) INTEGER
+            The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of nb elementary reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
+    A(i+k+1:n,i), and tau in TAU(i).
+
+    The elements of the vectors v together form the (n-k+1)-by-nb matrix
+    V which is needed, with T and Y, to apply the transformation to the
+    unreduced part of the matrix, using an update of the form:
+    A := (I - V*T*V') * (A - Y*V').
+
+    The contents of A on exit are illustrated by the following example
+    with n = 7, k = 3 and nb = 2:
+
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( a   a   a   a   a )
+       ( h   h   a   a   a )
+       ( v1  h   a   a   a )
+       ( v1  v2  a   a   a )
+       ( v1  v2  a   a   a )
+
+    where a denotes an element of the original matrix A, h denotes a
+    modified element of the upper Hessenberg matrix H, and vi denotes an
+    element of the vector defining H(i).
+
+    This subroutine is a slight modification of LAPACK-3.0's DLAHRD
+    incorporating improvements proposed by Quintana-Orti and Van de
+    Gejin. Note that the entries of A(1:K,2:NB) differ from those
+    returned by the original LAPACK-3.0's DLAHRD routine. (This
+    subroutine is not backward compatible with LAPACK-3.0's DLAHRD.)
+
+    References
+    ==========
+
+    Gregorio Quintana-Orti and Robert van de Geijn, "Improving the
+    performance of reduction to Hessenberg form," ACM Transactions on
+    Mathematical Software, 32(2):180-194, June 2006.
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    --tau;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+
+    /* Function Body */
+    if (*n <= 1) {
+	return 0;
+    }
+
+    i__1 = *nb;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (i__ > 1) {
+
+/*
+             Update A(K+1:N,I)
+
+             Update I-th column of A - Y * V'
+*/
+
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &a[*k + i__ - 1 + a_dim1], lda);
+	    i__2 = *n - *k;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("NO TRANSPOSE", &i__2, &i__3, &z__1, &y[*k + 1 + y_dim1],
+		    ldy, &a[*k + i__ - 1 + a_dim1], lda, &c_b57, &a[*k + 1 +
+		    i__ * a_dim1], &c__1);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &a[*k + i__ - 1 + a_dim1], lda);
+
+/*
+             Apply I - V * T' * V' to this column (call it b) from the
+             left, using the last column of T as workspace
+
+             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)
+                      ( V2 )             ( b2 )
+
+             where V1 is unit lower triangular
+
+             w := V1' * b1
+*/
+
+	    i__2 = i__ - 1;
+	    zcopy_(&i__2, &a[*k + 1 + i__ * a_dim1], &c__1, &t[*nb * t_dim1 +
+		    1], &c__1);
+	    i__2 = i__ - 1;
+	    ztrmv_("Lower", "Conjugate transpose", "UNIT", &i__2, &a[*k + 1 +
+		    a_dim1], lda, &t[*nb * t_dim1 + 1], &c__1);
+
+/*           w := w + V2'*b2 */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[*k + i__ +
+		    a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b57,
+		    &t[*nb * t_dim1 + 1], &c__1);
+
+/*           w := T'*w */
+
+	    i__2 = i__ - 1;
+	    ztrmv_("Upper", "Conjugate transpose", "NON-UNIT", &i__2, &t[
+		    t_offset], ldt, &t[*nb * t_dim1 + 1], &c__1);
+
+/*           b2 := b2 - V2*w */
+
+	    i__2 = *n - *k - i__ + 1;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("NO TRANSPOSE", &i__2, &i__3, &z__1, &a[*k + i__ + a_dim1],
+		     lda, &t[*nb * t_dim1 + 1], &c__1, &c_b57, &a[*k + i__ +
+		    i__ * a_dim1], &c__1);
+
+/*           b1 := b1 - V1*w */
+
+	    i__2 = i__ - 1;
+	    ztrmv_("Lower", "NO TRANSPOSE", "UNIT", &i__2, &a[*k + 1 + a_dim1]
+		    , lda, &t[*nb * t_dim1 + 1], &c__1);
+	    i__2 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zaxpy_(&i__2, &z__1, &t[*nb * t_dim1 + 1], &c__1, &a[*k + 1 + i__
+		    * a_dim1], &c__1);
+
+	    i__2 = *k + i__ - 1 + (i__ - 1) * a_dim1;
+	    a[i__2].r = ei.r, a[i__2].i = ei.i;
+	}
+
+/*
+          Generate the elementary reflector H(I) to annihilate
+          A(K+I+1:N,I)
+*/
+
+	i__2 = *n - *k - i__ + 1;
+/* Computing MIN */
+	i__3 = *k + i__ + 1;
+	zlarfg_(&i__2, &a[*k + i__ + i__ * a_dim1], &a[min(i__3,*n) + i__ *
+		a_dim1], &c__1, &tau[i__]);
+	i__2 = *k + i__ + i__ * a_dim1;
+	ei.r = a[i__2].r, ei.i = a[i__2].i;
+	i__2 = *k + i__ + i__ * a_dim1;
+	a[i__2].r = 1., a[i__2].i = 0.;
+
+/*        Compute  Y(K+1:N,I) */
+
+	i__2 = *n - *k;
+	i__3 = *n - *k - i__ + 1;
+	zgemv_("NO TRANSPOSE", &i__2, &i__3, &c_b57, &a[*k + 1 + (i__ + 1) *
+		a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b56, &y[*
+		k + 1 + i__ * y_dim1], &c__1);
+	i__2 = *n - *k - i__ + 1;
+	i__3 = i__ - 1;
+	zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[*k + i__ +
+		a_dim1], lda, &a[*k + i__ + i__ * a_dim1], &c__1, &c_b56, &t[
+		i__ * t_dim1 + 1], &c__1);
+	i__2 = *n - *k;
+	i__3 = i__ - 1;
+	z__1.r = -1., z__1.i = -0.;
+	zgemv_("NO TRANSPOSE", &i__2, &i__3, &z__1, &y[*k + 1 + y_dim1], ldy,
+		&t[i__ * t_dim1 + 1], &c__1, &c_b57, &y[*k + 1 + i__ * y_dim1]
+		, &c__1);
+	i__2 = *n - *k;
+	zscal_(&i__2, &tau[i__], &y[*k + 1 + i__ * y_dim1], &c__1);
+
+/*        Compute T(1:I,I) */
+
+	i__2 = i__ - 1;
+	i__3 = i__;
+	z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i;
+	zscal_(&i__2, &z__1, &t[i__ * t_dim1 + 1], &c__1);
+	i__2 = i__ - 1;
+	ztrmv_("Upper", "No Transpose", "NON-UNIT", &i__2, &t[t_offset], ldt,
+		&t[i__ * t_dim1 + 1], &c__1)
+		;
+	i__2 = i__ + i__ * t_dim1;
+	i__3 = i__;
+	t[i__2].r = tau[i__3].r, t[i__2].i = tau[i__3].i;
+
+/* L10: */
+    }
+    i__1 = *k + *nb + *nb * a_dim1;
+    a[i__1].r = ei.r, a[i__1].i = ei.i;
+
+/*     Compute Y(1:K,1:NB) */
+
+    zlacpy_("ALL", k, nb, &a[(a_dim1 << 1) + 1], lda, &y[y_offset], ldy);
+    ztrmm_("RIGHT", "Lower", "NO TRANSPOSE", "UNIT", k, nb, &c_b57, &a[*k + 1
+	    + a_dim1], lda, &y[y_offset], ldy);
+    if (*n > *k + *nb) {
+	i__1 = *n - *k - *nb;
+	zgemm_("NO TRANSPOSE", "NO TRANSPOSE", k, nb, &i__1, &c_b57, &a[(*nb
+		+ 2) * a_dim1 + 1], lda, &a[*k + 1 + *nb + a_dim1], lda, &
+		c_b57, &y[y_offset], ldy);
+    }
+    ztrmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b57, &t[
+	    t_offset], ldt, &y[y_offset], ldy);
+
+    return 0;
+
+/*     End of ZLAHR2 */
+
+} /* zlahr2_ */
+
+/* Subroutine */ int zlals0_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *nrhs, doublecomplex *b, integer *ldb,
+	doublecomplex *bx, integer *ldbx, integer *perm, integer *givptr,
+	integer *givcol, integer *ldgcol, doublereal *givnum, integer *ldgnum,
+	 doublereal *poles, doublereal *difl, doublereal *difr, doublereal *
+	z__, integer *k, doublereal *c__, doublereal *s, doublereal *rwork,
+	integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, difr_dim1, difr_offset, givnum_dim1,
+	    givnum_offset, poles_dim1, poles_offset, b_dim1, b_offset,
+	    bx_dim1, bx_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, m, n;
+    static doublereal dj;
+    static integer nlp1, jcol;
+    static doublereal temp;
+    static integer jrow;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    static doublereal diflj, difrj, dsigj;
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, doublereal *, integer *), zdrot_(integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublereal *, doublereal *);
+    extern doublereal dlamc3_(doublereal *, doublereal *);
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), xerbla_(char *, integer *);
+    static doublereal dsigjp;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *,
+	    doublecomplex *, integer *), zlascl_(char *, integer *, integer *,
+	     doublereal *, doublereal *, integer *, integer *, doublecomplex *
+	    , integer *, integer *), zlacpy_(char *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLALS0 applies back the multiplying factors of either the left or the
+    right singular vector matrix of a diagonal matrix appended by a row
+    to the right hand side matrix B in solving the least squares problem
+    using the divide-and-conquer SVD approach.
+
+    For the left singular vector matrix, three types of orthogonal
+    matrices are involved:
+
+    (1L) Givens rotations: the number of such rotations is GIVPTR; the
+         pairs of columns/rows they were applied to are stored in GIVCOL;
+         and the C- and S-values of these rotations are stored in GIVNUM.
+
+    (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+         row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+         J-th row.
+
+    (3L) The left singular vector matrix of the remaining matrix.
+
+    For the right singular vector matrix, four types of orthogonal
+    matrices are involved:
+
+    (1R) The right singular vector matrix of the remaining matrix.
+
+    (2R) If SQRE = 1, one extra Givens rotation to generate the right
+         null space.
+
+    (3R) The inverse transformation of (2L).
+
+    (4R) The inverse transformation of (1L).
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Left singular vector matrix.
+           = 1: Right singular vector matrix.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) COMPLEX*16 array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M. On output, B contains
+           the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B. LDB must be at least
+           max(1,MAX( M, N ) ).
+
+    BX     (workspace) COMPLEX*16 array, dimension ( LDBX, NRHS )
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    PERM   (input) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) applied
+           to the two blocks.
+
+    GIVPTR (input) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of rows/columns
+           involved in a Givens rotation.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value used in the
+           corresponding Givens rotation.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of arrays DIFR, POLES and
+           GIVNUM, must be at least K.
+
+    POLES  (input) DOUBLE PRECISION array, dimension ( LDGNUM, 2 )
+           On entry, POLES(1:K, 1) contains the new singular
+           values obtained from solving the secular equation, and
+           POLES(1:K, 2) is an array containing the poles in the secular
+           equation.
+
+    DIFL   (input) DOUBLE PRECISION array, dimension ( K ).
+           On entry, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (input) DOUBLE PRECISION array, dimension ( LDGNUM, 2 ).
+           On entry, DIFR(I, 1) contains the distances between I-th
+           updated (undeflated) singular value and the I+1-th
+           (undeflated) old singular value. And DIFR(I, 2) is the
+           normalizing factor for the I-th right singular vector.
+
+    Z      (input) DOUBLE PRECISION array, dimension ( K )
+           Contain the components of the deflation-adjusted updating row
+           vector.
+
+    K      (input) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (input) DOUBLE PRECISION
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (input) DOUBLE PRECISION
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    RWORK  (workspace) DOUBLE PRECISION array, dimension
+           ( K*(1+NRHS) + 2*NRHS )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    difr_dim1 = *ldgnum;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --z__;
+    --rwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    }
+
+    n = *nl + *nr + 1;
+
+    if (*nrhs < 1) {
+	*info = -5;
+    } else if (*ldb < n) {
+	*info = -7;
+    } else if (*ldbx < n) {
+	*info = -9;
+    } else if (*givptr < 0) {
+	*info = -11;
+    } else if (*ldgcol < n) {
+	*info = -13;
+    } else if (*ldgnum < n) {
+	*info = -15;
+    } else if (*k < 1) {
+	*info = -20;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLALS0", &i__1);
+	return 0;
+    }
+
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+
+    if (*icompq == 0) {
+
+/*
+          Apply back orthogonal transformations from the left.
+
+          Step (1L): apply back the Givens rotations performed.
+*/
+
+	i__1 = *givptr;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    zdrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]);
+/* L10: */
+	}
+
+/*        Step (2L): permute rows of B. */
+
+	zcopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx);
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    zcopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1],
+		    ldbx);
+/* L20: */
+	}
+
+/*
+          Step (3L): apply the inverse of the left singular vector
+          matrix to BX.
+*/
+
+	if (*k == 1) {
+	    zcopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb);
+	    if (z__[1] < 0.) {
+		zdscal_(nrhs, &c_b1276, &b[b_offset], ldb);
+	    }
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		diflj = difl[j];
+		dj = poles[j + poles_dim1];
+		dsigj = -poles[j + (poles_dim1 << 1)];
+		if (j < *k) {
+		    difrj = -difr[j + difr_dim1];
+		    dsigjp = -poles[j + 1 + (poles_dim1 << 1)];
+		}
+		if (z__[j] == 0. || poles[j + (poles_dim1 << 1)] == 0.) {
+		    rwork[j] = 0.;
+		} else {
+		    rwork[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj
+			    / (poles[j + (poles_dim1 << 1)] + dj);
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0. || poles[i__ + (poles_dim1 << 1)] ==
+			    0.) {
+			rwork[i__] = 0.;
+		    } else {
+			rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				 / (dlamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigj) - diflj) / (poles[i__ + (poles_dim1 <<
+				1)] + dj);
+		    }
+/* L30: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0. || poles[i__ + (poles_dim1 << 1)] ==
+			    0.) {
+			rwork[i__] = 0.;
+		    } else {
+			rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				 / (dlamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigjp) + difrj) / (poles[i__ + (poles_dim1 <<
+				 1)] + dj);
+		    }
+/* L40: */
+		}
+		rwork[1] = -1.;
+		temp = dnrm2_(k, &rwork[1], &c__1);
+
+/*
+                Since B and BX are complex, the following call to DGEMV
+                is performed in two steps (real and imaginary parts).
+
+                CALL DGEMV( 'T', K, NRHS, ONE, BX, LDBX, WORK, 1, ZERO,
+      $                     B( J, 1 ), LDB )
+*/
+
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			i__4 = jrow + jcol * bx_dim1;
+			rwork[i__] = bx[i__4].r;
+/* L50: */
+		    }
+/* L60: */
+		}
+		dgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1);
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			rwork[i__] = d_imag(&bx[jrow + jcol * bx_dim1]);
+/* L70: */
+		    }
+/* L80: */
+		}
+		dgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs],
+			&c__1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = j + jcol * b_dim1;
+		    i__4 = jcol + *k;
+		    i__5 = jcol + *k + *nrhs;
+		    z__1.r = rwork[i__4], z__1.i = rwork[i__5];
+		    b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L90: */
+		}
+		zlascl_("G", &c__0, &c__0, &temp, &c_b1034, &c__1, nrhs, &b[j
+			+ b_dim1], ldb, info);
+/* L100: */
+	    }
+	}
+
+/*        Move the deflated rows of BX to B also. */
+
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    zlacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1
+		    + b_dim1], ldb);
+	}
+    } else {
+
+/*
+          Apply back the right orthogonal transformations.
+
+          Step (1R): apply back the new right singular vector matrix
+          to B.
+*/
+
+	if (*k == 1) {
+	    zcopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx);
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		dsigj = poles[j + (poles_dim1 << 1)];
+		if (z__[j] == 0.) {
+		    rwork[j] = 0.;
+		} else {
+		    rwork[j] = -z__[j] / difl[j] / (dsigj + poles[j +
+			    poles_dim1]) / difr[j + (difr_dim1 << 1)];
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.) {
+			rwork[i__] = 0.;
+		    } else {
+			d__1 = -poles[i__ + 1 + (poles_dim1 << 1)];
+			rwork[i__] = z__[j] / (dlamc3_(&dsigj, &d__1) - difr[
+				i__ + difr_dim1]) / (dsigj + poles[i__ +
+				poles_dim1]) / difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L110: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.) {
+			rwork[i__] = 0.;
+		    } else {
+			d__1 = -poles[i__ + (poles_dim1 << 1)];
+			rwork[i__] = z__[j] / (dlamc3_(&dsigj, &d__1) - difl[
+				i__]) / (dsigj + poles[i__ + poles_dim1]) /
+				difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L120: */
+		}
+
+/*
+                Since B and BX are complex, the following call to DGEMV
+                is performed in two steps (real and imaginary parts).
+
+                CALL DGEMV( 'T', K, NRHS, ONE, B, LDB, WORK, 1, ZERO,
+      $                     BX( J, 1 ), LDBX )
+*/
+
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			i__4 = jrow + jcol * b_dim1;
+			rwork[i__] = b[i__4].r;
+/* L130: */
+		    }
+/* L140: */
+		}
+		dgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1);
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			rwork[i__] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L150: */
+		    }
+/* L160: */
+		}
+		dgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs],
+			&c__1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = j + jcol * bx_dim1;
+		    i__4 = jcol + *k;
+		    i__5 = jcol + *k + *nrhs;
+		    z__1.r = rwork[i__4], z__1.i = rwork[i__5];
+		    bx[i__3].r = z__1.r, bx[i__3].i = z__1.i;
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+
+/*
+          Step (2R): if SQRE = 1, apply back the rotation that is
+          related to the right null space of the subproblem.
+*/
+
+	if (*sqre == 1) {
+	    zcopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx);
+	    zdrot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__,
+		    s);
+	}
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    zlacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 +
+		    bx_dim1], ldbx);
+	}
+
+/*        Step (3R): permute rows of B. */
+
+	zcopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb);
+	if (*sqre == 1) {
+	    zcopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb);
+	}
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    zcopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1],
+		    ldb);
+/* L190: */
+	}
+
+/*        Step (4R): apply back the Givens rotations performed. */
+
+	for (i__ = *givptr; i__ >= 1; --i__) {
+	    d__1 = -givnum[i__ + givnum_dim1];
+	    zdrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &d__1);
+/* L200: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLALS0 */
+
+} /* zlals0_ */
+
+/* Subroutine */ int zlalsa_(integer *icompq, integer *smlsiz, integer *n,
+	integer *nrhs, doublecomplex *b, integer *ldb, doublecomplex *bx,
+	integer *ldbx, doublereal *u, integer *ldu, doublereal *vt, integer *
+	k, doublereal *difl, doublereal *difr, doublereal *z__, doublereal *
+	poles, integer *givptr, integer *givcol, integer *ldgcol, integer *
+	perm, doublereal *givnum, doublereal *c__, doublereal *s, doublereal *
+	rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, difl_dim1,
+	    difl_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset,
+	    z_dim1, z_offset, b_dim1, b_offset, bx_dim1, bx_offset, i__1,
+	    i__2, i__3, i__4, i__5, i__6;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl,
+	    ndb1, nlp1, lvl2, nrp1, jcol, nlvl, sqre, jrow, jimag;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer jreal, inode, ndiml, ndimr;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zlals0_(integer *, integer *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, integer *, integer *,
+	    integer *, doublereal *, integer *, doublereal *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
+	     doublereal *, integer *), dlasdt_(integer *, integer *, integer *
+	    , integer *, integer *, integer *, integer *), xerbla_(char *,
+	    integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLALSA is an itermediate step in solving the least squares problem
+    by computing the SVD of the coefficient matrix in compact form (The
+    singular vectors are computed as products of simple orthorgonal
+    matrices.).
+
+    If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector
+    matrix of an upper bidiagonal matrix to the right hand side; and if
+    ICOMPQ = 1, ZLALSA applies the right singular vector matrix to the
+    right hand side. The singular vector matrices were generated in
+    compact form by ZLALSA.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether the left or the right singular vector
+           matrix is involved.
+           = 0: Left singular vector matrix
+           = 1: Right singular vector matrix
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row and column dimensions of the upper bidiagonal matrix.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) COMPLEX*16 array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M.
+           On output, B contains the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,MAX( M, N ) ).
+
+    BX     (output) COMPLEX*16 array, dimension ( LDBX, NRHS )
+           On exit, the result of applying the left or right singular
+           vector matrix to B.
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    U      (input) DOUBLE PRECISION array, dimension ( LDU, SMLSIZ ).
+           On entry, U contains the left singular vector matrices of all
+           subproblems at the bottom level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR,
+           POLES, GIVNUM, and Z.
+
+    VT     (input) DOUBLE PRECISION array, dimension ( LDU, SMLSIZ+1 ).
+           On entry, VT' contains the right singular vector matrices of
+           all subproblems at the bottom level.
+
+    K      (input) INTEGER array, dimension ( N ).
+
+    DIFL   (input) DOUBLE PRECISION array, dimension ( LDU, NLVL ).
+           where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1.
+
+    DIFR   (input) DOUBLE PRECISION array, dimension ( LDU, 2 * NLVL ).
+           On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record
+           distances between singular values on the I-th level and
+           singular values on the (I -1)-th level, and DIFR(*, 2 * I)
+           record the normalizing factors of the right singular vectors
+           matrices of subproblems on I-th level.
+
+    Z      (input) DOUBLE PRECISION array, dimension ( LDU, NLVL ).
+           On entry, Z(1, I) contains the components of the deflation-
+           adjusted updating row vector for subproblems on the I-th
+           level.
+
+    POLES  (input) DOUBLE PRECISION array, dimension ( LDU, 2 * NLVL ).
+           On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old
+           singular values involved in the secular equations on the I-th
+           level.
+
+    GIVPTR (input) INTEGER array, dimension ( N ).
+           On entry, GIVPTR( I ) records the number of Givens
+           rotations performed on the I-th problem on the computation
+           tree.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ).
+           On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the
+           locations of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (input) INTEGER array, dimension ( LDGCOL, NLVL ).
+           On entry, PERM(*, I) records permutations done on the I-th
+           level of the computation tree.
+
+    GIVNUM (input) DOUBLE PRECISION array, dimension ( LDU, 2 * NLVL ).
+           On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S-
+           values of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    C      (input) DOUBLE PRECISION array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (input) DOUBLE PRECISION array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           S( I ) contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    RWORK  (workspace) DOUBLE PRECISION array, dimension at least
+           MAX( (SMLSZ+1)*NRHS*3, N*(1+NRHS) + 2*NRHS ).
+
+    IWORK  (workspace) INTEGER array.
+           The dimension must be at least 3 * N
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < *smlsiz) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < *n) {
+	*info = -6;
+    } else if (*ldbx < *n) {
+	*info = -8;
+    } else if (*ldu < *n) {
+	*info = -10;
+    } else if (*ldgcol < *n) {
+	*info = -19;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLALSA", &i__1);
+	return 0;
+    }
+
+/*     Book-keeping and  setting up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+
+    dlasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       The following code applies back the left singular vector factors.
+       For applying back the right singular vector factors, go to 170.
+*/
+
+    if (*icompq == 1) {
+	goto L170;
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by DLASDQ. The corresponding left and right singular vector
+       matrices are in explicit form. First apply back the left
+       singular vector matrices.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+
+/*
+          Since B and BX are complex, the following call to DGEMM
+          is performed in two steps (real and imaginary parts).
+
+          CALL DGEMM( 'T', 'N', NL, NRHS, NL, ONE, U( NLF, 1 ), LDU,
+       $               B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX )
+*/
+
+	j = nl * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L10: */
+	    }
+/* L20: */
+	}
+	dgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, &
+		rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[1], &nl);
+	j = nl * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L30: */
+	    }
+/* L40: */
+	}
+	dgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, &
+		rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[nl * *nrhs
+		+ 1], &nl);
+	jreal = 0;
+	jimag = nl * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		z__1.r = rwork[i__5], z__1.i = rwork[i__6];
+		bx[i__4].r = z__1.r, bx[i__4].i = z__1.i;
+/* L50: */
+	    }
+/* L60: */
+	}
+
+/*
+          Since B and BX are complex, the following call to DGEMM
+          is performed in two steps (real and imaginary parts).
+
+          CALL DGEMM( 'T', 'N', NR, NRHS, NR, ONE, U( NRF, 1 ), LDU,
+      $               B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX )
+*/
+
+	j = nr * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L70: */
+	    }
+/* L80: */
+	}
+	dgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, &
+		rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[1], &nr);
+	j = nr * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L90: */
+	    }
+/* L100: */
+	}
+	dgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, &
+		rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[nr * *nrhs
+		+ 1], &nr);
+	jreal = 0;
+	jimag = nr * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		z__1.r = rwork[i__5], z__1.i = rwork[i__6];
+		bx[i__4].r = z__1.r, bx[i__4].i = z__1.i;
+/* L110: */
+	    }
+/* L120: */
+	}
+
+/* L130: */
+    }
+
+/*
+       Next copy the rows of B that correspond to unchanged rows
+       in the bidiagonal matrix to BX.
+*/
+
+    i__1 = nd;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	ic = iwork[inode + i__ - 1];
+	zcopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx);
+/* L140: */
+    }
+
+/*
+       Finally go through the left singular vector matrices of all
+       the other subproblems bottom-up on the tree.
+*/
+
+    j = pow_ii(&c__2, &nlvl);
+    sqre = 0;
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          find the first node LF and last node LL on
+          the current level LVL
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    --j;
+	    zlals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, &
+		    b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &rwork[1], info);
+/* L150: */
+	}
+/* L160: */
+    }
+    goto L330;
+
+/*     ICOMPQ = 1: applying back the right singular vector factors. */
+
+L170:
+
+/*
+       First now go through the right singular vector matrices of all
+       the tree nodes top-down.
+*/
+
+    j = 0;
+    i__1 = nlvl;
+    for (lvl = 1; lvl <= i__1; ++lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__2 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__2);
+	    ll = (lf << 1) - 1;
+	}
+	i__2 = lf;
+	for (i__ = ll; i__ >= i__2; --i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqre = 0;
+	    } else {
+		sqre = 1;
+	    }
+	    ++j;
+	    zlals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[
+		    nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &rwork[1], info);
+/* L180: */
+	}
+/* L190: */
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by DLASDQ. The corresponding right singular vector
+       matrices are in explicit form. Apply them back.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlp1 = nl + 1;
+	if (i__ == nd) {
+	    nrp1 = nr;
+	} else {
+	    nrp1 = nr + 1;
+	}
+	nlf = ic - nl;
+	nrf = ic + 1;
+
+/*
+          Since B and BX are complex, the following call to DGEMM is
+          performed in two steps (real and imaginary parts).
+
+          CALL DGEMM( 'T', 'N', NLP1, NRHS, NLP1, ONE, VT( NLF, 1 ), LDU,
+      $               B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX )
+*/
+
+	j = nlp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L200: */
+	    }
+/* L210: */
+	}
+	dgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1],
+		ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[
+		1], &nlp1);
+	j = nlp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L220: */
+	    }
+/* L230: */
+	}
+	dgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1],
+		ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[
+		nlp1 * *nrhs + 1], &nlp1);
+	jreal = 0;
+	jimag = nlp1 * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		z__1.r = rwork[i__5], z__1.i = rwork[i__6];
+		bx[i__4].r = z__1.r, bx[i__4].i = z__1.i;
+/* L240: */
+	    }
+/* L250: */
+	}
+
+/*
+          Since B and BX are complex, the following call to DGEMM is
+          performed in two steps (real and imaginary parts).
+
+          CALL DGEMM( 'T', 'N', NRP1, NRHS, NRP1, ONE, VT( NRF, 1 ), LDU,
+      $               B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX )
+*/
+
+	j = nrp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L260: */
+	    }
+/* L270: */
+	}
+	dgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1],
+		ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[
+		1], &nrp1);
+	j = nrp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L280: */
+	    }
+/* L290: */
+	}
+	dgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1],
+		ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[
+		nrp1 * *nrhs + 1], &nrp1);
+	jreal = 0;
+	jimag = nrp1 * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		z__1.r = rwork[i__5], z__1.i = rwork[i__6];
+		bx[i__4].r = z__1.r, bx[i__4].i = z__1.i;
+/* L300: */
+	    }
+/* L310: */
+	}
+
+/* L320: */
+    }
+
+L330:
+
+    return 0;
+
+/*     End of ZLALSA */
+
+} /* zlalsa_ */
+
+/* Subroutine */ int zlalsd_(char *uplo, integer *smlsiz, integer *n, integer
+	*nrhs, doublereal *d__, doublereal *e, doublecomplex *b, integer *ldb,
+	 doublereal *rcond, integer *rank, doublecomplex *work, doublereal *
+	rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer c__, i__, j, k;
+    static doublereal r__;
+    static integer s, u, z__;
+    static doublereal cs;
+    static integer bx;
+    static doublereal sn;
+    static integer st, vt, nm1, st1;
+    static doublereal eps;
+    static integer iwk;
+    static doublereal tol;
+    static integer difl, difr;
+    static doublereal rcnd;
+    static integer jcol, irwb, perm, nsub, nlvl, sqre, bxst, jrow, irwu,
+	    jimag;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+    static integer jreal, irwib, poles, sizei, irwrb, nsize;
+    extern /* Subroutine */ int zdrot_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublereal *), zcopy_(
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *)
+	    ;
+    static integer irwvt, icmpq1, icmpq2;
+
+    extern /* Subroutine */ int dlasda_(integer *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *,
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *, integer *, integer *, integer *,
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *), dlascl_(char *, integer *, integer *, doublereal *,
+	    doublereal *, integer *, integer *, doublereal *, integer *,
+	    integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, doublereal *, doublereal *, doublereal *,
+	     integer *, doublereal *, integer *, doublereal *, integer *,
+	    doublereal *, integer *), dlaset_(char *, integer *,
+	    integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *), xerbla_(char *, integer *);
+    static integer givcol;
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int zlalsa_(integer *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *, doublereal *, integer *, doublereal *,
+	    doublereal *, doublereal *, doublereal *, integer *, integer *,
+	    integer *, integer *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, integer *, integer *), zlascl_(char *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, integer *,
+	    doublecomplex *, integer *, integer *), dlasrt_(char *,
+	    integer *, doublereal *, integer *), zlacpy_(char *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *), zlaset_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, integer *);
+    static doublereal orgnrm;
+    static integer givnum, givptr, nrwork, irwwrk, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    ZLALSD uses the singular value decomposition of A to solve the least
+    squares problem of finding X to minimize the Euclidean norm of each
+    column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+    are N-by-NRHS. The solution X overwrites B.
+
+    The singular values of A smaller than RCOND times the largest
+    singular value are treated as zero in solving the least squares
+    problem; in this case a minimum norm solution is returned.
+    The actual singular values are returned in D in ascending order.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    UPLO   (input) CHARACTER*1
+           = 'U': D and E define an upper bidiagonal matrix.
+           = 'L': D and E define a  lower bidiagonal matrix.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The dimension of the  bidiagonal matrix.  N >= 0.
+
+    NRHS   (input) INTEGER
+           The number of columns of B. NRHS must be at least 1.
+
+    D      (input/output) DOUBLE PRECISION array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit, if INFO = 0, D contains its singular values.
+
+    E      (input/output) DOUBLE PRECISION array, dimension (N-1)
+           Contains the super-diagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    B      (input/output) COMPLEX*16 array, dimension (LDB,NRHS)
+           On input, B contains the right hand sides of the least
+           squares problem. On output, B contains the solution X.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,N).
+
+    RCOND  (input) DOUBLE PRECISION
+           The singular values of A less than or equal to RCOND times
+           the largest singular value are treated as zero in solving
+           the least squares problem. If RCOND is negative,
+           machine precision is used instead.
+           For example, if diag(S)*X=B were the least squares problem,
+           where diag(S) is a diagonal matrix of singular values, the
+           solution would be X(i) = B(i) / S(i) if S(i) is greater than
+           RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to
+           RCOND*max(S).
+
+    RANK   (output) INTEGER
+           The number of singular values of A greater than RCOND times
+           the largest singular value.
+
+    WORK   (workspace) COMPLEX*16 array, dimension at least
+           (N * NRHS).
+
+    RWORK  (workspace) DOUBLE PRECISION array, dimension at least
+           (9*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS +
+           MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS ),
+           where
+           NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+
+    IWORK  (workspace) INTEGER array, dimension at least
+           (3*N*NLVL + 11*N).
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  The algorithm failed to compute a singular value while
+                 working on the submatrix lying in rows and columns
+                 INFO/(N+1) through MOD(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < 1 || *ldb < *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLALSD", &i__1);
+	return 0;
+    }
+
+    eps = EPSILON;
+
+/*     Set up the tolerance. */
+
+    if (*rcond <= 0. || *rcond >= 1.) {
+	rcnd = eps;
+    } else {
+	rcnd = *rcond;
+    }
+
+    *rank = 0;
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	if (d__[1] == 0.) {
+	    zlaset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	} else {
+	    *rank = 1;
+	    zlascl_("G", &c__0, &c__0, &d__[1], &c_b1034, &c__1, nrhs, &b[
+		    b_offset], ldb, info);
+	    d__[1] = abs(d__[1]);
+	}
+	return 0;
+    }
+
+/*     Rotate the matrix if it is lower bidiagonal. */
+
+    if (*(unsigned char *)uplo == 'L') {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    dlartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (*nrhs == 1) {
+		zdrot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], &
+			c__1, &cs, &sn);
+	    } else {
+		rwork[(i__ << 1) - 1] = cs;
+		rwork[i__ * 2] = sn;
+	    }
+/* L10: */
+	}
+	if (*nrhs > 1) {
+	    i__1 = *nrhs;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = *n - 1;
+		for (j = 1; j <= i__2; ++j) {
+		    cs = rwork[(j << 1) - 1];
+		    sn = rwork[j * 2];
+		    zdrot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__
+			    * b_dim1], &c__1, &cs, &sn);
+/* L20: */
+		}
+/* L30: */
+	    }
+	}
+    }
+
+/*     Scale. */
+
+    nm1 = *n - 1;
+    orgnrm = dlanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.) {
+	zlaset_("A", n, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	return 0;
+    }
+
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, &c__1, &d__[1], n, info);
+    dlascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &nm1, &c__1, &e[1], &nm1,
+	    info);
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= *smlsiz) {
+	irwu = 1;
+	irwvt = irwu + *n * *n;
+	irwwrk = irwvt + *n * *n;
+	irwrb = irwwrk;
+	irwib = irwrb + *n * *nrhs;
+	irwb = irwib + *n * *nrhs;
+	dlaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwu], n);
+	dlaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwvt], n);
+	dlasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &rwork[irwvt], n,
+		&rwork[irwu], n, &rwork[irwwrk], &c__1, &rwork[irwwrk], info);
+	if (*info != 0) {
+	    return 0;
+	}
+
+/*
+          In the real version, B is passed to DLASDQ and multiplied
+          internally by Q'. Here B is complex and that product is
+          computed below in two steps (real and imaginary parts).
+*/
+
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		i__3 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__3].r;
+/* L40: */
+	    }
+/* L50: */
+	}
+	dgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb],
+		n, &c_b328, &rwork[irwrb], n);
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L60: */
+	    }
+/* L70: */
+	}
+	dgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb],
+		n, &c_b328, &rwork[irwib], n);
+	jreal = irwrb - 1;
+	jimag = irwib - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++jreal;
+		++jimag;
+		i__3 = jrow + jcol * b_dim1;
+		i__4 = jreal;
+		i__5 = jimag;
+		z__1.r = rwork[i__4], z__1.i = rwork[i__5];
+		b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+	tol = rcnd * (d__1 = d__[idamax_(n, &d__[1], &c__1)], abs(d__1));
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (d__[i__] <= tol) {
+		zlaset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[i__ + b_dim1],
+			ldb);
+	    } else {
+		zlascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, &
+			b[i__ + b_dim1], ldb, info);
+		++(*rank);
+	    }
+/* L100: */
+	}
+
+/*
+          Since B is complex, the following call to DGEMM is performed
+          in two steps (real and imaginary parts). That is for V * B
+          (in the real version of the code V' is stored in WORK).
+
+          CALL DGEMM( 'T', 'N', N, NRHS, N, ONE, WORK, N, B, LDB, ZERO,
+      $               WORK( NWORK ), N )
+*/
+
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		i__3 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__3].r;
+/* L110: */
+	    }
+/* L120: */
+	}
+	dgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb],
+		 n, &c_b328, &rwork[irwrb], n);
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L130: */
+	    }
+/* L140: */
+	}
+	dgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb],
+		 n, &c_b328, &rwork[irwib], n);
+	jreal = irwrb - 1;
+	jimag = irwib - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++jreal;
+		++jimag;
+		i__3 = jrow + jcol * b_dim1;
+		i__4 = jreal;
+		i__5 = jimag;
+		z__1.r = rwork[i__4], z__1.i = rwork[i__5];
+		b[i__3].r = z__1.r, b[i__3].i = z__1.i;
+/* L150: */
+	    }
+/* L160: */
+	}
+
+/*        Unscale. */
+
+	dlascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n,
+		info);
+	dlasrt_("D", n, &d__[1], info);
+	zlascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset],
+		ldb, info);
+
+	return 0;
+    }
+
+/*     Book-keeping and setting up some constants. */
+
+    nlvl = (integer) (log((doublereal) (*n) / (doublereal) (*smlsiz + 1)) /
+	    log(2.)) + 1;
+
+    smlszp = *smlsiz + 1;
+
+    u = 1;
+    vt = *smlsiz * *n + 1;
+    difl = vt + smlszp * *n;
+    difr = difl + nlvl * *n;
+    z__ = difr + (nlvl * *n << 1);
+    c__ = z__ + nlvl * *n;
+    s = c__ + *n;
+    poles = s + *n;
+    givnum = poles + (nlvl << 1) * *n;
+    nrwork = givnum + (nlvl << 1) * *n;
+    bx = 1;
+
+    irwrb = nrwork;
+    irwib = irwrb + *smlsiz * *nrhs;
+    irwb = irwib + *smlsiz * *nrhs;
+
+    sizei = *n + 1;
+    k = sizei + *n;
+    givptr = k + *n;
+    perm = givptr + *n;
+    givcol = perm + nlvl * *n;
+    iwk = givcol + (nlvl * *n << 1);
+
+    st = 1;
+    sqre = 0;
+    icmpq1 = 1;
+    icmpq2 = 0;
+    nsub = 0;
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = d__[i__], abs(d__1)) < eps) {
+	    d__[i__] = d_sign(&eps, &d__[i__]);
+	}
+/* L170: */
+    }
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((d__1 = e[i__], abs(d__1)) < eps || i__ == nm1) {
+	    ++nsub;
+	    iwork[nsub] = st;
+
+/*
+             Subproblem found. First determine its size and then
+             apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*              A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else if ((d__1 = e[i__], abs(d__1)) >= eps) {
+
+/*              A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else {
+
+/*
+                A subproblem with E(NM1) small. This implies an
+                1-by-1 subproblem at D(N), which is not solved
+                explicitly.
+*/
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+		++nsub;
+		iwork[nsub] = *n;
+		iwork[sizei + nsub - 1] = 1;
+		zcopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n);
+	    }
+	    st1 = st - 1;
+	    if (nsize == 1) {
+
+/*
+                This is a 1-by-1 subproblem and is not solved
+                explicitly.
+*/
+
+		zcopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n);
+	    } else if (nsize <= *smlsiz) {
+
+/*              This is a small subproblem and is solved by DLASDQ. */
+
+		dlaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[vt +
+			st1], n);
+		dlaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[u +
+			st1], n);
+		dlasdq_("U", &c__0, &nsize, &nsize, &nsize, &c__0, &d__[st], &
+			e[st], &rwork[vt + st1], n, &rwork[u + st1], n, &
+			rwork[nrwork], &c__1, &rwork[nrwork], info)
+			;
+		if (*info != 0) {
+		    return 0;
+		}
+
+/*
+                In the real version, B is passed to DLASDQ and multiplied
+                internally by Q'. Here B is complex and that product is
+                computed below in two steps (real and imaginary parts).
+*/
+
+		j = irwb - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++j;
+			i__4 = jrow + jcol * b_dim1;
+			rwork[j] = b[i__4].r;
+/* L180: */
+		    }
+/* L190: */
+		}
+		dgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u +
+			st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb],
+			 &nsize);
+		j = irwb - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++j;
+			rwork[j] = d_imag(&b[jrow + jcol * b_dim1]);
+/* L200: */
+		    }
+/* L210: */
+		}
+		dgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u +
+			st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib],
+			 &nsize);
+		jreal = irwrb - 1;
+		jimag = irwib - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++jreal;
+			++jimag;
+			i__4 = jrow + jcol * b_dim1;
+			i__5 = jreal;
+			i__6 = jimag;
+			z__1.r = rwork[i__5], z__1.i = rwork[i__6];
+			b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L220: */
+		    }
+/* L230: */
+		}
+
+		zlacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx +
+			st1], n);
+	    } else {
+
+/*              A large problem. Solve it using divide and conquer. */
+
+		dlasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], &
+			rwork[u + st1], n, &rwork[vt + st1], &iwork[k + st1],
+			&rwork[difl + st1], &rwork[difr + st1], &rwork[z__ +
+			st1], &rwork[poles + st1], &iwork[givptr + st1], &
+			iwork[givcol + st1], n, &iwork[perm + st1], &rwork[
+			givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &
+			rwork[nrwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		bxst = bx + st1;
+		zlalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, &
+			work[bxst], n, &rwork[u + st1], n, &rwork[vt + st1], &
+			iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1]
+			, &rwork[z__ + st1], &rwork[poles + st1], &iwork[
+			givptr + st1], &iwork[givcol + st1], n, &iwork[perm +
+			st1], &rwork[givnum + st1], &rwork[c__ + st1], &rwork[
+			s + st1], &rwork[nrwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+	    }
+	    st = i__ + 1;
+	}
+/* L240: */
+    }
+
+/*     Apply the singular values and treat the tiny ones as zero. */
+
+    tol = rcnd * (d__1 = d__[idamax_(n, &d__[1], &c__1)], abs(d__1));
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+          Some of the elements in D can be negative because 1-by-1
+          subproblems were not solved explicitly.
+*/
+
+	if ((d__1 = d__[i__], abs(d__1)) <= tol) {
+	    zlaset_("A", &c__1, nrhs, &c_b56, &c_b56, &work[bx + i__ - 1], n);
+	} else {
+	    ++(*rank);
+	    zlascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, &
+		    work[bx + i__ - 1], n, info);
+	}
+	d__[i__] = (d__1 = d__[i__], abs(d__1));
+/* L250: */
+    }
+
+/*     Now apply back the right singular vectors. */
+
+    icmpq2 = 1;
+    i__1 = nsub;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	st = iwork[i__];
+	st1 = st - 1;
+	nsize = iwork[sizei + i__ - 1];
+	bxst = bx + st1;
+	if (nsize == 1) {
+	    zcopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb);
+	} else if (nsize <= *smlsiz) {
+
+/*
+             Since B and BX are complex, the following call to DGEMM
+             is performed in two steps (real and imaginary parts).
+
+             CALL DGEMM( 'T', 'N', NSIZE, NRHS, NSIZE, ONE,
+      $                  RWORK( VT+ST1 ), N, RWORK( BXST ), N, ZERO,
+      $                  B( ST, 1 ), LDB )
+*/
+
+	    j = bxst - *n - 1;
+	    jreal = irwb - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		j += *n;
+		i__3 = nsize;
+		for (jrow = 1; jrow <= i__3; ++jrow) {
+		    ++jreal;
+		    i__4 = j + jrow;
+		    rwork[jreal] = work[i__4].r;
+/* L260: */
+		}
+/* L270: */
+	    }
+	    dgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1],
+		     n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb], &nsize);
+	    j = bxst - *n - 1;
+	    jimag = irwb - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		j += *n;
+		i__3 = nsize;
+		for (jrow = 1; jrow <= i__3; ++jrow) {
+		    ++jimag;
+		    rwork[jimag] = d_imag(&work[j + jrow]);
+/* L280: */
+		}
+/* L290: */
+	    }
+	    dgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1],
+		     n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib], &nsize);
+	    jreal = irwrb - 1;
+	    jimag = irwib - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		i__3 = st + nsize - 1;
+		for (jrow = st; jrow <= i__3; ++jrow) {
+		    ++jreal;
+		    ++jimag;
+		    i__4 = jrow + jcol * b_dim1;
+		    i__5 = jreal;
+		    i__6 = jimag;
+		    z__1.r = rwork[i__5], z__1.i = rwork[i__6];
+		    b[i__4].r = z__1.r, b[i__4].i = z__1.i;
+/* L300: */
+		}
+/* L310: */
+	    }
+	} else {
+	    zlalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st +
+		    b_dim1], ldb, &rwork[u + st1], n, &rwork[vt + st1], &
+		    iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1], &
+		    rwork[z__ + st1], &rwork[poles + st1], &iwork[givptr +
+		    st1], &iwork[givcol + st1], n, &iwork[perm + st1], &rwork[
+		    givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &rwork[
+		    nrwork], &iwork[iwk], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+	}
+/* L320: */
+    }
+
+/*     Unscale and sort the singular values. */
+
+    dlascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n, info);
+    dlasrt_("D", n, &d__[1], info);
+    zlascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset], ldb,
+	    info);
+
+    return 0;
+
+/*     End of ZLALSD */
+
+} /* zlalsd_ */
+
+doublereal zlange_(char *norm, integer *m, integer *n, doublecomplex *a,
+	integer *lda, doublereal *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal ret_val, d__1, d__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal sum, scale;
+    extern logical lsame_(char *, char *);
+    static doublereal value;
+    extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *,
+	     doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLANGE  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    complex matrix A.
+
+    Description
+    ===========
+
+    ZLANGE returns the value
+
+       ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in ZLANGE as described
+            above.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.  When M = 0,
+            ZLANGE is set to zero.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.  When N = 0,
+            ZLANGE is set to zero.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The m by n matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(M,1).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)),
+            where LWORK >= M when NORM = 'I'; otherwise, WORK is not
+            referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (min(*m,*n) == 0) {
+	value = 0.;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		d__1 = value, d__2 = z_abs(&a[i__ + j * a_dim1]);
+		value = max(d__1,d__2);
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1') {
+
+/*        Find norm1(A). */
+
+	value = 0.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = 0.;
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		sum += z_abs(&a[i__ + j * a_dim1]);
+/* L30: */
+	    }
+	    value = max(value,sum);
+/* L40: */
+	}
+    } else if (lsame_(norm, "I")) {
+
+/*        Find normI(A). */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[i__] = 0.;
+/* L50: */
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		work[i__] += z_abs(&a[i__ + j * a_dim1]);
+/* L60: */
+	    }
+/* L70: */
+	}
+	value = 0.;
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    d__1 = value, d__2 = work[i__];
+	    value = max(d__1,d__2);
+/* L80: */
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.;
+	sum = 1.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    zlassq_(m, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L90: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of ZLANGE */
+
+} /* zlange_ */
+
+doublereal zlanhe_(char *norm, char *uplo, integer *n, doublecomplex *a,
+	integer *lda, doublereal *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal ret_val, d__1, d__2, d__3;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal sum, absa, scale;
+    extern logical lsame_(char *, char *);
+    static doublereal value;
+    extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *,
+	     doublereal *, doublereal *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLANHE  returns the value of the one norm,  or the Frobenius norm, or
+    the  infinity norm,  or the  element of  largest absolute value  of a
+    complex hermitian matrix A.
+
+    Description
+    ===========
+
+    ZLANHE returns the value
+
+       ZLANHE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                (
+                ( norm1(A),         NORM = '1', 'O' or 'o'
+                (
+                ( normI(A),         NORM = 'I' or 'i'
+                (
+                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in ZLANHE as described
+            above.
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            hermitian matrix A is to be referenced.
+            = 'U':  Upper triangular part of A is referenced
+            = 'L':  Lower triangular part of A is referenced
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.  When N = 0, ZLANHE is
+            set to zero.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The hermitian matrix A.  If UPLO = 'U', the leading n by n
+            upper triangular part of A contains the upper triangular part
+            of the matrix A, and the strictly lower triangular part of A
+            is not referenced.  If UPLO = 'L', the leading n by n lower
+            triangular part of A contains the lower triangular part of
+            the matrix A, and the strictly upper triangular part of A is
+            not referenced. Note that the imaginary parts of the diagonal
+            elements need not be set and are assumed to be zero.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(N,1).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)),
+            where LWORK >= N when NORM = 'I' or '1' or 'O'; otherwise,
+            WORK is not referenced.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (*n == 0) {
+	value = 0.;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    d__1 = value, d__2 = z_abs(&a[i__ + j * a_dim1]);
+		    value = max(d__1,d__2);
+/* L10: */
+		}
+/* Computing MAX */
+		i__2 = j + j * a_dim1;
+		d__2 = value, d__3 = (d__1 = a[i__2].r, abs(d__1));
+		value = max(d__2,d__3);
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+		i__2 = j + j * a_dim1;
+		d__2 = value, d__3 = (d__1 = a[i__2].r, abs(d__1));
+		value = max(d__2,d__3);
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		    d__1 = value, d__2 = z_abs(&a[i__ + j * a_dim1]);
+		    value = max(d__1,d__2);
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+    } else if (lsame_(norm, "I") || lsame_(norm, "O") || *(unsigned char *)norm == '1') {
+
+/*        Find normI(A) ( = norm1(A), since A is hermitian). */
+
+	value = 0.;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		sum = 0.;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    absa = z_abs(&a[i__ + j * a_dim1]);
+		    sum += absa;
+		    work[i__] += absa;
+/* L50: */
+		}
+		i__2 = j + j * a_dim1;
+		work[j] = sum + (d__1 = a[i__2].r, abs(d__1));
+/* L60: */
+	    }
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+		d__1 = value, d__2 = work[i__];
+		value = max(d__1,d__2);
+/* L70: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		work[i__] = 0.;
+/* L80: */
+	    }
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j + j * a_dim1;
+		sum = work[j] + (d__1 = a[i__2].r, abs(d__1));
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    absa = z_abs(&a[i__ + j * a_dim1]);
+		    sum += absa;
+		    work[i__] += absa;
+/* L90: */
+		}
+		value = max(value,sum);
+/* L100: */
+	    }
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.;
+	sum = 1.;
+	if (lsame_(uplo, "U")) {
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		i__2 = j - 1;
+		zlassq_(&i__2, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L110: */
+	    }
+	} else {
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n - j;
+		zlassq_(&i__2, &a[j + 1 + j * a_dim1], &c__1, &scale, &sum);
+/* L120: */
+	    }
+	}
+	sum *= 2;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    if (a[i__2].r != 0.) {
+		i__2 = i__ + i__ * a_dim1;
+		absa = (d__1 = a[i__2].r, abs(d__1));
+		if (scale < absa) {
+/* Computing 2nd power */
+		    d__1 = scale / absa;
+		    sum = sum * (d__1 * d__1) + 1.;
+		    scale = absa;
+		} else {
+/* Computing 2nd power */
+		    d__1 = absa / scale;
+		    sum += d__1 * d__1;
+		}
+	    }
+/* L130: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of ZLANHE */
+
+} /* zlanhe_ */
+
+/* Subroutine */ int zlaqr0_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh,
+	doublecomplex *w, integer *iloz, integer *ihiz, doublecomplex *z__,
+	integer *ldz, doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6, d__7, d__8;
+    doublecomplex z__1, z__2, z__3, z__4, z__5;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal s;
+    static doublecomplex aa, bb, cc, dd;
+    static integer ld, nh, it, ks, kt, ku, kv, ls, ns, nw;
+    static doublecomplex tr2, det;
+    static integer inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl, kbot,
+	    nmin;
+    static doublecomplex swap;
+    static integer ktop;
+    static doublecomplex zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int zlaqr3_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, integer *,
+	     doublecomplex *, integer *, integer *, doublecomplex *, integer *
+	    , doublecomplex *, integer *), zlaqr4_(logical *, logical *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, integer *), zlaqr5_(logical *,
+	    logical *, integer *, integer *, integer *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, integer *, integer *,
+	     doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, doublecomplex *, integer *,
+	     integer *, doublecomplex *, integer *);
+    static integer nibble;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    static doublecomplex rtdisc;
+    static integer nwupbd;
+    static logical sorted;
+    extern /* Subroutine */ int zlahqr_(logical *, logical *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, integer *, doublecomplex *, integer *, integer *),
+	    zlacpy_(char *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       Purpose
+       =======
+
+       ZLAQR0 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**H, where T is an upper triangular matrix (the
+       Schur form), and Z is the unitary matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input unitary
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to ZGEBAL, and then passed to ZGEHRD when the
+             matrix output by ZGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) COMPLEX*16 array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H
+             contains the upper triangular matrix T from the Schur
+             decomposition (the Schur form). If INFO = 0 and WANT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       W        (output) COMPLEX*16 array, dimension (N)
+             The computed eigenvalues of H(ILO:IHI,ILO:IHI) are stored
+             in W(ILO:IHI). If WANTT is .TRUE., then the eigenvalues are
+             stored in the same order as on the diagonal of the Schur
+             form returned in H, with W(i) = H(i,i).
+
+       Z     (input/output) COMPLEX*16 array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) COMPLEX*16 array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then ZLAQR0 does a workspace query.
+             In this case, ZLAQR0 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, ZLAQR0 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is a unitary matrix.  The final
+                  value of  H is upper Hessenberg and triangular in
+                  rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    ZLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constant WILK1 is used to form the exceptional
+       .    shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use ZLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    zlahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "ZLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "ZLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to ZLAQR3 ====
+*/
+
+	i__1 = nwr + 1;
+	zlaqr3_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[h_offset],
+		ldh, n, &h__[h_offset], ldh, n, &h__[h_offset], ldh, &work[1],
+		 &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(ZLAQR5, ZLAQR3) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1].r;
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    d__1 = (doublereal) lwkopt;
+	    z__1.r = d__1, z__1.i = 0.;
+	    work[1].r = z__1.r, work[1].i = z__1.i;
+	    return 0;
+	}
+
+/*        ==== ZLAHQR/ZLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "ZLAQR0", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "ZLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "ZLAQR0", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L80;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		i__3 = k + (k - 1) * h_dim1;
+		if (h__[i__3].r == 0. && h__[i__3].i == 0.) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    i__2 = kwtop + (kwtop - 1) * h_dim1;
+		    i__3 = kwtop - 1 + (kwtop - 2) * h_dim1;
+		    if ((d__1 = h__[i__2].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			    kwtop + (kwtop - 1) * h_dim1]), abs(d__2)) > (
+			    d__3 = h__[i__3].r, abs(d__3)) + (d__4 = d_imag(&
+			    h__[kwtop - 1 + (kwtop - 2) * h_dim1]), abs(d__4))
+			    ) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    zlaqr3_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[kv
+		    + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1], ldh, &nve, &
+		    h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if ZLAQR3
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    ZLAQR3 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+		    i__2 = ks + 1;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			i__3 = i__;
+			i__4 = i__ + i__ * h_dim1;
+			i__5 = i__ + (i__ - 1) * h_dim1;
+			d__3 = ((d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				d_imag(&h__[i__ + (i__ - 1) * h_dim1]), abs(
+				d__2))) * .75;
+			z__1.r = h__[i__4].r + d__3, z__1.i = h__[i__4].i;
+			w[i__3].r = z__1.r, w[i__3].i = z__1.i;
+			i__3 = i__ - 1;
+			i__4 = i__;
+			w[i__3].r = w[i__4].r, w[i__3].i = w[i__4].i;
+/* L30: */
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use ZLAQR4 or
+                   .    ZLAHQR on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			zlacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			if (ns > nmin) {
+			    zlaqr4_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &w[ks], &c__1, &c__1,
+				    zdum, &c__1, &work[1], lwork, &inf);
+			} else {
+			    zlahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[
+				    kt + h_dim1], ldh, &w[ks], &c__1, &c__1,
+				    zdum, &c__1, &inf);
+			}
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  Scale to avoid
+                      .    overflows, underflows and subnormals.
+                      .    (The scale factor S can not be zero,
+                      .    because H(KBOT,KBOT-1) is nonzero.) ====
+*/
+
+			if (ks >= kbot) {
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    i__3 = kbot + (kbot - 1) * h_dim1;
+			    i__4 = kbot - 1 + kbot * h_dim1;
+			    i__5 = kbot + kbot * h_dim1;
+			    s = (d__1 = h__[i__2].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[kbot - 1 + (kbot - 1) *
+				    h_dim1]), abs(d__2)) + ((d__3 = h__[i__3]
+				    .r, abs(d__3)) + (d__4 = d_imag(&h__[kbot
+				    + (kbot - 1) * h_dim1]), abs(d__4))) + ((
+				    d__5 = h__[i__4].r, abs(d__5)) + (d__6 =
+				    d_imag(&h__[kbot - 1 + kbot * h_dim1]),
+				    abs(d__6))) + ((d__7 = h__[i__5].r, abs(
+				    d__7)) + (d__8 = d_imag(&h__[kbot + kbot *
+				     h_dim1]), abs(d__8)));
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    aa.r = z__1.r, aa.i = z__1.i;
+			    i__2 = kbot + (kbot - 1) * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    cc.r = z__1.r, cc.i = z__1.i;
+			    i__2 = kbot - 1 + kbot * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    bb.r = z__1.r, bb.i = z__1.i;
+			    i__2 = kbot + kbot * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    dd.r = z__1.r, dd.i = z__1.i;
+			    z__2.r = aa.r + dd.r, z__2.i = aa.i + dd.i;
+			    z__1.r = z__2.r / 2., z__1.i = z__2.i / 2.;
+			    tr2.r = z__1.r, tr2.i = z__1.i;
+			    z__3.r = aa.r - tr2.r, z__3.i = aa.i - tr2.i;
+			    z__4.r = dd.r - tr2.r, z__4.i = dd.i - tr2.i;
+			    z__2.r = z__3.r * z__4.r - z__3.i * z__4.i,
+				    z__2.i = z__3.r * z__4.i + z__3.i *
+				    z__4.r;
+			    z__5.r = bb.r * cc.r - bb.i * cc.i, z__5.i = bb.r
+				    * cc.i + bb.i * cc.r;
+			    z__1.r = z__2.r - z__5.r, z__1.i = z__2.i -
+				    z__5.i;
+			    det.r = z__1.r, det.i = z__1.i;
+			    z__2.r = -det.r, z__2.i = -det.i;
+			    z_sqrt(&z__1, &z__2);
+			    rtdisc.r = z__1.r, rtdisc.i = z__1.i;
+			    i__2 = kbot - 1;
+			    z__2.r = tr2.r + rtdisc.r, z__2.i = tr2.i +
+				    rtdisc.i;
+			    z__1.r = s * z__2.r, z__1.i = s * z__2.i;
+			    w[i__2].r = z__1.r, w[i__2].i = z__1.i;
+			    i__2 = kbot;
+			    z__2.r = tr2.r - rtdisc.r, z__2.i = tr2.i -
+				    rtdisc.i;
+			    z__1.r = s * z__2.r, z__1.i = s * z__2.i;
+			    w[i__2].r = z__1.r, w[i__2].i = z__1.i;
+
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*                    ==== Sort the shifts (Helps a little) ==== */
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				i__4 = i__;
+				i__5 = i__ + 1;
+				if ((d__1 = w[i__4].r, abs(d__1)) + (d__2 =
+					d_imag(&w[i__]), abs(d__2)) < (d__3 =
+					w[i__5].r, abs(d__3)) + (d__4 =
+					d_imag(&w[i__ + 1]), abs(d__4))) {
+				    sorted = FALSE_;
+				    i__4 = i__;
+				    swap.r = w[i__4].r, swap.i = w[i__4].i;
+				    i__4 = i__;
+				    i__5 = i__ + 1;
+				    w[i__4].r = w[i__5].r, w[i__4].i = w[i__5]
+					    .i;
+				    i__4 = i__ + 1;
+				    w[i__4].r = swap.r, w[i__4].i = swap.i;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+		}
+
+/*
+                ==== If there are only two shifts, then use
+                .    only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    i__2 = kbot;
+		    i__3 = kbot + kbot * h_dim1;
+		    z__2.r = w[i__2].r - h__[i__3].r, z__2.i = w[i__2].i -
+			    h__[i__3].i;
+		    z__1.r = z__2.r, z__1.i = z__2.i;
+		    i__4 = kbot - 1;
+		    i__5 = kbot + kbot * h_dim1;
+		    z__4.r = w[i__4].r - h__[i__5].r, z__4.i = w[i__4].i -
+			    h__[i__5].i;
+		    z__3.r = z__4.r, z__3.i = z__4.i;
+		    if ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1),
+			    abs(d__2)) < (d__3 = z__3.r, abs(d__3)) + (d__4 =
+			    d_imag(&z__3), abs(d__4))) {
+			i__2 = kbot - 1;
+			i__3 = kbot;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    } else {
+			i__2 = kbot;
+			i__3 = kbot - 1;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		zlaqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &w[ks], &
+			h__[h_offset], ldh, iloz, ihiz, &z__[z_offset], ldz, &
+			work[1], &c__3, &h__[ku + h_dim1], ldh, &nve, &h__[
+			kwv + h_dim1], ldh, &nho, &h__[ku + kwh * h_dim1],
+			ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L70:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L80:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    d__1 = (doublereal) lwkopt;
+    z__1.r = d__1, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+/*     ==== End of ZLAQR0 ==== */
+
+    return 0;
+} /* zlaqr0_ */
+
+/* Subroutine */ int zlaqr1_(integer *n, doublecomplex *h__, integer *ldh,
+	doublecomplex *s1, doublecomplex *s2, doublecomplex *v)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    doublecomplex z__1, z__2, z__3, z__4, z__5, z__6, z__7, z__8;
+
+    /* Local variables */
+    static doublereal s;
+    static doublecomplex h21s, h31s;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+         Given a 2-by-2 or 3-by-3 matrix H, ZLAQR1 sets v to a
+         scalar multiple of the first column of the product
+
+         (*)  K = (H - s1*I)*(H - s2*I)
+
+         scaling to avoid overflows and most underflows.
+
+         This is useful for starting double implicit shift bulges
+         in the QR algorithm.
+
+
+         N      (input) integer
+                Order of the matrix H. N must be either 2 or 3.
+
+         H      (input) COMPLEX*16 array of dimension (LDH,N)
+                The 2-by-2 or 3-by-3 matrix H in (*).
+
+         LDH    (input) integer
+                The leading dimension of H as declared in
+                the calling procedure.  LDH.GE.N
+
+         S1     (input) COMPLEX*16
+         S2     S1 and S2 are the shifts defining K in (*) above.
+
+         V      (output) COMPLEX*16 array of dimension N
+                A scalar multiple of the first column of the
+                matrix K in (*).
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --v;
+
+    /* Function Body */
+    if (*n == 2) {
+	i__1 = h_dim1 + 1;
+	z__2.r = h__[i__1].r - s2->r, z__2.i = h__[i__1].i - s2->i;
+	z__1.r = z__2.r, z__1.i = z__2.i;
+	i__2 = h_dim1 + 2;
+	s = (d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs(d__2)) + (
+		(d__3 = h__[i__2].r, abs(d__3)) + (d__4 = d_imag(&h__[h_dim1
+		+ 2]), abs(d__4)));
+	if (s == 0.) {
+	    v[1].r = 0., v[1].i = 0.;
+	    v[2].r = 0., v[2].i = 0.;
+	} else {
+	    i__1 = h_dim1 + 2;
+	    z__1.r = h__[i__1].r / s, z__1.i = h__[i__1].i / s;
+	    h21s.r = z__1.r, h21s.i = z__1.i;
+	    i__1 = (h_dim1 << 1) + 1;
+	    z__2.r = h21s.r * h__[i__1].r - h21s.i * h__[i__1].i, z__2.i =
+		    h21s.r * h__[i__1].i + h21s.i * h__[i__1].r;
+	    i__2 = h_dim1 + 1;
+	    z__4.r = h__[i__2].r - s1->r, z__4.i = h__[i__2].i - s1->i;
+	    i__3 = h_dim1 + 1;
+	    z__6.r = h__[i__3].r - s2->r, z__6.i = h__[i__3].i - s2->i;
+	    z__5.r = z__6.r / s, z__5.i = z__6.i / s;
+	    z__3.r = z__4.r * z__5.r - z__4.i * z__5.i, z__3.i = z__4.r *
+		    z__5.i + z__4.i * z__5.r;
+	    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	    v[1].r = z__1.r, v[1].i = z__1.i;
+	    i__1 = h_dim1 + 1;
+	    i__2 = (h_dim1 << 1) + 2;
+	    z__4.r = h__[i__1].r + h__[i__2].r, z__4.i = h__[i__1].i + h__[
+		    i__2].i;
+	    z__3.r = z__4.r - s1->r, z__3.i = z__4.i - s1->i;
+	    z__2.r = z__3.r - s2->r, z__2.i = z__3.i - s2->i;
+	    z__1.r = h21s.r * z__2.r - h21s.i * z__2.i, z__1.i = h21s.r *
+		    z__2.i + h21s.i * z__2.r;
+	    v[2].r = z__1.r, v[2].i = z__1.i;
+	}
+    } else {
+	i__1 = h_dim1 + 1;
+	z__2.r = h__[i__1].r - s2->r, z__2.i = h__[i__1].i - s2->i;
+	z__1.r = z__2.r, z__1.i = z__2.i;
+	i__2 = h_dim1 + 2;
+	i__3 = h_dim1 + 3;
+	s = (d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs(d__2)) + (
+		(d__3 = h__[i__2].r, abs(d__3)) + (d__4 = d_imag(&h__[h_dim1
+		+ 2]), abs(d__4))) + ((d__5 = h__[i__3].r, abs(d__5)) + (d__6
+		= d_imag(&h__[h_dim1 + 3]), abs(d__6)));
+	if (s == 0.) {
+	    v[1].r = 0., v[1].i = 0.;
+	    v[2].r = 0., v[2].i = 0.;
+	    v[3].r = 0., v[3].i = 0.;
+	} else {
+	    i__1 = h_dim1 + 2;
+	    z__1.r = h__[i__1].r / s, z__1.i = h__[i__1].i / s;
+	    h21s.r = z__1.r, h21s.i = z__1.i;
+	    i__1 = h_dim1 + 3;
+	    z__1.r = h__[i__1].r / s, z__1.i = h__[i__1].i / s;
+	    h31s.r = z__1.r, h31s.i = z__1.i;
+	    i__1 = h_dim1 + 1;
+	    z__4.r = h__[i__1].r - s1->r, z__4.i = h__[i__1].i - s1->i;
+	    i__2 = h_dim1 + 1;
+	    z__6.r = h__[i__2].r - s2->r, z__6.i = h__[i__2].i - s2->i;
+	    z__5.r = z__6.r / s, z__5.i = z__6.i / s;
+	    z__3.r = z__4.r * z__5.r - z__4.i * z__5.i, z__3.i = z__4.r *
+		    z__5.i + z__4.i * z__5.r;
+	    i__3 = (h_dim1 << 1) + 1;
+	    z__7.r = h__[i__3].r * h21s.r - h__[i__3].i * h21s.i, z__7.i =
+		    h__[i__3].r * h21s.i + h__[i__3].i * h21s.r;
+	    z__2.r = z__3.r + z__7.r, z__2.i = z__3.i + z__7.i;
+	    i__4 = h_dim1 * 3 + 1;
+	    z__8.r = h__[i__4].r * h31s.r - h__[i__4].i * h31s.i, z__8.i =
+		    h__[i__4].r * h31s.i + h__[i__4].i * h31s.r;
+	    z__1.r = z__2.r + z__8.r, z__1.i = z__2.i + z__8.i;
+	    v[1].r = z__1.r, v[1].i = z__1.i;
+	    i__1 = h_dim1 + 1;
+	    i__2 = (h_dim1 << 1) + 2;
+	    z__5.r = h__[i__1].r + h__[i__2].r, z__5.i = h__[i__1].i + h__[
+		    i__2].i;
+	    z__4.r = z__5.r - s1->r, z__4.i = z__5.i - s1->i;
+	    z__3.r = z__4.r - s2->r, z__3.i = z__4.i - s2->i;
+	    z__2.r = h21s.r * z__3.r - h21s.i * z__3.i, z__2.i = h21s.r *
+		    z__3.i + h21s.i * z__3.r;
+	    i__3 = h_dim1 * 3 + 2;
+	    z__6.r = h__[i__3].r * h31s.r - h__[i__3].i * h31s.i, z__6.i =
+		    h__[i__3].r * h31s.i + h__[i__3].i * h31s.r;
+	    z__1.r = z__2.r + z__6.r, z__1.i = z__2.i + z__6.i;
+	    v[2].r = z__1.r, v[2].i = z__1.i;
+	    i__1 = h_dim1 + 1;
+	    i__2 = h_dim1 * 3 + 3;
+	    z__5.r = h__[i__1].r + h__[i__2].r, z__5.i = h__[i__1].i + h__[
+		    i__2].i;
+	    z__4.r = z__5.r - s1->r, z__4.i = z__5.i - s1->i;
+	    z__3.r = z__4.r - s2->r, z__3.i = z__4.i - s2->i;
+	    z__2.r = h31s.r * z__3.r - h31s.i * z__3.i, z__2.i = h31s.r *
+		    z__3.i + h31s.i * z__3.r;
+	    i__3 = (h_dim1 << 1) + 3;
+	    z__6.r = h21s.r * h__[i__3].r - h21s.i * h__[i__3].i, z__6.i =
+		    h21s.r * h__[i__3].i + h21s.i * h__[i__3].r;
+	    z__1.r = z__2.r + z__6.r, z__1.i = z__2.i + z__6.i;
+	    v[3].r = z__1.r, v[3].i = z__1.i;
+	}
+    }
+    return 0;
+} /* zlaqr1_ */
+
+/* Subroutine */ int zlaqr2_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, doublecomplex *h__,
+	integer *ldh, integer *iloz, integer *ihiz, doublecomplex *z__,
+	integer *ldz, integer *ns, integer *nd, doublecomplex *sh,
+	doublecomplex *v, integer *ldv, integer *nh, doublecomplex *t,
+	integer *ldt, integer *nv, doublecomplex *wv, integer *ldwv,
+	doublecomplex *work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublecomplex s;
+    static integer jw;
+    static doublereal foo;
+    static integer kln;
+    static doublecomplex tau;
+    static integer knt;
+    static doublereal ulp;
+    static integer lwk1, lwk2;
+    static doublecomplex beta;
+    static integer kcol, info, ifst, ilst, ltop, krow;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *);
+    static integer infqr;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer kwtop;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), dlabad_(doublereal *, doublereal *);
+
+    static doublereal safmin, safmax;
+    extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *), zlarfg_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *), zlahqr_(logical *,
+	    logical *, integer *, integer *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, integer *, doublecomplex *,
+	     integer *, integer *), zlacpy_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *),
+	    zlaset_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, integer *);
+    static doublereal smlnum;
+    extern /* Subroutine */ int ztrexc_(char *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, integer *, integer *,
+	    integer *);
+    static integer lwkopt;
+    extern /* Subroutine */ int zunmhr_(char *, char *, integer *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    );
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- April 2009                                                      --
+
+
+       This subroutine is identical to ZLAQR3 except that it avoids
+       recursion by calling ZLAHQR instead of ZLAQR4.
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an unitary similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an unitary similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the unitary matrix Z is updated so
+            so that the unitary Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the unitary matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) COMPLEX*16 array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by a unitary
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) COMPLEX*16 array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the unitary
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SH      (output) COMPLEX*16 array, dimension KBOT
+            On output, approximate eigenvalues that may
+            be used for shifts are stored in SH(KBOT-ND-NS+1)
+            through SR(KBOT-ND).  Converged eigenvalues are
+            stored in SH(KBOT-ND+1) through SH(KBOT).
+
+       V       (workspace) COMPLEX*16 array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) COMPLEX*16 array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) COMPLEX*16 array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) COMPLEX*16 array, dimension LWORK.
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; ZLAQR2
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sh;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to ZGEHRD ==== */
+
+	i__1 = jw - 1;
+	zgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1].r;
+
+/*        ==== Workspace query call to ZUNMHR ==== */
+
+	i__1 = jw - 1;
+	zunmhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1].r;
+
+/*        ==== Optimal workspace ==== */
+
+	lwkopt = jw + max(lwk1,lwk2);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	d__1 = (doublereal) lwkopt;
+	z__1.r = d__1, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1].r = 1., work[1].i = 0.;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s.r = 0., s.i = 0.;
+    } else {
+	i__1 = kwtop + (kwtop - 1) * h_dim1;
+	s.r = h__[i__1].r, s.i = h__[i__1].i;
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	i__1 = kwtop;
+	i__2 = kwtop + kwtop * h_dim1;
+	sh[i__1].r = h__[i__2].r, sh[i__1].i = h__[i__2].i;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	i__1 = kwtop + kwtop * h_dim1;
+	d__5 = smlnum, d__6 = ulp * ((d__1 = h__[i__1].r, abs(d__1)) + (d__2 =
+		 d_imag(&h__[kwtop + kwtop * h_dim1]), abs(d__2)));
+	if ((d__3 = s.r, abs(d__3)) + (d__4 = d_imag(&s), abs(d__4)) <= max(
+		d__5,d__6)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		i__1 = kwtop + (kwtop - 1) * h_dim1;
+		h__[i__1].r = 0., h__[i__1].i = 0.;
+	    }
+	}
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    zlacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    zcopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    zlaset_("A", &jw, &jw, &c_b56, &c_b57, &v[v_offset], ldv);
+    zlahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[kwtop],
+	    &c__1, &jw, &v[v_offset], ldv, &infqr);
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+    i__1 = jw;
+    for (knt = infqr + 1; knt <= i__1; ++knt) {
+
+/*        ==== Small spike tip deflation test ==== */
+
+	i__2 = *ns + *ns * t_dim1;
+	foo = (d__1 = t[i__2].r, abs(d__1)) + (d__2 = d_imag(&t[*ns + *ns *
+		t_dim1]), abs(d__2));
+	if (foo == 0.) {
+	    foo = (d__1 = s.r, abs(d__1)) + (d__2 = d_imag(&s), abs(d__2));
+	}
+	i__2 = *ns * v_dim1 + 1;
+/* Computing MAX */
+	d__5 = smlnum, d__6 = ulp * foo;
+	if (((d__1 = s.r, abs(d__1)) + (d__2 = d_imag(&s), abs(d__2))) * ((
+		d__3 = v[i__2].r, abs(d__3)) + (d__4 = d_imag(&v[*ns * v_dim1
+		+ 1]), abs(d__4))) <= max(d__5,d__6)) {
+
+/*           ==== One more converged eigenvalue ==== */
+
+	    --(*ns);
+	} else {
+
+/*
+             ==== One undeflatable eigenvalue.  Move it up out of the
+             .    way.   (ZTREXC can not fail in this case.) ====
+*/
+
+	    ifst = *ns;
+	    ztrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst, &
+		    ilst, &info);
+	    ++ilst;
+	}
+/* L10: */
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s.r = 0., s.i = 0.;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting the diagonal of T improves accuracy for
+          .    graded matrices.  ====
+*/
+
+	i__1 = *ns;
+	for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	    ifst = i__;
+	    i__2 = *ns;
+	    for (j = i__ + 1; j <= i__2; ++j) {
+		i__3 = j + j * t_dim1;
+		i__4 = ifst + ifst * t_dim1;
+		if ((d__1 = t[i__3].r, abs(d__1)) + (d__2 = d_imag(&t[j + j *
+			t_dim1]), abs(d__2)) > (d__3 = t[i__4].r, abs(d__3))
+			+ (d__4 = d_imag(&t[ifst + ifst * t_dim1]), abs(d__4))
+			) {
+		    ifst = j;
+		}
+/* L20: */
+	    }
+	    ilst = i__;
+	    if (ifst != ilst) {
+		ztrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &info);
+	    }
+/* L30: */
+	}
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__1 = jw;
+    for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	i__2 = kwtop + i__ - 1;
+	i__3 = i__ + i__ * t_dim1;
+	sh[i__2].r = t[i__3].r, sh[i__2].i = t[i__3].i;
+/* L40: */
+    }
+
+
+    if (*ns < jw || s.r == 0. && s.i == 0.) {
+	if (*ns > 1 && (s.r != 0. || s.i != 0.)) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    zcopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    i__1 = *ns;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		d_cnjg(&z__1, &work[i__]);
+		work[i__2].r = z__1.r, work[i__2].i = z__1.i;
+/* L50: */
+	    }
+	    beta.r = work[1].r, beta.i = work[1].i;
+	    zlarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1].r = 1., work[1].i = 0.;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    zlaset_("L", &i__1, &i__2, &c_b56, &c_b56, &t[t_dim1 + 3], ldt);
+
+	    d_cnjg(&z__1, &tau);
+	    zlarf_("L", ns, &jw, &work[1], &c__1, &z__1, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    zlarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    zlarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    zgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    i__1 = kwtop + (kwtop - 1) * h_dim1;
+	    d_cnjg(&z__2, &v[v_dim1 + 1]);
+	    z__1.r = s.r * z__2.r - s.i * z__2.i, z__1.i = s.r * z__2.i + s.i
+		    * z__2.r;
+	    h__[i__1].r = z__1.r, h__[i__1].i = z__1.i;
+	}
+	zlacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	zcopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && (s.r != 0. || s.i != 0.)) {
+	    i__1 = *lwork - jw;
+	    zunmhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    zgemm_("N", "N", &kln, &jw, &jw, &c_b57, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b56, &wv[wv_offset],
+		    ldwv);
+	    zlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L60: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		zgemm_("C", "N", &jw, &kln, &jw, &c_b57, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b56, &t[t_offset],
+			 ldt);
+		zlacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L70: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		zgemm_("N", "N", &kln, &jw, &jw, &c_b57, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b56, &wv[
+			wv_offset], ldwv);
+		zlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L80: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    d__1 = (doublereal) lwkopt;
+    z__1.r = d__1, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+/*     ==== End of ZLAQR2 ==== */
+
+    return 0;
+} /* zlaqr2_ */
+
+/* Subroutine */ int zlaqr3_(logical *wantt, logical *wantz, integer *n,
+	integer *ktop, integer *kbot, integer *nw, doublecomplex *h__,
+	integer *ldh, integer *iloz, integer *ihiz, doublecomplex *z__,
+	integer *ldz, integer *ns, integer *nd, doublecomplex *sh,
+	doublecomplex *v, integer *ldv, integer *nh, doublecomplex *t,
+	integer *ldt, integer *nv, doublecomplex *wv, integer *ldwv,
+	doublecomplex *work, integer *lwork)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1,
+	    wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublecomplex s;
+    static integer jw;
+    static doublereal foo;
+    static integer kln;
+    static doublecomplex tau;
+    static integer knt;
+    static doublereal ulp;
+    static integer lwk1, lwk2, lwk3;
+    static doublecomplex beta;
+    static integer kcol, info, nmin, ifst, ilst, ltop, krow;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *);
+    static integer infqr;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer kwtop;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), dlabad_(doublereal *, doublereal *),
+	    zlaqr4_(logical *, logical *, integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    );
+
+    static doublereal safmin;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static doublereal safmax;
+    extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *), zlarfg_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *), zlahqr_(logical *,
+	    logical *, integer *, integer *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, integer *, doublecomplex *,
+	     integer *, integer *), zlacpy_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *),
+	    zlaset_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, integer *);
+    static doublereal smlnum;
+    extern /* Subroutine */ int ztrexc_(char *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, integer *, integer *,
+	    integer *);
+    static integer lwkopt;
+    extern /* Subroutine */ int zunmhr_(char *, char *, integer *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    );
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2.1)                        --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+    -- April 2009                                                      --
+
+
+       ******************************************************************
+       Aggressive early deflation:
+
+       This subroutine accepts as input an upper Hessenberg matrix
+       H and performs an unitary similarity transformation
+       designed to detect and deflate fully converged eigenvalues from
+       a trailing principal submatrix.  On output H has been over-
+       written by a new Hessenberg matrix that is a perturbation of
+       an unitary similarity transformation of H.  It is to be
+       hoped that the final version of H has many zero subdiagonal
+       entries.
+
+       ******************************************************************
+       WANTT   (input) LOGICAL
+            If .TRUE., then the Hessenberg matrix H is fully updated
+            so that the triangular Schur factor may be
+            computed (in cooperation with the calling subroutine).
+            If .FALSE., then only enough of H is updated to preserve
+            the eigenvalues.
+
+       WANTZ   (input) LOGICAL
+            If .TRUE., then the unitary matrix Z is updated so
+            so that the unitary Schur factor may be computed
+            (in cooperation with the calling subroutine).
+            If .FALSE., then Z is not referenced.
+
+       N       (input) INTEGER
+            The order of the matrix H and (if WANTZ is .TRUE.) the
+            order of the unitary matrix Z.
+
+       KTOP    (input) INTEGER
+            It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0.
+            KBOT and KTOP together determine an isolated block
+            along the diagonal of the Hessenberg matrix.
+
+       KBOT    (input) INTEGER
+            It is assumed without a check that either
+            KBOT = N or H(KBOT+1,KBOT)=0.  KBOT and KTOP together
+            determine an isolated block along the diagonal of the
+            Hessenberg matrix.
+
+       NW      (input) INTEGER
+            Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+
+       H       (input/output) COMPLEX*16 array, dimension (LDH,N)
+            On input the initial N-by-N section of H stores the
+            Hessenberg matrix undergoing aggressive early deflation.
+            On output H has been transformed by a unitary
+            similarity transformation, perturbed, and the returned
+            to Hessenberg form that (it is to be hoped) has some
+            zero subdiagonal entries.
+
+       LDH     (input) integer
+            Leading dimension of H just as declared in the calling
+            subroutine.  N .LE. LDH
+
+       ILOZ    (input) INTEGER
+       IHIZ    (input) INTEGER
+            Specify the rows of Z to which transformations must be
+            applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+
+       Z       (input/output) COMPLEX*16 array, dimension (LDZ,N)
+            IF WANTZ is .TRUE., then on output, the unitary
+            similarity transformation mentioned above has been
+            accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right.
+            If WANTZ is .FALSE., then Z is unreferenced.
+
+       LDZ     (input) integer
+            The leading dimension of Z just as declared in the
+            calling subroutine.  1 .LE. LDZ.
+
+       NS      (output) integer
+            The number of unconverged (ie approximate) eigenvalues
+            returned in SR and SI that may be used as shifts by the
+            calling subroutine.
+
+       ND      (output) integer
+            The number of converged eigenvalues uncovered by this
+            subroutine.
+
+       SH      (output) COMPLEX*16 array, dimension KBOT
+            On output, approximate eigenvalues that may
+            be used for shifts are stored in SH(KBOT-ND-NS+1)
+            through SR(KBOT-ND).  Converged eigenvalues are
+            stored in SH(KBOT-ND+1) through SH(KBOT).
+
+       V       (workspace) COMPLEX*16 array, dimension (LDV,NW)
+            An NW-by-NW work array.
+
+       LDV     (input) integer scalar
+            The leading dimension of V just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       NH      (input) integer scalar
+            The number of columns of T.  NH.GE.NW.
+
+       T       (workspace) COMPLEX*16 array, dimension (LDT,NW)
+
+       LDT     (input) integer
+            The leading dimension of T just as declared in the
+            calling subroutine.  NW .LE. LDT
+
+       NV      (input) integer
+            The number of rows of work array WV available for
+            workspace.  NV.GE.NW.
+
+       WV      (workspace) COMPLEX*16 array, dimension (LDWV,NW)
+
+       LDWV    (input) integer
+            The leading dimension of W just as declared in the
+            calling subroutine.  NW .LE. LDV
+
+       WORK    (workspace) COMPLEX*16 array, dimension LWORK.
+            On exit, WORK(1) is set to an estimate of the optimal value
+            of LWORK for the given values of N, NW, KTOP and KBOT.
+
+       LWORK   (input) integer
+            The dimension of the work array WORK.  LWORK = 2*NW
+            suffices, but greater efficiency may result from larger
+            values of LWORK.
+
+            If LWORK = -1, then a workspace query is assumed; ZLAQR3
+            only estimates the optimal workspace size for the given
+            values of N, NW, KTOP and KBOT.  The estimate is returned
+            in WORK(1).  No error message related to LWORK is issued
+            by XERBLA.  Neither H nor Z are accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+
+       ==== Estimate optimal workspace. ====
+*/
+
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --sh;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    --work;
+
+    /* Function Body */
+/* Computing MIN */
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    if (jw <= 2) {
+	lwkopt = 1;
+    } else {
+
+/*        ==== Workspace query call to ZGEHRD ==== */
+
+	i__1 = jw - 1;
+	zgehrd_(&jw, &c__1, &i__1, &t[t_offset], ldt, &work[1], &work[1], &
+		c_n1, &info);
+	lwk1 = (integer) work[1].r;
+
+/*        ==== Workspace query call to ZUNMHR ==== */
+
+	i__1 = jw - 1;
+	zunmhr_("R", "N", &jw, &jw, &c__1, &i__1, &t[t_offset], ldt, &work[1],
+		 &v[v_offset], ldv, &work[1], &c_n1, &info);
+	lwk2 = (integer) work[1].r;
+
+/*        ==== Workspace query call to ZLAQR4 ==== */
+
+	zlaqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[1],
+		&c__1, &jw, &v[v_offset], ldv, &work[1], &c_n1, &infqr);
+	lwk3 = (integer) work[1].r;
+
+/*
+          ==== Optimal workspace ====
+
+   Computing MAX
+*/
+	i__1 = jw + max(lwk1,lwk2);
+	lwkopt = max(i__1,lwk3);
+    }
+
+/*     ==== Quick return in case of workspace query. ==== */
+
+    if (*lwork == -1) {
+	d__1 = (doublereal) lwkopt;
+	z__1.r = d__1, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*
+       ==== Nothing to do ...
+       ... for an empty active block ... ====
+*/
+    *ns = 0;
+    *nd = 0;
+    work[1].r = 1., work[1].i = 0.;
+    if (*ktop > *kbot) {
+	return 0;
+    }
+/*     ... nor for an empty deflation window. ==== */
+    if (*nw < 1) {
+	return 0;
+    }
+
+/*     ==== Machine constants ==== */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) (*n) / ulp);
+
+/*
+       ==== Setup deflation window ====
+
+   Computing MIN
+*/
+    i__1 = *nw, i__2 = *kbot - *ktop + 1;
+    jw = min(i__1,i__2);
+    kwtop = *kbot - jw + 1;
+    if (kwtop == *ktop) {
+	s.r = 0., s.i = 0.;
+    } else {
+	i__1 = kwtop + (kwtop - 1) * h_dim1;
+	s.r = h__[i__1].r, s.i = h__[i__1].i;
+    }
+
+    if (*kbot == kwtop) {
+
+/*        ==== 1-by-1 deflation window: not much to do ==== */
+
+	i__1 = kwtop;
+	i__2 = kwtop + kwtop * h_dim1;
+	sh[i__1].r = h__[i__2].r, sh[i__1].i = h__[i__2].i;
+	*ns = 1;
+	*nd = 0;
+/* Computing MAX */
+	i__1 = kwtop + kwtop * h_dim1;
+	d__5 = smlnum, d__6 = ulp * ((d__1 = h__[i__1].r, abs(d__1)) + (d__2 =
+		 d_imag(&h__[kwtop + kwtop * h_dim1]), abs(d__2)));
+	if ((d__3 = s.r, abs(d__3)) + (d__4 = d_imag(&s), abs(d__4)) <= max(
+		d__5,d__6)) {
+	    *ns = 0;
+	    *nd = 1;
+	    if (kwtop > *ktop) {
+		i__1 = kwtop + (kwtop - 1) * h_dim1;
+		h__[i__1].r = 0., h__[i__1].i = 0.;
+	    }
+	}
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+/*
+       ==== Convert to spike-triangular form.  (In case of a
+       .    rare QR failure, this routine continues to do
+       .    aggressive early deflation using that part of
+       .    the deflation window that converged using INFQR
+       .    here and there to keep track.) ====
+*/
+
+    zlacpy_("U", &jw, &jw, &h__[kwtop + kwtop * h_dim1], ldh, &t[t_offset],
+	    ldt);
+    i__1 = jw - 1;
+    i__2 = *ldh + 1;
+    i__3 = *ldt + 1;
+    zcopy_(&i__1, &h__[kwtop + 1 + kwtop * h_dim1], &i__2, &t[t_dim1 + 2], &
+	    i__3);
+
+    zlaset_("A", &jw, &jw, &c_b56, &c_b57, &v[v_offset], ldv);
+    nmin = ilaenv_(&c__12, "ZLAQR3", "SV", &jw, &c__1, &jw, lwork, (ftnlen)6,
+	    (ftnlen)2);
+    if (jw > nmin) {
+	zlaqr4_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[
+		kwtop], &c__1, &jw, &v[v_offset], ldv, &work[1], lwork, &
+		infqr);
+    } else {
+	zlahqr_(&c_true, &c_true, &jw, &c__1, &jw, &t[t_offset], ldt, &sh[
+		kwtop], &c__1, &jw, &v[v_offset], ldv, &infqr);
+    }
+
+/*     ==== Deflation detection loop ==== */
+
+    *ns = jw;
+    ilst = infqr + 1;
+    i__1 = jw;
+    for (knt = infqr + 1; knt <= i__1; ++knt) {
+
+/*        ==== Small spike tip deflation test ==== */
+
+	i__2 = *ns + *ns * t_dim1;
+	foo = (d__1 = t[i__2].r, abs(d__1)) + (d__2 = d_imag(&t[*ns + *ns *
+		t_dim1]), abs(d__2));
+	if (foo == 0.) {
+	    foo = (d__1 = s.r, abs(d__1)) + (d__2 = d_imag(&s), abs(d__2));
+	}
+	i__2 = *ns * v_dim1 + 1;
+/* Computing MAX */
+	d__5 = smlnum, d__6 = ulp * foo;
+	if (((d__1 = s.r, abs(d__1)) + (d__2 = d_imag(&s), abs(d__2))) * ((
+		d__3 = v[i__2].r, abs(d__3)) + (d__4 = d_imag(&v[*ns * v_dim1
+		+ 1]), abs(d__4))) <= max(d__5,d__6)) {
+
+/*           ==== One more converged eigenvalue ==== */
+
+	    --(*ns);
+	} else {
+
+/*
+             ==== One undeflatable eigenvalue.  Move it up out of the
+             .    way.   (ZTREXC can not fail in this case.) ====
+*/
+
+	    ifst = *ns;
+	    ztrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst, &
+		    ilst, &info);
+	    ++ilst;
+	}
+/* L10: */
+    }
+
+/*        ==== Return to Hessenberg form ==== */
+
+    if (*ns == 0) {
+	s.r = 0., s.i = 0.;
+    }
+
+    if (*ns < jw) {
+
+/*
+          ==== sorting the diagonal of T improves accuracy for
+          .    graded matrices.  ====
+*/
+
+	i__1 = *ns;
+	for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	    ifst = i__;
+	    i__2 = *ns;
+	    for (j = i__ + 1; j <= i__2; ++j) {
+		i__3 = j + j * t_dim1;
+		i__4 = ifst + ifst * t_dim1;
+		if ((d__1 = t[i__3].r, abs(d__1)) + (d__2 = d_imag(&t[j + j *
+			t_dim1]), abs(d__2)) > (d__3 = t[i__4].r, abs(d__3))
+			+ (d__4 = d_imag(&t[ifst + ifst * t_dim1]), abs(d__4))
+			) {
+		    ifst = j;
+		}
+/* L20: */
+	    }
+	    ilst = i__;
+	    if (ifst != ilst) {
+		ztrexc_("V", &jw, &t[t_offset], ldt, &v[v_offset], ldv, &ifst,
+			 &ilst, &info);
+	    }
+/* L30: */
+	}
+    }
+
+/*     ==== Restore shift/eigenvalue array from T ==== */
+
+    i__1 = jw;
+    for (i__ = infqr + 1; i__ <= i__1; ++i__) {
+	i__2 = kwtop + i__ - 1;
+	i__3 = i__ + i__ * t_dim1;
+	sh[i__2].r = t[i__3].r, sh[i__2].i = t[i__3].i;
+/* L40: */
+    }
+
+
+    if (*ns < jw || s.r == 0. && s.i == 0.) {
+	if (*ns > 1 && (s.r != 0. || s.i != 0.)) {
+
+/*           ==== Reflect spike back into lower triangle ==== */
+
+	    zcopy_(ns, &v[v_offset], ldv, &work[1], &c__1);
+	    i__1 = *ns;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		d_cnjg(&z__1, &work[i__]);
+		work[i__2].r = z__1.r, work[i__2].i = z__1.i;
+/* L50: */
+	    }
+	    beta.r = work[1].r, beta.i = work[1].i;
+	    zlarfg_(ns, &beta, &work[2], &c__1, &tau);
+	    work[1].r = 1., work[1].i = 0.;
+
+	    i__1 = jw - 2;
+	    i__2 = jw - 2;
+	    zlaset_("L", &i__1, &i__2, &c_b56, &c_b56, &t[t_dim1 + 3], ldt);
+
+	    d_cnjg(&z__1, &tau);
+	    zlarf_("L", ns, &jw, &work[1], &c__1, &z__1, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    zlarf_("R", ns, ns, &work[1], &c__1, &tau, &t[t_offset], ldt, &
+		    work[jw + 1]);
+	    zlarf_("R", &jw, ns, &work[1], &c__1, &tau, &v[v_offset], ldv, &
+		    work[jw + 1]);
+
+	    i__1 = *lwork - jw;
+	    zgehrd_(&jw, &c__1, ns, &t[t_offset], ldt, &work[1], &work[jw + 1]
+		    , &i__1, &info);
+	}
+
+/*        ==== Copy updated reduced window into place ==== */
+
+	if (kwtop > 1) {
+	    i__1 = kwtop + (kwtop - 1) * h_dim1;
+	    d_cnjg(&z__2, &v[v_dim1 + 1]);
+	    z__1.r = s.r * z__2.r - s.i * z__2.i, z__1.i = s.r * z__2.i + s.i
+		    * z__2.r;
+	    h__[i__1].r = z__1.r, h__[i__1].i = z__1.i;
+	}
+	zlacpy_("U", &jw, &jw, &t[t_offset], ldt, &h__[kwtop + kwtop * h_dim1]
+		, ldh);
+	i__1 = jw - 1;
+	i__2 = *ldt + 1;
+	i__3 = *ldh + 1;
+	zcopy_(&i__1, &t[t_dim1 + 2], &i__2, &h__[kwtop + 1 + kwtop * h_dim1],
+		 &i__3);
+
+/*
+          ==== Accumulate orthogonal matrix in order update
+          .    H and Z, if requested.  ====
+*/
+
+	if (*ns > 1 && (s.r != 0. || s.i != 0.)) {
+	    i__1 = *lwork - jw;
+	    zunmhr_("R", "N", &jw, ns, &c__1, ns, &t[t_offset], ldt, &work[1],
+		     &v[v_offset], ldv, &work[jw + 1], &i__1, &info);
+	}
+
+/*        ==== Update vertical slab in H ==== */
+
+	if (*wantt) {
+	    ltop = 1;
+	} else {
+	    ltop = *ktop;
+	}
+	i__1 = kwtop - 1;
+	i__2 = *nv;
+	for (krow = ltop; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		i__2) {
+/* Computing MIN */
+	    i__3 = *nv, i__4 = kwtop - krow;
+	    kln = min(i__3,i__4);
+	    zgemm_("N", "N", &kln, &jw, &jw, &c_b57, &h__[krow + kwtop *
+		    h_dim1], ldh, &v[v_offset], ldv, &c_b56, &wv[wv_offset],
+		    ldwv);
+	    zlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &h__[krow + kwtop *
+		    h_dim1], ldh);
+/* L60: */
+	}
+
+/*        ==== Update horizontal slab in H ==== */
+
+	if (*wantt) {
+	    i__2 = *n;
+	    i__1 = *nh;
+	    for (kcol = *kbot + 1; i__1 < 0 ? kcol >= i__2 : kcol <= i__2;
+		    kcol += i__1) {
+/* Computing MIN */
+		i__3 = *nh, i__4 = *n - kcol + 1;
+		kln = min(i__3,i__4);
+		zgemm_("C", "N", &jw, &kln, &jw, &c_b57, &v[v_offset], ldv, &
+			h__[kwtop + kcol * h_dim1], ldh, &c_b56, &t[t_offset],
+			 ldt);
+		zlacpy_("A", &jw, &kln, &t[t_offset], ldt, &h__[kwtop + kcol *
+			 h_dim1], ldh);
+/* L70: */
+	    }
+	}
+
+/*        ==== Update vertical slab in Z ==== */
+
+	if (*wantz) {
+	    i__1 = *ihiz;
+	    i__2 = *nv;
+	    for (krow = *iloz; i__2 < 0 ? krow >= i__1 : krow <= i__1; krow +=
+		     i__2) {
+/* Computing MIN */
+		i__3 = *nv, i__4 = *ihiz - krow + 1;
+		kln = min(i__3,i__4);
+		zgemm_("N", "N", &kln, &jw, &jw, &c_b57, &z__[krow + kwtop *
+			z_dim1], ldz, &v[v_offset], ldv, &c_b56, &wv[
+			wv_offset], ldwv);
+		zlacpy_("A", &kln, &jw, &wv[wv_offset], ldwv, &z__[krow +
+			kwtop * z_dim1], ldz);
+/* L80: */
+	    }
+	}
+    }
+
+/*     ==== Return the number of deflations ... ==== */
+
+    *nd = jw - *ns;
+
+/*
+       ==== ... and the number of shifts. (Subtracting
+       .    INFQR from the spike length takes care
+       .    of the case of a rare QR failure while
+       .    calculating eigenvalues of the deflation
+       .    window.)  ====
+*/
+
+    *ns -= infqr;
+
+/*      ==== Return optimal workspace. ==== */
+
+    d__1 = (doublereal) lwkopt;
+    z__1.r = d__1, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+/*     ==== End of ZLAQR3 ==== */
+
+    return 0;
+} /* zlaqr3_ */
+
+/* Subroutine */ int zlaqr4_(logical *wantt, logical *wantz, integer *n,
+	integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh,
+	doublecomplex *w, integer *iloz, integer *ihiz, doublecomplex *z__,
+	integer *ldz, doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6, d__7, d__8;
+    doublecomplex z__1, z__2, z__3, z__4, z__5;
+
+    /* Local variables */
+    static integer i__, k;
+    static doublereal s;
+    static doublecomplex aa, bb, cc, dd;
+    static integer ld, nh, it, ks, kt, ku, kv, ls, ns, nw;
+    static doublecomplex tr2, det;
+    static integer inf, kdu, nho, nve, kwh, nsr, nwr, kwv, ndec, ndfl, kbot,
+	    nmin;
+    static doublecomplex swap;
+    static integer ktop;
+    static doublecomplex zdum[1]	/* was [1][1] */;
+    static integer kacc22, itmax, nsmax, nwmax, kwtop;
+    extern /* Subroutine */ int zlaqr2_(logical *, logical *, integer *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *, integer *,
+	     doublecomplex *, integer *, integer *, doublecomplex *, integer *
+	    , doublecomplex *, integer *), zlaqr5_(logical *, logical *,
+	    integer *, integer *, integer *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, integer *, integer *,
+	     doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, integer *, doublecomplex *, integer *,
+	     integer *, doublecomplex *, integer *);
+    static integer nibble;
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static char jbcmpz[2];
+    static doublecomplex rtdisc;
+    static integer nwupbd;
+    static logical sorted;
+    extern /* Subroutine */ int zlahqr_(logical *, logical *, integer *,
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, integer *, doublecomplex *, integer *, integer *),
+	    zlacpy_(char *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       This subroutine implements one level of recursion for ZLAQR0.
+       It is a complete implementation of the small bulge multi-shift
+       QR algorithm.  It may be called by ZLAQR0 and, for large enough
+       deflation window size, it may be called by ZLAQR3.  This
+       subroutine is identical to ZLAQR0 except that it calls ZLAQR2
+       instead of ZLAQR3.
+
+       Purpose
+       =======
+
+       ZLAQR4 computes the eigenvalues of a Hessenberg matrix H
+       and, optionally, the matrices T and Z from the Schur decomposition
+       H = Z T Z**H, where T is an upper triangular matrix (the
+       Schur form), and Z is the unitary matrix of Schur vectors.
+
+       Optionally Z may be postmultiplied into an input unitary
+       matrix Q so that this routine can give the Schur factorization
+       of a matrix A which has been reduced to the Hessenberg form H
+       by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+       Arguments
+       =========
+
+       WANTT   (input) LOGICAL
+            = .TRUE. : the full Schur form T is required;
+            = .FALSE.: only eigenvalues are required.
+
+       WANTZ   (input) LOGICAL
+            = .TRUE. : the matrix of Schur vectors Z is required;
+            = .FALSE.: Schur vectors are not required.
+
+       N     (input) INTEGER
+             The order of the matrix H.  N .GE. 0.
+
+       ILO   (input) INTEGER
+       IHI   (input) INTEGER
+             It is assumed that H is already upper triangular in rows
+             and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+             H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
+             previous call to ZGEBAL, and then passed to ZGEHRD when the
+             matrix output by ZGEBAL is reduced to Hessenberg form.
+             Otherwise, ILO and IHI should be set to 1 and N,
+             respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+             If N = 0, then ILO = 1 and IHI = 0.
+
+       H     (input/output) COMPLEX*16 array, dimension (LDH,N)
+             On entry, the upper Hessenberg matrix H.
+             On exit, if INFO = 0 and WANTT is .TRUE., then H
+             contains the upper triangular matrix T from the Schur
+             decomposition (the Schur form). If INFO = 0 and WANT is
+             .FALSE., then the contents of H are unspecified on exit.
+             (The output value of H when INFO.GT.0 is given under the
+             description of INFO below.)
+
+             This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+             j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
+
+       LDH   (input) INTEGER
+             The leading dimension of the array H. LDH .GE. max(1,N).
+
+       W        (output) COMPLEX*16 array, dimension (N)
+             The computed eigenvalues of H(ILO:IHI,ILO:IHI) are stored
+             in W(ILO:IHI). If WANTT is .TRUE., then the eigenvalues are
+             stored in the same order as on the diagonal of the Schur
+             form returned in H, with W(i) = H(i,i).
+
+       Z     (input/output) COMPLEX*16 array, dimension (LDZ,IHI)
+             If WANTZ is .FALSE., then Z is not referenced.
+             If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
+             replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
+             orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
+             (The output value of Z when INFO.GT.0 is given under
+             the description of INFO below.)
+
+       LDZ   (input) INTEGER
+             The leading dimension of the array Z.  if WANTZ is .TRUE.
+             then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+
+       WORK  (workspace/output) COMPLEX*16 array, dimension LWORK
+             On exit, if LWORK = -1, WORK(1) returns an estimate of
+             the optimal value for LWORK.
+
+       LWORK (input) INTEGER
+             The dimension of the array WORK.  LWORK .GE. max(1,N)
+             is sufficient, but LWORK typically as large as 6*N may
+             be required for optimal performance.  A workspace query
+             to determine the optimal workspace size is recommended.
+
+             If LWORK = -1, then ZLAQR4 does a workspace query.
+             In this case, ZLAQR4 checks the input parameters and
+             estimates the optimal workspace size for the given
+             values of N, ILO and IHI.  The estimate is returned
+             in WORK(1).  No error message related to LWORK is
+             issued by XERBLA.  Neither H nor Z are accessed.
+
+
+       INFO  (output) INTEGER
+               =  0:  successful exit
+             .GT. 0:  if INFO = i, ZLAQR4 failed to compute all of
+                  the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
+                  and WI contain those eigenvalues which have been
+                  successfully computed.  (Failures are rare.)
+
+                  If INFO .GT. 0 and WANT is .FALSE., then on exit,
+                  the remaining unconverged eigenvalues are the eigen-
+                  values of the upper Hessenberg matrix rows and
+                  columns ILO through INFO of the final, output
+                  value of H.
+
+                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+
+             (*)  (initial value of H)*U  = U*(final value of H)
+
+                  where U is a unitary matrix.  The final
+                  value of  H is upper Hessenberg and triangular in
+                  rows and columns INFO+1 through IHI.
+
+                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+
+                    (final value of Z(ILO:IHI,ILOZ:IHIZ)
+                     =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
+
+                  where U is the unitary matrix in (*) (regard-
+                  less of the value of WANTT.)
+
+                  If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+                  accessed.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       References:
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+         Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+         929--947, 2002.
+
+         K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+         Algorithm Part II: Aggressive Early Deflation, SIAM Journal
+         of Matrix Analysis, volume 23, pages 948--973, 2002.
+
+       ================================================================
+
+       ==== Matrices of order NTINY or smaller must be processed by
+       .    ZLAHQR because of insufficient subdiagonal scratch space.
+       .    (This is a hard limit.) ====
+
+       ==== Exceptional deflation windows:  try to cure rare
+       .    slow convergence by varying the size of the
+       .    deflation window after KEXNW iterations. ====
+
+       ==== Exceptional shifts: try to cure rare slow convergence
+       .    with ad-hoc exceptional shifts every KEXSH iterations.
+       .    ====
+
+       ==== The constant WILK1 is used to form the exceptional
+       .    shifts. ====
+*/
+    /* Parameter adjustments */
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    --w;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     ==== Quick return for N = 0: nothing to do. ==== */
+
+    if (*n == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    if (*n <= 11) {
+
+/*        ==== Tiny matrices must use ZLAHQR. ==== */
+
+	lwkopt = 1;
+	if (*lwork != -1) {
+	    zlahqr_(wantt, wantz, n, ilo, ihi, &h__[h_offset], ldh, &w[1],
+		    iloz, ihiz, &z__[z_offset], ldz, info);
+	}
+    } else {
+
+/*
+          ==== Use small bulge multi-shift QR with aggressive early
+          .    deflation on larger-than-tiny matrices. ====
+
+          ==== Hope for the best. ====
+*/
+
+	*info = 0;
+
+/*        ==== Set up job flags for ILAENV. ==== */
+
+	if (*wantt) {
+	    *(unsigned char *)jbcmpz = 'S';
+	} else {
+	    *(unsigned char *)jbcmpz = 'E';
+	}
+	if (*wantz) {
+	    *(unsigned char *)&jbcmpz[1] = 'V';
+	} else {
+	    *(unsigned char *)&jbcmpz[1] = 'N';
+	}
+
+/*
+          ==== NWR = recommended deflation window size.  At this
+          .    point,  N .GT. NTINY = 11, so there is enough
+          .    subdiagonal workspace for NWR.GE.2 as required.
+          .    (In fact, there is enough subdiagonal space for
+          .    NWR.GE.3.) ====
+*/
+
+	nwr = ilaenv_(&c__13, "ZLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+	nwr = max(2,nwr);
+/* Computing MIN */
+	i__1 = *ihi - *ilo + 1, i__2 = (*n - 1) / 3, i__1 = min(i__1,i__2);
+	nwr = min(i__1,nwr);
+
+/*
+          ==== NSR = recommended number of simultaneous shifts.
+          .    At this point N .GT. NTINY = 11, so there is at
+          .    enough subdiagonal workspace for NSR to be even
+          .    and greater than or equal to two as required. ====
+*/
+
+	nsr = ilaenv_(&c__15, "ZLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)6,
+		 (ftnlen)2);
+/* Computing MIN */
+	i__1 = nsr, i__2 = (*n + 6) / 9, i__1 = min(i__1,i__2), i__2 = *ihi -
+		*ilo;
+	nsr = min(i__1,i__2);
+/* Computing MAX */
+	i__1 = 2, i__2 = nsr - nsr % 2;
+	nsr = max(i__1,i__2);
+
+/*
+          ==== Estimate optimal workspace ====
+
+          ==== Workspace query call to ZLAQR2 ====
+*/
+
+	i__1 = nwr + 1;
+	zlaqr2_(wantt, wantz, n, ilo, ihi, &i__1, &h__[h_offset], ldh, iloz,
+		ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[h_offset],
+		ldh, n, &h__[h_offset], ldh, n, &h__[h_offset], ldh, &work[1],
+		 &c_n1);
+
+/*
+          ==== Optimal workspace = MAX(ZLAQR5, ZLAQR2) ====
+
+   Computing MAX
+*/
+	i__1 = nsr * 3 / 2, i__2 = (integer) work[1].r;
+	lwkopt = max(i__1,i__2);
+
+/*        ==== Quick return in case of workspace query. ==== */
+
+	if (*lwork == -1) {
+	    d__1 = (doublereal) lwkopt;
+	    z__1.r = d__1, z__1.i = 0.;
+	    work[1].r = z__1.r, work[1].i = z__1.i;
+	    return 0;
+	}
+
+/*        ==== ZLAHQR/ZLAQR0 crossover point ==== */
+
+	nmin = ilaenv_(&c__12, "ZLAQR4", jbcmpz, n, ilo, ihi, lwork, (ftnlen)
+		6, (ftnlen)2);
+	nmin = max(11,nmin);
+
+/*        ==== Nibble crossover point ==== */
+
+	nibble = ilaenv_(&c__14, "ZLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	nibble = max(0,nibble);
+
+/*
+          ==== Accumulate reflections during ttswp?  Use block
+          .    2-by-2 structure during matrix-matrix multiply? ====
+*/
+
+	kacc22 = ilaenv_(&c__16, "ZLAQR4", jbcmpz, n, ilo, ihi, lwork, (
+		ftnlen)6, (ftnlen)2);
+	kacc22 = max(0,kacc22);
+	kacc22 = min(2,kacc22);
+
+/*
+          ==== NWMAX = the largest possible deflation window for
+          .    which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n - 1) / 3, i__2 = *lwork / 2;
+	nwmax = min(i__1,i__2);
+	nw = nwmax;
+
+/*
+          ==== NSMAX = the Largest number of simultaneous shifts
+          .    for which there is sufficient workspace. ====
+
+   Computing MIN
+*/
+	i__1 = (*n + 6) / 9, i__2 = (*lwork << 1) / 3;
+	nsmax = min(i__1,i__2);
+	nsmax -= nsmax % 2;
+
+/*        ==== NDFL: an iteration count restarted at deflation. ==== */
+
+	ndfl = 1;
+
+/*
+          ==== ITMAX = iteration limit ====
+
+   Computing MAX
+*/
+	i__1 = 10, i__2 = *ihi - *ilo + 1;
+	itmax = max(i__1,i__2) * 30;
+
+/*        ==== Last row and column in the active block ==== */
+
+	kbot = *ihi;
+
+/*        ==== Main Loop ==== */
+
+	i__1 = itmax;
+	for (it = 1; it <= i__1; ++it) {
+
+/*           ==== Done when KBOT falls below ILO ==== */
+
+	    if (kbot < *ilo) {
+		goto L80;
+	    }
+
+/*           ==== Locate active block ==== */
+
+	    i__2 = *ilo + 1;
+	    for (k = kbot; k >= i__2; --k) {
+		i__3 = k + (k - 1) * h_dim1;
+		if (h__[i__3].r == 0. && h__[i__3].i == 0.) {
+		    goto L20;
+		}
+/* L10: */
+	    }
+	    k = *ilo;
+L20:
+	    ktop = k;
+
+/*
+             ==== Select deflation window size:
+             .    Typical Case:
+             .      If possible and advisable, nibble the entire
+             .      active block.  If not, use size MIN(NWR,NWMAX)
+             .      or MIN(NWR+1,NWMAX) depending upon which has
+             .      the smaller corresponding subdiagonal entry
+             .      (a heuristic).
+             .
+             .    Exceptional Case:
+             .      If there have been no deflations in KEXNW or
+             .      more iterations, then vary the deflation window
+             .      size.   At first, because, larger windows are,
+             .      in general, more powerful than smaller ones,
+             .      rapidly increase the window to the maximum possible.
+             .      Then, gradually reduce the window size. ====
+*/
+
+	    nh = kbot - ktop + 1;
+	    nwupbd = min(nh,nwmax);
+	    if (ndfl < 5) {
+		nw = min(nwupbd,nwr);
+	    } else {
+/* Computing MIN */
+		i__2 = nwupbd, i__3 = nw << 1;
+		nw = min(i__2,i__3);
+	    }
+	    if (nw < nwmax) {
+		if (nw >= nh - 1) {
+		    nw = nh;
+		} else {
+		    kwtop = kbot - nw + 1;
+		    i__2 = kwtop + (kwtop - 1) * h_dim1;
+		    i__3 = kwtop - 1 + (kwtop - 2) * h_dim1;
+		    if ((d__1 = h__[i__2].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			    kwtop + (kwtop - 1) * h_dim1]), abs(d__2)) > (
+			    d__3 = h__[i__3].r, abs(d__3)) + (d__4 = d_imag(&
+			    h__[kwtop - 1 + (kwtop - 2) * h_dim1]), abs(d__4))
+			    ) {
+			++nw;
+		    }
+		}
+	    }
+	    if (ndfl < 5) {
+		ndec = -1;
+	    } else if (ndec >= 0 || nw >= nwupbd) {
+		++ndec;
+		if (nw - ndec < 2) {
+		    ndec = 0;
+		}
+		nw -= ndec;
+	    }
+
+/*
+             ==== Aggressive early deflation:
+             .    split workspace under the subdiagonal into
+             .      - an nw-by-nw work array V in the lower
+             .        left-hand-corner,
+             .      - an NW-by-at-least-NW-but-more-is-better
+             .        (NW-by-NHO) horizontal work array along
+             .        the bottom edge,
+             .      - an at-least-NW-but-more-is-better (NHV-by-NW)
+             .        vertical work array along the left-hand-edge.
+             .        ====
+*/
+
+	    kv = *n - nw + 1;
+	    kt = nw + 1;
+	    nho = *n - nw - 1 - kt + 1;
+	    kwv = nw + 2;
+	    nve = *n - nw - kwv + 1;
+
+/*           ==== Aggressive early deflation ==== */
+
+	    zlaqr2_(wantt, wantz, n, &ktop, &kbot, &nw, &h__[h_offset], ldh,
+		    iloz, ihiz, &z__[z_offset], ldz, &ls, &ld, &w[1], &h__[kv
+		    + h_dim1], ldh, &nho, &h__[kv + kt * h_dim1], ldh, &nve, &
+		    h__[kwv + h_dim1], ldh, &work[1], lwork);
+
+/*           ==== Adjust KBOT accounting for new deflations. ==== */
+
+	    kbot -= ld;
+
+/*           ==== KS points to the shifts. ==== */
+
+	    ks = kbot - ls + 1;
+
+/*
+             ==== Skip an expensive QR sweep if there is a (partly
+             .    heuristic) reason to expect that many eigenvalues
+             .    will deflate without it.  Here, the QR sweep is
+             .    skipped if many eigenvalues have just been deflated
+             .    or if the remaining active block is small.
+*/
+
+	    if (ld == 0 || ld * 100 <= nw * nibble && kbot - ktop + 1 > min(
+		    nmin,nwmax)) {
+
+/*
+                ==== NS = nominal number of simultaneous shifts.
+                .    This may be lowered (slightly) if ZLAQR2
+                .    did not provide that many shifts. ====
+
+   Computing MIN
+   Computing MAX
+*/
+		i__4 = 2, i__5 = kbot - ktop;
+		i__2 = min(nsmax,nsr), i__3 = max(i__4,i__5);
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+
+/*
+                ==== If there have been no deflations
+                .    in a multiple of KEXSH iterations,
+                .    then try exceptional shifts.
+                .    Otherwise use shifts provided by
+                .    ZLAQR2 above or from the eigenvalues
+                .    of a trailing principal submatrix. ====
+*/
+
+		if (ndfl % 6 == 0) {
+		    ks = kbot - ns + 1;
+		    i__2 = ks + 1;
+		    for (i__ = kbot; i__ >= i__2; i__ += -2) {
+			i__3 = i__;
+			i__4 = i__ + i__ * h_dim1;
+			i__5 = i__ + (i__ - 1) * h_dim1;
+			d__3 = ((d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				d_imag(&h__[i__ + (i__ - 1) * h_dim1]), abs(
+				d__2))) * .75;
+			z__1.r = h__[i__4].r + d__3, z__1.i = h__[i__4].i;
+			w[i__3].r = z__1.r, w[i__3].i = z__1.i;
+			i__3 = i__ - 1;
+			i__4 = i__;
+			w[i__3].r = w[i__4].r, w[i__3].i = w[i__4].i;
+/* L30: */
+		    }
+		} else {
+
+/*
+                   ==== Got NS/2 or fewer shifts? Use ZLAHQR
+                   .    on a trailing principal submatrix to
+                   .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+                   .    there is enough space below the subdiagonal
+                   .    to fit an NS-by-NS scratch array.) ====
+*/
+
+		    if (kbot - ks + 1 <= ns / 2) {
+			ks = kbot - ns + 1;
+			kt = *n - ns + 1;
+			zlacpy_("A", &ns, &ns, &h__[ks + ks * h_dim1], ldh, &
+				h__[kt + h_dim1], ldh);
+			zlahqr_(&c_false, &c_false, &ns, &c__1, &ns, &h__[kt
+				+ h_dim1], ldh, &w[ks], &c__1, &c__1, zdum, &
+				c__1, &inf);
+			ks += inf;
+
+/*
+                      ==== In case of a rare QR failure use
+                      .    eigenvalues of the trailing 2-by-2
+                      .    principal submatrix.  Scale to avoid
+                      .    overflows, underflows and subnormals.
+                      .    (The scale factor S can not be zero,
+                      .    because H(KBOT,KBOT-1) is nonzero.) ====
+*/
+
+			if (ks >= kbot) {
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    i__3 = kbot + (kbot - 1) * h_dim1;
+			    i__4 = kbot - 1 + kbot * h_dim1;
+			    i__5 = kbot + kbot * h_dim1;
+			    s = (d__1 = h__[i__2].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[kbot - 1 + (kbot - 1) *
+				    h_dim1]), abs(d__2)) + ((d__3 = h__[i__3]
+				    .r, abs(d__3)) + (d__4 = d_imag(&h__[kbot
+				    + (kbot - 1) * h_dim1]), abs(d__4))) + ((
+				    d__5 = h__[i__4].r, abs(d__5)) + (d__6 =
+				    d_imag(&h__[kbot - 1 + kbot * h_dim1]),
+				    abs(d__6))) + ((d__7 = h__[i__5].r, abs(
+				    d__7)) + (d__8 = d_imag(&h__[kbot + kbot *
+				     h_dim1]), abs(d__8)));
+			    i__2 = kbot - 1 + (kbot - 1) * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    aa.r = z__1.r, aa.i = z__1.i;
+			    i__2 = kbot + (kbot - 1) * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    cc.r = z__1.r, cc.i = z__1.i;
+			    i__2 = kbot - 1 + kbot * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    bb.r = z__1.r, bb.i = z__1.i;
+			    i__2 = kbot + kbot * h_dim1;
+			    z__1.r = h__[i__2].r / s, z__1.i = h__[i__2].i /
+				    s;
+			    dd.r = z__1.r, dd.i = z__1.i;
+			    z__2.r = aa.r + dd.r, z__2.i = aa.i + dd.i;
+			    z__1.r = z__2.r / 2., z__1.i = z__2.i / 2.;
+			    tr2.r = z__1.r, tr2.i = z__1.i;
+			    z__3.r = aa.r - tr2.r, z__3.i = aa.i - tr2.i;
+			    z__4.r = dd.r - tr2.r, z__4.i = dd.i - tr2.i;
+			    z__2.r = z__3.r * z__4.r - z__3.i * z__4.i,
+				    z__2.i = z__3.r * z__4.i + z__3.i *
+				    z__4.r;
+			    z__5.r = bb.r * cc.r - bb.i * cc.i, z__5.i = bb.r
+				    * cc.i + bb.i * cc.r;
+			    z__1.r = z__2.r - z__5.r, z__1.i = z__2.i -
+				    z__5.i;
+			    det.r = z__1.r, det.i = z__1.i;
+			    z__2.r = -det.r, z__2.i = -det.i;
+			    z_sqrt(&z__1, &z__2);
+			    rtdisc.r = z__1.r, rtdisc.i = z__1.i;
+			    i__2 = kbot - 1;
+			    z__2.r = tr2.r + rtdisc.r, z__2.i = tr2.i +
+				    rtdisc.i;
+			    z__1.r = s * z__2.r, z__1.i = s * z__2.i;
+			    w[i__2].r = z__1.r, w[i__2].i = z__1.i;
+			    i__2 = kbot;
+			    z__2.r = tr2.r - rtdisc.r, z__2.i = tr2.i -
+				    rtdisc.i;
+			    z__1.r = s * z__2.r, z__1.i = s * z__2.i;
+			    w[i__2].r = z__1.r, w[i__2].i = z__1.i;
+
+			    ks = kbot - 1;
+			}
+		    }
+
+		    if (kbot - ks + 1 > ns) {
+
+/*                    ==== Sort the shifts (Helps a little) ==== */
+
+			sorted = FALSE_;
+			i__2 = ks + 1;
+			for (k = kbot; k >= i__2; --k) {
+			    if (sorted) {
+				goto L60;
+			    }
+			    sorted = TRUE_;
+			    i__3 = k - 1;
+			    for (i__ = ks; i__ <= i__3; ++i__) {
+				i__4 = i__;
+				i__5 = i__ + 1;
+				if ((d__1 = w[i__4].r, abs(d__1)) + (d__2 =
+					d_imag(&w[i__]), abs(d__2)) < (d__3 =
+					w[i__5].r, abs(d__3)) + (d__4 =
+					d_imag(&w[i__ + 1]), abs(d__4))) {
+				    sorted = FALSE_;
+				    i__4 = i__;
+				    swap.r = w[i__4].r, swap.i = w[i__4].i;
+				    i__4 = i__;
+				    i__5 = i__ + 1;
+				    w[i__4].r = w[i__5].r, w[i__4].i = w[i__5]
+					    .i;
+				    i__4 = i__ + 1;
+				    w[i__4].r = swap.r, w[i__4].i = swap.i;
+				}
+/* L40: */
+			    }
+/* L50: */
+			}
+L60:
+			;
+		    }
+		}
+
+/*
+                ==== If there are only two shifts, then use
+                .    only one.  ====
+*/
+
+		if (kbot - ks + 1 == 2) {
+		    i__2 = kbot;
+		    i__3 = kbot + kbot * h_dim1;
+		    z__2.r = w[i__2].r - h__[i__3].r, z__2.i = w[i__2].i -
+			    h__[i__3].i;
+		    z__1.r = z__2.r, z__1.i = z__2.i;
+		    i__4 = kbot - 1;
+		    i__5 = kbot + kbot * h_dim1;
+		    z__4.r = w[i__4].r - h__[i__5].r, z__4.i = w[i__4].i -
+			    h__[i__5].i;
+		    z__3.r = z__4.r, z__3.i = z__4.i;
+		    if ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1),
+			    abs(d__2)) < (d__3 = z__3.r, abs(d__3)) + (d__4 =
+			    d_imag(&z__3), abs(d__4))) {
+			i__2 = kbot - 1;
+			i__3 = kbot;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    } else {
+			i__2 = kbot;
+			i__3 = kbot - 1;
+			w[i__2].r = w[i__3].r, w[i__2].i = w[i__3].i;
+		    }
+		}
+
+/*
+                ==== Use up to NS of the the smallest magnatiude
+                .    shifts.  If there aren't NS shifts available,
+                .    then use them all, possibly dropping one to
+                .    make the number of shifts even. ====
+
+   Computing MIN
+*/
+		i__2 = ns, i__3 = kbot - ks + 1;
+		ns = min(i__2,i__3);
+		ns -= ns % 2;
+		ks = kbot - ns + 1;
+
+/*
+                ==== Small-bulge multi-shift QR sweep:
+                .    split workspace under the subdiagonal into
+                .    - a KDU-by-KDU work array U in the lower
+                .      left-hand-corner,
+                .    - a KDU-by-at-least-KDU-but-more-is-better
+                .      (KDU-by-NHo) horizontal work array WH along
+                .      the bottom edge,
+                .    - and an at-least-KDU-but-more-is-better-by-KDU
+                .      (NVE-by-KDU) vertical work WV arrow along
+                .      the left-hand-edge. ====
+*/
+
+		kdu = ns * 3 - 3;
+		ku = *n - kdu + 1;
+		kwh = kdu + 1;
+		nho = *n - kdu - 3 - (kdu + 1) + 1;
+		kwv = kdu + 4;
+		nve = *n - kdu - kwv + 1;
+
+/*              ==== Small-bulge multi-shift QR sweep ==== */
+
+		zlaqr5_(wantt, wantz, &kacc22, n, &ktop, &kbot, &ns, &w[ks], &
+			h__[h_offset], ldh, iloz, ihiz, &z__[z_offset], ldz, &
+			work[1], &c__3, &h__[ku + h_dim1], ldh, &nve, &h__[
+			kwv + h_dim1], ldh, &nho, &h__[ku + kwh * h_dim1],
+			ldh);
+	    }
+
+/*           ==== Note progress (or the lack of it). ==== */
+
+	    if (ld > 0) {
+		ndfl = 1;
+	    } else {
+		++ndfl;
+	    }
+
+/*
+             ==== End of main loop ====
+   L70:
+*/
+	}
+
+/*
+          ==== Iteration limit exceeded.  Set INFO to show where
+          .    the problem occurred and exit. ====
+*/
+
+	*info = kbot;
+L80:
+	;
+    }
+
+/*     ==== Return the optimal value of LWORK. ==== */
+
+    d__1 = (doublereal) lwkopt;
+    z__1.r = d__1, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+/*     ==== End of ZLAQR4 ==== */
+
+    return 0;
+} /* zlaqr4_ */
+
+/* Subroutine */ int zlaqr5_(logical *wantt, logical *wantz, integer *kacc22,
+	integer *n, integer *ktop, integer *kbot, integer *nshfts,
+	doublecomplex *s, doublecomplex *h__, integer *ldh, integer *iloz,
+	integer *ihiz, doublecomplex *z__, integer *ldz, doublecomplex *v,
+	integer *ldv, doublecomplex *u, integer *ldu, integer *nv,
+	doublecomplex *wv, integer *ldwv, integer *nh, doublecomplex *wh,
+	integer *ldwh)
+{
+    /* System generated locals */
+    integer h_dim1, h_offset, u_dim1, u_offset, v_dim1, v_offset, wh_dim1,
+	    wh_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3,
+	     i__4, i__5, i__6, i__7, i__8, i__9, i__10, i__11;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6, d__7, d__8, d__9, d__10;
+    doublecomplex z__1, z__2, z__3, z__4, z__5, z__6, z__7, z__8;
+
+    /* Local variables */
+    static integer j, k, m, i2, j2, i4, j4, k1;
+    static doublereal h11, h12, h21, h22;
+    static integer m22, ns, nu;
+    static doublecomplex vt[3];
+    static doublereal scl;
+    static integer kdu, kms;
+    static doublereal ulp;
+    static integer knz, kzs;
+    static doublereal tst1, tst2;
+    static doublecomplex beta;
+    static logical blk22, bmp22;
+    static integer mend, jcol, jlen, jbot, mbot, jtop, jrow, mtop;
+    static doublecomplex alpha;
+    static logical accum;
+    static integer ndcol, incol, krcol, nbmps;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), ztrmm_(char *, char *, char *, char *,
+	     integer *, integer *, doublecomplex *, doublecomplex *, integer *
+	    , doublecomplex *, integer *),
+	    dlabad_(doublereal *, doublereal *), zlaqr1_(integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    doublecomplex *);
+
+    static doublereal safmin, safmax;
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *);
+    static doublecomplex refsum;
+    extern /* Subroutine */ int zlacpy_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *),
+	    zlaset_(char *, integer *, integer *, doublecomplex *,
+	    doublecomplex *, doublecomplex *, integer *);
+    static integer mstart;
+    static doublereal smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+       Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..
+       November 2006
+
+
+       This auxiliary subroutine called by ZLAQR0 performs a
+       single small-bulge multi-shift QR sweep.
+
+        WANTT  (input) logical scalar
+               WANTT = .true. if the triangular Schur factor
+               is being computed.  WANTT is set to .false. otherwise.
+
+        WANTZ  (input) logical scalar
+               WANTZ = .true. if the unitary Schur factor is being
+               computed.  WANTZ is set to .false. otherwise.
+
+        KACC22 (input) integer with value 0, 1, or 2.
+               Specifies the computation mode of far-from-diagonal
+               orthogonal updates.
+          = 0: ZLAQR5 does not accumulate reflections and does not
+               use matrix-matrix multiply to update far-from-diagonal
+               matrix entries.
+          = 1: ZLAQR5 accumulates reflections and uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries.
+          = 2: ZLAQR5 accumulates reflections, uses matrix-matrix
+               multiply to update the far-from-diagonal matrix entries,
+               and takes advantage of 2-by-2 block structure during
+               matrix multiplies.
+
+        N      (input) integer scalar
+               N is the order of the Hessenberg matrix H upon which this
+               subroutine operates.
+
+        KTOP   (input) integer scalar
+        KBOT   (input) integer scalar
+               These are the first and last rows and columns of an
+               isolated diagonal block upon which the QR sweep is to be
+               applied. It is assumed without a check that
+                         either KTOP = 1  or   H(KTOP,KTOP-1) = 0
+               and
+                         either KBOT = N  or   H(KBOT+1,KBOT) = 0.
+
+        NSHFTS (input) integer scalar
+               NSHFTS gives the number of simultaneous shifts.  NSHFTS
+               must be positive and even.
+
+        S      (input/output) COMPLEX*16 array of size (NSHFTS)
+               S contains the shifts of origin that define the multi-
+               shift QR sweep.  On output S may be reordered.
+
+        H      (input/output) COMPLEX*16 array of size (LDH,N)
+               On input H contains a Hessenberg matrix.  On output a
+               multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied
+               to the isolated diagonal block in rows and columns KTOP
+               through KBOT.
+
+        LDH    (input) integer scalar
+               LDH is the leading dimension of H just as declared in the
+               calling procedure.  LDH.GE.MAX(1,N).
+
+        ILOZ   (input) INTEGER
+        IHIZ   (input) INTEGER
+               Specify the rows of Z to which transformations must be
+               applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+
+        Z      (input/output) COMPLEX*16 array of size (LDZ,IHI)
+               If WANTZ = .TRUE., then the QR Sweep unitary
+               similarity transformation is accumulated into
+               Z(ILOZ:IHIZ,ILO:IHI) from the right.
+               If WANTZ = .FALSE., then Z is unreferenced.
+
+        LDZ    (input) integer scalar
+               LDA is the leading dimension of Z just as declared in
+               the calling procedure. LDZ.GE.N.
+
+        V      (workspace) COMPLEX*16 array of size (LDV,NSHFTS/2)
+
+        LDV    (input) integer scalar
+               LDV is the leading dimension of V as declared in the
+               calling procedure.  LDV.GE.3.
+
+        U      (workspace) COMPLEX*16 array of size
+               (LDU,3*NSHFTS-3)
+
+        LDU    (input) integer scalar
+               LDU is the leading dimension of U just as declared in the
+               in the calling subroutine.  LDU.GE.3*NSHFTS-3.
+
+        NH     (input) integer scalar
+               NH is the number of columns in array WH available for
+               workspace. NH.GE.1.
+
+        WH     (workspace) COMPLEX*16 array of size (LDWH,NH)
+
+        LDWH   (input) integer scalar
+               Leading dimension of WH just as declared in the
+               calling procedure.  LDWH.GE.3*NSHFTS-3.
+
+        NV     (input) integer scalar
+               NV is the number of rows in WV agailable for workspace.
+               NV.GE.1.
+
+        WV     (workspace) COMPLEX*16 array of size
+               (LDWV,3*NSHFTS-3)
+
+        LDWV   (input) integer scalar
+               LDWV is the leading dimension of WV as declared in the
+               in the calling subroutine.  LDWV.GE.NV.
+
+       ================================================================
+       Based on contributions by
+          Karen Braman and Ralph Byers, Department of Mathematics,
+          University of Kansas, USA
+
+       ================================================================
+       Reference:
+
+       K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+       Algorithm Part I: Maintaining Well Focused Shifts, and
+       Level 3 Performance, SIAM Journal of Matrix Analysis,
+       volume 23, pages 929--947, 2002.
+
+       ================================================================
+
+
+       ==== If there are no shifts, then there is nothing to do. ====
+*/
+
+    /* Parameter adjustments */
+    --s;
+    h_dim1 = *ldh;
+    h_offset = 1 + h_dim1;
+    h__ -= h_offset;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    wv_dim1 = *ldwv;
+    wv_offset = 1 + wv_dim1;
+    wv -= wv_offset;
+    wh_dim1 = *ldwh;
+    wh_offset = 1 + wh_dim1;
+    wh -= wh_offset;
+
+    /* Function Body */
+    if (*nshfts < 2) {
+	return 0;
+    }
+
+/*
+       ==== If the active block is empty or 1-by-1, then there
+       .    is nothing to do. ====
+*/
+
+    if (*ktop >= *kbot) {
+	return 0;
+    }
+
+/*
+       ==== NSHFTS is supposed to be even, but if it is odd,
+       .    then simply reduce it by one.  ====
+*/
+
+    ns = *nshfts - *nshfts % 2;
+
+/*     ==== Machine constants for deflation ==== */
+
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    dlabad_(&safmin, &safmax);
+    ulp = PRECISION;
+    smlnum = safmin * ((doublereal) (*n) / ulp);
+
+/*
+       ==== Use accumulated reflections to update far-from-diagonal
+       .    entries ? ====
+*/
+
+    accum = *kacc22 == 1 || *kacc22 == 2;
+
+/*     ==== If so, exploit the 2-by-2 block structure? ==== */
+
+    blk22 = ns > 2 && *kacc22 == 2;
+
+/*     ==== clear trash ==== */
+
+    if (*ktop + 2 <= *kbot) {
+	i__1 = *ktop + 2 + *ktop * h_dim1;
+	h__[i__1].r = 0., h__[i__1].i = 0.;
+    }
+
+/*     ==== NBMPS = number of 2-shift bulges in the chain ==== */
+
+    nbmps = ns / 2;
+
+/*     ==== KDU = width of slab ==== */
+
+    kdu = nbmps * 6 - 3;
+
+/*     ==== Create and chase chains of NBMPS bulges ==== */
+
+    i__1 = *kbot - 2;
+    i__2 = nbmps * 3 - 2;
+    for (incol = (1 - nbmps) * 3 + *ktop - 1; i__2 < 0 ? incol >= i__1 :
+	    incol <= i__1; incol += i__2) {
+	ndcol = incol + kdu;
+	if (accum) {
+	    zlaset_("ALL", &kdu, &kdu, &c_b56, &c_b57, &u[u_offset], ldu);
+	}
+
+/*
+          ==== Near-the-diagonal bulge chase.  The following loop
+          .    performs the near-the-diagonal part of a small bulge
+          .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+          .    chunk extends from column INCOL to column NDCOL
+          .    (including both column INCOL and column NDCOL). The
+          .    following loop chases a 3*NBMPS column long chain of
+          .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+          .    may be less than KTOP and and NDCOL may be greater than
+          .    KBOT indicating phantom columns from which to chase
+          .    bulges before they are actually introduced or to which
+          .    to chase bulges beyond column KBOT.)  ====
+
+   Computing MIN
+*/
+	i__4 = incol + nbmps * 3 - 3, i__5 = *kbot - 2;
+	i__3 = min(i__4,i__5);
+	for (krcol = incol; krcol <= i__3; ++krcol) {
+
+/*
+             ==== Bulges number MTOP to MBOT are active double implicit
+             .    shift bulges.  There may or may not also be small
+             .    2-by-2 bulge, if there is room.  The inactive bulges
+             .    (if any) must wait until the active bulges have moved
+             .    down the diagonal to make room.  The phantom matrix
+             .    paradigm described above helps keep track.  ====
+
+   Computing MAX
+*/
+	    i__4 = 1, i__5 = (*ktop - 1 - krcol + 2) / 3 + 1;
+	    mtop = max(i__4,i__5);
+/* Computing MIN */
+	    i__4 = nbmps, i__5 = (*kbot - krcol) / 3;
+	    mbot = min(i__4,i__5);
+	    m22 = mbot + 1;
+	    bmp22 = mbot < nbmps && krcol + (m22 - 1) * 3 == *kbot - 2;
+
+/*
+             ==== Generate reflections to chase the chain right
+             .    one column.  (The minimum value of K is KTOP-1.) ====
+*/
+
+	    i__4 = mbot;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		if (k == *ktop - 1) {
+		    zlaqr1_(&c__3, &h__[*ktop + *ktop * h_dim1], ldh, &s[(m <<
+			     1) - 1], &s[m * 2], &v[m * v_dim1 + 1]);
+		    i__5 = m * v_dim1 + 1;
+		    alpha.r = v[i__5].r, alpha.i = v[i__5].i;
+		    zlarfg_(&c__3, &alpha, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+		} else {
+		    i__5 = k + 1 + k * h_dim1;
+		    beta.r = h__[i__5].r, beta.i = h__[i__5].i;
+		    i__5 = m * v_dim1 + 2;
+		    i__6 = k + 2 + k * h_dim1;
+		    v[i__5].r = h__[i__6].r, v[i__5].i = h__[i__6].i;
+		    i__5 = m * v_dim1 + 3;
+		    i__6 = k + 3 + k * h_dim1;
+		    v[i__5].r = h__[i__6].r, v[i__5].i = h__[i__6].i;
+		    zlarfg_(&c__3, &beta, &v[m * v_dim1 + 2], &c__1, &v[m *
+			    v_dim1 + 1]);
+
+/*
+                   ==== A Bulge may collapse because of vigilant
+                   .    deflation or destructive underflow.  In the
+                   .    underflow case, try the two-small-subdiagonals
+                   .    trick to try to reinflate the bulge.  ====
+*/
+
+		    i__5 = k + 3 + k * h_dim1;
+		    i__6 = k + 3 + (k + 1) * h_dim1;
+		    i__7 = k + 3 + (k + 2) * h_dim1;
+		    if (h__[i__5].r != 0. || h__[i__5].i != 0. || (h__[i__6]
+			    .r != 0. || h__[i__6].i != 0.) || h__[i__7].r ==
+			    0. && h__[i__7].i == 0.) {
+
+/*                    ==== Typical case: not collapsed (yet). ==== */
+
+			i__5 = k + 1 + k * h_dim1;
+			h__[i__5].r = beta.r, h__[i__5].i = beta.i;
+			i__5 = k + 2 + k * h_dim1;
+			h__[i__5].r = 0., h__[i__5].i = 0.;
+			i__5 = k + 3 + k * h_dim1;
+			h__[i__5].r = 0., h__[i__5].i = 0.;
+		    } else {
+
+/*
+                      ==== Atypical case: collapsed.  Attempt to
+                      .    reintroduce ignoring H(K+1,K) and H(K+2,K).
+                      .    If the fill resulting from the new
+                      .    reflector is too large, then abandon it.
+                      .    Otherwise, use the new one. ====
+*/
+
+			zlaqr1_(&c__3, &h__[k + 1 + (k + 1) * h_dim1], ldh, &
+				s[(m << 1) - 1], &s[m * 2], vt);
+			alpha.r = vt[0].r, alpha.i = vt[0].i;
+			zlarfg_(&c__3, &alpha, &vt[1], &c__1, vt);
+			d_cnjg(&z__2, vt);
+			i__5 = k + 1 + k * h_dim1;
+			d_cnjg(&z__5, &vt[1]);
+			i__6 = k + 2 + k * h_dim1;
+			z__4.r = z__5.r * h__[i__6].r - z__5.i * h__[i__6].i,
+				z__4.i = z__5.r * h__[i__6].i + z__5.i * h__[
+				i__6].r;
+			z__3.r = h__[i__5].r + z__4.r, z__3.i = h__[i__5].i +
+				z__4.i;
+			z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+				z__2.r * z__3.i + z__2.i * z__3.r;
+			refsum.r = z__1.r, refsum.i = z__1.i;
+
+			i__5 = k + 2 + k * h_dim1;
+			z__3.r = refsum.r * vt[1].r - refsum.i * vt[1].i,
+				z__3.i = refsum.r * vt[1].i + refsum.i * vt[1]
+				.r;
+			z__2.r = h__[i__5].r - z__3.r, z__2.i = h__[i__5].i -
+				z__3.i;
+			z__1.r = z__2.r, z__1.i = z__2.i;
+			z__5.r = refsum.r * vt[2].r - refsum.i * vt[2].i,
+				z__5.i = refsum.r * vt[2].i + refsum.i * vt[2]
+				.r;
+			z__4.r = z__5.r, z__4.i = z__5.i;
+			i__6 = k + k * h_dim1;
+			i__7 = k + 1 + (k + 1) * h_dim1;
+			i__8 = k + 2 + (k + 2) * h_dim1;
+			if ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1)
+				, abs(d__2)) + ((d__3 = z__4.r, abs(d__3)) + (
+				d__4 = d_imag(&z__4), abs(d__4))) > ulp * ((
+				d__5 = h__[i__6].r, abs(d__5)) + (d__6 =
+				d_imag(&h__[k + k * h_dim1]), abs(d__6)) + ((
+				d__7 = h__[i__7].r, abs(d__7)) + (d__8 =
+				d_imag(&h__[k + 1 + (k + 1) * h_dim1]), abs(
+				d__8))) + ((d__9 = h__[i__8].r, abs(d__9)) + (
+				d__10 = d_imag(&h__[k + 2 + (k + 2) * h_dim1])
+				, abs(d__10))))) {
+
+/*
+                         ==== Starting a new bulge here would
+                         .    create non-negligible fill.  Use
+                         .    the old one with trepidation. ====
+*/
+
+			    i__5 = k + 1 + k * h_dim1;
+			    h__[i__5].r = beta.r, h__[i__5].i = beta.i;
+			    i__5 = k + 2 + k * h_dim1;
+			    h__[i__5].r = 0., h__[i__5].i = 0.;
+			    i__5 = k + 3 + k * h_dim1;
+			    h__[i__5].r = 0., h__[i__5].i = 0.;
+			} else {
+
+/*
+                         ==== Stating a new bulge here would
+                         .    create only negligible fill.
+                         .    Replace the old reflector with
+                         .    the new one. ====
+*/
+
+			    i__5 = k + 1 + k * h_dim1;
+			    i__6 = k + 1 + k * h_dim1;
+			    z__1.r = h__[i__6].r - refsum.r, z__1.i = h__[
+				    i__6].i - refsum.i;
+			    h__[i__5].r = z__1.r, h__[i__5].i = z__1.i;
+			    i__5 = k + 2 + k * h_dim1;
+			    h__[i__5].r = 0., h__[i__5].i = 0.;
+			    i__5 = k + 3 + k * h_dim1;
+			    h__[i__5].r = 0., h__[i__5].i = 0.;
+			    i__5 = m * v_dim1 + 1;
+			    v[i__5].r = vt[0].r, v[i__5].i = vt[0].i;
+			    i__5 = m * v_dim1 + 2;
+			    v[i__5].r = vt[1].r, v[i__5].i = vt[1].i;
+			    i__5 = m * v_dim1 + 3;
+			    v[i__5].r = vt[2].r, v[i__5].i = vt[2].i;
+			}
+		    }
+		}
+/* L10: */
+	    }
+
+/*           ==== Generate a 2-by-2 reflection, if needed. ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    if (bmp22) {
+		if (k == *ktop - 1) {
+		    zlaqr1_(&c__2, &h__[k + 1 + (k + 1) * h_dim1], ldh, &s[(
+			    m22 << 1) - 1], &s[m22 * 2], &v[m22 * v_dim1 + 1])
+			    ;
+		    i__4 = m22 * v_dim1 + 1;
+		    beta.r = v[i__4].r, beta.i = v[i__4].i;
+		    zlarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		} else {
+		    i__4 = k + 1 + k * h_dim1;
+		    beta.r = h__[i__4].r, beta.i = h__[i__4].i;
+		    i__4 = m22 * v_dim1 + 2;
+		    i__5 = k + 2 + k * h_dim1;
+		    v[i__4].r = h__[i__5].r, v[i__4].i = h__[i__5].i;
+		    zlarfg_(&c__2, &beta, &v[m22 * v_dim1 + 2], &c__1, &v[m22
+			    * v_dim1 + 1]);
+		    i__4 = k + 1 + k * h_dim1;
+		    h__[i__4].r = beta.r, h__[i__4].i = beta.i;
+		    i__4 = k + 2 + k * h_dim1;
+		    h__[i__4].r = 0., h__[i__4].i = 0.;
+		}
+	    }
+
+/*           ==== Multiply H by reflections from the left ==== */
+
+	    if (accum) {
+		jbot = min(ndcol,*kbot);
+	    } else if (*wantt) {
+		jbot = *n;
+	    } else {
+		jbot = *kbot;
+	    }
+	    i__4 = jbot;
+	    for (j = max(*ktop,krcol); j <= i__4; ++j) {
+/* Computing MIN */
+		i__5 = mbot, i__6 = (j - krcol + 2) / 3;
+		mend = min(i__5,i__6);
+		i__5 = mend;
+		for (m = mtop; m <= i__5; ++m) {
+		    k = krcol + (m - 1) * 3;
+		    d_cnjg(&z__2, &v[m * v_dim1 + 1]);
+		    i__6 = k + 1 + j * h_dim1;
+		    d_cnjg(&z__6, &v[m * v_dim1 + 2]);
+		    i__7 = k + 2 + j * h_dim1;
+		    z__5.r = z__6.r * h__[i__7].r - z__6.i * h__[i__7].i,
+			    z__5.i = z__6.r * h__[i__7].i + z__6.i * h__[i__7]
+			    .r;
+		    z__4.r = h__[i__6].r + z__5.r, z__4.i = h__[i__6].i +
+			    z__5.i;
+		    d_cnjg(&z__8, &v[m * v_dim1 + 3]);
+		    i__8 = k + 3 + j * h_dim1;
+		    z__7.r = z__8.r * h__[i__8].r - z__8.i * h__[i__8].i,
+			    z__7.i = z__8.r * h__[i__8].i + z__8.i * h__[i__8]
+			    .r;
+		    z__3.r = z__4.r + z__7.r, z__3.i = z__4.i + z__7.i;
+		    z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+			    z__2.r * z__3.i + z__2.i * z__3.r;
+		    refsum.r = z__1.r, refsum.i = z__1.i;
+		    i__6 = k + 1 + j * h_dim1;
+		    i__7 = k + 1 + j * h_dim1;
+		    z__1.r = h__[i__7].r - refsum.r, z__1.i = h__[i__7].i -
+			    refsum.i;
+		    h__[i__6].r = z__1.r, h__[i__6].i = z__1.i;
+		    i__6 = k + 2 + j * h_dim1;
+		    i__7 = k + 2 + j * h_dim1;
+		    i__8 = m * v_dim1 + 2;
+		    z__2.r = refsum.r * v[i__8].r - refsum.i * v[i__8].i,
+			    z__2.i = refsum.r * v[i__8].i + refsum.i * v[i__8]
+			    .r;
+		    z__1.r = h__[i__7].r - z__2.r, z__1.i = h__[i__7].i -
+			    z__2.i;
+		    h__[i__6].r = z__1.r, h__[i__6].i = z__1.i;
+		    i__6 = k + 3 + j * h_dim1;
+		    i__7 = k + 3 + j * h_dim1;
+		    i__8 = m * v_dim1 + 3;
+		    z__2.r = refsum.r * v[i__8].r - refsum.i * v[i__8].i,
+			    z__2.i = refsum.r * v[i__8].i + refsum.i * v[i__8]
+			    .r;
+		    z__1.r = h__[i__7].r - z__2.r, z__1.i = h__[i__7].i -
+			    z__2.i;
+		    h__[i__6].r = z__1.r, h__[i__6].i = z__1.i;
+/* L20: */
+		}
+/* L30: */
+	    }
+	    if (bmp22) {
+		k = krcol + (m22 - 1) * 3;
+/* Computing MAX */
+		i__4 = k + 1;
+		i__5 = jbot;
+		for (j = max(i__4,*ktop); j <= i__5; ++j) {
+		    d_cnjg(&z__2, &v[m22 * v_dim1 + 1]);
+		    i__4 = k + 1 + j * h_dim1;
+		    d_cnjg(&z__5, &v[m22 * v_dim1 + 2]);
+		    i__6 = k + 2 + j * h_dim1;
+		    z__4.r = z__5.r * h__[i__6].r - z__5.i * h__[i__6].i,
+			    z__4.i = z__5.r * h__[i__6].i + z__5.i * h__[i__6]
+			    .r;
+		    z__3.r = h__[i__4].r + z__4.r, z__3.i = h__[i__4].i +
+			    z__4.i;
+		    z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+			    z__2.r * z__3.i + z__2.i * z__3.r;
+		    refsum.r = z__1.r, refsum.i = z__1.i;
+		    i__4 = k + 1 + j * h_dim1;
+		    i__6 = k + 1 + j * h_dim1;
+		    z__1.r = h__[i__6].r - refsum.r, z__1.i = h__[i__6].i -
+			    refsum.i;
+		    h__[i__4].r = z__1.r, h__[i__4].i = z__1.i;
+		    i__4 = k + 2 + j * h_dim1;
+		    i__6 = k + 2 + j * h_dim1;
+		    i__7 = m22 * v_dim1 + 2;
+		    z__2.r = refsum.r * v[i__7].r - refsum.i * v[i__7].i,
+			    z__2.i = refsum.r * v[i__7].i + refsum.i * v[i__7]
+			    .r;
+		    z__1.r = h__[i__6].r - z__2.r, z__1.i = h__[i__6].i -
+			    z__2.i;
+		    h__[i__4].r = z__1.r, h__[i__4].i = z__1.i;
+/* L40: */
+		}
+	    }
+
+/*
+             ==== Multiply H by reflections from the right.
+             .    Delay filling in the last row until the
+             .    vigilant deflation check is complete. ====
+*/
+
+	    if (accum) {
+		jtop = max(*ktop,incol);
+	    } else if (*wantt) {
+		jtop = 1;
+	    } else {
+		jtop = *ktop;
+	    }
+	    i__5 = mbot;
+	    for (m = mtop; m <= i__5; ++m) {
+		i__4 = m * v_dim1 + 1;
+		if (v[i__4].r != 0. || v[i__4].i != 0.) {
+		    k = krcol + (m - 1) * 3;
+/* Computing MIN */
+		    i__6 = *kbot, i__7 = k + 3;
+		    i__4 = min(i__6,i__7);
+		    for (j = jtop; j <= i__4; ++j) {
+			i__6 = m * v_dim1 + 1;
+			i__7 = j + (k + 1) * h_dim1;
+			i__8 = m * v_dim1 + 2;
+			i__9 = j + (k + 2) * h_dim1;
+			z__4.r = v[i__8].r * h__[i__9].r - v[i__8].i * h__[
+				i__9].i, z__4.i = v[i__8].r * h__[i__9].i + v[
+				i__8].i * h__[i__9].r;
+			z__3.r = h__[i__7].r + z__4.r, z__3.i = h__[i__7].i +
+				z__4.i;
+			i__10 = m * v_dim1 + 3;
+			i__11 = j + (k + 3) * h_dim1;
+			z__5.r = v[i__10].r * h__[i__11].r - v[i__10].i * h__[
+				i__11].i, z__5.i = v[i__10].r * h__[i__11].i
+				+ v[i__10].i * h__[i__11].r;
+			z__2.r = z__3.r + z__5.r, z__2.i = z__3.i + z__5.i;
+			z__1.r = v[i__6].r * z__2.r - v[i__6].i * z__2.i,
+				z__1.i = v[i__6].r * z__2.i + v[i__6].i *
+				z__2.r;
+			refsum.r = z__1.r, refsum.i = z__1.i;
+			i__6 = j + (k + 1) * h_dim1;
+			i__7 = j + (k + 1) * h_dim1;
+			z__1.r = h__[i__7].r - refsum.r, z__1.i = h__[i__7].i
+				- refsum.i;
+			h__[i__6].r = z__1.r, h__[i__6].i = z__1.i;
+			i__6 = j + (k + 2) * h_dim1;
+			i__7 = j + (k + 2) * h_dim1;
+			d_cnjg(&z__3, &v[m * v_dim1 + 2]);
+			z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				z__2.i = refsum.r * z__3.i + refsum.i *
+				z__3.r;
+			z__1.r = h__[i__7].r - z__2.r, z__1.i = h__[i__7].i -
+				z__2.i;
+			h__[i__6].r = z__1.r, h__[i__6].i = z__1.i;
+			i__6 = j + (k + 3) * h_dim1;
+			i__7 = j + (k + 3) * h_dim1;
+			d_cnjg(&z__3, &v[m * v_dim1 + 3]);
+			z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				z__2.i = refsum.r * z__3.i + refsum.i *
+				z__3.r;
+			z__1.r = h__[i__7].r - z__2.r, z__1.i = h__[i__7].i -
+				z__2.i;
+			h__[i__6].r = z__1.r, h__[i__6].i = z__1.i;
+/* L50: */
+		    }
+
+		    if (accum) {
+
+/*
+                      ==== Accumulate U. (If necessary, update Z later
+                      .    with with an efficient matrix-matrix
+                      .    multiply.) ====
+*/
+
+			kms = k - incol;
+/* Computing MAX */
+			i__4 = 1, i__6 = *ktop - incol;
+			i__7 = kdu;
+			for (j = max(i__4,i__6); j <= i__7; ++j) {
+			    i__4 = m * v_dim1 + 1;
+			    i__6 = j + (kms + 1) * u_dim1;
+			    i__8 = m * v_dim1 + 2;
+			    i__9 = j + (kms + 2) * u_dim1;
+			    z__4.r = v[i__8].r * u[i__9].r - v[i__8].i * u[
+				    i__9].i, z__4.i = v[i__8].r * u[i__9].i +
+				    v[i__8].i * u[i__9].r;
+			    z__3.r = u[i__6].r + z__4.r, z__3.i = u[i__6].i +
+				    z__4.i;
+			    i__10 = m * v_dim1 + 3;
+			    i__11 = j + (kms + 3) * u_dim1;
+			    z__5.r = v[i__10].r * u[i__11].r - v[i__10].i * u[
+				    i__11].i, z__5.i = v[i__10].r * u[i__11]
+				    .i + v[i__10].i * u[i__11].r;
+			    z__2.r = z__3.r + z__5.r, z__2.i = z__3.i +
+				    z__5.i;
+			    z__1.r = v[i__4].r * z__2.r - v[i__4].i * z__2.i,
+				    z__1.i = v[i__4].r * z__2.i + v[i__4].i *
+				    z__2.r;
+			    refsum.r = z__1.r, refsum.i = z__1.i;
+			    i__4 = j + (kms + 1) * u_dim1;
+			    i__6 = j + (kms + 1) * u_dim1;
+			    z__1.r = u[i__6].r - refsum.r, z__1.i = u[i__6].i
+				    - refsum.i;
+			    u[i__4].r = z__1.r, u[i__4].i = z__1.i;
+			    i__4 = j + (kms + 2) * u_dim1;
+			    i__6 = j + (kms + 2) * u_dim1;
+			    d_cnjg(&z__3, &v[m * v_dim1 + 2]);
+			    z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				    z__2.i = refsum.r * z__3.i + refsum.i *
+				    z__3.r;
+			    z__1.r = u[i__6].r - z__2.r, z__1.i = u[i__6].i -
+				    z__2.i;
+			    u[i__4].r = z__1.r, u[i__4].i = z__1.i;
+			    i__4 = j + (kms + 3) * u_dim1;
+			    i__6 = j + (kms + 3) * u_dim1;
+			    d_cnjg(&z__3, &v[m * v_dim1 + 3]);
+			    z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				    z__2.i = refsum.r * z__3.i + refsum.i *
+				    z__3.r;
+			    z__1.r = u[i__6].r - z__2.r, z__1.i = u[i__6].i -
+				    z__2.i;
+			    u[i__4].r = z__1.r, u[i__4].i = z__1.i;
+/* L60: */
+			}
+		    } else if (*wantz) {
+
+/*
+                      ==== U is not accumulated, so update Z
+                      .    now by multiplying by reflections
+                      .    from the right. ====
+*/
+
+			i__7 = *ihiz;
+			for (j = *iloz; j <= i__7; ++j) {
+			    i__4 = m * v_dim1 + 1;
+			    i__6 = j + (k + 1) * z_dim1;
+			    i__8 = m * v_dim1 + 2;
+			    i__9 = j + (k + 2) * z_dim1;
+			    z__4.r = v[i__8].r * z__[i__9].r - v[i__8].i *
+				    z__[i__9].i, z__4.i = v[i__8].r * z__[
+				    i__9].i + v[i__8].i * z__[i__9].r;
+			    z__3.r = z__[i__6].r + z__4.r, z__3.i = z__[i__6]
+				    .i + z__4.i;
+			    i__10 = m * v_dim1 + 3;
+			    i__11 = j + (k + 3) * z_dim1;
+			    z__5.r = v[i__10].r * z__[i__11].r - v[i__10].i *
+				    z__[i__11].i, z__5.i = v[i__10].r * z__[
+				    i__11].i + v[i__10].i * z__[i__11].r;
+			    z__2.r = z__3.r + z__5.r, z__2.i = z__3.i +
+				    z__5.i;
+			    z__1.r = v[i__4].r * z__2.r - v[i__4].i * z__2.i,
+				    z__1.i = v[i__4].r * z__2.i + v[i__4].i *
+				    z__2.r;
+			    refsum.r = z__1.r, refsum.i = z__1.i;
+			    i__4 = j + (k + 1) * z_dim1;
+			    i__6 = j + (k + 1) * z_dim1;
+			    z__1.r = z__[i__6].r - refsum.r, z__1.i = z__[
+				    i__6].i - refsum.i;
+			    z__[i__4].r = z__1.r, z__[i__4].i = z__1.i;
+			    i__4 = j + (k + 2) * z_dim1;
+			    i__6 = j + (k + 2) * z_dim1;
+			    d_cnjg(&z__3, &v[m * v_dim1 + 2]);
+			    z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				    z__2.i = refsum.r * z__3.i + refsum.i *
+				    z__3.r;
+			    z__1.r = z__[i__6].r - z__2.r, z__1.i = z__[i__6]
+				    .i - z__2.i;
+			    z__[i__4].r = z__1.r, z__[i__4].i = z__1.i;
+			    i__4 = j + (k + 3) * z_dim1;
+			    i__6 = j + (k + 3) * z_dim1;
+			    d_cnjg(&z__3, &v[m * v_dim1 + 3]);
+			    z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				    z__2.i = refsum.r * z__3.i + refsum.i *
+				    z__3.r;
+			    z__1.r = z__[i__6].r - z__2.r, z__1.i = z__[i__6]
+				    .i - z__2.i;
+			    z__[i__4].r = z__1.r, z__[i__4].i = z__1.i;
+/* L70: */
+			}
+		    }
+		}
+/* L80: */
+	    }
+
+/*           ==== Special case: 2-by-2 reflection (if needed) ==== */
+
+	    k = krcol + (m22 - 1) * 3;
+	    i__5 = m22 * v_dim1 + 1;
+	    if (bmp22 && (v[i__5].r != 0. || v[i__5].i != 0.)) {
+/* Computing MIN */
+		i__7 = *kbot, i__4 = k + 3;
+		i__5 = min(i__7,i__4);
+		for (j = jtop; j <= i__5; ++j) {
+		    i__7 = m22 * v_dim1 + 1;
+		    i__4 = j + (k + 1) * h_dim1;
+		    i__6 = m22 * v_dim1 + 2;
+		    i__8 = j + (k + 2) * h_dim1;
+		    z__3.r = v[i__6].r * h__[i__8].r - v[i__6].i * h__[i__8]
+			    .i, z__3.i = v[i__6].r * h__[i__8].i + v[i__6].i *
+			     h__[i__8].r;
+		    z__2.r = h__[i__4].r + z__3.r, z__2.i = h__[i__4].i +
+			    z__3.i;
+		    z__1.r = v[i__7].r * z__2.r - v[i__7].i * z__2.i, z__1.i =
+			     v[i__7].r * z__2.i + v[i__7].i * z__2.r;
+		    refsum.r = z__1.r, refsum.i = z__1.i;
+		    i__7 = j + (k + 1) * h_dim1;
+		    i__4 = j + (k + 1) * h_dim1;
+		    z__1.r = h__[i__4].r - refsum.r, z__1.i = h__[i__4].i -
+			    refsum.i;
+		    h__[i__7].r = z__1.r, h__[i__7].i = z__1.i;
+		    i__7 = j + (k + 2) * h_dim1;
+		    i__4 = j + (k + 2) * h_dim1;
+		    d_cnjg(&z__3, &v[m22 * v_dim1 + 2]);
+		    z__2.r = refsum.r * z__3.r - refsum.i * z__3.i, z__2.i =
+			    refsum.r * z__3.i + refsum.i * z__3.r;
+		    z__1.r = h__[i__4].r - z__2.r, z__1.i = h__[i__4].i -
+			    z__2.i;
+		    h__[i__7].r = z__1.r, h__[i__7].i = z__1.i;
+/* L90: */
+		}
+
+		if (accum) {
+		    kms = k - incol;
+/* Computing MAX */
+		    i__5 = 1, i__7 = *ktop - incol;
+		    i__4 = kdu;
+		    for (j = max(i__5,i__7); j <= i__4; ++j) {
+			i__5 = m22 * v_dim1 + 1;
+			i__7 = j + (kms + 1) * u_dim1;
+			i__6 = m22 * v_dim1 + 2;
+			i__8 = j + (kms + 2) * u_dim1;
+			z__3.r = v[i__6].r * u[i__8].r - v[i__6].i * u[i__8]
+				.i, z__3.i = v[i__6].r * u[i__8].i + v[i__6]
+				.i * u[i__8].r;
+			z__2.r = u[i__7].r + z__3.r, z__2.i = u[i__7].i +
+				z__3.i;
+			z__1.r = v[i__5].r * z__2.r - v[i__5].i * z__2.i,
+				z__1.i = v[i__5].r * z__2.i + v[i__5].i *
+				z__2.r;
+			refsum.r = z__1.r, refsum.i = z__1.i;
+			i__5 = j + (kms + 1) * u_dim1;
+			i__7 = j + (kms + 1) * u_dim1;
+			z__1.r = u[i__7].r - refsum.r, z__1.i = u[i__7].i -
+				refsum.i;
+			u[i__5].r = z__1.r, u[i__5].i = z__1.i;
+			i__5 = j + (kms + 2) * u_dim1;
+			i__7 = j + (kms + 2) * u_dim1;
+			d_cnjg(&z__3, &v[m22 * v_dim1 + 2]);
+			z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				z__2.i = refsum.r * z__3.i + refsum.i *
+				z__3.r;
+			z__1.r = u[i__7].r - z__2.r, z__1.i = u[i__7].i -
+				z__2.i;
+			u[i__5].r = z__1.r, u[i__5].i = z__1.i;
+/* L100: */
+		    }
+		} else if (*wantz) {
+		    i__4 = *ihiz;
+		    for (j = *iloz; j <= i__4; ++j) {
+			i__5 = m22 * v_dim1 + 1;
+			i__7 = j + (k + 1) * z_dim1;
+			i__6 = m22 * v_dim1 + 2;
+			i__8 = j + (k + 2) * z_dim1;
+			z__3.r = v[i__6].r * z__[i__8].r - v[i__6].i * z__[
+				i__8].i, z__3.i = v[i__6].r * z__[i__8].i + v[
+				i__6].i * z__[i__8].r;
+			z__2.r = z__[i__7].r + z__3.r, z__2.i = z__[i__7].i +
+				z__3.i;
+			z__1.r = v[i__5].r * z__2.r - v[i__5].i * z__2.i,
+				z__1.i = v[i__5].r * z__2.i + v[i__5].i *
+				z__2.r;
+			refsum.r = z__1.r, refsum.i = z__1.i;
+			i__5 = j + (k + 1) * z_dim1;
+			i__7 = j + (k + 1) * z_dim1;
+			z__1.r = z__[i__7].r - refsum.r, z__1.i = z__[i__7].i
+				- refsum.i;
+			z__[i__5].r = z__1.r, z__[i__5].i = z__1.i;
+			i__5 = j + (k + 2) * z_dim1;
+			i__7 = j + (k + 2) * z_dim1;
+			d_cnjg(&z__3, &v[m22 * v_dim1 + 2]);
+			z__2.r = refsum.r * z__3.r - refsum.i * z__3.i,
+				z__2.i = refsum.r * z__3.i + refsum.i *
+				z__3.r;
+			z__1.r = z__[i__7].r - z__2.r, z__1.i = z__[i__7].i -
+				z__2.i;
+			z__[i__5].r = z__1.r, z__[i__5].i = z__1.i;
+/* L110: */
+		    }
+		}
+	    }
+
+/*           ==== Vigilant deflation check ==== */
+
+	    mstart = mtop;
+	    if (krcol + (mstart - 1) * 3 < *ktop) {
+		++mstart;
+	    }
+	    mend = mbot;
+	    if (bmp22) {
+		++mend;
+	    }
+	    if (krcol == *kbot - 2) {
+		++mend;
+	    }
+	    i__4 = mend;
+	    for (m = mstart; m <= i__4; ++m) {
+/* Computing MIN */
+		i__5 = *kbot - 1, i__7 = krcol + (m - 1) * 3;
+		k = min(i__5,i__7);
+
+/*
+                ==== The following convergence test requires that
+                .    the tradition small-compared-to-nearby-diagonals
+                .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+                .    criteria both be satisfied.  The latter improves
+                .    accuracy in some examples. Falling back on an
+                .    alternate convergence criterion when TST1 or TST2
+                .    is zero (as done here) is traditional but probably
+                .    unnecessary. ====
+*/
+
+		i__5 = k + 1 + k * h_dim1;
+		if (h__[i__5].r != 0. || h__[i__5].i != 0.) {
+		    i__5 = k + k * h_dim1;
+		    i__7 = k + 1 + (k + 1) * h_dim1;
+		    tst1 = (d__1 = h__[i__5].r, abs(d__1)) + (d__2 = d_imag(&
+			    h__[k + k * h_dim1]), abs(d__2)) + ((d__3 = h__[
+			    i__7].r, abs(d__3)) + (d__4 = d_imag(&h__[k + 1 +
+			    (k + 1) * h_dim1]), abs(d__4)));
+		    if (tst1 == 0.) {
+			if (k >= *ktop + 1) {
+			    i__5 = k + (k - 1) * h_dim1;
+			    tst1 += (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[k + (k - 1) * h_dim1]), abs(
+				    d__2));
+			}
+			if (k >= *ktop + 2) {
+			    i__5 = k + (k - 2) * h_dim1;
+			    tst1 += (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[k + (k - 2) * h_dim1]), abs(
+				    d__2));
+			}
+			if (k >= *ktop + 3) {
+			    i__5 = k + (k - 3) * h_dim1;
+			    tst1 += (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[k + (k - 3) * h_dim1]), abs(
+				    d__2));
+			}
+			if (k <= *kbot - 2) {
+			    i__5 = k + 2 + (k + 1) * h_dim1;
+			    tst1 += (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[k + 2 + (k + 1) * h_dim1]),
+				    abs(d__2));
+			}
+			if (k <= *kbot - 3) {
+			    i__5 = k + 3 + (k + 1) * h_dim1;
+			    tst1 += (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[k + 3 + (k + 1) * h_dim1]),
+				    abs(d__2));
+			}
+			if (k <= *kbot - 4) {
+			    i__5 = k + 4 + (k + 1) * h_dim1;
+			    tst1 += (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				    d_imag(&h__[k + 4 + (k + 1) * h_dim1]),
+				    abs(d__2));
+			}
+		    }
+		    i__5 = k + 1 + k * h_dim1;
+/* Computing MAX */
+		    d__3 = smlnum, d__4 = ulp * tst1;
+		    if ((d__1 = h__[i__5].r, abs(d__1)) + (d__2 = d_imag(&h__[
+			    k + 1 + k * h_dim1]), abs(d__2)) <= max(d__3,d__4)
+			    ) {
+/* Computing MAX */
+			i__5 = k + 1 + k * h_dim1;
+			i__7 = k + (k + 1) * h_dim1;
+			d__5 = (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				d_imag(&h__[k + 1 + k * h_dim1]), abs(d__2)),
+				d__6 = (d__3 = h__[i__7].r, abs(d__3)) + (
+				d__4 = d_imag(&h__[k + (k + 1) * h_dim1]),
+				abs(d__4));
+			h12 = max(d__5,d__6);
+/* Computing MIN */
+			i__5 = k + 1 + k * h_dim1;
+			i__7 = k + (k + 1) * h_dim1;
+			d__5 = (d__1 = h__[i__5].r, abs(d__1)) + (d__2 =
+				d_imag(&h__[k + 1 + k * h_dim1]), abs(d__2)),
+				d__6 = (d__3 = h__[i__7].r, abs(d__3)) + (
+				d__4 = d_imag(&h__[k + (k + 1) * h_dim1]),
+				abs(d__4));
+			h21 = min(d__5,d__6);
+			i__5 = k + k * h_dim1;
+			i__7 = k + 1 + (k + 1) * h_dim1;
+			z__2.r = h__[i__5].r - h__[i__7].r, z__2.i = h__[i__5]
+				.i - h__[i__7].i;
+			z__1.r = z__2.r, z__1.i = z__2.i;
+/* Computing MAX */
+			i__6 = k + 1 + (k + 1) * h_dim1;
+			d__5 = (d__1 = h__[i__6].r, abs(d__1)) + (d__2 =
+				d_imag(&h__[k + 1 + (k + 1) * h_dim1]), abs(
+				d__2)), d__6 = (d__3 = z__1.r, abs(d__3)) + (
+				d__4 = d_imag(&z__1), abs(d__4));
+			h11 = max(d__5,d__6);
+			i__5 = k + k * h_dim1;
+			i__7 = k + 1 + (k + 1) * h_dim1;
+			z__2.r = h__[i__5].r - h__[i__7].r, z__2.i = h__[i__5]
+				.i - h__[i__7].i;
+			z__1.r = z__2.r, z__1.i = z__2.i;
+/* Computing MIN */
+			i__6 = k + 1 + (k + 1) * h_dim1;
+			d__5 = (d__1 = h__[i__6].r, abs(d__1)) + (d__2 =
+				d_imag(&h__[k + 1 + (k + 1) * h_dim1]), abs(
+				d__2)), d__6 = (d__3 = z__1.r, abs(d__3)) + (
+				d__4 = d_imag(&z__1), abs(d__4));
+			h22 = min(d__5,d__6);
+			scl = h11 + h12;
+			tst2 = h22 * (h11 / scl);
+
+/* Computing MAX */
+			d__1 = smlnum, d__2 = ulp * tst2;
+			if (tst2 == 0. || h21 * (h12 / scl) <= max(d__1,d__2))
+				 {
+			    i__5 = k + 1 + k * h_dim1;
+			    h__[i__5].r = 0., h__[i__5].i = 0.;
+			}
+		    }
+		}
+/* L120: */
+	    }
+
+/*
+             ==== Fill in the last row of each bulge. ====
+
+   Computing MIN
+*/
+	    i__4 = nbmps, i__5 = (*kbot - krcol - 1) / 3;
+	    mend = min(i__4,i__5);
+	    i__4 = mend;
+	    for (m = mtop; m <= i__4; ++m) {
+		k = krcol + (m - 1) * 3;
+		i__5 = m * v_dim1 + 1;
+		i__7 = m * v_dim1 + 3;
+		z__2.r = v[i__5].r * v[i__7].r - v[i__5].i * v[i__7].i,
+			z__2.i = v[i__5].r * v[i__7].i + v[i__5].i * v[i__7]
+			.r;
+		i__6 = k + 4 + (k + 3) * h_dim1;
+		z__1.r = z__2.r * h__[i__6].r - z__2.i * h__[i__6].i, z__1.i =
+			 z__2.r * h__[i__6].i + z__2.i * h__[i__6].r;
+		refsum.r = z__1.r, refsum.i = z__1.i;
+		i__5 = k + 4 + (k + 1) * h_dim1;
+		z__1.r = -refsum.r, z__1.i = -refsum.i;
+		h__[i__5].r = z__1.r, h__[i__5].i = z__1.i;
+		i__5 = k + 4 + (k + 2) * h_dim1;
+		z__2.r = -refsum.r, z__2.i = -refsum.i;
+		d_cnjg(&z__3, &v[m * v_dim1 + 2]);
+		z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = z__2.r *
+			z__3.i + z__2.i * z__3.r;
+		h__[i__5].r = z__1.r, h__[i__5].i = z__1.i;
+		i__5 = k + 4 + (k + 3) * h_dim1;
+		i__7 = k + 4 + (k + 3) * h_dim1;
+		d_cnjg(&z__3, &v[m * v_dim1 + 3]);
+		z__2.r = refsum.r * z__3.r - refsum.i * z__3.i, z__2.i =
+			refsum.r * z__3.i + refsum.i * z__3.r;
+		z__1.r = h__[i__7].r - z__2.r, z__1.i = h__[i__7].i - z__2.i;
+		h__[i__5].r = z__1.r, h__[i__5].i = z__1.i;
+/* L130: */
+	    }
+
+/*
+             ==== End of near-the-diagonal bulge chase. ====
+
+   L140:
+*/
+	}
+
+/*
+          ==== Use U (if accumulated) to update far-from-diagonal
+          .    entries in H.  If required, use U to update Z as
+          .    well. ====
+*/
+
+	if (accum) {
+	    if (*wantt) {
+		jtop = 1;
+		jbot = *n;
+	    } else {
+		jtop = *ktop;
+		jbot = *kbot;
+	    }
+	    if (! blk22 || incol < *ktop || ndcol > *kbot || ns <= 2) {
+
+/*
+                ==== Updates not exploiting the 2-by-2 block
+                .    structure of U.  K1 and NU keep track of
+                .    the location and size of U in the special
+                .    cases of introducing bulges and chasing
+                .    bulges off the bottom.  In these special
+                .    cases and in case the number of shifts
+                .    is NS = 2, there is no 2-by-2 block
+                .    structure to exploit.  ====
+
+   Computing MAX
+*/
+		i__3 = 1, i__4 = *ktop - incol;
+		k1 = max(i__3,i__4);
+/* Computing MAX */
+		i__3 = 0, i__4 = ndcol - *kbot;
+		nu = kdu - max(i__3,i__4) - k1 + 1;
+
+/*              ==== Horizontal Multiply ==== */
+
+		i__3 = jbot;
+		i__4 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__4 < 0 ? jcol >= i__3 :
+			jcol <= i__3; jcol += i__4) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+		    zgemm_("C", "N", &nu, &jlen, &nu, &c_b57, &u[k1 + k1 *
+			    u_dim1], ldu, &h__[incol + k1 + jcol * h_dim1],
+			    ldh, &c_b56, &wh[wh_offset], ldwh);
+		    zlacpy_("ALL", &nu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + k1 + jcol * h_dim1], ldh);
+/* L150: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__4 = max(*ktop,incol) - 1;
+		i__3 = *nv;
+		for (jrow = jtop; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			jrow += i__3) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(*ktop,incol) - jrow;
+		    jlen = min(i__5,i__7);
+		    zgemm_("N", "N", &jlen, &nu, &nu, &c_b57, &h__[jrow + (
+			    incol + k1) * h_dim1], ldh, &u[k1 + k1 * u_dim1],
+			    ldu, &c_b56, &wv[wv_offset], ldwv);
+		    zlacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + k1) * h_dim1], ldh);
+/* L160: */
+		}
+
+/*              ==== Z multiply (also vertical) ==== */
+
+		if (*wantz) {
+		    i__3 = *ihiz;
+		    i__4 = *nv;
+		    for (jrow = *iloz; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			     jrow += i__4) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+			zgemm_("N", "N", &jlen, &nu, &nu, &c_b57, &z__[jrow +
+				(incol + k1) * z_dim1], ldz, &u[k1 + k1 *
+				u_dim1], ldu, &c_b56, &wv[wv_offset], ldwv);
+			zlacpy_("ALL", &jlen, &nu, &wv[wv_offset], ldwv, &z__[
+				jrow + (incol + k1) * z_dim1], ldz)
+				;
+/* L170: */
+		    }
+		}
+	    } else {
+
+/*
+                ==== Updates exploiting U's 2-by-2 block structure.
+                .    (I2, I4, J2, J4 are the last rows and columns
+                .    of the blocks.) ====
+*/
+
+		i2 = (kdu + 1) / 2;
+		i4 = kdu;
+		j2 = i4 - i2;
+		j4 = kdu;
+
+/*
+                ==== KZS and KNZ deal with the band of zeros
+                .    along the diagonal of one of the triangular
+                .    blocks. ====
+*/
+
+		kzs = j4 - j2 - (ns + 1);
+		knz = ns + 1;
+
+/*              ==== Horizontal multiply ==== */
+
+		i__4 = jbot;
+		i__3 = *nh;
+		for (jcol = min(ndcol,*kbot) + 1; i__3 < 0 ? jcol >= i__4 :
+			jcol <= i__4; jcol += i__3) {
+/* Computing MIN */
+		    i__5 = *nh, i__7 = jbot - jcol + 1;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy bottom of H to top+KZS of scratch ====
+                    (The first KZS rows get multiplied by zero.) ====
+*/
+
+		    zlacpy_("ALL", &knz, &jlen, &h__[incol + 1 + j2 + jcol *
+			    h_dim1], ldh, &wh[kzs + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    zlaset_("ALL", &kzs, &jlen, &c_b56, &c_b56, &wh[wh_offset]
+			    , ldwh);
+		    ztrmm_("L", "U", "C", "N", &knz, &jlen, &c_b57, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wh[kzs + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Multiply top of H by U11' ==== */
+
+		    zgemm_("C", "N", &i2, &jlen, &j2, &c_b57, &u[u_offset],
+			    ldu, &h__[incol + 1 + jcol * h_dim1], ldh, &c_b57,
+			     &wh[wh_offset], ldwh);
+
+/*                 ==== Copy top of H to bottom of WH ==== */
+
+		    zlacpy_("ALL", &j2, &jlen, &h__[incol + 1 + jcol * h_dim1]
+			    , ldh, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U21' ==== */
+
+		    ztrmm_("L", "L", "C", "N", &j2, &jlen, &c_b57, &u[(i2 + 1)
+			     * u_dim1 + 1], ldu, &wh[i2 + 1 + wh_dim1], ldwh);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    zgemm_("C", "N", &i__5, &jlen, &i__7, &c_b57, &u[j2 + 1 +
+			    (i2 + 1) * u_dim1], ldu, &h__[incol + 1 + j2 +
+			    jcol * h_dim1], ldh, &c_b57, &wh[i2 + 1 + wh_dim1]
+			    , ldwh);
+
+/*                 ==== Copy it back ==== */
+
+		    zlacpy_("ALL", &kdu, &jlen, &wh[wh_offset], ldwh, &h__[
+			    incol + 1 + jcol * h_dim1], ldh);
+/* L180: */
+		}
+
+/*              ==== Vertical multiply ==== */
+
+		i__3 = max(incol,*ktop) - 1;
+		i__4 = *nv;
+		for (jrow = jtop; i__4 < 0 ? jrow >= i__3 : jrow <= i__3;
+			jrow += i__4) {
+/* Computing MIN */
+		    i__5 = *nv, i__7 = max(incol,*ktop) - jrow;
+		    jlen = min(i__5,i__7);
+
+/*
+                   ==== Copy right of H to scratch (the first KZS
+                   .    columns get multiplied by zero) ====
+*/
+
+		    zlacpy_("ALL", &jlen, &knz, &h__[jrow + (incol + 1 + j2) *
+			     h_dim1], ldh, &wv[(kzs + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    zlaset_("ALL", &jlen, &kzs, &c_b56, &c_b56, &wv[wv_offset]
+			    , ldwv);
+		    ztrmm_("R", "U", "N", "N", &jlen, &knz, &c_b57, &u[j2 + 1
+			    + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U11 ==== */
+
+		    zgemm_("N", "N", &jlen, &i2, &j2, &c_b57, &h__[jrow + (
+			    incol + 1) * h_dim1], ldh, &u[u_offset], ldu, &
+			    c_b57, &wv[wv_offset], ldwv)
+			    ;
+
+/*                 ==== Copy left of H to right of scratch ==== */
+
+		    zlacpy_("ALL", &jlen, &j2, &h__[jrow + (incol + 1) *
+			    h_dim1], ldh, &wv[(i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                 ==== Multiply by U21 ==== */
+
+		    i__5 = i4 - i2;
+		    ztrmm_("R", "L", "N", "N", &jlen, &i__5, &c_b57, &u[(i2 +
+			    1) * u_dim1 + 1], ldu, &wv[(i2 + 1) * wv_dim1 + 1]
+			    , ldwv);
+
+/*                 ==== Multiply by U22 ==== */
+
+		    i__5 = i4 - i2;
+		    i__7 = j4 - j2;
+		    zgemm_("N", "N", &jlen, &i__5, &i__7, &c_b57, &h__[jrow +
+			    (incol + 1 + j2) * h_dim1], ldh, &u[j2 + 1 + (i2
+			    + 1) * u_dim1], ldu, &c_b57, &wv[(i2 + 1) *
+			    wv_dim1 + 1], ldwv);
+
+/*                 ==== Copy it back ==== */
+
+		    zlacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &h__[
+			    jrow + (incol + 1) * h_dim1], ldh);
+/* L190: */
+		}
+
+/*              ==== Multiply Z (also vertical) ==== */
+
+		if (*wantz) {
+		    i__4 = *ihiz;
+		    i__3 = *nv;
+		    for (jrow = *iloz; i__3 < 0 ? jrow >= i__4 : jrow <= i__4;
+			     jrow += i__3) {
+/* Computing MIN */
+			i__5 = *nv, i__7 = *ihiz - jrow + 1;
+			jlen = min(i__5,i__7);
+
+/*
+                      ==== Copy right of Z to left of scratch (first
+                      .     KZS columns get multiplied by zero) ====
+*/
+
+			zlacpy_("ALL", &jlen, &knz, &z__[jrow + (incol + 1 +
+				j2) * z_dim1], ldz, &wv[(kzs + 1) * wv_dim1 +
+				1], ldwv);
+
+/*                    ==== Multiply by U12 ==== */
+
+			zlaset_("ALL", &jlen, &kzs, &c_b56, &c_b56, &wv[
+				wv_offset], ldwv);
+			ztrmm_("R", "U", "N", "N", &jlen, &knz, &c_b57, &u[j2
+				+ 1 + (kzs + 1) * u_dim1], ldu, &wv[(kzs + 1)
+				* wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U11 ==== */
+
+			zgemm_("N", "N", &jlen, &i2, &j2, &c_b57, &z__[jrow +
+				(incol + 1) * z_dim1], ldz, &u[u_offset], ldu,
+				 &c_b57, &wv[wv_offset], ldwv);
+
+/*                    ==== Copy left of Z to right of scratch ==== */
+
+			zlacpy_("ALL", &jlen, &j2, &z__[jrow + (incol + 1) *
+				z_dim1], ldz, &wv[(i2 + 1) * wv_dim1 + 1],
+				ldwv);
+
+/*                    ==== Multiply by U21 ==== */
+
+			i__5 = i4 - i2;
+			ztrmm_("R", "L", "N", "N", &jlen, &i__5, &c_b57, &u[(
+				i2 + 1) * u_dim1 + 1], ldu, &wv[(i2 + 1) *
+				wv_dim1 + 1], ldwv);
+
+/*                    ==== Multiply by U22 ==== */
+
+			i__5 = i4 - i2;
+			i__7 = j4 - j2;
+			zgemm_("N", "N", &jlen, &i__5, &i__7, &c_b57, &z__[
+				jrow + (incol + 1 + j2) * z_dim1], ldz, &u[j2
+				+ 1 + (i2 + 1) * u_dim1], ldu, &c_b57, &wv[(
+				i2 + 1) * wv_dim1 + 1], ldwv);
+
+/*                    ==== Copy the result back to Z ==== */
+
+			zlacpy_("ALL", &jlen, &kdu, &wv[wv_offset], ldwv, &
+				z__[jrow + (incol + 1) * z_dim1], ldz);
+/* L200: */
+		    }
+		}
+	    }
+	}
+/* L210: */
+    }
+
+/*     ==== End of ZLAQR5 ==== */
+
+    return 0;
+} /* zlaqr5_ */
+
+/* Subroutine */ int zlarcm_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublecomplex *b, integer *ldb, doublecomplex *c__, integer *ldc,
+	 doublereal *rwork)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+	    i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *,
+	    integer *, doublereal *, doublereal *, integer *, doublereal *,
+	    integer *, doublereal *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLARCM performs a very simple matrix-matrix multiplication:
+             C := A * B,
+    where A is M by M and real; B is M by N and complex;
+    C is M by N and complex.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A and of the matrix C.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns and rows of the matrix B and
+            the number of columns of the matrix C.
+            N >= 0.
+
+    A       (input) DOUBLE PRECISION array, dimension (LDA, M)
+            A contains the M by M matrix A.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >=max(1,M).
+
+    B       (input) DOUBLE PRECISION array, dimension (LDB, N)
+            B contains the M by N matrix B.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >=max(1,M).
+
+    C       (input) COMPLEX*16 array, dimension (LDC, N)
+            C contains the M by N matrix C.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >=max(1,M).
+
+    RWORK   (workspace) DOUBLE PRECISION array, dimension (2*M*N)
+
+    =====================================================================
+
+
+       Quick return if possible.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --rwork;
+
+    /* Function Body */
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * b_dim1;
+	    rwork[(j - 1) * *m + i__] = b[i__3].r;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    l = *m * *n + 1;
+    dgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = l + (j - 1) * *m + i__ - 1;
+	    c__[i__3].r = rwork[i__4], c__[i__3].i = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    rwork[(j - 1) * *m + i__] = d_imag(&b[i__ + j * b_dim1]);
+/* L50: */
+	}
+/* L60: */
+    }
+    dgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, &
+	    c_b328, &rwork[l], m);
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    i__4 = i__ + j * c_dim1;
+	    d__1 = c__[i__4].r;
+	    i__5 = l + (j - 1) * *m + i__ - 1;
+	    z__1.r = d__1, z__1.i = rwork[i__5];
+	    c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L70: */
+	}
+/* L80: */
+    }
+
+    return 0;
+
+/*     End of ZLARCM */
+
+} /* zlarcm_ */
+
+/* Subroutine */ int zlarf_(char *side, integer *m, integer *n, doublecomplex
+	*v, integer *incv, doublecomplex *tau, doublecomplex *c__, integer *
+	ldc, doublecomplex *work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__;
+    static logical applyleft;
+    extern logical lsame_(char *, char *);
+    static integer lastc;
+    extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    static integer lastv;
+    extern integer ilazlc_(integer *, integer *, doublecomplex *, integer *),
+	    ilazlr_(integer *, integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLARF applies a complex elementary reflector H to a complex M-by-N
+    matrix C, from either the left or the right. H is represented in the
+    form
+
+          H = I - tau * v * v'
+
+    where tau is a complex scalar and v is a complex vector.
+
+    If tau = 0, then H is taken to be the unit matrix.
+
+    To apply H' (the conjugate transpose of H), supply conjg(tau) instead
+    tau.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': form  H * C
+            = 'R': form  C * H
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    V       (input) COMPLEX*16 array, dimension
+                       (1 + (M-1)*abs(INCV)) if SIDE = 'L'
+                    or (1 + (N-1)*abs(INCV)) if SIDE = 'R'
+            The vector v in the representation of H. V is not used if
+            TAU = 0.
+
+    INCV    (input) INTEGER
+            The increment between elements of v. INCV <> 0.
+
+    TAU     (input) COMPLEX*16
+            The value tau in the representation of H.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by the matrix H * C if SIDE = 'L',
+            or C * H if SIDE = 'R'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX*16 array, dimension
+                           (N) if SIDE = 'L'
+                        or (M) if SIDE = 'R'
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    applyleft = lsame_(side, "L");
+    lastv = 0;
+    lastc = 0;
+    if (tau->r != 0. || tau->i != 0.) {
+/*
+       Set up variables for scanning V.  LASTV begins pointing to the end
+       of V.
+*/
+	if (applyleft) {
+	    lastv = *m;
+	} else {
+	    lastv = *n;
+	}
+	if (*incv > 0) {
+	    i__ = (lastv - 1) * *incv + 1;
+	} else {
+	    i__ = 1;
+	}
+/*     Look for the last non-zero row in V. */
+	for(;;) { /* while(complicated condition) */
+	    i__1 = i__;
+	    if (!(lastv > 0 && (v[i__1].r == 0. && v[i__1].i == 0.)))
+	    	break;
+	    --lastv;
+	    i__ -= *incv;
+	}
+	if (applyleft) {
+/*     Scan for the last non-zero column in C(1:lastv,:). */
+	    lastc = ilazlc_(&lastv, n, &c__[c_offset], ldc);
+	} else {
+/*     Scan for the last non-zero row in C(:,1:lastv). */
+	    lastc = ilazlr_(m, &lastv, &c__[c_offset], ldc);
+	}
+    }
+/*
+       Note that lastc.eq.0 renders the BLAS operations null; no special
+       case is needed at this level.
+*/
+    if (applyleft) {
+
+/*        Form  H * C */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastv,1:lastc)' * v(1:lastv,1) */
+
+	    zgemv_("Conjugate transpose", &lastv, &lastc, &c_b57, &c__[
+		    c_offset], ldc, &v[1], incv, &c_b56, &work[1], &c__1);
+
+/*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)' */
+
+	    z__1.r = -tau->r, z__1.i = -tau->i;
+	    zgerc_(&lastv, &lastc, &z__1, &v[1], incv, &work[1], &c__1, &c__[
+		    c_offset], ldc);
+	}
+    } else {
+
+/*        Form  C * H */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1) */
+
+	    zgemv_("No transpose", &lastc, &lastv, &c_b57, &c__[c_offset],
+		    ldc, &v[1], incv, &c_b56, &work[1], &c__1);
+
+/*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)' */
+
+	    z__1.r = -tau->r, z__1.i = -tau->i;
+	    zgerc_(&lastc, &lastv, &z__1, &work[1], &c__1, &v[1], incv, &c__[
+		    c_offset], ldc);
+	}
+    }
+    return 0;
+
+/*     End of ZLARF */
+
+} /* zlarf_ */
+
+/* Subroutine */ int zlarfb_(char *side, char *trans, char *direct, char *
+	storev, integer *m, integer *n, integer *k, doublecomplex *v, integer
+	*ldv, doublecomplex *t, integer *ldt, doublecomplex *c__, integer *
+	ldc, doublecomplex *work, integer *ldwork)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1,
+	    work_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+    static integer lastc;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer lastv;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), ztrmm_(char *, char *, char *, char *
+	    , integer *, integer *, doublecomplex *, doublecomplex *, integer
+	    *, doublecomplex *, integer *);
+    extern integer ilazlc_(integer *, integer *, doublecomplex *, integer *);
+    extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *)
+	    ;
+    extern integer ilazlr_(integer *, integer *, doublecomplex *, integer *);
+    static char transt[1];
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLARFB applies a complex block reflector H or its transpose H' to a
+    complex M-by-N matrix C, from either the left or the right.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply H or H' from the Left
+            = 'R': apply H or H' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply H (No transpose)
+            = 'C': apply H' (Conjugate transpose)
+
+    DIRECT  (input) CHARACTER*1
+            Indicates how H is formed from a product of elementary
+            reflectors
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Indicates how the vectors which define the elementary
+            reflectors are stored:
+            = 'C': Columnwise
+            = 'R': Rowwise
+
+    M       (input) INTEGER
+            The number of rows of the matrix C.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C.
+
+    K       (input) INTEGER
+            The order of the matrix T (= the number of elementary
+            reflectors whose product defines the block reflector).
+
+    V       (input) COMPLEX*16 array, dimension
+                                  (LDV,K) if STOREV = 'C'
+                                  (LDV,M) if STOREV = 'R' and SIDE = 'L'
+                                  (LDV,N) if STOREV = 'R' and SIDE = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+            if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+            if STOREV = 'R', LDV >= K.
+
+    T       (input) COMPLEX*16 array, dimension (LDT,K)
+            The triangular K-by-K matrix T in the representation of the
+            block reflector.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by H*C or H'*C or C*H or C*H'.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX*16 array, dimension (LDWORK,K)
+
+    LDWORK  (input) INTEGER
+            The leading dimension of the array WORK.
+            If SIDE = 'L', LDWORK >= max(1,N);
+            if SIDE = 'R', LDWORK >= max(1,M).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    work_dim1 = *ldwork;
+    work_offset = 1 + work_dim1;
+    work -= work_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(trans, "N")) {
+	*(unsigned char *)transt = 'C';
+    } else {
+	*(unsigned char *)transt = 'N';
+    }
+
+    if (lsame_(storev, "C")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1 )    (first K rows)
+                       ( V2 )
+             where  V1  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+		    zlacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L10: */
+		}
+
+/*              W := W * V1 */
+
+		ztrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2 */
+
+		    i__1 = lastv - *k;
+		    zgemm_("Conjugate transpose", "No transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[*k + 1 + c_dim1], ldc, &v[*k +
+			    1 + v_dim1], ldv, &c_b57, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ztrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (*m > *k) {
+
+/*                 C2 := C2 - V2 * W' */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    lastc, k, &z__1, &v[*k + 1 + v_dim1], ldv, &work[
+			    work_offset], ldwork, &c_b57, &c__[*k + 1 +
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		ztrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = j + i__ * c_dim1;
+			i__4 = j + i__ * c_dim1;
+			d_cnjg(&z__2, &work[i__ + j * work_dim1]);
+			z__1.r = c__[i__4].r - z__2.r, z__1.i = c__[i__4].i -
+				z__2.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L20: */
+		    }
+/* L30: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L40: */
+		}
+
+/*              W := W * V1 */
+
+		ztrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2 */
+
+		    i__1 = lastv - *k;
+		    zgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b57, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[*k +
+			    1 + v_dim1], ldv, &c_b57, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ztrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2' */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "Conjugate transpose", &lastc, &
+			    i__1, k, &z__1, &work[work_offset], ldwork, &v[*k
+			    + 1 + v_dim1], ldv, &c_b57, &c__[(*k + 1) *
+			    c_dim1 + 1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		ztrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			z__1.r = c__[i__4].r - work[i__5].r, z__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L50: */
+		    }
+/* L60: */
+		}
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1 )
+                       ( V2 )    (last K rows)
+             where  V2  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+		    zlacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L70: */
+		}
+
+/*              W := W * V2 */
+
+		ztrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1 */
+
+		    i__1 = lastv - *k;
+		    zgemm_("Conjugate transpose", "No transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[c_offset], ldc, &v[v_offset],
+			    ldv, &c_b57, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ztrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1 * W' */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    lastc, k, &z__1, &v[v_offset], ldv, &work[
+			    work_offset], ldwork, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		ztrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &
+			work[work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = lastv - *k + j + i__ * c_dim1;
+			i__4 = lastv - *k + j + i__ * c_dim1;
+			d_cnjg(&z__2, &work[i__ + j * work_dim1]);
+			z__1.r = c__[i__4].r - z__2.r, z__1.i = c__[i__4].i -
+				z__2.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L80: */
+		    }
+/* L90: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1,
+			     &work[j * work_dim1 + 1], &c__1);
+/* L100: */
+		}
+
+/*              W := W * V2 */
+
+		ztrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1 */
+
+		    i__1 = lastv - *k;
+		    zgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b57, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b57, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ztrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1' */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "Conjugate transpose", &lastc, &
+			    i__1, k, &z__1, &work[work_offset], ldwork, &v[
+			    v_offset], ldv, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		ztrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[lastv - *k + 1 + v_dim1], ldv, &
+			work[work_offset], ldwork);
+
+/*              C2 := C2 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + (lastv - *k + j) * c_dim1;
+			i__4 = i__ + (lastv - *k + j) * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			z__1.r = c__[i__4].r - work[i__5].r, z__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L110: */
+		    }
+/* L120: */
+		}
+	    }
+	}
+
+    } else if (lsame_(storev, "R")) {
+
+	if (lsame_(direct, "F")) {
+
+/*
+             Let  V =  ( V1  V2 )    (V1: first K columns)
+             where  V1  is unit upper triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C1'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1
+			    + 1], &c__1);
+		    zlacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L130: */
+		}
+
+/*              W := W * V1' */
+
+		ztrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2' */
+
+		    i__1 = lastv - *k;
+		    zgemm_("Conjugate transpose", "Conjugate transpose", &
+			    lastc, k, &i__1, &c_b57, &c__[*k + 1 + c_dim1],
+			    ldc, &v[(*k + 1) * v_dim1 + 1], ldv, &c_b57, &
+			    work[work_offset], ldwork)
+			    ;
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ztrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2' * W' */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("Conjugate transpose", "Conjugate transpose", &
+			    i__1, &lastc, k, &z__1, &v[(*k + 1) * v_dim1 + 1],
+			     ldv, &work[work_offset], ldwork, &c_b57, &c__[*k
+			    + 1 + c_dim1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		ztrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = j + i__ * c_dim1;
+			i__4 = j + i__ * c_dim1;
+			d_cnjg(&z__2, &work[i__ + j * work_dim1]);
+			z__1.r = c__[i__4].r - z__2.r, z__1.i = c__[i__4].i -
+				z__2.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L140: */
+		    }
+/* L150: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C1
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j *
+			    work_dim1 + 1], &c__1);
+/* L160: */
+		}
+
+/*              W := W * V1' */
+
+		ztrmm_("Right", "Upper", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[v_offset], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2' */
+
+		    i__1 = lastv - *k;
+		    zgemm_("No transpose", "Conjugate transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[(*k + 1) * c_dim1 + 1], ldc, &
+			    v[(*k + 1) * v_dim1 + 1], ldv, &c_b57, &work[
+			    work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ztrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2 */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    z__1, &work[work_offset], ldwork, &v[(*k + 1) *
+			    v_dim1 + 1], ldv, &c_b57, &c__[(*k + 1) * c_dim1
+			    + 1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		ztrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + j * c_dim1;
+			i__4 = i__ + j * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			z__1.r = c__[i__4].r - work[i__5].r, z__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L170: */
+		    }
+/* L180: */
+		}
+
+	    }
+
+	} else {
+
+/*
+             Let  V =  ( V1  V2 )    (V2: last K columns)
+             where  V2  is unit lower triangular.
+*/
+
+	    if (lsame_(side, "L")) {
+
+/*
+                Form  H * C  or  H' * C  where  C = ( C1 )
+                                                    ( C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*
+                W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK)
+
+                W := C2'
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+		    zlacgv_(&lastc, &work[j * work_dim1 + 1], &c__1);
+/* L190: */
+		}
+
+/*              W := W * V2' */
+
+		ztrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[(lastv - *k + 1) * v_dim1 + 1],
+			ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1' */
+
+		    i__1 = lastv - *k;
+		    zgemm_("Conjugate transpose", "Conjugate transpose", &
+			    lastc, k, &i__1, &c_b57, &c__[c_offset], ldc, &v[
+			    v_offset], ldv, &c_b57, &work[work_offset],
+			    ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		ztrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b57, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1' * W' */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("Conjugate transpose", "Conjugate transpose", &
+			    i__1, &lastc, k, &z__1, &v[v_offset], ldv, &work[
+			    work_offset], ldwork, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		ztrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = lastv - *k + j + i__ * c_dim1;
+			i__4 = lastv - *k + j + i__ * c_dim1;
+			d_cnjg(&z__2, &work[i__ + j * work_dim1]);
+			z__1.r = c__[i__4].r - z__2.r, z__1.i = c__[i__4].i -
+				z__2.i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L200: */
+		    }
+/* L210: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*
+                Form  C * H  or  C * H'  where  C = ( C1  C2 )
+
+   Computing MAX
+*/
+		i__1 = *k, i__2 = ilazlc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = ilazlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*
+                W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK)
+
+                W := C2
+*/
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    zcopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1,
+			     &work[j * work_dim1 + 1], &c__1);
+/* L220: */
+		}
+
+/*              W := W * V2' */
+
+		ztrmm_("Right", "Lower", "Conjugate transpose", "Unit", &
+			lastc, k, &c_b57, &v[(lastv - *k + 1) * v_dim1 + 1],
+			ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1' */
+
+		    i__1 = lastv - *k;
+		    zgemm_("No transpose", "Conjugate transpose", &lastc, k, &
+			    i__1, &c_b57, &c__[c_offset], ldc, &v[v_offset],
+			    ldv, &c_b57, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		ztrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b57,
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1 */
+
+		    i__1 = lastv - *k;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    z__1, &work[work_offset], ldwork, &v[v_offset],
+			    ldv, &c_b57, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		ztrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b57, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			i__3 = i__ + (lastv - *k + j) * c_dim1;
+			i__4 = i__ + (lastv - *k + j) * c_dim1;
+			i__5 = i__ + j * work_dim1;
+			z__1.r = c__[i__4].r - work[i__5].r, z__1.i = c__[
+				i__4].i - work[i__5].i;
+			c__[i__3].r = z__1.r, c__[i__3].i = z__1.i;
+/* L230: */
+		    }
+/* L240: */
+		}
+
+	    }
+
+	}
+    }
+
+    return 0;
+
+/*     End of ZLARFB */
+
+} /* zlarfb_ */
+
+/* Subroutine */ int zlarfg_(integer *n, doublecomplex *alpha, doublecomplex *
+	x, integer *incx, doublecomplex *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer j, knt;
+    static doublereal beta, alphi, alphr;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *);
+    static doublereal xnorm;
+    extern doublereal dlapy3_(doublereal *, doublereal *, doublereal *),
+	    dznrm2_(integer *, doublecomplex *, integer *), dlamch_(char *);
+    static doublereal safmin;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *,
+	    doublecomplex *, integer *);
+    static doublereal rsafmn;
+    extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *,
+	     doublecomplex *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLARFG generates a complex elementary reflector H of order n, such
+    that
+
+          H' * ( alpha ) = ( beta ),   H' * H = I.
+               (   x   )   (   0  )
+
+    where alpha and beta are scalars, with beta real, and x is an
+    (n-1)-element complex vector. H is represented in the form
+
+          H = I - tau * ( 1 ) * ( 1 v' ) ,
+                        ( v )
+
+    where tau is a complex scalar and v is a complex (n-1)-element
+    vector. Note that H is not hermitian.
+
+    If the elements of x are all zero and alpha is real, then tau = 0
+    and H is taken to be the unit matrix.
+
+    Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the elementary reflector.
+
+    ALPHA   (input/output) COMPLEX*16
+            On entry, the value alpha.
+            On exit, it is overwritten with the value beta.
+
+    X       (input/output) COMPLEX*16 array, dimension
+                           (1+(N-2)*abs(INCX))
+            On entry, the vector x.
+            On exit, it is overwritten with the vector v.
+
+    INCX    (input) INTEGER
+            The increment between elements of X. INCX > 0.
+
+    TAU     (output) COMPLEX*16
+            The value tau.
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	tau->r = 0., tau->i = 0.;
+	return 0;
+    }
+
+    i__1 = *n - 1;
+    xnorm = dznrm2_(&i__1, &x[1], incx);
+    alphr = alpha->r;
+    alphi = d_imag(alpha);
+
+    if (xnorm == 0. && alphi == 0.) {
+
+/*        H  =  I */
+
+	tau->r = 0., tau->i = 0.;
+    } else {
+
+/*        general case */
+
+	d__1 = dlapy3_(&alphr, &alphi, &xnorm);
+	beta = -d_sign(&d__1, &alphr);
+	safmin = SAFEMINIMUM / EPSILON;
+	rsafmn = 1. / safmin;
+
+	knt = 0;
+	if (abs(beta) < safmin) {
+
+/*           XNORM, BETA may be inaccurate; scale X and recompute them */
+
+L10:
+	    ++knt;
+	    i__1 = *n - 1;
+	    zdscal_(&i__1, &rsafmn, &x[1], incx);
+	    beta *= rsafmn;
+	    alphi *= rsafmn;
+	    alphr *= rsafmn;
+	    if (abs(beta) < safmin) {
+		goto L10;
+	    }
+
+/*           New BETA is at most 1, at least SAFMIN */
+
+	    i__1 = *n - 1;
+	    xnorm = dznrm2_(&i__1, &x[1], incx);
+	    z__1.r = alphr, z__1.i = alphi;
+	    alpha->r = z__1.r, alpha->i = z__1.i;
+	    d__1 = dlapy3_(&alphr, &alphi, &xnorm);
+	    beta = -d_sign(&d__1, &alphr);
+	}
+	d__1 = (beta - alphr) / beta;
+	d__2 = -alphi / beta;
+	z__1.r = d__1, z__1.i = d__2;
+	tau->r = z__1.r, tau->i = z__1.i;
+	z__2.r = alpha->r - beta, z__2.i = alpha->i;
+	zladiv_(&z__1, &c_b57, &z__2);
+	alpha->r = z__1.r, alpha->i = z__1.i;
+	i__1 = *n - 1;
+	zscal_(&i__1, alpha, &x[1], incx);
+
+/*        If ALPHA is subnormal, it may lose relative accuracy */
+
+	i__1 = knt;
+	for (j = 1; j <= i__1; ++j) {
+	    beta *= safmin;
+/* L20: */
+	}
+	alpha->r = beta, alpha->i = 0.;
+    }
+
+    return 0;
+
+/*     End of ZLARFG */
+
+} /* zlarfg_ */
+
+/* Subroutine */ int zlarft_(char *direct, char *storev, integer *n, integer *
+	k, doublecomplex *v, integer *ldv, doublecomplex *tau, doublecomplex *
+	t, integer *ldt)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, prevlastv;
+    static doublecomplex vii;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    static integer lastv;
+    extern /* Subroutine */ int ztrmv_(char *, char *, char *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), zlacgv_(integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLARFT forms the triangular factor T of a complex block reflector H
+    of order n, which is defined as a product of k elementary reflectors.
+
+    If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+
+    If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+
+    If STOREV = 'C', the vector which defines the elementary reflector
+    H(i) is stored in the i-th column of the array V, and
+
+       H  =  I - V * T * V'
+
+    If STOREV = 'R', the vector which defines the elementary reflector
+    H(i) is stored in the i-th row of the array V, and
+
+       H  =  I - V' * T * V
+
+    Arguments
+    =========
+
+    DIRECT  (input) CHARACTER*1
+            Specifies the order in which the elementary reflectors are
+            multiplied to form the block reflector:
+            = 'F': H = H(1) H(2) . . . H(k) (Forward)
+            = 'B': H = H(k) . . . H(2) H(1) (Backward)
+
+    STOREV  (input) CHARACTER*1
+            Specifies how the vectors which define the elementary
+            reflectors are stored (see also Further Details):
+            = 'C': columnwise
+            = 'R': rowwise
+
+    N       (input) INTEGER
+            The order of the block reflector H. N >= 0.
+
+    K       (input) INTEGER
+            The order of the triangular factor T (= the number of
+            elementary reflectors). K >= 1.
+
+    V       (input/output) COMPLEX*16 array, dimension
+                                 (LDV,K) if STOREV = 'C'
+                                 (LDV,N) if STOREV = 'R'
+            The matrix V. See further details.
+
+    LDV     (input) INTEGER
+            The leading dimension of the array V.
+            If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K.
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i).
+
+    T       (output) COMPLEX*16 array, dimension (LDT,K)
+            The k by k triangular factor T of the block reflector.
+            If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is
+            lower triangular. The rest of the array is not used.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= K.
+
+    Further Details
+    ===============
+
+    The shape of the matrix V and the storage of the vectors which define
+    the H(i) is best illustrated by the following example with n = 5 and
+    k = 3. The elements equal to 1 are not stored; the corresponding
+    array elements are modified but restored on exit. The rest of the
+    array is not used.
+
+    DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R':
+
+                 V = (  1       )                 V = (  1 v1 v1 v1 v1 )
+                     ( v1  1    )                     (     1 v2 v2 v2 )
+                     ( v1 v2  1 )                     (        1 v3 v3 )
+                     ( v1 v2 v3 )
+                     ( v1 v2 v3 )
+
+    DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R':
+
+                 V = ( v1 v2 v3 )                 V = ( v1 v1  1       )
+                     ( v1 v2 v3 )                     ( v2 v2 v2  1    )
+                     (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
+                     (     1 v3 )
+                     (        1 )
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    --tau;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (lsame_(direct, "F")) {
+	prevlastv = *n;
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    prevlastv = max(prevlastv,i__);
+	    i__2 = i__;
+	    if (tau[i__2].r == 0. && tau[i__2].i == 0.) {
+
+/*              H(i)  =  I */
+
+		i__2 = i__;
+		for (j = 1; j <= i__2; ++j) {
+		    i__3 = j + i__ * t_dim1;
+		    t[i__3].r = 0., t[i__3].i = 0.;
+/* L10: */
+		}
+	    } else {
+
+/*              general case */
+
+		i__2 = i__ + i__ * v_dim1;
+		vii.r = v[i__2].r, vii.i = v[i__2].i;
+		i__2 = i__ + i__ * v_dim1;
+		v[i__2].r = 1., v[i__2].i = 0.;
+		if (lsame_(storev, "C")) {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			i__3 = lastv + i__ * v_dim1;
+			if (v[i__3].r != 0. || v[i__3].i != 0.) {
+			    goto L15;
+			}
+		    }
+L15:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)' * V(i:j,i) */
+
+		    i__2 = j - i__ + 1;
+		    i__3 = i__ - 1;
+		    i__4 = i__;
+		    z__1.r = -tau[i__4].r, z__1.i = -tau[i__4].i;
+		    zgemv_("Conjugate transpose", &i__2, &i__3, &z__1, &v[i__
+			    + v_dim1], ldv, &v[i__ + i__ * v_dim1], &c__1, &
+			    c_b56, &t[i__ * t_dim1 + 1], &c__1);
+		} else {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			i__3 = i__ + lastv * v_dim1;
+			if (v[i__3].r != 0. || v[i__3].i != 0.) {
+			    goto L16;
+			}
+		    }
+L16:
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)' */
+
+		    if (i__ < j) {
+			i__2 = j - i__;
+			zlacgv_(&i__2, &v[i__ + (i__ + 1) * v_dim1], ldv);
+		    }
+		    i__2 = i__ - 1;
+		    i__3 = j - i__ + 1;
+		    i__4 = i__;
+		    z__1.r = -tau[i__4].r, z__1.i = -tau[i__4].i;
+		    zgemv_("No transpose", &i__2, &i__3, &z__1, &v[i__ *
+			    v_dim1 + 1], ldv, &v[i__ + i__ * v_dim1], ldv, &
+			    c_b56, &t[i__ * t_dim1 + 1], &c__1);
+		    if (i__ < j) {
+			i__2 = j - i__;
+			zlacgv_(&i__2, &v[i__ + (i__ + 1) * v_dim1], ldv);
+		    }
+		}
+		i__2 = i__ + i__ * v_dim1;
+		v[i__2].r = vii.r, v[i__2].i = vii.i;
+
+/*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		ztrmv_("Upper", "No transpose", "Non-unit", &i__2, &t[
+			t_offset], ldt, &t[i__ * t_dim1 + 1], &c__1);
+		i__2 = i__ + i__ * t_dim1;
+		i__3 = i__;
+		t[i__2].r = tau[i__3].r, t[i__2].i = tau[i__3].i;
+		if (i__ > 1) {
+		    prevlastv = max(prevlastv,lastv);
+		} else {
+		    prevlastv = lastv;
+		}
+	    }
+/* L20: */
+	}
+    } else {
+	prevlastv = 1;
+	for (i__ = *k; i__ >= 1; --i__) {
+	    i__1 = i__;
+	    if (tau[i__1].r == 0. && tau[i__1].i == 0.) {
+
+/*              H(i)  =  I */
+
+		i__1 = *k;
+		for (j = i__; j <= i__1; ++j) {
+		    i__2 = j + i__ * t_dim1;
+		    t[i__2].r = 0., t[i__2].i = 0.;
+/* L30: */
+		}
+	    } else {
+
+/*              general case */
+
+		if (i__ < *k) {
+		    if (lsame_(storev, "C")) {
+			i__1 = *n - *k + i__ + i__ * v_dim1;
+			vii.r = v[i__1].r, vii.i = v[i__1].i;
+			i__1 = *n - *k + i__ + i__ * v_dim1;
+			v[i__1].r = 1., v[i__1].i = 0.;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    i__2 = lastv + i__ * v_dim1;
+			    if (v[i__2].r != 0. || v[i__2].i != 0.) {
+				goto L35;
+			    }
+			}
+L35:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(j:n-k+i,i+1:k)' * V(j:n-k+i,i)
+*/
+
+			i__1 = *n - *k + i__ - j + 1;
+			i__2 = *k - i__;
+			i__3 = i__;
+			z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i;
+			zgemv_("Conjugate transpose", &i__1, &i__2, &z__1, &v[
+				j + (i__ + 1) * v_dim1], ldv, &v[j + i__ *
+				v_dim1], &c__1, &c_b56, &t[i__ + 1 + i__ *
+				t_dim1], &c__1);
+			i__1 = *n - *k + i__ + i__ * v_dim1;
+			v[i__1].r = vii.r, v[i__1].i = vii.i;
+		    } else {
+			i__1 = i__ + (*n - *k + i__) * v_dim1;
+			vii.r = v[i__1].r, vii.i = v[i__1].i;
+			i__1 = i__ + (*n - *k + i__) * v_dim1;
+			v[i__1].r = 1., v[i__1].i = 0.;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    i__2 = i__ + lastv * v_dim1;
+			    if (v[i__2].r != 0. || v[i__2].i != 0.) {
+				goto L36;
+			    }
+			}
+L36:
+			j = max(lastv,prevlastv);
+
+/*
+                      T(i+1:k,i) :=
+                              - tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)'
+*/
+
+			i__1 = *n - *k + i__ - 1 - j + 1;
+			zlacgv_(&i__1, &v[i__ + j * v_dim1], ldv);
+			i__1 = *k - i__;
+			i__2 = *n - *k + i__ - j + 1;
+			i__3 = i__;
+			z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i;
+			zgemv_("No transpose", &i__1, &i__2, &z__1, &v[i__ +
+				1 + j * v_dim1], ldv, &v[i__ + j * v_dim1],
+				ldv, &c_b56, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			i__1 = *n - *k + i__ - 1 - j + 1;
+			zlacgv_(&i__1, &v[i__ + j * v_dim1], ldv);
+			i__1 = i__ + (*n - *k + i__) * v_dim1;
+			v[i__1].r = vii.r, v[i__1].i = vii.i;
+		    }
+
+/*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) */
+
+		    i__1 = *k - i__;
+		    ztrmv_("Lower", "No transpose", "Non-unit", &i__1, &t[i__
+			    + 1 + (i__ + 1) * t_dim1], ldt, &t[i__ + 1 + i__ *
+			     t_dim1], &c__1)
+			    ;
+		    if (i__ > 1) {
+			prevlastv = min(prevlastv,lastv);
+		    } else {
+			prevlastv = lastv;
+		    }
+		}
+		i__1 = i__ + i__ * t_dim1;
+		i__2 = i__;
+		t[i__1].r = tau[i__2].r, t[i__1].i = tau[i__2].i;
+	    }
+/* L40: */
+	}
+    }
+    return 0;
+
+/*     End of ZLARFT */
+
+} /* zlarft_ */
+
+/* Subroutine */ int zlartg_(doublecomplex *f, doublecomplex *g, doublereal *
+	cs, doublecomplex *sn, doublecomplex *r__)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6, d__7, d__8, d__9, d__10;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static doublereal d__;
+    static integer i__;
+    static doublereal f2, g2;
+    static doublecomplex ff;
+    static doublereal di, dr;
+    static doublecomplex fs, gs;
+    static doublereal f2s, g2s, eps, scale;
+    static integer count;
+    static doublereal safmn2;
+    extern doublereal dlapy2_(doublereal *, doublereal *);
+    static doublereal safmx2;
+
+    static doublereal safmin;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLARTG generates a plane rotation so that
+
+       [  CS  SN  ]     [ F ]     [ R ]
+       [  __      ]  .  [   ]  =  [   ]   where CS**2 + |SN|**2 = 1.
+       [ -SN  CS  ]     [ G ]     [ 0 ]
+
+    This is a faster version of the BLAS1 routine ZROTG, except for
+    the following differences:
+       F and G are unchanged on return.
+       If G=0, then CS=1 and SN=0.
+       If F=0, then CS=0 and SN is chosen so that R is real.
+
+    Arguments
+    =========
+
+    F       (input) COMPLEX*16
+            The first component of vector to be rotated.
+
+    G       (input) COMPLEX*16
+            The second component of vector to be rotated.
+
+    CS      (output) DOUBLE PRECISION
+            The cosine of the rotation.
+
+    SN      (output) COMPLEX*16
+            The sine of the rotation.
+
+    R       (output) COMPLEX*16
+            The nonzero component of the rotated vector.
+
+    Further Details
+    ======= =======
+
+    3-5-96 - Modified with a new algorithm by W. Kahan and J. Demmel
+
+    This version has a few statements commented out for thread safety
+    (machine parameters are computed on each entry). 10 feb 03, SJH.
+
+    =====================================================================
+
+       LOGICAL            FIRST
+       SAVE               FIRST, SAFMX2, SAFMIN, SAFMN2
+       DATA               FIRST / .TRUE. /
+
+       IF( FIRST ) THEN
+*/
+    safmin = SAFEMINIMUM;
+    eps = EPSILON;
+    d__1 = BASE;
+    i__1 = (integer) (log(safmin / eps) / log(BASE) / 2.);
+    safmn2 = pow_di(&d__1, &i__1);
+    safmx2 = 1. / safmn2;
+/*
+          FIRST = .FALSE.
+       END IF
+   Computing MAX
+   Computing MAX
+*/
+    d__7 = (d__1 = f->r, abs(d__1)), d__8 = (d__2 = d_imag(f), abs(d__2));
+/* Computing MAX */
+    d__9 = (d__3 = g->r, abs(d__3)), d__10 = (d__4 = d_imag(g), abs(d__4));
+    d__5 = max(d__7,d__8), d__6 = max(d__9,d__10);
+    scale = max(d__5,d__6);
+    fs.r = f->r, fs.i = f->i;
+    gs.r = g->r, gs.i = g->i;
+    count = 0;
+    if (scale >= safmx2) {
+L10:
+	++count;
+	z__1.r = safmn2 * fs.r, z__1.i = safmn2 * fs.i;
+	fs.r = z__1.r, fs.i = z__1.i;
+	z__1.r = safmn2 * gs.r, z__1.i = safmn2 * gs.i;
+	gs.r = z__1.r, gs.i = z__1.i;
+	scale *= safmn2;
+	if (scale >= safmx2) {
+	    goto L10;
+	}
+    } else if (scale <= safmn2) {
+	if (g->r == 0. && g->i == 0.) {
+	    *cs = 1.;
+	    sn->r = 0., sn->i = 0.;
+	    r__->r = f->r, r__->i = f->i;
+	    return 0;
+	}
+L20:
+	--count;
+	z__1.r = safmx2 * fs.r, z__1.i = safmx2 * fs.i;
+	fs.r = z__1.r, fs.i = z__1.i;
+	z__1.r = safmx2 * gs.r, z__1.i = safmx2 * gs.i;
+	gs.r = z__1.r, gs.i = z__1.i;
+	scale *= safmx2;
+	if (scale <= safmn2) {
+	    goto L20;
+	}
+    }
+/* Computing 2nd power */
+    d__1 = fs.r;
+/* Computing 2nd power */
+    d__2 = d_imag(&fs);
+    f2 = d__1 * d__1 + d__2 * d__2;
+/* Computing 2nd power */
+    d__1 = gs.r;
+/* Computing 2nd power */
+    d__2 = d_imag(&gs);
+    g2 = d__1 * d__1 + d__2 * d__2;
+    if (f2 <= max(g2,1.) * safmin) {
+
+/*        This is a rare case: F is very small. */
+
+	if (f->r == 0. && f->i == 0.) {
+	    *cs = 0.;
+	    d__2 = g->r;
+	    d__3 = d_imag(g);
+	    d__1 = dlapy2_(&d__2, &d__3);
+	    r__->r = d__1, r__->i = 0.;
+/*           Do complex/real division explicitly with two real divisions */
+	    d__1 = gs.r;
+	    d__2 = d_imag(&gs);
+	    d__ = dlapy2_(&d__1, &d__2);
+	    d__1 = gs.r / d__;
+	    d__2 = -d_imag(&gs) / d__;
+	    z__1.r = d__1, z__1.i = d__2;
+	    sn->r = z__1.r, sn->i = z__1.i;
+	    return 0;
+	}
+	d__1 = fs.r;
+	d__2 = d_imag(&fs);
+	f2s = dlapy2_(&d__1, &d__2);
+/*
+          G2 and G2S are accurate
+          G2 is at least SAFMIN, and G2S is at least SAFMN2
+*/
+	g2s = sqrt(g2);
+/*
+          Error in CS from underflow in F2S is at most
+          UNFL / SAFMN2 .lt. sqrt(UNFL*EPS) .lt. EPS
+          If MAX(G2,ONE)=G2, then F2 .lt. G2*SAFMIN,
+          and so CS .lt. sqrt(SAFMIN)
+          If MAX(G2,ONE)=ONE, then F2 .lt. SAFMIN
+          and so CS .lt. sqrt(SAFMIN)/SAFMN2 = sqrt(EPS)
+          Therefore, CS = F2S/G2S / sqrt( 1 + (F2S/G2S)**2 ) = F2S/G2S
+*/
+	*cs = f2s / g2s;
+/*
+          Make sure abs(FF) = 1
+          Do complex/real division explicitly with 2 real divisions
+   Computing MAX
+*/
+	d__3 = (d__1 = f->r, abs(d__1)), d__4 = (d__2 = d_imag(f), abs(d__2));
+	if (max(d__3,d__4) > 1.) {
+	    d__1 = f->r;
+	    d__2 = d_imag(f);
+	    d__ = dlapy2_(&d__1, &d__2);
+	    d__1 = f->r / d__;
+	    d__2 = d_imag(f) / d__;
+	    z__1.r = d__1, z__1.i = d__2;
+	    ff.r = z__1.r, ff.i = z__1.i;
+	} else {
+	    dr = safmx2 * f->r;
+	    di = safmx2 * d_imag(f);
+	    d__ = dlapy2_(&dr, &di);
+	    d__1 = dr / d__;
+	    d__2 = di / d__;
+	    z__1.r = d__1, z__1.i = d__2;
+	    ff.r = z__1.r, ff.i = z__1.i;
+	}
+	d__1 = gs.r / g2s;
+	d__2 = -d_imag(&gs) / g2s;
+	z__2.r = d__1, z__2.i = d__2;
+	z__1.r = ff.r * z__2.r - ff.i * z__2.i, z__1.i = ff.r * z__2.i + ff.i
+		* z__2.r;
+	sn->r = z__1.r, sn->i = z__1.i;
+	z__2.r = *cs * f->r, z__2.i = *cs * f->i;
+	z__3.r = sn->r * g->r - sn->i * g->i, z__3.i = sn->r * g->i + sn->i *
+		g->r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	r__->r = z__1.r, r__->i = z__1.i;
+    } else {
+
+/*
+          This is the most common case.
+          Neither F2 nor F2/G2 are less than SAFMIN
+          F2S cannot overflow, and it is accurate
+*/
+
+	f2s = sqrt(g2 / f2 + 1.);
+/*        Do the F2S(real)*FS(complex) multiply with two real multiplies */
+	d__1 = f2s * fs.r;
+	d__2 = f2s * d_imag(&fs);
+	z__1.r = d__1, z__1.i = d__2;
+	r__->r = z__1.r, r__->i = z__1.i;
+	*cs = 1. / f2s;
+	d__ = f2 + g2;
+/*        Do complex/real division explicitly with two real divisions */
+	d__1 = r__->r / d__;
+	d__2 = d_imag(r__) / d__;
+	z__1.r = d__1, z__1.i = d__2;
+	sn->r = z__1.r, sn->i = z__1.i;
+	d_cnjg(&z__2, &gs);
+	z__1.r = sn->r * z__2.r - sn->i * z__2.i, z__1.i = sn->r * z__2.i +
+		sn->i * z__2.r;
+	sn->r = z__1.r, sn->i = z__1.i;
+	if (count != 0) {
+	    if (count > 0) {
+		i__1 = count;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    z__1.r = safmx2 * r__->r, z__1.i = safmx2 * r__->i;
+		    r__->r = z__1.r, r__->i = z__1.i;
+/* L30: */
+		}
+	    } else {
+		i__1 = -count;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    z__1.r = safmn2 * r__->r, z__1.i = safmn2 * r__->i;
+		    r__->r = z__1.r, r__->i = z__1.i;
+/* L40: */
+		}
+	    }
+	}
+    }
+    return 0;
+
+/*     End of ZLARTG */
+
+} /* zlartg_ */
+
+/* Subroutine */ int zlascl_(char *type__, integer *kl, integer *ku,
+	doublereal *cfrom, doublereal *cto, integer *m, integer *n,
+	doublecomplex *a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, k1, k2, k3, k4;
+    static doublereal mul, cto1;
+    static logical done;
+    static doublereal ctoc;
+    extern logical lsame_(char *, char *);
+    static integer itype;
+    static doublereal cfrom1;
+
+    static doublereal cfromc;
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static doublereal bignum, smlnum;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLASCL multiplies the M by N complex matrix A by the real scalar
+    CTO/CFROM.  This is done without over/underflow as long as the final
+    result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+    A may be full, upper triangular, lower triangular, upper Hessenberg,
+    or banded.
+
+    Arguments
+    =========
+
+    TYPE    (input) CHARACTER*1
+            TYPE indices the storage type of the input matrix.
+            = 'G':  A is a full matrix.
+            = 'L':  A is a lower triangular matrix.
+            = 'U':  A is an upper triangular matrix.
+            = 'H':  A is an upper Hessenberg matrix.
+            = 'B':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the lower
+                    half stored.
+            = 'Q':  A is a symmetric band matrix with lower bandwidth KL
+                    and upper bandwidth KU and with the only the upper
+                    half stored.
+            = 'Z':  A is a band matrix with lower bandwidth KL and upper
+                    bandwidth KU.
+
+    KL      (input) INTEGER
+            The lower bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    KU      (input) INTEGER
+            The upper bandwidth of A.  Referenced only if TYPE = 'B',
+            'Q' or 'Z'.
+
+    CFROM   (input) DOUBLE PRECISION
+    CTO     (input) DOUBLE PRECISION
+            The matrix A is multiplied by CTO/CFROM. A(I,J) is computed
+            without over/underflow if the final result CTO*A(I,J)/CFROM
+            can be represented without over/underflow.  CFROM must be
+            nonzero.
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            The matrix to be multiplied by CTO/CFROM.  See TYPE for the
+            storage type.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    INFO    (output) INTEGER
+            0  - successful exit
+            <0 - if INFO = -i, the i-th argument had an illegal value.
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(type__, "G")) {
+	itype = 0;
+    } else if (lsame_(type__, "L")) {
+	itype = 1;
+    } else if (lsame_(type__, "U")) {
+	itype = 2;
+    } else if (lsame_(type__, "H")) {
+	itype = 3;
+    } else if (lsame_(type__, "B")) {
+	itype = 4;
+    } else if (lsame_(type__, "Q")) {
+	itype = 5;
+    } else if (lsame_(type__, "Z")) {
+	itype = 6;
+    } else {
+	itype = -1;
+    }
+
+    if (itype == -1) {
+	*info = -1;
+    } else if (*cfrom == 0. || disnan_(cfrom)) {
+	*info = -4;
+    } else if (disnan_(cto)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || itype == 4 && *n != *m || itype == 5 && *n != *m) {
+	*info = -7;
+    } else if (itype <= 3 && *lda < max(1,*m)) {
+	*info = -9;
+    } else if (itype >= 4) {
+/* Computing MAX */
+	i__1 = *m - 1;
+	if (*kl < 0 || *kl > max(i__1,0)) {
+	    *info = -2;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = *n - 1;
+	    if (*ku < 0 || *ku > max(i__1,0) || (itype == 4 || itype == 5) &&
+		    *kl != *ku) {
+		*info = -3;
+	    } else if (itype == 4 && *lda < *kl + 1 || itype == 5 && *lda < *
+		    ku + 1 || itype == 6 && *lda < (*kl << 1) + *ku + 1) {
+		*info = -9;
+	    }
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLASCL", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = SAFEMINIMUM;
+    bignum = 1. / smlnum;
+
+    cfromc = *cfrom;
+    ctoc = *cto;
+
+L10:
+    cfrom1 = cfromc * smlnum;
+    if (cfrom1 == cfromc) {
+/*
+          CFROMC is an inf.  Multiply by a correctly signed zero for
+          finite CTOC, or a NaN if CTOC is infinite.
+*/
+	mul = ctoc / cfromc;
+	done = TRUE_;
+	cto1 = ctoc;
+    } else {
+	cto1 = ctoc / bignum;
+	if (cto1 == ctoc) {
+/*
+             CTOC is either 0 or an inf.  In both cases, CTOC itself
+             serves as the correct multiplication factor.
+*/
+	    mul = ctoc;
+	    done = TRUE_;
+	    cfromc = 1.;
+	} else if (abs(cfrom1) > abs(ctoc) && ctoc != 0.) {
+	    mul = smlnum;
+	    done = FALSE_;
+	    cfromc = cfrom1;
+	} else if (abs(cto1) > abs(cfromc)) {
+	    mul = bignum;
+	    done = FALSE_;
+	    ctoc = cto1;
+	} else {
+	    mul = ctoc / cfromc;
+	    done = TRUE_;
+	}
+    }
+
+    if (itype == 0) {
+
+/*        Full matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L20: */
+	    }
+/* L30: */
+	}
+
+    } else if (itype == 1) {
+
+/*        Lower triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L40: */
+	    }
+/* L50: */
+	}
+
+    } else if (itype == 2) {
+
+/*        Upper triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L60: */
+	    }
+/* L70: */
+	}
+
+    } else if (itype == 3) {
+
+/*        Upper Hessenberg matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j + 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+    } else if (itype == 4) {
+
+/*        Lower half of a symmetric band matrix */
+
+	k3 = *kl + 1;
+	k4 = *n + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = k3, i__4 = k4 - j;
+	    i__2 = min(i__3,i__4);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L100: */
+	    }
+/* L110: */
+	}
+
+    } else if (itype == 5) {
+
+/*        Upper half of a symmetric band matrix */
+
+	k1 = *ku + 2;
+	k3 = *ku + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__2 = k1 - j;
+	    i__3 = k3;
+	    for (i__ = max(i__2,1); i__ <= i__3; ++i__) {
+		i__2 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L120: */
+	    }
+/* L130: */
+	}
+
+    } else if (itype == 6) {
+
+/*        Band matrix */
+
+	k1 = *kl + *ku + 2;
+	k2 = *kl + 1;
+	k3 = (*kl << 1) + *ku + 1;
+	k4 = *kl + *ku + 1 + *m;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__3 = k1 - j;
+/* Computing MIN */
+	    i__4 = k3, i__5 = k4 - j;
+	    i__2 = min(i__4,i__5);
+	    for (i__ = max(i__3,k2); i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		i__4 = i__ + j * a_dim1;
+		z__1.r = mul * a[i__4].r, z__1.i = mul * a[i__4].i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L140: */
+	    }
+/* L150: */
+	}
+
+    }
+
+    if (! done) {
+	goto L10;
+    }
+
+    return 0;
+
+/*     End of ZLASCL */
+
+} /* zlascl_ */
+
+/* Subroutine */ int zlaset_(char *uplo, integer *m, integer *n,
+	doublecomplex *alpha, doublecomplex *beta, doublecomplex *a, integer *
+	lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLASET initializes a 2-D array A to BETA on the diagonal and
+    ALPHA on the offdiagonals.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies the part of the matrix A to be set.
+            = 'U':      Upper triangular part is set. The lower triangle
+                        is unchanged.
+            = 'L':      Lower triangular part is set. The upper triangle
+                        is unchanged.
+            Otherwise:  All of the matrix A is set.
+
+    M       (input) INTEGER
+            On entry, M specifies the number of rows of A.
+
+    N       (input) INTEGER
+            On entry, N specifies the number of columns of A.
+
+    ALPHA   (input) COMPLEX*16
+            All the offdiagonal array elements are set to ALPHA.
+
+    BETA    (input) COMPLEX*16
+            All the diagonal array elements are set to BETA.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the m by n matrix A.
+            On exit, A(i,j) = ALPHA, 1 <= i <= m, 1 <= j <= n, i.ne.j;
+                     A(i,i) = BETA , 1 <= i <= min(m,n)
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+
+/*
+          Set the diagonal to BETA and the strictly upper triangular
+          part of the array to ALPHA.
+*/
+
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j - 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = alpha->r, a[i__3].i = alpha->i;
+/* L10: */
+	    }
+/* L20: */
+	}
+	i__1 = min(*n,*m);
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = beta->r, a[i__2].i = beta->i;
+/* L30: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+
+/*
+          Set the diagonal to BETA and the strictly lower triangular
+          part of the array to ALPHA.
+*/
+
+	i__1 = min(*m,*n);
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = alpha->r, a[i__3].i = alpha->i;
+/* L40: */
+	    }
+/* L50: */
+	}
+	i__1 = min(*n,*m);
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = beta->r, a[i__2].i = beta->i;
+/* L60: */
+	}
+
+    } else {
+
+/*
+          Set the array to BETA on the diagonal and ALPHA on the
+          offdiagonal.
+*/
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = alpha->r, a[i__3].i = alpha->i;
+/* L70: */
+	    }
+/* L80: */
+	}
+	i__1 = min(*m,*n);
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = beta->r, a[i__2].i = beta->i;
+/* L90: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLASET */
+
+} /* zlaset_ */
+
+/* Subroutine */ int zlasr_(char *side, char *pivot, char *direct, integer *m,
+	 integer *n, doublereal *c__, doublereal *s, doublecomplex *a,
+	integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Local variables */
+    static integer i__, j, info;
+    static doublecomplex temp;
+    extern logical lsame_(char *, char *);
+    static doublereal ctemp, stemp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLASR applies a sequence of real plane rotations to a complex matrix
+    A, from either the left or the right.
+
+    When SIDE = 'L', the transformation takes the form
+
+       A := P*A
+
+    and when SIDE = 'R', the transformation takes the form
+
+       A := A*P**T
+
+    where P is an orthogonal matrix consisting of a sequence of z plane
+    rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+    and P**T is the transpose of P.
+
+    When DIRECT = 'F' (Forward sequence), then
+
+       P = P(z-1) * ... * P(2) * P(1)
+
+    and when DIRECT = 'B' (Backward sequence), then
+
+       P = P(1) * P(2) * ... * P(z-1)
+
+    where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+
+       R(k) = (  c(k)  s(k) )
+            = ( -s(k)  c(k) ).
+
+    When PIVOT = 'V' (Variable pivot), the rotation is performed
+    for the plane (k,k+1), i.e., P(k) has the form
+
+       P(k) = (  1                                            )
+              (       ...                                     )
+              (              1                                )
+              (                   c(k)  s(k)                  )
+              (                  -s(k)  c(k)                  )
+              (                                1              )
+              (                                     ...       )
+              (                                            1  )
+
+    where R(k) appears as a rank-2 modification to the identity matrix in
+    rows and columns k and k+1.
+
+    When PIVOT = 'T' (Top pivot), the rotation is performed for the
+    plane (1,k+1), so P(k) has the form
+
+       P(k) = (  c(k)                    s(k)                 )
+              (         1                                     )
+              (              ...                              )
+              (                     1                         )
+              ( -s(k)                    c(k)                 )
+              (                                 1             )
+              (                                      ...      )
+              (                                             1 )
+
+    where R(k) appears in rows and columns 1 and k+1.
+
+    Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+    performed for the plane (k,z), giving P(k) the form
+
+       P(k) = ( 1                                             )
+              (      ...                                      )
+              (             1                                 )
+              (                  c(k)                    s(k) )
+              (                         1                     )
+              (                              ...              )
+              (                                     1         )
+              (                 -s(k)                    c(k) )
+
+    where R(k) appears in rows and columns k and z.  The rotations are
+    performed without ever forming P(k) explicitly.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            Specifies whether the plane rotation matrix P is applied to
+            A on the left or the right.
+            = 'L':  Left, compute A := P*A
+            = 'R':  Right, compute A:= A*P**T
+
+    PIVOT   (input) CHARACTER*1
+            Specifies the plane for which P(k) is a plane rotation
+            matrix.
+            = 'V':  Variable pivot, the plane (k,k+1)
+            = 'T':  Top pivot, the plane (1,k+1)
+            = 'B':  Bottom pivot, the plane (k,z)
+
+    DIRECT  (input) CHARACTER*1
+            Specifies whether P is a forward or backward sequence of
+            plane rotations.
+            = 'F':  Forward, P = P(z-1)*...*P(2)*P(1)
+            = 'B':  Backward, P = P(1)*P(2)*...*P(z-1)
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  If m <= 1, an immediate
+            return is effected.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  If n <= 1, an
+            immediate return is effected.
+
+    C       (input) DOUBLE PRECISION array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The cosines c(k) of the plane rotations.
+
+    S       (input) DOUBLE PRECISION array, dimension
+                    (M-1) if SIDE = 'L'
+                    (N-1) if SIDE = 'R'
+            The sines s(k) of the plane rotations.  The 2-by-2 plane
+            rotation part of the matrix P(k), R(k), has the form
+            R(k) = (  c(k)  s(k) )
+                   ( -s(k)  c(k) ).
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            The M-by-N matrix A.  On exit, A is overwritten by P*A if
+            SIDE = 'R' or by A*P**T if SIDE = 'L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    =====================================================================
+
+
+       Test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --c__;
+    --s;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (! (lsame_(side, "L") || lsame_(side, "R"))) {
+	info = 1;
+    } else if (! (lsame_(pivot, "V") || lsame_(pivot,
+	    "T") || lsame_(pivot, "B"))) {
+	info = 2;
+    } else if (! (lsame_(direct, "F") || lsame_(direct,
+	    "B"))) {
+	info = 3;
+    } else if (*m < 0) {
+	info = 4;
+    } else if (*n < 0) {
+	info = 5;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZLASR ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+    if (lsame_(side, "L")) {
+
+/*        Form  P * A */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = j + 1 + i__ * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = j + 1 + i__ * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__4 = j + i__ * a_dim1;
+			    z__3.r = stemp * a[i__4].r, z__3.i = stemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			    i__3 = j + i__ * a_dim1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__4 = j + i__ * a_dim1;
+			    z__3.r = ctemp * a[i__4].r, z__3.i = ctemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = j + 1 + i__ * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = j + 1 + i__ * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__3 = j + i__ * a_dim1;
+			    z__3.r = stemp * a[i__3].r, z__3.i = stemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			    i__2 = j + i__ * a_dim1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__3 = j + i__ * a_dim1;
+			    z__3.r = ctemp * a[i__3].r, z__3.i = ctemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L30: */
+			}
+		    }
+/* L40: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = j + i__ * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = j + i__ * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__4 = i__ * a_dim1 + 1;
+			    z__3.r = stemp * a[i__4].r, z__3.i = stemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			    i__3 = i__ * a_dim1 + 1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__4 = i__ * a_dim1 + 1;
+			    z__3.r = ctemp * a[i__4].r, z__3.i = ctemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = j + i__ * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = j + i__ * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__3 = i__ * a_dim1 + 1;
+			    z__3.r = stemp * a[i__3].r, z__3.i = stemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			    i__2 = i__ * a_dim1 + 1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__3 = i__ * a_dim1 + 1;
+			    z__3.r = ctemp * a[i__3].r, z__3.i = ctemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *m - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *n;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = j + i__ * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = j + i__ * a_dim1;
+			    i__4 = *m + i__ * a_dim1;
+			    z__2.r = stemp * a[i__4].r, z__2.i = stemp * a[
+				    i__4].i;
+			    z__3.r = ctemp * temp.r, z__3.i = ctemp * temp.i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			    i__3 = *m + i__ * a_dim1;
+			    i__4 = *m + i__ * a_dim1;
+			    z__2.r = ctemp * a[i__4].r, z__2.i = ctemp * a[
+				    i__4].i;
+			    z__3.r = stemp * temp.r, z__3.i = stemp * temp.i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L90: */
+			}
+		    }
+/* L100: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *m - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *n;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = j + i__ * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = j + i__ * a_dim1;
+			    i__3 = *m + i__ * a_dim1;
+			    z__2.r = stemp * a[i__3].r, z__2.i = stemp * a[
+				    i__3].i;
+			    z__3.r = ctemp * temp.r, z__3.i = ctemp * temp.i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			    i__2 = *m + i__ * a_dim1;
+			    i__3 = *m + i__ * a_dim1;
+			    z__2.r = ctemp * a[i__3].r, z__2.i = ctemp * a[
+				    i__3].i;
+			    z__3.r = stemp * temp.r, z__3.i = stemp * temp.i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L110: */
+			}
+		    }
+/* L120: */
+		}
+	    }
+	}
+    } else if (lsame_(side, "R")) {
+
+/*        Form A * P' */
+
+	if (lsame_(pivot, "V")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + (j + 1) * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = i__ + (j + 1) * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__4 = i__ + j * a_dim1;
+			    z__3.r = stemp * a[i__4].r, z__3.i = stemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			    i__3 = i__ + j * a_dim1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__4 = i__ + j * a_dim1;
+			    z__3.r = ctemp * a[i__4].r, z__3.i = ctemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L130: */
+			}
+		    }
+/* L140: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + (j + 1) * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = i__ + (j + 1) * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__3 = i__ + j * a_dim1;
+			    z__3.r = stemp * a[i__3].r, z__3.i = stemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			    i__2 = i__ + j * a_dim1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__3 = i__ + j * a_dim1;
+			    z__3.r = ctemp * a[i__3].r, z__3.i = ctemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+	    }
+	} else if (lsame_(pivot, "T")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n;
+		for (j = 2; j <= i__1; ++j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = i__ + j * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__4 = i__ + a_dim1;
+			    z__3.r = stemp * a[i__4].r, z__3.i = stemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			    i__3 = i__ + a_dim1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__4 = i__ + a_dim1;
+			    z__3.r = ctemp * a[i__4].r, z__3.i = ctemp * a[
+				    i__4].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L170: */
+			}
+		    }
+/* L180: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n; j >= 2; --j) {
+		    ctemp = c__[j - 1];
+		    stemp = s[j - 1];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = i__ + j * a_dim1;
+			    z__2.r = ctemp * temp.r, z__2.i = ctemp * temp.i;
+			    i__3 = i__ + a_dim1;
+			    z__3.r = stemp * a[i__3].r, z__3.i = stemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			    i__2 = i__ + a_dim1;
+			    z__2.r = stemp * temp.r, z__2.i = stemp * temp.i;
+			    i__3 = i__ + a_dim1;
+			    z__3.r = ctemp * a[i__3].r, z__3.i = ctemp * a[
+				    i__3].i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L190: */
+			}
+		    }
+/* L200: */
+		}
+	    }
+	} else if (lsame_(pivot, "B")) {
+	    if (lsame_(direct, "F")) {
+		i__1 = *n - 1;
+		for (j = 1; j <= i__1; ++j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    i__3 = i__ + j * a_dim1;
+			    temp.r = a[i__3].r, temp.i = a[i__3].i;
+			    i__3 = i__ + j * a_dim1;
+			    i__4 = i__ + *n * a_dim1;
+			    z__2.r = stemp * a[i__4].r, z__2.i = stemp * a[
+				    i__4].i;
+			    z__3.r = ctemp * temp.r, z__3.i = ctemp * temp.i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+			    i__3 = i__ + *n * a_dim1;
+			    i__4 = i__ + *n * a_dim1;
+			    z__2.r = ctemp * a[i__4].r, z__2.i = ctemp * a[
+				    i__4].i;
+			    z__3.r = stemp * temp.r, z__3.i = stemp * temp.i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+/* L210: */
+			}
+		    }
+/* L220: */
+		}
+	    } else if (lsame_(direct, "B")) {
+		for (j = *n - 1; j >= 1; --j) {
+		    ctemp = c__[j];
+		    stemp = s[j];
+		    if (ctemp != 1. || stemp != 0.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    i__2 = i__ + j * a_dim1;
+			    temp.r = a[i__2].r, temp.i = a[i__2].i;
+			    i__2 = i__ + j * a_dim1;
+			    i__3 = i__ + *n * a_dim1;
+			    z__2.r = stemp * a[i__3].r, z__2.i = stemp * a[
+				    i__3].i;
+			    z__3.r = ctemp * temp.r, z__3.i = ctemp * temp.i;
+			    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i +
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+			    i__2 = i__ + *n * a_dim1;
+			    i__3 = i__ + *n * a_dim1;
+			    z__2.r = ctemp * a[i__3].r, z__2.i = ctemp * a[
+				    i__3].i;
+			    z__3.r = stemp * temp.r, z__3.i = stemp * temp.i;
+			    z__1.r = z__2.r - z__3.r, z__1.i = z__2.i -
+				    z__3.i;
+			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L230: */
+			}
+		    }
+/* L240: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZLASR */
+
+} /* zlasr_ */
+
+/* Subroutine */ int zlassq_(integer *n, doublecomplex *x, integer *incx,
+	doublereal *scale, doublereal *sumsq)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    static integer ix;
+    static doublereal temp1;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLASSQ returns the values scl and ssq such that
+
+       ( scl**2 )*ssq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+
+    where x( i ) = abs( X( 1 + ( i - 1 )*INCX ) ). The value of sumsq is
+    assumed to be at least unity and the value of ssq will then satisfy
+
+       1.0 .le. ssq .le. ( sumsq + 2*n ).
+
+    scale is assumed to be non-negative and scl returns the value
+
+       scl = max( scale, abs( real( x( i ) ) ), abs( aimag( x( i ) ) ) ),
+              i
+
+    scale and sumsq must be supplied in SCALE and SUMSQ respectively.
+    SCALE and SUMSQ are overwritten by scl and ssq respectively.
+
+    The routine makes only one pass through the vector X.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of elements to be used from the vector X.
+
+    X       (input) COMPLEX*16 array, dimension (N)
+            The vector x as described above.
+               x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+
+    INCX    (input) INTEGER
+            The increment between successive values of the vector X.
+            INCX > 0.
+
+    SCALE   (input/output) DOUBLE PRECISION
+            On entry, the value  scale  in the equation above.
+            On exit, SCALE is overwritten with the value  scl .
+
+    SUMSQ   (input/output) DOUBLE PRECISION
+            On entry, the value  sumsq  in the equation above.
+            On exit, SUMSQ is overwritten with the value  ssq .
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n > 0) {
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    i__3 = ix;
+	    if (x[i__3].r != 0.) {
+		i__3 = ix;
+		temp1 = (d__1 = x[i__3].r, abs(d__1));
+		if (*scale < temp1) {
+/* Computing 2nd power */
+		    d__1 = *scale / temp1;
+		    *sumsq = *sumsq * (d__1 * d__1) + 1;
+		    *scale = temp1;
+		} else {
+/* Computing 2nd power */
+		    d__1 = temp1 / *scale;
+		    *sumsq += d__1 * d__1;
+		}
+	    }
+	    if (d_imag(&x[ix]) != 0.) {
+		temp1 = (d__1 = d_imag(&x[ix]), abs(d__1));
+		if (*scale < temp1) {
+/* Computing 2nd power */
+		    d__1 = *scale / temp1;
+		    *sumsq = *sumsq * (d__1 * d__1) + 1;
+		    *scale = temp1;
+		} else {
+/* Computing 2nd power */
+		    d__1 = temp1 / *scale;
+		    *sumsq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLASSQ */
+
+} /* zlassq_ */
+
+/* Subroutine */ int zlaswp_(integer *n, doublecomplex *a, integer *lda,
+	integer *k1, integer *k2, integer *ipiv, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+
+    /* Local variables */
+    static integer i__, j, k, i1, i2, n32, ip, ix, ix0, inc;
+    static doublecomplex temp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLASWP performs a series of row interchanges on the matrix A.
+    One row interchange is initiated for each of rows K1 through K2 of A.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the matrix of column dimension N to which the row
+            interchanges will be applied.
+            On exit, the permuted matrix.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+
+    K1      (input) INTEGER
+            The first element of IPIV for which a row interchange will
+            be done.
+
+    K2      (input) INTEGER
+            The last element of IPIV for which a row interchange will
+            be done.
+
+    IPIV    (input) INTEGER array, dimension (K2*abs(INCX))
+            The vector of pivot indices.  Only the elements in positions
+            K1 through K2 of IPIV are accessed.
+            IPIV(K) = L implies rows K and L are to be interchanged.
+
+    INCX    (input) INTEGER
+            The increment between successive values of IPIV.  If IPIV
+            is negative, the pivots are applied in reverse order.
+
+    Further Details
+    ===============
+
+    Modified by
+     R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA
+
+   =====================================================================
+
+
+       Interchange row I with row IPIV(I) for each of rows K1 through K2.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --ipiv;
+
+    /* Function Body */
+    if (*incx > 0) {
+	ix0 = *k1;
+	i1 = *k1;
+	i2 = *k2;
+	inc = 1;
+    } else if (*incx < 0) {
+	ix0 = (1 - *k2) * *incx + 1;
+	i1 = *k2;
+	i2 = *k1;
+	inc = -1;
+    } else {
+	return 0;
+    }
+
+    n32 = *n / 32 << 5;
+    if (n32 != 0) {
+	i__1 = n32;
+	for (j = 1; j <= i__1; j += 32) {
+	    ix = ix0;
+	    i__2 = i2;
+	    i__3 = inc;
+	    for (i__ = i1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__3)
+		    {
+		ip = ipiv[ix];
+		if (ip != i__) {
+		    i__4 = j + 31;
+		    for (k = j; k <= i__4; ++k) {
+			i__5 = i__ + k * a_dim1;
+			temp.r = a[i__5].r, temp.i = a[i__5].i;
+			i__5 = i__ + k * a_dim1;
+			i__6 = ip + k * a_dim1;
+			a[i__5].r = a[i__6].r, a[i__5].i = a[i__6].i;
+			i__5 = ip + k * a_dim1;
+			a[i__5].r = temp.r, a[i__5].i = temp.i;
+/* L10: */
+		    }
+		}
+		ix += *incx;
+/* L20: */
+	    }
+/* L30: */
+	}
+    }
+    if (n32 != *n) {
+	++n32;
+	ix = ix0;
+	i__1 = i2;
+	i__3 = inc;
+	for (i__ = i1; i__3 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__3) {
+	    ip = ipiv[ix];
+	    if (ip != i__) {
+		i__2 = *n;
+		for (k = n32; k <= i__2; ++k) {
+		    i__4 = i__ + k * a_dim1;
+		    temp.r = a[i__4].r, temp.i = a[i__4].i;
+		    i__4 = i__ + k * a_dim1;
+		    i__5 = ip + k * a_dim1;
+		    a[i__4].r = a[i__5].r, a[i__4].i = a[i__5].i;
+		    i__4 = ip + k * a_dim1;
+		    a[i__4].r = temp.r, a[i__4].i = temp.i;
+/* L40: */
+		}
+	    }
+	    ix += *incx;
+/* L50: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLASWP */
+
+} /* zlaswp_ */
+
+/* Subroutine */ int zlatrd_(char *uplo, integer *n, integer *nb,
+	doublecomplex *a, integer *lda, doublereal *e, doublecomplex *tau,
+	doublecomplex *w, integer *ldw)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__, iw;
+    static doublecomplex alpha;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *);
+    extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern /* Subroutine */ int zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *),
+	    zhemv_(char *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *), zaxpy_(integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *,
+	    integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to
+    Hermitian tridiagonal form by a unitary similarity
+    transformation Q' * A * Q, and returns the matrices V and W which are
+    needed to apply the transformation to the unreduced part of A.
+
+    If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a
+    matrix, of which the upper triangle is supplied;
+    if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a
+    matrix, of which the lower triangle is supplied.
+
+    This is an auxiliary routine called by ZHETRD.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            Hermitian matrix A is stored:
+            = 'U': Upper triangular
+            = 'L': Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.
+
+    NB      (input) INTEGER
+            The number of rows and columns to be reduced.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            n-by-n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n-by-n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+            On exit:
+            if UPLO = 'U', the last NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements above the diagonal
+              with the array TAU, represent the unitary matrix Q as a
+              product of elementary reflectors;
+            if UPLO = 'L', the first NB columns have been reduced to
+              tridiagonal form, with the diagonal elements overwriting
+              the diagonal elements of A; the elements below the diagonal
+              with the array TAU, represent the  unitary matrix Q as a
+              product of elementary reflectors.
+            See Further Details.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    E       (output) DOUBLE PRECISION array, dimension (N-1)
+            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
+            elements of the last NB columns of the reduced matrix;
+            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
+            the first NB columns of the reduced matrix.
+
+    TAU     (output) COMPLEX*16 array, dimension (N-1)
+            The scalar factors of the elementary reflectors, stored in
+            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
+            See Further Details.
+
+    W       (output) COMPLEX*16 array, dimension (LDW,NB)
+            The n-by-nb matrix W required to update the unreduced part
+            of A.
+
+    LDW     (input) INTEGER
+            The leading dimension of the array W. LDW >= max(1,N).
+
+    Further Details
+    ===============
+
+    If UPLO = 'U', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(n) H(n-1) . . . H(n-nb+1).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
+    and tau in TAU(i-1).
+
+    If UPLO = 'L', the matrix Q is represented as a product of elementary
+    reflectors
+
+       Q = H(1) H(2) . . . H(nb).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
+    and tau in TAU(i).
+
+    The elements of the vectors v together form the n-by-nb matrix V
+    which is needed, with W, to apply the transformation to the unreduced
+    part of the matrix, using a Hermitian rank-2k update of the form:
+    A := A - V*W' - W*V'.
+
+    The contents of A on exit are illustrated by the following examples
+    with n = 5 and nb = 2:
+
+    if UPLO = 'U':                       if UPLO = 'L':
+
+      (  a   a   a   v4  v5 )              (  d                  )
+      (      a   a   v4  v5 )              (  1   d              )
+      (          a   1   v5 )              (  v1  1   a          )
+      (              d   1  )              (  v1  v2  a   a      )
+      (                  d  )              (  v1  v2  a   a   a  )
+
+    where d denotes a diagonal element of the reduced matrix, a denotes
+    an element of the original matrix that is unchanged, and vi denotes
+    an element of the vector defining H(i).
+
+    =====================================================================
+
+
+       Quick return if possible
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --e;
+    --tau;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1;
+    w -= w_offset;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(uplo, "U")) {
+
+/*        Reduce last NB columns of upper triangle */
+
+	i__1 = *n - *nb + 1;
+	for (i__ = *n; i__ >= i__1; --i__) {
+	    iw = i__ - *n + *nb;
+	    if (i__ < *n) {
+
+/*              Update A(1:i,i) */
+
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = i__ + i__ * a_dim1;
+		d__1 = a[i__3].r;
+		a[i__2].r = d__1, a[i__2].i = 0.;
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &w[i__ + (iw + 1) * w_dim1], ldw);
+		i__2 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__, &i__2, &z__1, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &w[i__ + (iw + 1) * w_dim1], ldw, &
+			c_b57, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &w[i__ + (iw + 1) * w_dim1], ldw);
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = *n - i__;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__, &i__2, &z__1, &w[(iw + 1) *
+			w_dim1 + 1], ldw, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			c_b57, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = i__ + i__ * a_dim1;
+		d__1 = a[i__3].r;
+		a[i__2].r = d__1, a[i__2].i = 0.;
+	    }
+	    if (i__ > 1) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(1:i-2,i)
+*/
+
+		i__2 = i__ - 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = i__ - 1;
+		zlarfg_(&i__2, &alpha, &a[i__ * a_dim1 + 1], &c__1, &tau[i__
+			- 1]);
+		i__2 = i__ - 1;
+		e[i__2] = alpha.r;
+		i__2 = i__ - 1 + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute W(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		zhemv_("Upper", &i__2, &c_b57, &a[a_offset], lda, &a[i__ *
+			a_dim1 + 1], &c__1, &c_b56, &w[iw * w_dim1 + 1], &
+			c__1);
+		if (i__ < *n) {
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &w[(
+			    iw + 1) * w_dim1 + 1], ldw, &a[i__ * a_dim1 + 1],
+			    &c__1, &c_b56, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &i__2, &i__3, &z__1, &a[(i__ + 1) *
+			     a_dim1 + 1], lda, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b57, &w[iw * w_dim1 + 1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[(
+			    i__ + 1) * a_dim1 + 1], lda, &a[i__ * a_dim1 + 1],
+			     &c__1, &c_b56, &w[i__ + 1 + iw * w_dim1], &c__1);
+		    i__2 = i__ - 1;
+		    i__3 = *n - i__;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemv_("No transpose", &i__2, &i__3, &z__1, &w[(iw + 1) *
+			    w_dim1 + 1], ldw, &w[i__ + 1 + iw * w_dim1], &
+			    c__1, &c_b57, &w[iw * w_dim1 + 1], &c__1);
+		}
+		i__2 = i__ - 1;
+		zscal_(&i__2, &tau[i__ - 1], &w[iw * w_dim1 + 1], &c__1);
+		z__3.r = -.5, z__3.i = -0.;
+		i__2 = i__ - 1;
+		z__2.r = z__3.r * tau[i__2].r - z__3.i * tau[i__2].i, z__2.i =
+			 z__3.r * tau[i__2].i + z__3.i * tau[i__2].r;
+		i__3 = i__ - 1;
+		zdotc_(&z__4, &i__3, &w[iw * w_dim1 + 1], &c__1, &a[i__ *
+			a_dim1 + 1], &c__1);
+		z__1.r = z__2.r * z__4.r - z__2.i * z__4.i, z__1.i = z__2.r *
+			z__4.i + z__2.i * z__4.r;
+		alpha.r = z__1.r, alpha.i = z__1.i;
+		i__2 = i__ - 1;
+		zaxpy_(&i__2, &alpha, &a[i__ * a_dim1 + 1], &c__1, &w[iw *
+			w_dim1 + 1], &c__1);
+	    }
+
+/* L10: */
+	}
+    } else {
+
+/*        Reduce first NB columns of lower triangle */
+
+	i__1 = *nb;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*           Update A(i:n,i) */
+
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__ + i__ * a_dim1;
+	    d__1 = a[i__3].r;
+	    a[i__2].r = d__1, a[i__2].i = 0.;
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &w[i__ + w_dim1], ldw);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__2, &i__3, &z__1, &a[i__ + a_dim1], lda,
+		     &w[i__ + w_dim1], ldw, &c_b57, &a[i__ + i__ * a_dim1], &
+		    c__1);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &w[i__ + w_dim1], ldw);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = *n - i__ + 1;
+	    i__3 = i__ - 1;
+	    z__1.r = -1., z__1.i = -0.;
+	    zgemv_("No transpose", &i__2, &i__3, &z__1, &w[i__ + w_dim1], ldw,
+		     &a[i__ + a_dim1], lda, &c_b57, &a[i__ + i__ * a_dim1], &
+		    c__1);
+	    i__2 = i__ - 1;
+	    zlacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    i__2 = i__ + i__ * a_dim1;
+	    i__3 = i__ + i__ * a_dim1;
+	    d__1 = a[i__3].r;
+	    a[i__2].r = d__1, a[i__2].i = 0.;
+	    if (i__ < *n) {
+
+/*
+                Generate elementary reflector H(i) to annihilate
+                A(i+2:n,i)
+*/
+
+		i__2 = i__ + 1 + i__ * a_dim1;
+		alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+		i__2 = *n - i__;
+/* Computing MIN */
+		i__3 = i__ + 2;
+		zlarfg_(&i__2, &alpha, &a[min(i__3,*n) + i__ * a_dim1], &c__1,
+			 &tau[i__]);
+		i__2 = i__;
+		e[i__2] = alpha.r;
+		i__2 = i__ + 1 + i__ * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+
+/*              Compute W(i+1:n,i) */
+
+		i__2 = *n - i__;
+		zhemv_("Lower", &i__2, &c_b57, &a[i__ + 1 + (i__ + 1) *
+			a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &w[i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &w[i__ +
+			1 + w_dim1], ldw, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &w[i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &a[i__ + 1 +
+			a_dim1], lda, &w[i__ * w_dim1 + 1], &c__1, &c_b57, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			c_b56, &w[i__ * w_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &w[i__ + 1 +
+			w_dim1], ldw, &w[i__ * w_dim1 + 1], &c__1, &c_b57, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+		i__2 = *n - i__;
+		zscal_(&i__2, &tau[i__], &w[i__ + 1 + i__ * w_dim1], &c__1);
+		z__3.r = -.5, z__3.i = -0.;
+		i__2 = i__;
+		z__2.r = z__3.r * tau[i__2].r - z__3.i * tau[i__2].i, z__2.i =
+			 z__3.r * tau[i__2].i + z__3.i * tau[i__2].r;
+		i__3 = *n - i__;
+		zdotc_(&z__4, &i__3, &w[i__ + 1 + i__ * w_dim1], &c__1, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+		z__1.r = z__2.r * z__4.r - z__2.i * z__4.i, z__1.i = z__2.r *
+			z__4.i + z__2.i * z__4.r;
+		alpha.r = z__1.r, alpha.i = z__1.i;
+		i__2 = *n - i__;
+		zaxpy_(&i__2, &alpha, &a[i__ + 1 + i__ * a_dim1], &c__1, &w[
+			i__ + 1 + i__ * w_dim1], &c__1);
+	    }
+
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLATRD */
+
+} /* zlatrd_ */
+
+/* Subroutine */ int zlatrs_(char *uplo, char *trans, char *diag, char *
+	normin, integer *n, doublecomplex *a, integer *lda, doublecomplex *x,
+	doublereal *scale, doublereal *cnorm, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1, d__2, d__3, d__4;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__, j;
+    static doublereal xj, rec, tjj;
+    static integer jinc;
+    static doublereal xbnd;
+    static integer imax;
+    static doublereal tmax;
+    static doublecomplex tjjs;
+    static doublereal xmax, grow;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *,
+	    integer *);
+    extern logical lsame_(char *, char *);
+    static doublereal tscal;
+    static doublecomplex uscal;
+    static integer jlast;
+    static doublecomplex csumj;
+    extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    static logical upper;
+    extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), ztrsv_(
+	    char *, char *, char *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), dlabad_(
+	    doublereal *, doublereal *);
+
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), zdscal_(
+	    integer *, doublereal *, doublecomplex *, integer *);
+    static doublereal bignum;
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+    extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *,
+	     doublecomplex *);
+    static logical notran;
+    static integer jfirst;
+    extern doublereal dzasum_(integer *, doublecomplex *, integer *);
+    static doublereal smlnum;
+    static logical nounit;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLATRS solves one of the triangular systems
+
+       A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+
+    with scaling to prevent overflow.  Here A is an upper or lower
+    triangular matrix, A**T denotes the transpose of A, A**H denotes the
+    conjugate transpose of A, x and b are n-element vectors, and s is a
+    scaling factor, usually less than or equal to 1, chosen so that the
+    components of x will be less than the overflow threshold.  If the
+    unscaled problem will not cause overflow, the Level 2 BLAS routine
+    ZTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+    then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the matrix A is upper or lower triangular.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    TRANS   (input) CHARACTER*1
+            Specifies the operation applied to A.
+            = 'N':  Solve A * x = s*b     (No transpose)
+            = 'T':  Solve A**T * x = s*b  (Transpose)
+            = 'C':  Solve A**H * x = s*b  (Conjugate transpose)
+
+    DIAG    (input) CHARACTER*1
+            Specifies whether or not the matrix A is unit triangular.
+            = 'N':  Non-unit triangular
+            = 'U':  Unit triangular
+
+    NORMIN  (input) CHARACTER*1
+            Specifies whether CNORM has been set or not.
+            = 'Y':  CNORM contains the column norms on entry
+            = 'N':  CNORM is not set on entry.  On exit, the norms will
+                    be computed and stored in CNORM.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The triangular matrix A.  If UPLO = 'U', the leading n by n
+            upper triangular part of the array A contains the upper
+            triangular matrix, and the strictly lower triangular part of
+            A is not referenced.  If UPLO = 'L', the leading n by n lower
+            triangular part of the array A contains the lower triangular
+            matrix, and the strictly upper triangular part of A is not
+            referenced.  If DIAG = 'U', the diagonal elements of A are
+            also not referenced and are assumed to be 1.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max (1,N).
+
+    X       (input/output) COMPLEX*16 array, dimension (N)
+            On entry, the right hand side b of the triangular system.
+            On exit, X is overwritten by the solution vector x.
+
+    SCALE   (output) DOUBLE PRECISION
+            The scaling factor s for the triangular system
+               A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b.
+            If SCALE = 0, the matrix A is singular or badly scaled, and
+            the vector x is an exact or approximate solution to A*x = 0.
+
+    CNORM   (input or output) DOUBLE PRECISION array, dimension (N)
+
+            If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
+            contains the norm of the off-diagonal part of the j-th column
+            of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
+            to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
+            must be greater than or equal to the 1-norm.
+
+            If NORMIN = 'N', CNORM is an output argument and CNORM(j)
+            returns the 1-norm of the offdiagonal part of the j-th column
+            of A.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -k, the k-th argument had an illegal value
+
+    Further Details
+    ======= =======
+
+    A rough bound on x is computed; if that is less than overflow, ZTRSV
+    is called, otherwise, specific code is used which checks for possible
+    overflow or divide-by-zero at every operation.
+
+    A columnwise scheme is used for solving A*x = b.  The basic algorithm
+    if A is lower triangular is
+
+         x[1:n] := b[1:n]
+         for j = 1, ..., n
+              x(j) := x(j) / A(j,j)
+              x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j]
+         end
+
+    Define bounds on the components of x after j iterations of the loop:
+       M(j) = bound on x[1:j]
+       G(j) = bound on x[j+1:n]
+    Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}.
+
+    Then for iteration j+1 we have
+       M(j+1) <= G(j) / | A(j+1,j+1) |
+       G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] |
+              <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | )
+
+    where CNORM(j+1) is greater than or equal to the infinity-norm of
+    column j+1 of A, not counting the diagonal.  Hence
+
+       G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | )
+                    1<=i<=j
+    and
+
+       |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| )
+                                     1<=i< j
+
+    Since |x(j)| <= M(j), we use the Level 2 BLAS routine ZTRSV if the
+    reciprocal of the largest M(j), j=1,..,n, is larger than
+    max(underflow, 1/overflow).
+
+    The bound on x(j) is also used to determine when a step in the
+    columnwise method can be performed without fear of overflow.  If
+    the computed bound is greater than a large constant, x is scaled to
+    prevent overflow, but if the bound overflows, x is set to 0, x(j) to
+    1, and scale to 0, and a non-trivial solution to A*x = 0 is found.
+
+    Similarly, a row-wise scheme is used to solve A**T *x = b  or
+    A**H *x = b.  The basic algorithm for A upper triangular is
+
+         for j = 1, ..., n
+              x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j)
+         end
+
+    We simultaneously compute two bounds
+         G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j
+         M(j) = bound on x(i), 1<=i<=j
+
+    The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we
+    add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1.
+    Then the bound on x(j) is
+
+         M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) |
+
+              <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| )
+                        1<=i<=j
+
+    and we can safely call ZTRSV if 1/M(n) and 1/G(n) are both greater
+    than max(underflow, 1/overflow).
+
+    =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --cnorm;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    notran = lsame_(trans, "N");
+    nounit = lsame_(diag, "N");
+
+/*     Test the input parameters. */
+
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T") && !
+	    lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (! lsame_(normin, "Y") && ! lsame_(normin,
+	     "N")) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLATRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine machine dependent parameters to control overflow. */
+
+    smlnum = SAFEMINIMUM;
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+    smlnum /= PRECISION;
+    bignum = 1. / smlnum;
+    *scale = 1.;
+
+    if (lsame_(normin, "N")) {
+
+/*        Compute the 1-norm of each column, not including the diagonal. */
+
+	if (upper) {
+
+/*           A is upper triangular. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j - 1;
+		cnorm[j] = dzasum_(&i__2, &a[j * a_dim1 + 1], &c__1);
+/* L10: */
+	    }
+	} else {
+
+/*           A is lower triangular. */
+
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n - j;
+		cnorm[j] = dzasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1);
+/* L20: */
+	    }
+	    cnorm[*n] = 0.;
+	}
+    }
+
+/*
+       Scale the column norms by TSCAL if the maximum element in CNORM is
+       greater than BIGNUM/2.
+*/
+
+    imax = idamax_(n, &cnorm[1], &c__1);
+    tmax = cnorm[imax];
+    if (tmax <= bignum * .5) {
+	tscal = 1.;
+    } else {
+	tscal = .5 / (smlnum * tmax);
+	dscal_(n, &tscal, &cnorm[1], &c__1);
+    }
+
+/*
+       Compute a bound on the computed solution vector to see if the
+       Level 2 BLAS routine ZTRSV can be used.
+*/
+
+    xmax = 0.;
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	i__2 = j;
+	d__3 = xmax, d__4 = (d__1 = x[i__2].r / 2., abs(d__1)) + (d__2 =
+		d_imag(&x[j]) / 2., abs(d__2));
+	xmax = max(d__3,d__4);
+/* L30: */
+    }
+    xbnd = xmax;
+
+    if (notran) {
+
+/*        Compute the growth in A * x = b. */
+
+	if (upper) {
+	    jfirst = *n;
+	    jlast = 1;
+	    jinc = -1;
+	} else {
+	    jfirst = 1;
+	    jlast = *n;
+	    jinc = 1;
+	}
+
+	if (tscal != 1.) {
+	    grow = 0.;
+	    goto L60;
+	}
+
+	if (nounit) {
+
+/*
+             A is non-unit triangular.
+
+             Compute GROW = 1/G(j) and XBND = 1/M(j).
+             Initially, G(0) = max{x(i), i=1,...,n}.
+*/
+
+	    grow = .5 / max(xbnd,smlnum);
+	    xbnd = grow;
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L60;
+		}
+
+		i__3 = j + j * a_dim1;
+		tjjs.r = a[i__3].r, tjjs.i = a[i__3].i;
+		tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), abs(
+			d__2));
+
+		if (tjj >= smlnum) {
+
+/*
+                   M(j) = G(j-1) / abs(A(j,j))
+
+   Computing MIN
+*/
+		    d__1 = xbnd, d__2 = min(1.,tjj) * grow;
+		    xbnd = min(d__1,d__2);
+		} else {
+
+/*                 M(j) could overflow, set XBND to 0. */
+
+		    xbnd = 0.;
+		}
+
+		if (tjj + cnorm[j] >= smlnum) {
+
+/*                 G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */
+
+		    grow *= tjj / (tjj + cnorm[j]);
+		} else {
+
+/*                 G(j) could overflow, set GROW to 0. */
+
+		    grow = 0.;
+		}
+/* L40: */
+	    }
+	    grow = xbnd;
+	} else {
+
+/*
+             A is unit triangular.
+
+             Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}.
+
+   Computing MIN
+*/
+	    d__1 = 1., d__2 = .5 / max(xbnd,smlnum);
+	    grow = min(d__1,d__2);
+	    i__2 = jlast;
+	    i__1 = jinc;
+	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L60;
+		}
+
+/*              G(j) = G(j-1)*( 1 + CNORM(j) ) */
+
+		grow *= 1. / (cnorm[j] + 1.);
+/* L50: */
+	    }
+	}
+L60:
+
+	;
+    } else {
+
+/*        Compute the growth in A**T * x = b  or  A**H * x = b. */
+
+	if (upper) {
+	    jfirst = 1;
+	    jlast = *n;
+	    jinc = 1;
+	} else {
+	    jfirst = *n;
+	    jlast = 1;
+	    jinc = -1;
+	}
+
+	if (tscal != 1.) {
+	    grow = 0.;
+	    goto L90;
+	}
+
+	if (nounit) {
+
+/*
+             A is non-unit triangular.
+
+             Compute GROW = 1/G(j) and XBND = 1/M(j).
+             Initially, M(0) = max{x(i), i=1,...,n}.
+*/
+
+	    grow = .5 / max(xbnd,smlnum);
+	    xbnd = grow;
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L90;
+		}
+
+/*              G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */
+
+		xj = cnorm[j] + 1.;
+/* Computing MIN */
+		d__1 = grow, d__2 = xbnd / xj;
+		grow = min(d__1,d__2);
+
+		i__3 = j + j * a_dim1;
+		tjjs.r = a[i__3].r, tjjs.i = a[i__3].i;
+		tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), abs(
+			d__2));
+
+		if (tjj >= smlnum) {
+
+/*                 M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */
+
+		    if (xj > tjj) {
+			xbnd *= tjj / xj;
+		    }
+		} else {
+
+/*                 M(j) could overflow, set XBND to 0. */
+
+		    xbnd = 0.;
+		}
+/* L70: */
+	    }
+	    grow = min(grow,xbnd);
+	} else {
+
+/*
+             A is unit triangular.
+
+             Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}.
+
+   Computing MIN
+*/
+	    d__1 = 1., d__2 = .5 / max(xbnd,smlnum);
+	    grow = min(d__1,d__2);
+	    i__2 = jlast;
+	    i__1 = jinc;
+	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*              Exit the loop if the growth factor is too small. */
+
+		if (grow <= smlnum) {
+		    goto L90;
+		}
+
+/*              G(j) = ( 1 + CNORM(j) )*G(j-1) */
+
+		xj = cnorm[j] + 1.;
+		grow /= xj;
+/* L80: */
+	    }
+	}
+L90:
+	;
+    }
+
+    if (grow * tscal > smlnum) {
+
+/*
+          Use the Level 2 BLAS solve if the reciprocal of the bound on
+          elements of X is not too small.
+*/
+
+	ztrsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1);
+    } else {
+
+/*        Use a Level 1 BLAS solve, scaling intermediate results. */
+
+	if (xmax > bignum * .5) {
+
+/*
+             Scale X so that its components are less than or equal to
+             BIGNUM in absolute value.
+*/
+
+	    *scale = bignum * .5 / xmax;
+	    zdscal_(n, scale, &x[1], &c__1);
+	    xmax = bignum;
+	} else {
+	    xmax *= 2.;
+	}
+
+	if (notran) {
+
+/*           Solve A * x = b */
+
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*              Compute x(j) = b(j) / A(j,j), scaling x if necessary. */
+
+		i__3 = j;
+		xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j]),
+			abs(d__2));
+		if (nounit) {
+		    i__3 = j + j * a_dim1;
+		    z__1.r = tscal * a[i__3].r, z__1.i = tscal * a[i__3].i;
+		    tjjs.r = z__1.r, tjjs.i = z__1.i;
+		} else {
+		    tjjs.r = tscal, tjjs.i = 0.;
+		    if (tscal == 1.) {
+			goto L110;
+		    }
+		}
+		tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), abs(
+			d__2));
+		if (tjj > smlnum) {
+
+/*                    abs(A(j,j)) > SMLNUM: */
+
+		    if (tjj < 1.) {
+			if (xj > tjj * bignum) {
+
+/*                          Scale x by 1/b(j). */
+
+			    rec = 1. / xj;
+			    zdscal_(n, &rec, &x[1], &c__1);
+			    *scale *= rec;
+			    xmax *= rec;
+			}
+		    }
+		    i__3 = j;
+		    zladiv_(&z__1, &x[j], &tjjs);
+		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    i__3 = j;
+		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
+			    , abs(d__2));
+		} else if (tjj > 0.) {
+
+/*                    0 < abs(A(j,j)) <= SMLNUM: */
+
+		    if (xj > tjj * bignum) {
+
+/*
+                         Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM
+                         to avoid overflow when dividing by A(j,j).
+*/
+
+			rec = tjj * bignum / xj;
+			if (cnorm[j] > 1.) {
+
+/*
+                            Scale by 1/CNORM(j) to avoid overflow when
+                            multiplying x(j) times column j.
+*/
+
+			    rec /= cnorm[j];
+			}
+			zdscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+			xmax *= rec;
+		    }
+		    i__3 = j;
+		    zladiv_(&z__1, &x[j], &tjjs);
+		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    i__3 = j;
+		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
+			    , abs(d__2));
+		} else {
+
+/*
+                      A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and
+                      scale = 0, and compute a solution to A*x = 0.
+*/
+
+		    i__3 = *n;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			x[i__4].r = 0., x[i__4].i = 0.;
+/* L100: */
+		    }
+		    i__3 = j;
+		    x[i__3].r = 1., x[i__3].i = 0.;
+		    xj = 1.;
+		    *scale = 0.;
+		    xmax = 0.;
+		}
+L110:
+
+/*
+                Scale x if necessary to avoid overflow when adding a
+                multiple of column j of A.
+*/
+
+		if (xj > 1.) {
+		    rec = 1. / xj;
+		    if (cnorm[j] > (bignum - xmax) * rec) {
+
+/*                    Scale x by 1/(2*abs(x(j))). */
+
+			rec *= .5;
+			zdscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+		    }
+		} else if (xj * cnorm[j] > bignum - xmax) {
+
+/*                 Scale x by 1/2. */
+
+		    zdscal_(n, &c_b2435, &x[1], &c__1);
+		    *scale *= .5;
+		}
+
+		if (upper) {
+		    if (j > 1) {
+
+/*
+                      Compute the update
+                         x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j)
+*/
+
+			i__3 = j - 1;
+			i__4 = j;
+			z__2.r = -x[i__4].r, z__2.i = -x[i__4].i;
+			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
+			zaxpy_(&i__3, &z__1, &a[j * a_dim1 + 1], &c__1, &x[1],
+				 &c__1);
+			i__3 = j - 1;
+			i__ = izamax_(&i__3, &x[1], &c__1);
+			i__3 = i__;
+			xmax = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(
+				&x[i__]), abs(d__2));
+		    }
+		} else {
+		    if (j < *n) {
+
+/*
+                      Compute the update
+                         x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j)
+*/
+
+			i__3 = *n - j;
+			i__4 = j;
+			z__2.r = -x[i__4].r, z__2.i = -x[i__4].i;
+			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
+			zaxpy_(&i__3, &z__1, &a[j + 1 + j * a_dim1], &c__1, &
+				x[j + 1], &c__1);
+			i__3 = *n - j;
+			i__ = j + izamax_(&i__3, &x[j + 1], &c__1);
+			i__3 = i__;
+			xmax = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(
+				&x[i__]), abs(d__2));
+		    }
+		}
+/* L120: */
+	    }
+
+	} else if (lsame_(trans, "T")) {
+
+/*           Solve A**T * x = b */
+
+	    i__2 = jlast;
+	    i__1 = jinc;
+	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*
+                Compute x(j) = b(j) - sum A(k,j)*x(k).
+                                      k<>j
+*/
+
+		i__3 = j;
+		xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j]),
+			abs(d__2));
+		uscal.r = tscal, uscal.i = 0.;
+		rec = 1. / max(xmax,1.);
+		if (cnorm[j] > (bignum - xj) * rec) {
+
+/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */
+
+		    rec *= .5;
+		    if (nounit) {
+			i__3 = j + j * a_dim1;
+			z__1.r = tscal * a[i__3].r, z__1.i = tscal * a[i__3]
+				.i;
+			tjjs.r = z__1.r, tjjs.i = z__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.;
+		    }
+		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs),
+			    abs(d__2));
+		    if (tjj > 1.) {
+
+/*
+                         Divide by A(j,j) when scaling x if A(j,j) > 1.
+
+   Computing MIN
+*/
+			d__1 = 1., d__2 = rec * tjj;
+			rec = min(d__1,d__2);
+			zladiv_(&z__1, &uscal, &tjjs);
+			uscal.r = z__1.r, uscal.i = z__1.i;
+		    }
+		    if (rec < 1.) {
+			zdscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+			xmax *= rec;
+		    }
+		}
+
+		csumj.r = 0., csumj.i = 0.;
+		if (uscal.r == 1. && uscal.i == 0.) {
+
+/*
+                   If the scaling needed for A in the dot product is 1,
+                   call ZDOTU to perform the dot product.
+*/
+
+		    if (upper) {
+			i__3 = j - 1;
+			zdotu_(&z__1, &i__3, &a[j * a_dim1 + 1], &c__1, &x[1],
+				 &c__1);
+			csumj.r = z__1.r, csumj.i = z__1.i;
+		    } else if (j < *n) {
+			i__3 = *n - j;
+			zdotu_(&z__1, &i__3, &a[j + 1 + j * a_dim1], &c__1, &
+				x[j + 1], &c__1);
+			csumj.r = z__1.r, csumj.i = z__1.i;
+		    }
+		} else {
+
+/*                 Otherwise, use in-line code for the dot product. */
+
+		    if (upper) {
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * a_dim1;
+			    z__3.r = a[i__4].r * uscal.r - a[i__4].i *
+				    uscal.i, z__3.i = a[i__4].r * uscal.i + a[
+				    i__4].i * uscal.r;
+			    i__5 = i__;
+			    z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i,
+				    z__2.i = z__3.r * x[i__5].i + z__3.i * x[
+				    i__5].r;
+			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i +
+				    z__2.i;
+			    csumj.r = z__1.r, csumj.i = z__1.i;
+/* L130: */
+			}
+		    } else if (j < *n) {
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    i__4 = i__ + j * a_dim1;
+			    z__3.r = a[i__4].r * uscal.r - a[i__4].i *
+				    uscal.i, z__3.i = a[i__4].r * uscal.i + a[
+				    i__4].i * uscal.r;
+			    i__5 = i__;
+			    z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i,
+				    z__2.i = z__3.r * x[i__5].i + z__3.i * x[
+				    i__5].r;
+			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i +
+				    z__2.i;
+			    csumj.r = z__1.r, csumj.i = z__1.i;
+/* L140: */
+			}
+		    }
+		}
+
+		z__1.r = tscal, z__1.i = 0.;
+		if (uscal.r == z__1.r && uscal.i == z__1.i) {
+
+/*
+                   Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j)
+                   was not used to scale the dotproduct.
+*/
+
+		    i__3 = j;
+		    i__4 = j;
+		    z__1.r = x[i__4].r - csumj.r, z__1.i = x[i__4].i -
+			    csumj.i;
+		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    i__3 = j;
+		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
+			    , abs(d__2));
+		    if (nounit) {
+			i__3 = j + j * a_dim1;
+			z__1.r = tscal * a[i__3].r, z__1.i = tscal * a[i__3]
+				.i;
+			tjjs.r = z__1.r, tjjs.i = z__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.;
+			if (tscal == 1.) {
+			    goto L160;
+			}
+		    }
+
+/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */
+
+		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs),
+			    abs(d__2));
+		    if (tjj > smlnum) {
+
+/*                       abs(A(j,j)) > SMLNUM: */
+
+			if (tjj < 1.) {
+			    if (xj > tjj * bignum) {
+
+/*                             Scale X by 1/abs(x(j)). */
+
+				rec = 1. / xj;
+				zdscal_(n, &rec, &x[1], &c__1);
+				*scale *= rec;
+				xmax *= rec;
+			    }
+			}
+			i__3 = j;
+			zladiv_(&z__1, &x[j], &tjjs);
+			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    } else if (tjj > 0.) {
+
+/*                       0 < abs(A(j,j)) <= SMLNUM: */
+
+			if (xj > tjj * bignum) {
+
+/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
+
+			    rec = tjj * bignum / xj;
+			    zdscal_(n, &rec, &x[1], &c__1);
+			    *scale *= rec;
+			    xmax *= rec;
+			}
+			i__3 = j;
+			zladiv_(&z__1, &x[j], &tjjs);
+			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    } else {
+
+/*
+                         A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and
+                         scale = 0 and compute a solution to A**T *x = 0.
+*/
+
+			i__3 = *n;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    x[i__4].r = 0., x[i__4].i = 0.;
+/* L150: */
+			}
+			i__3 = j;
+			x[i__3].r = 1., x[i__3].i = 0.;
+			*scale = 0.;
+			xmax = 0.;
+		    }
+L160:
+		    ;
+		} else {
+
+/*
+                   Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot
+                   product has already been divided by 1/A(j,j).
+*/
+
+		    i__3 = j;
+		    zladiv_(&z__2, &x[j], &tjjs);
+		    z__1.r = z__2.r - csumj.r, z__1.i = z__2.i - csumj.i;
+		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		}
+/* Computing MAX */
+		i__3 = j;
+		d__3 = xmax, d__4 = (d__1 = x[i__3].r, abs(d__1)) + (d__2 =
+			d_imag(&x[j]), abs(d__2));
+		xmax = max(d__3,d__4);
+/* L170: */
+	    }
+
+	} else {
+
+/*           Solve A**H * x = b */
+
+	    i__1 = jlast;
+	    i__2 = jinc;
+	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*
+                Compute x(j) = b(j) - sum A(k,j)*x(k).
+                                      k<>j
+*/
+
+		i__3 = j;
+		xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j]),
+			abs(d__2));
+		uscal.r = tscal, uscal.i = 0.;
+		rec = 1. / max(xmax,1.);
+		if (cnorm[j] > (bignum - xj) * rec) {
+
+/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */
+
+		    rec *= .5;
+		    if (nounit) {
+			d_cnjg(&z__2, &a[j + j * a_dim1]);
+			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
+			tjjs.r = z__1.r, tjjs.i = z__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.;
+		    }
+		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs),
+			    abs(d__2));
+		    if (tjj > 1.) {
+
+/*
+                         Divide by A(j,j) when scaling x if A(j,j) > 1.
+
+   Computing MIN
+*/
+			d__1 = 1., d__2 = rec * tjj;
+			rec = min(d__1,d__2);
+			zladiv_(&z__1, &uscal, &tjjs);
+			uscal.r = z__1.r, uscal.i = z__1.i;
+		    }
+		    if (rec < 1.) {
+			zdscal_(n, &rec, &x[1], &c__1);
+			*scale *= rec;
+			xmax *= rec;
+		    }
+		}
+
+		csumj.r = 0., csumj.i = 0.;
+		if (uscal.r == 1. && uscal.i == 0.) {
+
+/*
+                   If the scaling needed for A in the dot product is 1,
+                   call ZDOTC to perform the dot product.
+*/
+
+		    if (upper) {
+			i__3 = j - 1;
+			zdotc_(&z__1, &i__3, &a[j * a_dim1 + 1], &c__1, &x[1],
+				 &c__1);
+			csumj.r = z__1.r, csumj.i = z__1.i;
+		    } else if (j < *n) {
+			i__3 = *n - j;
+			zdotc_(&z__1, &i__3, &a[j + 1 + j * a_dim1], &c__1, &
+				x[j + 1], &c__1);
+			csumj.r = z__1.r, csumj.i = z__1.i;
+		    }
+		} else {
+
+/*                 Otherwise, use in-line code for the dot product. */
+
+		    if (upper) {
+			i__3 = j - 1;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    d_cnjg(&z__4, &a[i__ + j * a_dim1]);
+			    z__3.r = z__4.r * uscal.r - z__4.i * uscal.i,
+				    z__3.i = z__4.r * uscal.i + z__4.i *
+				    uscal.r;
+			    i__4 = i__;
+			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i,
+				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
+				    i__4].r;
+			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i +
+				    z__2.i;
+			    csumj.r = z__1.r, csumj.i = z__1.i;
+/* L180: */
+			}
+		    } else if (j < *n) {
+			i__3 = *n;
+			for (i__ = j + 1; i__ <= i__3; ++i__) {
+			    d_cnjg(&z__4, &a[i__ + j * a_dim1]);
+			    z__3.r = z__4.r * uscal.r - z__4.i * uscal.i,
+				    z__3.i = z__4.r * uscal.i + z__4.i *
+				    uscal.r;
+			    i__4 = i__;
+			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i,
+				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
+				    i__4].r;
+			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i +
+				    z__2.i;
+			    csumj.r = z__1.r, csumj.i = z__1.i;
+/* L190: */
+			}
+		    }
+		}
+
+		z__1.r = tscal, z__1.i = 0.;
+		if (uscal.r == z__1.r && uscal.i == z__1.i) {
+
+/*
+                   Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j)
+                   was not used to scale the dotproduct.
+*/
+
+		    i__3 = j;
+		    i__4 = j;
+		    z__1.r = x[i__4].r - csumj.r, z__1.i = x[i__4].i -
+			    csumj.i;
+		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    i__3 = j;
+		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
+			    , abs(d__2));
+		    if (nounit) {
+			d_cnjg(&z__2, &a[j + j * a_dim1]);
+			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
+			tjjs.r = z__1.r, tjjs.i = z__1.i;
+		    } else {
+			tjjs.r = tscal, tjjs.i = 0.;
+			if (tscal == 1.) {
+			    goto L210;
+			}
+		    }
+
+/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */
+
+		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs),
+			    abs(d__2));
+		    if (tjj > smlnum) {
+
+/*                       abs(A(j,j)) > SMLNUM: */
+
+			if (tjj < 1.) {
+			    if (xj > tjj * bignum) {
+
+/*                             Scale X by 1/abs(x(j)). */
+
+				rec = 1. / xj;
+				zdscal_(n, &rec, &x[1], &c__1);
+				*scale *= rec;
+				xmax *= rec;
+			    }
+			}
+			i__3 = j;
+			zladiv_(&z__1, &x[j], &tjjs);
+			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    } else if (tjj > 0.) {
+
+/*                       0 < abs(A(j,j)) <= SMLNUM: */
+
+			if (xj > tjj * bignum) {
+
+/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
+
+			    rec = tjj * bignum / xj;
+			    zdscal_(n, &rec, &x[1], &c__1);
+			    *scale *= rec;
+			    xmax *= rec;
+			}
+			i__3 = j;
+			zladiv_(&z__1, &x[j], &tjjs);
+			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		    } else {
+
+/*
+                         A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and
+                         scale = 0 and compute a solution to A**H *x = 0.
+*/
+
+			i__3 = *n;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    x[i__4].r = 0., x[i__4].i = 0.;
+/* L200: */
+			}
+			i__3 = j;
+			x[i__3].r = 1., x[i__3].i = 0.;
+			*scale = 0.;
+			xmax = 0.;
+		    }
+L210:
+		    ;
+		} else {
+
+/*
+                   Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot
+                   product has already been divided by 1/A(j,j).
+*/
+
+		    i__3 = j;
+		    zladiv_(&z__2, &x[j], &tjjs);
+		    z__1.r = z__2.r - csumj.r, z__1.i = z__2.i - csumj.i;
+		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+		}
+/* Computing MAX */
+		i__3 = j;
+		d__3 = xmax, d__4 = (d__1 = x[i__3].r, abs(d__1)) + (d__2 =
+			d_imag(&x[j]), abs(d__2));
+		xmax = max(d__3,d__4);
+/* L220: */
+	    }
+	}
+	*scale /= tscal;
+    }
+
+/*     Scale the column norms by 1/TSCAL for return. */
+
+    if (tscal != 1.) {
+	d__1 = 1. / tscal;
+	dscal_(n, &d__1, &cnorm[1], &c__1);
+    }
+
+    return 0;
+
+/*     End of ZLATRS */
+
+} /* zlatrs_ */
+
+/* Subroutine */ int zlauu2_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__;
+    static doublereal aii;
+    extern logical lsame_(char *, char *);
+    extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern /* Subroutine */ int zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *), zdscal_(
+	    integer *, doublereal *, doublecomplex *, integer *), zlacgv_(
+	    integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLAUU2 computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLAUU2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the product U * U'. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    aii = a[i__2].r;
+	    if (i__ < *n) {
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = *n - i__;
+		zdotc_(&z__1, &i__3, &a[i__ + (i__ + 1) * a_dim1], lda, &a[
+			i__ + (i__ + 1) * a_dim1], lda);
+		d__1 = aii * aii + z__1.r;
+		a[i__2].r = d__1, a[i__2].i = 0.;
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+		i__2 = i__ - 1;
+		i__3 = *n - i__;
+		z__1.r = aii, z__1.i = 0.;
+		zgemv_("No transpose", &i__2, &i__3, &c_b57, &a[(i__ + 1) *
+			a_dim1 + 1], lda, &a[i__ + (i__ + 1) * a_dim1], lda, &
+			z__1, &a[i__ * a_dim1 + 1], &c__1);
+		i__2 = *n - i__;
+		zlacgv_(&i__2, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    } else {
+		zdscal_(&i__, &aii, &a[i__ * a_dim1 + 1], &c__1);
+	    }
+/* L10: */
+	}
+
+    } else {
+
+/*        Compute the product L' * L. */
+
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    i__2 = i__ + i__ * a_dim1;
+	    aii = a[i__2].r;
+	    if (i__ < *n) {
+		i__2 = i__ + i__ * a_dim1;
+		i__3 = *n - i__;
+		zdotc_(&z__1, &i__3, &a[i__ + 1 + i__ * a_dim1], &c__1, &a[
+			i__ + 1 + i__ * a_dim1], &c__1);
+		d__1 = aii * aii + z__1.r;
+		a[i__2].r = d__1, a[i__2].i = 0.;
+		i__2 = i__ - 1;
+		zlacgv_(&i__2, &a[i__ + a_dim1], lda);
+		i__2 = *n - i__;
+		i__3 = i__ - 1;
+		z__1.r = aii, z__1.i = 0.;
+		zgemv_("Conjugate transpose", &i__2, &i__3, &c_b57, &a[i__ +
+			1 + a_dim1], lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &
+			z__1, &a[i__ + a_dim1], lda);
+		i__2 = i__ - 1;
+		zlacgv_(&i__2, &a[i__ + a_dim1], lda);
+	    } else {
+		zdscal_(&i__, &aii, &a[i__ + a_dim1], lda);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZLAUU2 */
+
+} /* zlauu2_ */
+
+/* Subroutine */ int zlauum_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, ib, nb;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), zherk_(char *, char *, integer *,
+	    integer *, doublereal *, doublecomplex *, integer *, doublereal *,
+	     doublecomplex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *),
+	    zlauu2_(char *, integer *, doublecomplex *, integer *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZLAUUM computes the product U * U' or L' * L, where the triangular
+    factor U or L is stored in the upper or lower triangular part of
+    the array A.
+
+    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+    overwriting the factor U in A.
+    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+    overwriting the factor L in A.
+
+    This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the triangular factor stored in the array A
+            is upper or lower triangular:
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the triangular factor U or L.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the triangular factor U or L.
+            On exit, if UPLO = 'U', the upper triangle of A is
+            overwritten with the upper triangle of the product U * U';
+            if UPLO = 'L', the lower triangle of A is overwritten with
+            the lower triangle of the product L' * L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLAUUM", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "ZLAUUM", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	zlauu2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute the product U * U'. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		ztrmm_("Right", "Upper", "Conjugate transpose", "Non-unit", &
+			i__3, &ib, &c_b57, &a[i__ + i__ * a_dim1], lda, &a[
+			i__ * a_dim1 + 1], lda);
+		zlauu2_("Upper", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    zgemm_("No transpose", "Conjugate transpose", &i__3, &ib,
+			    &i__4, &c_b57, &a[(i__ + ib) * a_dim1 + 1], lda, &
+			    a[i__ + (i__ + ib) * a_dim1], lda, &c_b57, &a[i__
+			    * a_dim1 + 1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    zherk_("Upper", "No transpose", &ib, &i__3, &c_b1034, &a[
+			    i__ + (i__ + ib) * a_dim1], lda, &c_b1034, &a[i__
+			    + i__ * a_dim1], lda);
+		}
+/* L10: */
+	    }
+	} else {
+
+/*           Compute the product L' * L. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+/* Computing MIN */
+		i__3 = nb, i__4 = *n - i__ + 1;
+		ib = min(i__3,i__4);
+		i__3 = i__ - 1;
+		ztrmm_("Left", "Lower", "Conjugate transpose", "Non-unit", &
+			ib, &i__3, &c_b57, &a[i__ + i__ * a_dim1], lda, &a[
+			i__ + a_dim1], lda);
+		zlauu2_("Lower", &ib, &a[i__ + i__ * a_dim1], lda, info);
+		if (i__ + ib <= *n) {
+		    i__3 = i__ - 1;
+		    i__4 = *n - i__ - ib + 1;
+		    zgemm_("Conjugate transpose", "No transpose", &ib, &i__3,
+			    &i__4, &c_b57, &a[i__ + ib + i__ * a_dim1], lda, &
+			    a[i__ + ib + a_dim1], lda, &c_b57, &a[i__ +
+			    a_dim1], lda);
+		    i__3 = *n - i__ - ib + 1;
+		    zherk_("Lower", "Conjugate transpose", &ib, &i__3, &
+			    c_b1034, &a[i__ + ib + i__ * a_dim1], lda, &
+			    c_b1034, &a[i__ + i__ * a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZLAUUM */
+
+} /* zlauum_ */
+
+/* Subroutine */ int zpotf2_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer j;
+    static doublereal ajj;
+    extern logical lsame_(char *, char *);
+    extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern /* Subroutine */ int zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    static logical upper;
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), zdscal_(
+	    integer *, doublereal *, doublecomplex *, integer *), zlacgv_(
+	    integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZPOTF2 computes the Cholesky factorization of a complex Hermitian
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U' * U ,  if UPLO = 'U', or
+       A = L  * L',  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the upper or lower triangular part of the
+            Hermitian matrix A is stored.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            n by n upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U'*U  or A = L*L'.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+            > 0: if INFO = k, the leading minor of order k is not
+                 positive definite, and the factorization could not be
+                 completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZPOTF2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute the Cholesky factorization A = U'*U. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute U(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j + j * a_dim1;
+	    d__1 = a[i__2].r;
+	    i__3 = j - 1;
+	    zdotc_(&z__2, &i__3, &a[j * a_dim1 + 1], &c__1, &a[j * a_dim1 + 1]
+		    , &c__1);
+	    z__1.r = d__1 - z__2.r, z__1.i = -z__2.i;
+	    ajj = z__1.r;
+	    if (ajj <= 0. || disnan_(&ajj)) {
+		i__2 = j + j * a_dim1;
+		a[i__2].r = ajj, a[i__2].i = 0.;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = ajj, a[i__2].i = 0.;
+
+/*           Compute elements J+1:N of row J. */
+
+	    if (j < *n) {
+		i__2 = j - 1;
+		zlacgv_(&i__2, &a[j * a_dim1 + 1], &c__1);
+		i__2 = j - 1;
+		i__3 = *n - j;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("Transpose", &i__2, &i__3, &z__1, &a[(j + 1) * a_dim1
+			+ 1], lda, &a[j * a_dim1 + 1], &c__1, &c_b57, &a[j + (
+			j + 1) * a_dim1], lda);
+		i__2 = j - 1;
+		zlacgv_(&i__2, &a[j * a_dim1 + 1], &c__1);
+		i__2 = *n - j;
+		d__1 = 1. / ajj;
+		zdscal_(&i__2, &d__1, &a[j + (j + 1) * a_dim1], lda);
+	    }
+/* L10: */
+	}
+    } else {
+
+/*        Compute the Cholesky factorization A = L*L'. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+
+/*           Compute L(J,J) and test for non-positive-definiteness. */
+
+	    i__2 = j + j * a_dim1;
+	    d__1 = a[i__2].r;
+	    i__3 = j - 1;
+	    zdotc_(&z__2, &i__3, &a[j + a_dim1], lda, &a[j + a_dim1], lda);
+	    z__1.r = d__1 - z__2.r, z__1.i = -z__2.i;
+	    ajj = z__1.r;
+	    if (ajj <= 0. || disnan_(&ajj)) {
+		i__2 = j + j * a_dim1;
+		a[i__2].r = ajj, a[i__2].i = 0.;
+		goto L30;
+	    }
+	    ajj = sqrt(ajj);
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = ajj, a[i__2].i = 0.;
+
+/*           Compute elements J+1:N of column J. */
+
+	    if (j < *n) {
+		i__2 = j - 1;
+		zlacgv_(&i__2, &a[j + a_dim1], lda);
+		i__2 = *n - j;
+		i__3 = j - 1;
+		z__1.r = -1., z__1.i = -0.;
+		zgemv_("No transpose", &i__2, &i__3, &z__1, &a[j + 1 + a_dim1]
+			, lda, &a[j + a_dim1], lda, &c_b57, &a[j + 1 + j *
+			a_dim1], &c__1);
+		i__2 = j - 1;
+		zlacgv_(&i__2, &a[j + a_dim1], lda);
+		i__2 = *n - j;
+		d__1 = 1. / ajj;
+		zdscal_(&i__2, &d__1, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+    goto L40;
+
+L30:
+    *info = j;
+
+L40:
+    return 0;
+
+/*     End of ZPOTF2 */
+
+} /* zpotf2_ */
+
+/* Subroutine */ int zpotrf_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer j, jb, nb;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), zherk_(char *, char *, integer *,
+	    integer *, doublereal *, doublecomplex *, integer *, doublereal *,
+	     doublecomplex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *),
+	    zpotf2_(char *, integer *, doublecomplex *, integer *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZPOTRF computes the Cholesky factorization of a complex Hermitian
+    positive definite matrix A.
+
+    The factorization has the form
+       A = U**H * U,  if UPLO = 'U', or
+       A = L  * L**H,  if UPLO = 'L',
+    where U is an upper triangular matrix and L is lower triangular.
+
+    This is the block version of the algorithm, calling Level 3 BLAS.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
+            N-by-N upper triangular part of A contains the upper
+            triangular part of the matrix A, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of A contains the lower
+            triangular part of the matrix A, and the strictly upper
+            triangular part of A is not referenced.
+
+            On exit, if INFO = 0, the factor U or L from the Cholesky
+            factorization A = U**H*U or A = L*L**H.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the leading minor of order i is not
+                  positive definite, and the factorization could not be
+                  completed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZPOTRF", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Determine the block size for this environment. */
+
+    nb = ilaenv_(&c__1, "ZPOTRF", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)1);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code. */
+
+	zpotf2_(uplo, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code. */
+
+	if (upper) {
+
+/*           Compute the Cholesky factorization A = U'*U. */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		zherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b1276, &
+			a[j * a_dim1 + 1], lda, &c_b1034, &a[j + j * a_dim1],
+			lda);
+		zpotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block row. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("Conjugate transpose", "No transpose", &jb, &i__3,
+			    &i__4, &z__1, &a[j * a_dim1 + 1], lda, &a[(j + jb)
+			     * a_dim1 + 1], lda, &c_b57, &a[j + (j + jb) *
+			    a_dim1], lda);
+		    i__3 = *n - j - jb + 1;
+		    ztrsm_("Left", "Upper", "Conjugate transpose", "Non-unit",
+			     &jb, &i__3, &c_b57, &a[j + j * a_dim1], lda, &a[
+			    j + (j + jb) * a_dim1], lda);
+		}
+/* L10: */
+	    }
+
+	} else {
+
+/*           Compute the Cholesky factorization A = L*L'. */
+
+	    i__2 = *n;
+	    i__1 = nb;
+	    for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
+
+/*
+                Update and factorize the current diagonal block and test
+                for non-positive-definiteness.
+
+   Computing MIN
+*/
+		i__3 = nb, i__4 = *n - j + 1;
+		jb = min(i__3,i__4);
+		i__3 = j - 1;
+		zherk_("Lower", "No transpose", &jb, &i__3, &c_b1276, &a[j +
+			a_dim1], lda, &c_b1034, &a[j + j * a_dim1], lda);
+		zpotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info);
+		if (*info != 0) {
+		    goto L30;
+		}
+		if (j + jb <= *n) {
+
+/*                 Compute the current block column. */
+
+		    i__3 = *n - j - jb + 1;
+		    i__4 = j - 1;
+		    z__1.r = -1., z__1.i = -0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__3, &jb,
+			    &i__4, &z__1, &a[j + jb + a_dim1], lda, &a[j +
+			    a_dim1], lda, &c_b57, &a[j + jb + j * a_dim1],
+			    lda);
+		    i__3 = *n - j - jb + 1;
+		    ztrsm_("Right", "Lower", "Conjugate transpose", "Non-unit"
+			    , &i__3, &jb, &c_b57, &a[j + j * a_dim1], lda, &a[
+			    j + jb + j * a_dim1], lda);
+		}
+/* L20: */
+	    }
+	}
+    }
+    goto L40;
+
+L30:
+    *info = *info + j - 1;
+
+L40:
+    return 0;
+
+/*     End of ZPOTRF */
+
+} /* zpotrf_ */
+
+/* Subroutine */ int zpotri_(char *uplo, integer *n, doublecomplex *a,
+	integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *), zlauum_(
+	    char *, integer *, doublecomplex *, integer *, integer *),
+	     ztrtri_(char *, char *, integer *, doublecomplex *, integer *,
+	    integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZPOTRI computes the inverse of a complex Hermitian positive definite
+    matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+    computed by ZPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the triangular factor U or L from the Cholesky
+            factorization A = U**H*U or A = L*L**H, as computed by
+            ZPOTRF.
+            On exit, the upper or lower triangle of the (Hermitian)
+            inverse of A, overwriting the input factor U or L.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i, the (i,i) element of the factor U or L is
+                  zero, and the inverse could not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*n)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZPOTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Invert the triangular Cholesky factor U or L. */
+
+    ztrtri_(uplo, "Non-unit", n, &a[a_offset], lda, info);
+    if (*info > 0) {
+	return 0;
+    }
+
+/*     Form inv(U)*inv(U)' or inv(L)'*inv(L). */
+
+    zlauum_(uplo, n, &a[a_offset], lda, info);
+
+    return 0;
+
+/*     End of ZPOTRI */
+
+} /* zpotri_ */
+
+/* Subroutine */ int zpotrs_(char *uplo, integer *n, integer *nrhs,
+	doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb,
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *,
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZPOTRS solves a system of linear equations A*X = B with a Hermitian
+    positive definite matrix A using the Cholesky factorization
+    A = U**H*U or A = L*L**H computed by ZPOTRF.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  Upper triangle of A is stored;
+            = 'L':  Lower triangle of A is stored.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrix B.  NRHS >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,N)
+            The triangular factor U or L from the Cholesky factorization
+            A = U**H*U or A = L*L**H, as computed by ZPOTRF.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    B       (input/output) COMPLEX*16 array, dimension (LDB,NRHS)
+            On entry, the right hand side matrix B.
+            On exit, the solution matrix X.
+
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*ldb < max(1,*n)) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZPOTRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0) {
+	return 0;
+    }
+
+    if (upper) {
+
+/*
+          Solve A*X = B where A = U'*U.
+
+          Solve U'*X = B, overwriting B with X.
+*/
+
+	ztrsm_("Left", "Upper", "Conjugate transpose", "Non-unit", n, nrhs, &
+		c_b57, &a[a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve U*X = B, overwriting B with X. */
+
+	ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b57, &
+		a[a_offset], lda, &b[b_offset], ldb);
+    } else {
+
+/*
+          Solve A*X = B where A = L*L'.
+
+          Solve L*X = B, overwriting B with X.
+*/
+
+	ztrsm_("Left", "Lower", "No transpose", "Non-unit", n, nrhs, &c_b57, &
+		a[a_offset], lda, &b[b_offset], ldb);
+
+/*        Solve L'*X = B, overwriting B with X. */
+
+	ztrsm_("Left", "Lower", "Conjugate transpose", "Non-unit", n, nrhs, &
+		c_b57, &a[a_offset], lda, &b[b_offset], ldb);
+    }
+
+    return 0;
+
+/*     End of ZPOTRS */
+
+} /* zpotrs_ */
+
+/* Subroutine */ int zrot_(integer *n, doublecomplex *cx, integer *incx,
+	doublecomplex *cy, integer *incy, doublereal *c__, doublecomplex *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    static integer i__, ix, iy;
+    static doublecomplex stemp;
+
+
+/*
+    -- LAPACK auxiliary routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZROT   applies a plane rotation, where the cos (C) is real and the
+    sin (S) is complex, and the vectors CX and CY are complex.
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The number of elements in the vectors CX and CY.
+
+    CX      (input/output) COMPLEX*16 array, dimension (N)
+            On input, the vector X.
+            On output, CX is overwritten with C*X + S*Y.
+
+    INCX    (input) INTEGER
+            The increment between successive values of CY.  INCX <> 0.
+
+    CY      (input/output) COMPLEX*16 array, dimension (N)
+            On input, the vector Y.
+            On output, CY is overwritten with -CONJG(S)*X + C*Y.
+
+    INCY    (input) INTEGER
+            The increment between successive values of CY.  INCX <> 0.
+
+    C       (input) DOUBLE PRECISION
+    S       (input) COMPLEX*16
+            C and S define a rotation
+               [  C          S  ]
+               [ -conjg(S)   C  ]
+            where C*C + S*CONJG(S) = 1.0.
+
+   =====================================================================
+*/
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r, stemp.i = z__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	d_cnjg(&z__4, s);
+	i__4 = ix;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r *
+		cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r, stemp.i = z__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	d_cnjg(&z__4, s);
+	i__4 = i__;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r *
+		cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+    }
+    return 0;
+} /* zrot_ */
+
+/* Subroutine */ int zstedc_(char *compz, integer *n, doublereal *d__,
+	doublereal *e, doublecomplex *z__, integer *ldz, doublecomplex *work,
+	integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork,
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static integer i__, j, k, m;
+    static doublereal p;
+    static integer ii, ll, lgn;
+    static doublereal eps, tiny;
+    extern logical lsame_(char *, char *);
+    static integer lwmin, start;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), zlaed0_(integer *, integer *,
+	    doublereal *, doublereal *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublereal *, integer *, integer *);
+
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *), dstedc_(char *, integer *,
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     integer *, integer *, integer *, integer *), dlaset_(
+	    char *, integer *, integer *, doublereal *, doublereal *,
+	    doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer finish;
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *,
+	     integer *), zlacrm_(integer *, integer *, doublecomplex *,
+	    integer *, doublereal *, integer *, doublecomplex *, integer *,
+	    doublereal *);
+    static integer liwmin, icompz;
+    extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *,
+	    doublereal *, doublereal *, integer *, doublereal *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *);
+    static doublereal orgnrm;
+    static integer lrwmin;
+    static logical lquery;
+    static integer smlsiz;
+    extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *,
+	    doublereal *, doublecomplex *, integer *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZSTEDC computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the divide and conquer method.
+    The eigenvectors of a full or band complex Hermitian matrix can also
+    be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
+    matrix to tridiagonal form.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.  See DLAED3 for details.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'I':  Compute eigenvectors of tridiagonal matrix also.
+            = 'V':  Compute eigenvectors of original Hermitian matrix
+                    also.  On entry, Z contains the unitary matrix used
+                    to reduce the original matrix to tridiagonal form.
+
+    N       (input) INTEGER
+            The dimension of the symmetric tridiagonal matrix.  N >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the subdiagonal elements of the tridiagonal matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) COMPLEX*16 array, dimension (LDZ,N)
+            On entry, if COMPZ = 'V', then Z contains the unitary
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original Hermitian matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If  COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1.
+            If eigenvectors are desired, then LDZ >= max(1,N).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If COMPZ = 'N' or 'I', or N <= 1, LWORK must be at least 1.
+            If COMPZ = 'V' and N > 1, LWORK must be at least N*N.
+            Note that for COMPZ = 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LWORK need
+            only be 1.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal sizes of the WORK, RWORK and
+            IWORK arrays, returns these values as the first entries of
+            the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    RWORK   (workspace/output) DOUBLE PRECISION array,
+                                           dimension (LRWORK)
+            On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
+
+    LRWORK  (input) INTEGER
+            The dimension of the array RWORK.
+            If COMPZ = 'N' or N <= 1, LRWORK must be at least 1.
+            If COMPZ = 'V' and N > 1, LRWORK must be at least
+                           1 + 3*N + 2*N*lg N + 3*N**2 ,
+                           where lg( N ) = smallest integer k such
+                           that 2**k >= N.
+            If COMPZ = 'I' and N > 1, LRWORK must be at least
+                           1 + 4*N + 2*N**2 .
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LRWORK
+            need only be max(1,2*(N-1)).
+
+            If LRWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
+            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+
+    LIWORK  (input) INTEGER
+            The dimension of the array IWORK.
+            If COMPZ = 'N' or N <= 1, LIWORK must be at least 1.
+            If COMPZ = 'V' or N > 1,  LIWORK must be at least
+                                      6 + 6*N + 5*N*lg N.
+            If COMPZ = 'I' or N > 1,  LIWORK must be at least
+                                      3 + 5*N .
+            Note that for COMPZ = 'I' or 'V', then if N is less than or
+            equal to the minimum divide size, usually 25, then LIWORK
+            need only be 1.
+
+            If LIWORK = -1, then a workspace query is assumed; the
+            routine only calculates the optimal sizes of the WORK, RWORK
+            and IWORK arrays, returns these values as the first entries
+            of the WORK, RWORK and IWORK arrays, and no error message
+            related to LWORK or LRWORK or LIWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  The algorithm failed to compute an eigenvalue while
+                  working on the submatrix lying in rows and columns
+                  INFO/(N+1) through mod(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Jeff Rutter, Computer Science Division, University of California
+       at Berkeley, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1 || *lrwork == -1 || *liwork == -1;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+
+    if (*info == 0) {
+
+/*        Compute the workspace requirements */
+
+	smlsiz = ilaenv_(&c__9, "ZSTEDC", " ", &c__0, &c__0, &c__0, &c__0, (
+		ftnlen)6, (ftnlen)1);
+	if (*n <= 1 || icompz == 0) {
+	    lwmin = 1;
+	    liwmin = 1;
+	    lrwmin = 1;
+	} else if (*n <= smlsiz) {
+	    lwmin = 1;
+	    liwmin = 1;
+	    lrwmin = *n - 1 << 1;
+	} else if (icompz == 1) {
+	    lgn = (integer) (log((doublereal) (*n)) / log(2.));
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    if (pow_ii(&c__2, &lgn) < *n) {
+		++lgn;
+	    }
+	    lwmin = *n * *n;
+/* Computing 2nd power */
+	    i__1 = *n;
+	    lrwmin = *n * 3 + 1 + (*n << 1) * lgn + i__1 * i__1 * 3;
+	    liwmin = *n * 6 + 6 + *n * 5 * lgn;
+	} else if (icompz == 2) {
+	    lwmin = 1;
+/* Computing 2nd power */
+	    i__1 = *n;
+	    lrwmin = (*n << 2) + 1 + (i__1 * i__1 << 1);
+	    liwmin = *n * 5 + 3;
+	}
+	work[1].r = (doublereal) lwmin, work[1].i = 0.;
+	rwork[1] = (doublereal) lrwmin;
+	iwork[1] = liwmin;
+
+	if (*lwork < lwmin && ! lquery) {
+	    *info = -8;
+	} else if (*lrwork < lrwmin && ! lquery) {
+	    *info = -10;
+	} else if (*liwork < liwmin && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZSTEDC", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+    if (*n == 1) {
+	if (icompz != 0) {
+	    i__1 = z_dim1 + 1;
+	    z__[i__1].r = 1., z__[i__1].i = 0.;
+	}
+	return 0;
+    }
+
+/*
+       If the following conditional clause is removed, then the routine
+       will use the Divide and Conquer routine to compute only the
+       eigenvalues, which requires (3N + 3N**2) real workspace and
+       (2 + 5N + 2N lg(N)) integer workspace.
+       Since on many architectures DSTERF is much faster than any other
+       algorithm for finding eigenvalues only, it is used here
+       as the default. If the conditional clause is removed, then
+       information on the size of workspace needs to be changed.
+
+       If COMPZ = 'N', use DSTERF to compute the eigenvalues.
+*/
+
+    if (icompz == 0) {
+	dsterf_(n, &d__[1], &e[1], info);
+	goto L70;
+    }
+
+/*
+       If N is smaller than the minimum divide size (SMLSIZ+1), then
+       solve the problem with another solver.
+*/
+
+    if (*n <= smlsiz) {
+
+	zsteqr_(compz, n, &d__[1], &e[1], &z__[z_offset], ldz, &rwork[1],
+		info);
+
+    } else {
+
+/*        If COMPZ = 'I', we simply call DSTEDC instead. */
+
+	if (icompz == 2) {
+	    dlaset_("Full", n, n, &c_b328, &c_b1034, &rwork[1], n);
+	    ll = *n * *n + 1;
+	    i__1 = *lrwork - ll + 1;
+	    dstedc_("I", n, &d__[1], &e[1], &rwork[1], n, &rwork[ll], &i__1, &
+		    iwork[1], liwork, info);
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * z_dim1;
+		    i__4 = (j - 1) * *n + i__;
+		    z__[i__3].r = rwork[i__4], z__[i__3].i = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+	    goto L70;
+	}
+
+/*
+          From now on, only option left to be handled is COMPZ = 'V',
+          i.e. ICOMPZ = 1.
+
+          Scale.
+*/
+
+	orgnrm = dlanst_("M", n, &d__[1], &e[1]);
+	if (orgnrm == 0.) {
+	    goto L70;
+	}
+
+	eps = EPSILON;
+
+	start = 1;
+
+/*        while ( START <= N ) */
+
+L30:
+	if (start <= *n) {
+
+/*
+             Let FINISH be the position of the next subdiagonal entry
+             such that E( FINISH ) <= TINY or FINISH = N if no such
+             subdiagonal exists.  The matrix identified by the elements
+             between START and FINISH constitutes an independent
+             sub-problem.
+*/
+
+	    finish = start;
+L40:
+	    if (finish < *n) {
+		tiny = eps * sqrt((d__1 = d__[finish], abs(d__1))) * sqrt((
+			d__2 = d__[finish + 1], abs(d__2)));
+		if ((d__1 = e[finish], abs(d__1)) > tiny) {
+		    ++finish;
+		    goto L40;
+		}
+	    }
+
+/*           (Sub) Problem determined.  Compute its size and solve it. */
+
+	    m = finish - start + 1;
+	    if (m > smlsiz) {
+
+/*              Scale. */
+
+		orgnrm = dlanst_("M", &m, &d__[start], &e[start]);
+		dlascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &m, &c__1, &d__[
+			start], &m, info);
+		i__1 = m - 1;
+		i__2 = m - 1;
+		dlascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &i__1, &c__1, &
+			e[start], &i__2, info);
+
+		zlaed0_(n, &m, &d__[start], &e[start], &z__[start * z_dim1 +
+			1], ldz, &work[1], n, &rwork[1], &iwork[1], info);
+		if (*info > 0) {
+		    *info = (*info / (m + 1) + start - 1) * (*n + 1) + *info %
+			     (m + 1) + start - 1;
+		    goto L70;
+		}
+
+/*              Scale back. */
+
+		dlascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, &m, &c__1, &d__[
+			start], &m, info);
+
+	    } else {
+		dsteqr_("I", &m, &d__[start], &e[start], &rwork[1], &m, &
+			rwork[m * m + 1], info);
+		zlacrm_(n, &m, &z__[start * z_dim1 + 1], ldz, &rwork[1], &m, &
+			work[1], n, &rwork[m * m + 1]);
+		zlacpy_("A", n, &m, &work[1], n, &z__[start * z_dim1 + 1],
+			ldz);
+		if (*info > 0) {
+		    *info = start * (*n + 1) + finish;
+		    goto L70;
+		}
+	    }
+
+	    start = finish + 1;
+	    goto L30;
+	}
+
+/*
+          endwhile
+
+          If the problem split any number of times, then the eigenvalues
+          will not be properly ordered.  Here we permute the eigenvalues
+          (and the associated eigenvectors) into ascending order.
+*/
+
+	if (m != *n) {
+
+/*           Use Selection Sort to minimize swaps of eigenvectors */
+
+	    i__1 = *n;
+	    for (ii = 2; ii <= i__1; ++ii) {
+		i__ = ii - 1;
+		k = i__;
+		p = d__[i__];
+		i__2 = *n;
+		for (j = ii; j <= i__2; ++j) {
+		    if (d__[j] < p) {
+			k = j;
+			p = d__[j];
+		    }
+/* L50: */
+		}
+		if (k != i__) {
+		    d__[k] = d__[i__];
+		    d__[i__] = p;
+		    zswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k * z_dim1
+			    + 1], &c__1);
+		}
+/* L60: */
+	    }
+	}
+    }
+
+L70:
+    work[1].r = (doublereal) lwmin, work[1].i = 0.;
+    rwork[1] = (doublereal) lrwmin;
+    iwork[1] = liwmin;
+
+    return 0;
+
+/*     End of ZSTEDC */
+
+} /* zstedc_ */
+
+/* Subroutine */ int zsteqr_(char *compz, integer *n, doublereal *d__,
+	doublereal *e, doublecomplex *z__, integer *ldz, doublereal *work,
+	integer *info)
+{
+    /* System generated locals */
+    integer z_dim1, z_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    static doublereal b, c__, f, g;
+    static integer i__, j, k, l, m;
+    static doublereal p, r__, s;
+    static integer l1, ii, mm, lm1, mm1, nm1;
+    static doublereal rt1, rt2, eps;
+    static integer lsv;
+    static doublereal tst, eps2;
+    static integer lend, jtot;
+    extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal
+	    *, doublereal *, doublereal *);
+    extern logical lsame_(char *, char *);
+    static doublereal anorm;
+    extern /* Subroutine */ int zlasr_(char *, char *, char *, integer *,
+	    integer *, doublereal *, doublereal *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *,
+	    integer *, doublecomplex *, integer *), dlaev2_(doublereal *,
+	    doublereal *, doublereal *, doublereal *, doublereal *,
+	    doublereal *, doublereal *);
+    static integer lendm1, lendp1;
+
+    static integer iscale;
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *,
+	    doublereal *, doublereal *, integer *, integer *, doublereal *,
+	    integer *, integer *);
+    static doublereal safmin;
+    extern /* Subroutine */ int dlartg_(doublereal *, doublereal *,
+	    doublereal *, doublereal *, doublereal *);
+    static doublereal safmax;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *);
+    extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *,
+	    integer *);
+    static integer lendsv;
+    static doublereal ssfmin;
+    static integer nmaxit, icompz;
+    static doublereal ssfmax;
+    extern /* Subroutine */ int zlaset_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZSTEQR computes all eigenvalues and, optionally, eigenvectors of a
+    symmetric tridiagonal matrix using the implicit QL or QR method.
+    The eigenvectors of a full or band complex Hermitian matrix can also
+    be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
+    matrix to tridiagonal form.
+
+    Arguments
+    =========
+
+    COMPZ   (input) CHARACTER*1
+            = 'N':  Compute eigenvalues only.
+            = 'V':  Compute eigenvalues and eigenvectors of the original
+                    Hermitian matrix.  On entry, Z must contain the
+                    unitary matrix used to reduce the original matrix
+                    to tridiagonal form.
+            = 'I':  Compute eigenvalues and eigenvectors of the
+                    tridiagonal matrix.  Z is initialized to the identity
+                    matrix.
+
+    N       (input) INTEGER
+            The order of the matrix.  N >= 0.
+
+    D       (input/output) DOUBLE PRECISION array, dimension (N)
+            On entry, the diagonal elements of the tridiagonal matrix.
+            On exit, if INFO = 0, the eigenvalues in ascending order.
+
+    E       (input/output) DOUBLE PRECISION array, dimension (N-1)
+            On entry, the (n-1) subdiagonal elements of the tridiagonal
+            matrix.
+            On exit, E has been destroyed.
+
+    Z       (input/output) COMPLEX*16 array, dimension (LDZ, N)
+            On entry, if  COMPZ = 'V', then Z contains the unitary
+            matrix used in the reduction to tridiagonal form.
+            On exit, if INFO = 0, then if COMPZ = 'V', Z contains the
+            orthonormal eigenvectors of the original Hermitian matrix,
+            and if COMPZ = 'I', Z contains the orthonormal eigenvectors
+            of the symmetric tridiagonal matrix.
+            If COMPZ = 'N', then Z is not referenced.
+
+    LDZ     (input) INTEGER
+            The leading dimension of the array Z.  LDZ >= 1, and if
+            eigenvectors are desired, then  LDZ >= max(1,N).
+
+    WORK    (workspace) DOUBLE PRECISION array, dimension (max(1,2*N-2))
+            If COMPZ = 'N', then WORK is not referenced.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  the algorithm has failed to find all the eigenvalues in
+                  a total of 30*N iterations; if INFO = i, then i
+                  elements of E have not converged to zero; on exit, D
+                  and E contain the elements of a symmetric tridiagonal
+                  matrix which is unitarily similar to the original
+                  matrix.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(compz, "N")) {
+	icompz = 0;
+    } else if (lsame_(compz, "V")) {
+	icompz = 1;
+    } else if (lsame_(compz, "I")) {
+	icompz = 2;
+    } else {
+	icompz = -1;
+    }
+    if (icompz < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldz < 1 || icompz > 0 && *ldz < max(1,*n)) {
+	*info = -6;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZSTEQR", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (*n == 1) {
+	if (icompz == 2) {
+	    i__1 = z_dim1 + 1;
+	    z__[i__1].r = 1., z__[i__1].i = 0.;
+	}
+	return 0;
+    }
+
+/*     Determine the unit roundoff and over/underflow thresholds. */
+
+    eps = EPSILON;
+/* Computing 2nd power */
+    d__1 = eps;
+    eps2 = d__1 * d__1;
+    safmin = SAFEMINIMUM;
+    safmax = 1. / safmin;
+    ssfmax = sqrt(safmax) / 3.;
+    ssfmin = sqrt(safmin) / eps2;
+
+/*
+       Compute the eigenvalues and eigenvectors of the tridiagonal
+       matrix.
+*/
+
+    if (icompz == 2) {
+	zlaset_("Full", n, n, &c_b56, &c_b57, &z__[z_offset], ldz);
+    }
+
+    nmaxit = *n * 30;
+    jtot = 0;
+
+/*
+       Determine where the matrix splits and choose QL or QR iteration
+       for each block, according to whether top or bottom diagonal
+       element is smaller.
+*/
+
+    l1 = 1;
+    nm1 = *n - 1;
+
+L10:
+    if (l1 > *n) {
+	goto L160;
+    }
+    if (l1 > 1) {
+	e[l1 - 1] = 0.;
+    }
+    if (l1 <= nm1) {
+	i__1 = nm1;
+	for (m = l1; m <= i__1; ++m) {
+	    tst = (d__1 = e[m], abs(d__1));
+	    if (tst == 0.) {
+		goto L30;
+	    }
+	    if (tst <= sqrt((d__1 = d__[m], abs(d__1))) * sqrt((d__2 = d__[m
+		    + 1], abs(d__2))) * eps) {
+		e[m] = 0.;
+		goto L30;
+	    }
+/* L20: */
+	}
+    }
+    m = *n;
+
+L30:
+    l = l1;
+    lsv = l;
+    lend = m;
+    lendsv = lend;
+    l1 = m + 1;
+    if (lend == l) {
+	goto L10;
+    }
+
+/*     Scale submatrix in rows and columns L to LEND */
+
+    i__1 = lend - l + 1;
+    anorm = dlanst_("I", &i__1, &d__[l], &e[l]);
+    iscale = 0;
+    if (anorm == 0.) {
+	goto L10;
+    }
+    if (anorm > ssfmax) {
+	iscale = 1;
+	i__1 = lend - l + 1;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmax, &i__1, &c__1, &e[l], n,
+		info);
+    } else if (anorm < ssfmin) {
+	iscale = 2;
+	i__1 = lend - l + 1;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &d__[l], n,
+		info);
+	i__1 = lend - l;
+	dlascl_("G", &c__0, &c__0, &anorm, &ssfmin, &i__1, &c__1, &e[l], n,
+		info);
+    }
+
+/*     Choose between QL and QR iteration */
+
+    if ((d__1 = d__[lend], abs(d__1)) < (d__2 = d__[l], abs(d__2))) {
+	lend = lsv;
+	l = lendsv;
+    }
+
+    if (lend > l) {
+
+/*
+          QL Iteration
+
+          Look for small subdiagonal element.
+*/
+
+L40:
+	if (l != lend) {
+	    lendm1 = lend - 1;
+	    i__1 = lendm1;
+	    for (m = l; m <= i__1; ++m) {
+/* Computing 2nd power */
+		d__2 = (d__1 = e[m], abs(d__1));
+		tst = d__2 * d__2;
+		if (tst <= eps2 * (d__1 = d__[m], abs(d__1)) * (d__2 = d__[m
+			+ 1], abs(d__2)) + safmin) {
+		    goto L60;
+		}
+/* L50: */
+	    }
+	}
+
+	m = lend;
+
+L60:
+	if (m < lend) {
+	    e[m] = 0.;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L80;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use DLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l + 1) {
+	    if (icompz > 0) {
+		dlaev2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2, &c__, &s);
+		work[l] = c__;
+		work[*n - 1 + l] = s;
+		zlasr_("R", "V", "B", n, &c__2, &work[l], &work[*n - 1 + l], &
+			z__[l * z_dim1 + 1], ldz);
+	    } else {
+		dlae2_(&d__[l], &e[l], &d__[l + 1], &rt1, &rt2);
+	    }
+	    d__[l] = rt1;
+	    d__[l + 1] = rt2;
+	    e[l] = 0.;
+	    l += 2;
+	    if (l <= lend) {
+		goto L40;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l + 1] - p) / (e[l] * 2.);
+	r__ = dlapy2_(&g, &c_b1034);
+	g = d__[m] - p + e[l] / (g + d_sign(&r__, &g));
+
+	s = 1.;
+	c__ = 1.;
+	p = 0.;
+
+/*        Inner loop */
+
+	mm1 = m - 1;
+	i__1 = l;
+	for (i__ = mm1; i__ >= i__1; --i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    dlartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m - 1) {
+		e[i__ + 1] = r__;
+	    }
+	    g = d__[i__ + 1] - p;
+	    r__ = (d__[i__] - g) * s + c__ * 2. * b;
+	    p = s * r__;
+	    d__[i__ + 1] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = -s;
+	    }
+
+/* L70: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = m - l + 1;
+	    zlasr_("R", "V", "B", n, &mm, &work[l], &work[*n - 1 + l], &z__[l
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[l] = g;
+	goto L40;
+
+/*        Eigenvalue found. */
+
+L80:
+	d__[l] = p;
+
+	++l;
+	if (l <= lend) {
+	    goto L40;
+	}
+	goto L140;
+
+    } else {
+
+/*
+          QR Iteration
+
+          Look for small superdiagonal element.
+*/
+
+L90:
+	if (l != lend) {
+	    lendp1 = lend + 1;
+	    i__1 = lendp1;
+	    for (m = l; m >= i__1; --m) {
+/* Computing 2nd power */
+		d__2 = (d__1 = e[m - 1], abs(d__1));
+		tst = d__2 * d__2;
+		if (tst <= eps2 * (d__1 = d__[m], abs(d__1)) * (d__2 = d__[m
+			- 1], abs(d__2)) + safmin) {
+		    goto L110;
+		}
+/* L100: */
+	    }
+	}
+
+	m = lend;
+
+L110:
+	if (m > lend) {
+	    e[m - 1] = 0.;
+	}
+	p = d__[l];
+	if (m == l) {
+	    goto L130;
+	}
+
+/*
+          If remaining matrix is 2-by-2, use DLAE2 or SLAEV2
+          to compute its eigensystem.
+*/
+
+	if (m == l - 1) {
+	    if (icompz > 0) {
+		dlaev2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2, &c__, &s)
+			;
+		work[m] = c__;
+		work[*n - 1 + m] = s;
+		zlasr_("R", "V", "F", n, &c__2, &work[m], &work[*n - 1 + m], &
+			z__[(l - 1) * z_dim1 + 1], ldz);
+	    } else {
+		dlae2_(&d__[l - 1], &e[l - 1], &d__[l], &rt1, &rt2);
+	    }
+	    d__[l - 1] = rt1;
+	    d__[l] = rt2;
+	    e[l - 1] = 0.;
+	    l += -2;
+	    if (l >= lend) {
+		goto L90;
+	    }
+	    goto L140;
+	}
+
+	if (jtot == nmaxit) {
+	    goto L140;
+	}
+	++jtot;
+
+/*        Form shift. */
+
+	g = (d__[l - 1] - p) / (e[l - 1] * 2.);
+	r__ = dlapy2_(&g, &c_b1034);
+	g = d__[m] - p + e[l - 1] / (g + d_sign(&r__, &g));
+
+	s = 1.;
+	c__ = 1.;
+	p = 0.;
+
+/*        Inner loop */
+
+	lm1 = l - 1;
+	i__1 = lm1;
+	for (i__ = m; i__ <= i__1; ++i__) {
+	    f = s * e[i__];
+	    b = c__ * e[i__];
+	    dlartg_(&g, &f, &c__, &s, &r__);
+	    if (i__ != m) {
+		e[i__ - 1] = r__;
+	    }
+	    g = d__[i__] - p;
+	    r__ = (d__[i__ + 1] - g) * s + c__ * 2. * b;
+	    p = s * r__;
+	    d__[i__] = g + p;
+	    g = c__ * r__ - b;
+
+/*           If eigenvectors are desired, then save rotations. */
+
+	    if (icompz > 0) {
+		work[i__] = c__;
+		work[*n - 1 + i__] = s;
+	    }
+
+/* L120: */
+	}
+
+/*        If eigenvectors are desired, then apply saved rotations. */
+
+	if (icompz > 0) {
+	    mm = l - m + 1;
+	    zlasr_("R", "V", "F", n, &mm, &work[m], &work[*n - 1 + m], &z__[m
+		    * z_dim1 + 1], ldz);
+	}
+
+	d__[l] -= p;
+	e[lm1] = g;
+	goto L90;
+
+/*        Eigenvalue found. */
+
+L130:
+	d__[l] = p;
+
+	--l;
+	if (l >= lend) {
+	    goto L90;
+	}
+	goto L140;
+
+    }
+
+/*     Undo scaling if necessary */
+
+L140:
+    if (iscale == 1) {
+	i__1 = lendsv - lsv + 1;
+	dlascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	dlascl_("G", &c__0, &c__0, &ssfmax, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    } else if (iscale == 2) {
+	i__1 = lendsv - lsv + 1;
+	dlascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &d__[lsv],
+		n, info);
+	i__1 = lendsv - lsv;
+	dlascl_("G", &c__0, &c__0, &ssfmin, &anorm, &i__1, &c__1, &e[lsv], n,
+		info);
+    }
+
+/*
+       Check for no convergence to an eigenvalue after a total
+       of N*MAXIT iterations.
+*/
+
+    if (jtot == nmaxit) {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (e[i__] != 0.) {
+		++(*info);
+	    }
+/* L150: */
+	}
+	return 0;
+    }
+    goto L10;
+
+/*     Order eigenvalues and eigenvectors. */
+
+L160:
+    if (icompz == 0) {
+
+/*        Use Quick Sort */
+
+	dlasrt_("I", n, &d__[1], info);
+
+    } else {
+
+/*        Use Selection Sort to minimize swaps of eigenvectors */
+
+	i__1 = *n;
+	for (ii = 2; ii <= i__1; ++ii) {
+	    i__ = ii - 1;
+	    k = i__;
+	    p = d__[i__];
+	    i__2 = *n;
+	    for (j = ii; j <= i__2; ++j) {
+		if (d__[j] < p) {
+		    k = j;
+		    p = d__[j];
+		}
+/* L170: */
+	    }
+	    if (k != i__) {
+		d__[k] = d__[i__];
+		d__[i__] = p;
+		zswap_(n, &z__[i__ * z_dim1 + 1], &c__1, &z__[k * z_dim1 + 1],
+			 &c__1);
+	    }
+/* L180: */
+	}
+    }
+    return 0;
+
+/*     End of ZSTEQR */
+
+} /* zsteqr_ */
+
+/* Subroutine */ int ztrevc_(char *side, char *howmny, logical *select,
+	integer *n, doublecomplex *t, integer *ldt, doublecomplex *vl,
+	integer *ldvl, doublecomplex *vr, integer *ldvr, integer *mm, integer
+	*m, doublecomplex *work, doublereal *rwork, integer *info)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
+	    i__2, i__3, i__4, i__5;
+    doublereal d__1, d__2, d__3;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j, k, ii, ki, is;
+    static doublereal ulp;
+    static logical allv;
+    static doublereal unfl, ovfl, smin;
+    static logical over;
+    static doublereal scale;
+    extern logical lsame_(char *, char *);
+    static doublereal remax;
+    static logical leftv, bothv;
+    extern /* Subroutine */ int zgemv_(char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    static logical somev;
+    extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *), dlabad_(doublereal *, doublereal *);
+
+    extern /* Subroutine */ int xerbla_(char *, integer *), zdscal_(
+	    integer *, doublereal *, doublecomplex *, integer *);
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+    static logical rightv;
+    extern doublereal dzasum_(integer *, doublecomplex *, integer *);
+    static doublereal smlnum;
+    extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublereal *, doublereal *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZTREVC computes some or all of the right and/or left eigenvectors of
+    a complex upper triangular matrix T.
+    Matrices of this type are produced by the Schur factorization of
+    a complex general matrix:  A = Q*T*Q**H, as computed by ZHSEQR.
+
+    The right eigenvector x and the left eigenvector y of T corresponding
+    to an eigenvalue w are defined by:
+
+                 T*x = w*x,     (y**H)*T = w*(y**H)
+
+    where y**H denotes the conjugate transpose of the vector y.
+    The eigenvalues are not input to this routine, but are read directly
+    from the diagonal of T.
+
+    This routine returns the matrices X and/or Y of right and left
+    eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+    input matrix.  If Q is the unitary factor that reduces a matrix A to
+    Schur form T, then Q*X and Q*Y are the matrices of right and left
+    eigenvectors of A.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'R':  compute right eigenvectors only;
+            = 'L':  compute left eigenvectors only;
+            = 'B':  compute both right and left eigenvectors.
+
+    HOWMNY  (input) CHARACTER*1
+            = 'A':  compute all right and/or left eigenvectors;
+            = 'B':  compute all right and/or left eigenvectors,
+                    backtransformed using the matrices supplied in
+                    VR and/or VL;
+            = 'S':  compute selected right and/or left eigenvectors,
+                    as indicated by the logical array SELECT.
+
+    SELECT  (input) LOGICAL array, dimension (N)
+            If HOWMNY = 'S', SELECT specifies the eigenvectors to be
+            computed.
+            The eigenvector corresponding to the j-th eigenvalue is
+            computed if SELECT(j) = .TRUE..
+            Not referenced if HOWMNY = 'A' or 'B'.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) COMPLEX*16 array, dimension (LDT,N)
+            The upper triangular matrix T.  T is modified, but restored
+            on exit.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    VL      (input/output) COMPLEX*16 array, dimension (LDVL,MM)
+            On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must
+            contain an N-by-N matrix Q (usually the unitary matrix Q of
+            Schur vectors returned by ZHSEQR).
+            On exit, if SIDE = 'L' or 'B', VL contains:
+            if HOWMNY = 'A', the matrix Y of left eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*Y;
+            if HOWMNY = 'S', the left eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VL, in the same order as their
+                             eigenvalues.
+            Not referenced if SIDE = 'R'.
+
+    LDVL    (input) INTEGER
+            The leading dimension of the array VL.  LDVL >= 1, and if
+            SIDE = 'L' or 'B', LDVL >= N.
+
+    VR      (input/output) COMPLEX*16 array, dimension (LDVR,MM)
+            On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
+            contain an N-by-N matrix Q (usually the unitary matrix Q of
+            Schur vectors returned by ZHSEQR).
+            On exit, if SIDE = 'R' or 'B', VR contains:
+            if HOWMNY = 'A', the matrix X of right eigenvectors of T;
+            if HOWMNY = 'B', the matrix Q*X;
+            if HOWMNY = 'S', the right eigenvectors of T specified by
+                             SELECT, stored consecutively in the columns
+                             of VR, in the same order as their
+                             eigenvalues.
+            Not referenced if SIDE = 'L'.
+
+    LDVR    (input) INTEGER
+            The leading dimension of the array VR.  LDVR >= 1, and if
+            SIDE = 'R' or 'B'; LDVR >= N.
+
+    MM      (input) INTEGER
+            The number of columns in the arrays VL and/or VR. MM >= M.
+
+    M       (output) INTEGER
+            The number of columns in the arrays VL and/or VR actually
+            used to store the eigenvectors.  If HOWMNY = 'A' or 'B', M
+            is set to N.  Each selected eigenvector occupies one
+            column.
+
+    WORK    (workspace) COMPLEX*16 array, dimension (2*N)
+
+    RWORK   (workspace) DOUBLE PRECISION array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The algorithm used in this program is basically backward (forward)
+    substitution, with scaling to make the the code robust against
+    possible overflow.
+
+    Each eigenvector is normalized so that the element of largest
+    magnitude has magnitude 1; here the magnitude of a complex number
+    (x,y) is taken to be |x| + |y|.
+
+    =====================================================================
+
+
+       Decode and test the input parameters
+*/
+
+    /* Parameter adjustments */
+    --select;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    vl_dim1 = *ldvl;
+    vl_offset = 1 + vl_dim1;
+    vl -= vl_offset;
+    vr_dim1 = *ldvr;
+    vr_offset = 1 + vr_dim1;
+    vr -= vr_offset;
+    --work;
+    --rwork;
+
+    /* Function Body */
+    bothv = lsame_(side, "B");
+    rightv = lsame_(side, "R") || bothv;
+    leftv = lsame_(side, "L") || bothv;
+
+    allv = lsame_(howmny, "A");
+    over = lsame_(howmny, "B");
+    somev = lsame_(howmny, "S");
+
+/*
+       Set M to the number of columns required to store the selected
+       eigenvectors.
+*/
+
+    if (somev) {
+	*m = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (select[j]) {
+		++(*m);
+	    }
+/* L10: */
+	}
+    } else {
+	*m = *n;
+    }
+
+    *info = 0;
+    if (! rightv && ! leftv) {
+	*info = -1;
+    } else if (! allv && ! over && ! somev) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ldt < max(1,*n)) {
+	*info = -6;
+    } else if (*ldvl < 1 || leftv && *ldvl < *n) {
+	*info = -8;
+    } else if (*ldvr < 1 || rightv && *ldvr < *n) {
+	*info = -10;
+    } else if (*mm < *m) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZTREVC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Set the constants to control overflow. */
+
+    unfl = SAFEMINIMUM;
+    ovfl = 1. / unfl;
+    dlabad_(&unfl, &ovfl);
+    ulp = PRECISION;
+    smlnum = unfl * (*n / ulp);
+
+/*     Store the diagonal elements of T in working array WORK. */
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__ + *n;
+	i__3 = i__ + i__ * t_dim1;
+	work[i__2].r = t[i__3].r, work[i__2].i = t[i__3].i;
+/* L20: */
+    }
+
+/*
+       Compute 1-norm of each column of strictly upper triangular
+       part of T to control overflow in triangular solver.
+*/
+
+    rwork[1] = 0.;
+    i__1 = *n;
+    for (j = 2; j <= i__1; ++j) {
+	i__2 = j - 1;
+	rwork[j] = dzasum_(&i__2, &t[j * t_dim1 + 1], &c__1);
+/* L30: */
+    }
+
+    if (rightv) {
+
+/*        Compute right eigenvectors. */
+
+	is = *m;
+	for (ki = *n; ki >= 1; --ki) {
+
+	    if (somev) {
+		if (! select[ki]) {
+		    goto L80;
+		}
+	    }
+/* Computing MAX */
+	    i__1 = ki + ki * t_dim1;
+	    d__3 = ulp * ((d__1 = t[i__1].r, abs(d__1)) + (d__2 = d_imag(&t[
+		    ki + ki * t_dim1]), abs(d__2)));
+	    smin = max(d__3,smlnum);
+
+	    work[1].r = 1., work[1].i = 0.;
+
+/*           Form right-hand side. */
+
+	    i__1 = ki - 1;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k;
+		i__3 = k + ki * t_dim1;
+		z__1.r = -t[i__3].r, z__1.i = -t[i__3].i;
+		work[i__2].r = z__1.r, work[i__2].i = z__1.i;
+/* L40: */
+	    }
+
+/*
+             Solve the triangular system:
+                (T(1:KI-1,1:KI-1) - T(KI,KI))*X = SCALE*WORK.
+*/
+
+	    i__1 = ki - 1;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k + k * t_dim1;
+		i__3 = k + k * t_dim1;
+		i__4 = ki + ki * t_dim1;
+		z__1.r = t[i__3].r - t[i__4].r, z__1.i = t[i__3].i - t[i__4]
+			.i;
+		t[i__2].r = z__1.r, t[i__2].i = z__1.i;
+		i__2 = k + k * t_dim1;
+		if ((d__1 = t[i__2].r, abs(d__1)) + (d__2 = d_imag(&t[k + k *
+			t_dim1]), abs(d__2)) < smin) {
+		    i__3 = k + k * t_dim1;
+		    t[i__3].r = smin, t[i__3].i = 0.;
+		}
+/* L50: */
+	    }
+
+	    if (ki > 1) {
+		i__1 = ki - 1;
+		zlatrs_("Upper", "No transpose", "Non-unit", "Y", &i__1, &t[
+			t_offset], ldt, &work[1], &scale, &rwork[1], info);
+		i__1 = ki;
+		work[i__1].r = scale, work[i__1].i = 0.;
+	    }
+
+/*           Copy the vector x or Q*x to VR and normalize. */
+
+	    if (! over) {
+		zcopy_(&ki, &work[1], &c__1, &vr[is * vr_dim1 + 1], &c__1);
+
+		ii = izamax_(&ki, &vr[is * vr_dim1 + 1], &c__1);
+		i__1 = ii + is * vr_dim1;
+		remax = 1. / ((d__1 = vr[i__1].r, abs(d__1)) + (d__2 = d_imag(
+			&vr[ii + is * vr_dim1]), abs(d__2)));
+		zdscal_(&ki, &remax, &vr[is * vr_dim1 + 1], &c__1);
+
+		i__1 = *n;
+		for (k = ki + 1; k <= i__1; ++k) {
+		    i__2 = k + is * vr_dim1;
+		    vr[i__2].r = 0., vr[i__2].i = 0.;
+/* L60: */
+		}
+	    } else {
+		if (ki > 1) {
+		    i__1 = ki - 1;
+		    z__1.r = scale, z__1.i = 0.;
+		    zgemv_("N", n, &i__1, &c_b57, &vr[vr_offset], ldvr, &work[
+			    1], &c__1, &z__1, &vr[ki * vr_dim1 + 1], &c__1);
+		}
+
+		ii = izamax_(n, &vr[ki * vr_dim1 + 1], &c__1);
+		i__1 = ii + ki * vr_dim1;
+		remax = 1. / ((d__1 = vr[i__1].r, abs(d__1)) + (d__2 = d_imag(
+			&vr[ii + ki * vr_dim1]), abs(d__2)));
+		zdscal_(n, &remax, &vr[ki * vr_dim1 + 1], &c__1);
+	    }
+
+/*           Set back the original diagonal elements of T. */
+
+	    i__1 = ki - 1;
+	    for (k = 1; k <= i__1; ++k) {
+		i__2 = k + k * t_dim1;
+		i__3 = k + *n;
+		t[i__2].r = work[i__3].r, t[i__2].i = work[i__3].i;
+/* L70: */
+	    }
+
+	    --is;
+L80:
+	    ;
+	}
+    }
+
+    if (leftv) {
+
+/*        Compute left eigenvectors. */
+
+	is = 1;
+	i__1 = *n;
+	for (ki = 1; ki <= i__1; ++ki) {
+
+	    if (somev) {
+		if (! select[ki]) {
+		    goto L130;
+		}
+	    }
+/* Computing MAX */
+	    i__2 = ki + ki * t_dim1;
+	    d__3 = ulp * ((d__1 = t[i__2].r, abs(d__1)) + (d__2 = d_imag(&t[
+		    ki + ki * t_dim1]), abs(d__2)));
+	    smin = max(d__3,smlnum);
+
+	    i__2 = *n;
+	    work[i__2].r = 1., work[i__2].i = 0.;
+
+/*           Form right-hand side. */
+
+	    i__2 = *n;
+	    for (k = ki + 1; k <= i__2; ++k) {
+		i__3 = k;
+		d_cnjg(&z__2, &t[ki + k * t_dim1]);
+		z__1.r = -z__2.r, z__1.i = -z__2.i;
+		work[i__3].r = z__1.r, work[i__3].i = z__1.i;
+/* L90: */
+	    }
+
+/*
+             Solve the triangular system:
+                (T(KI+1:N,KI+1:N) - T(KI,KI))'*X = SCALE*WORK.
+*/
+
+	    i__2 = *n;
+	    for (k = ki + 1; k <= i__2; ++k) {
+		i__3 = k + k * t_dim1;
+		i__4 = k + k * t_dim1;
+		i__5 = ki + ki * t_dim1;
+		z__1.r = t[i__4].r - t[i__5].r, z__1.i = t[i__4].i - t[i__5]
+			.i;
+		t[i__3].r = z__1.r, t[i__3].i = z__1.i;
+		i__3 = k + k * t_dim1;
+		if ((d__1 = t[i__3].r, abs(d__1)) + (d__2 = d_imag(&t[k + k *
+			t_dim1]), abs(d__2)) < smin) {
+		    i__4 = k + k * t_dim1;
+		    t[i__4].r = smin, t[i__4].i = 0.;
+		}
+/* L100: */
+	    }
+
+	    if (ki < *n) {
+		i__2 = *n - ki;
+		zlatrs_("Upper", "Conjugate transpose", "Non-unit", "Y", &
+			i__2, &t[ki + 1 + (ki + 1) * t_dim1], ldt, &work[ki +
+			1], &scale, &rwork[1], info);
+		i__2 = ki;
+		work[i__2].r = scale, work[i__2].i = 0.;
+	    }
+
+/*           Copy the vector x or Q*x to VL and normalize. */
+
+	    if (! over) {
+		i__2 = *n - ki + 1;
+		zcopy_(&i__2, &work[ki], &c__1, &vl[ki + is * vl_dim1], &c__1)
+			;
+
+		i__2 = *n - ki + 1;
+		ii = izamax_(&i__2, &vl[ki + is * vl_dim1], &c__1) + ki - 1;
+		i__2 = ii + is * vl_dim1;
+		remax = 1. / ((d__1 = vl[i__2].r, abs(d__1)) + (d__2 = d_imag(
+			&vl[ii + is * vl_dim1]), abs(d__2)));
+		i__2 = *n - ki + 1;
+		zdscal_(&i__2, &remax, &vl[ki + is * vl_dim1], &c__1);
+
+		i__2 = ki - 1;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = k + is * vl_dim1;
+		    vl[i__3].r = 0., vl[i__3].i = 0.;
+/* L110: */
+		}
+	    } else {
+		if (ki < *n) {
+		    i__2 = *n - ki;
+		    z__1.r = scale, z__1.i = 0.;
+		    zgemv_("N", n, &i__2, &c_b57, &vl[(ki + 1) * vl_dim1 + 1],
+			     ldvl, &work[ki + 1], &c__1, &z__1, &vl[ki *
+			    vl_dim1 + 1], &c__1);
+		}
+
+		ii = izamax_(n, &vl[ki * vl_dim1 + 1], &c__1);
+		i__2 = ii + ki * vl_dim1;
+		remax = 1. / ((d__1 = vl[i__2].r, abs(d__1)) + (d__2 = d_imag(
+			&vl[ii + ki * vl_dim1]), abs(d__2)));
+		zdscal_(n, &remax, &vl[ki * vl_dim1 + 1], &c__1);
+	    }
+
+/*           Set back the original diagonal elements of T. */
+
+	    i__2 = *n;
+	    for (k = ki + 1; k <= i__2; ++k) {
+		i__3 = k + k * t_dim1;
+		i__4 = k + *n;
+		t[i__3].r = work[i__4].r, t[i__3].i = work[i__4].i;
+/* L120: */
+	    }
+
+	    ++is;
+L130:
+	    ;
+	}
+    }
+
+    return 0;
+
+/*     End of ZTREVC */
+
+} /* ztrevc_ */
+
+/* Subroutine */ int ztrexc_(char *compq, integer *n, doublecomplex *t,
+	integer *ldt, doublecomplex *q, integer *ldq, integer *ifst, integer *
+	ilst, integer *info)
+{
+    /* System generated locals */
+    integer q_dim1, q_offset, t_dim1, t_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer k, m1, m2, m3;
+    static doublereal cs;
+    static doublecomplex t11, t22, sn, temp;
+    extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublereal *, doublecomplex *);
+    extern logical lsame_(char *, char *);
+    static logical wantq;
+    extern /* Subroutine */ int xerbla_(char *, integer *), zlartg_(
+	    doublecomplex *, doublecomplex *, doublereal *, doublecomplex *,
+	    doublecomplex *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZTREXC reorders the Schur factorization of a complex matrix
+    A = Q*T*Q**H, so that the diagonal element of T with row index IFST
+    is moved to row ILST.
+
+    The Schur form T is reordered by a unitary similarity transformation
+    Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
+    postmultplying it with Z.
+
+    Arguments
+    =========
+
+    COMPQ   (input) CHARACTER*1
+            = 'V':  update the matrix Q of Schur vectors;
+            = 'N':  do not update Q.
+
+    N       (input) INTEGER
+            The order of the matrix T. N >= 0.
+
+    T       (input/output) COMPLEX*16 array, dimension (LDT,N)
+            On entry, the upper triangular matrix T.
+            On exit, the reordered upper triangular matrix.
+
+    LDT     (input) INTEGER
+            The leading dimension of the array T. LDT >= max(1,N).
+
+    Q       (input/output) COMPLEX*16 array, dimension (LDQ,N)
+            On entry, if COMPQ = 'V', the matrix Q of Schur vectors.
+            On exit, if COMPQ = 'V', Q has been postmultiplied by the
+            unitary transformation matrix Z which reorders T.
+            If COMPQ = 'N', Q is not referenced.
+
+    LDQ     (input) INTEGER
+            The leading dimension of the array Q.  LDQ >= max(1,N).
+
+    IFST    (input) INTEGER
+    ILST    (input) INTEGER
+            Specify the reordering of the diagonal elements of T:
+            The element with row index IFST is moved to row ILST by a
+            sequence of transpositions between adjacent elements.
+            1 <= IFST <= N; 1 <= ILST <= N.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Decode and test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    q_dim1 = *ldq;
+    q_offset = 1 + q_dim1;
+    q -= q_offset;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(compq, "V");
+    if (! lsame_(compq, "N") && ! wantq) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*ldt < max(1,*n)) {
+	*info = -4;
+    } else if (*ldq < 1 || wantq && *ldq < max(1,*n)) {
+	*info = -6;
+    } else if (*ifst < 1 || *ifst > *n) {
+	*info = -7;
+    } else if (*ilst < 1 || *ilst > *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZTREXC", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 1 || *ifst == *ilst) {
+	return 0;
+    }
+
+    if (*ifst < *ilst) {
+
+/*        Move the IFST-th diagonal element forward down the diagonal. */
+
+	m1 = 0;
+	m2 = -1;
+	m3 = 1;
+    } else {
+
+/*        Move the IFST-th diagonal element backward up the diagonal. */
+
+	m1 = -1;
+	m2 = 0;
+	m3 = -1;
+    }
+
+    i__1 = *ilst + m2;
+    i__2 = m3;
+    for (k = *ifst + m1; i__2 < 0 ? k >= i__1 : k <= i__1; k += i__2) {
+
+/*        Interchange the k-th and (k+1)-th diagonal elements. */
+
+	i__3 = k + k * t_dim1;
+	t11.r = t[i__3].r, t11.i = t[i__3].i;
+	i__3 = k + 1 + (k + 1) * t_dim1;
+	t22.r = t[i__3].r, t22.i = t[i__3].i;
+
+/*        Determine the transformation to perform the interchange. */
+
+	z__1.r = t22.r - t11.r, z__1.i = t22.i - t11.i;
+	zlartg_(&t[k + (k + 1) * t_dim1], &z__1, &cs, &sn, &temp);
+
+/*        Apply transformation to the matrix T. */
+
+	if (k + 2 <= *n) {
+	    i__3 = *n - k - 1;
+	    zrot_(&i__3, &t[k + (k + 2) * t_dim1], ldt, &t[k + 1 + (k + 2) *
+		    t_dim1], ldt, &cs, &sn);
+	}
+	i__3 = k - 1;
+	d_cnjg(&z__1, &sn);
+	zrot_(&i__3, &t[k * t_dim1 + 1], &c__1, &t[(k + 1) * t_dim1 + 1], &
+		c__1, &cs, &z__1);
+
+	i__3 = k + k * t_dim1;
+	t[i__3].r = t22.r, t[i__3].i = t22.i;
+	i__3 = k + 1 + (k + 1) * t_dim1;
+	t[i__3].r = t11.r, t[i__3].i = t11.i;
+
+	if (wantq) {
+
+/*           Accumulate transformation in the matrix Q. */
+
+	    d_cnjg(&z__1, &sn);
+	    zrot_(n, &q[k * q_dim1 + 1], &c__1, &q[(k + 1) * q_dim1 + 1], &
+		    c__1, &cs, &z__1);
+	}
+
+/* L10: */
+    }
+
+    return 0;
+
+/*     End of ZTREXC */
+
+} /* ztrexc_ */
+
+/* Subroutine */ int ztrti2_(char *uplo, char *diag, integer *n,
+	doublecomplex *a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer j;
+    static doublecomplex ajj;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *);
+    static logical upper;
+    extern /* Subroutine */ int ztrmv_(char *, char *, char *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZTRTI2 computes the inverse of a complex upper or lower triangular
+    matrix.
+
+    This is the Level 2 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            Specifies whether the matrix A is upper or lower triangular.
+            = 'U':  Upper triangular
+            = 'L':  Lower triangular
+
+    DIAG    (input) CHARACTER*1
+            Specifies whether or not the matrix A is unit triangular.
+            = 'N':  Non-unit triangular
+            = 'U':  Unit triangular
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading n by n upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading n by n lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -k, the k-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZTRTI2", &i__1);
+	return 0;
+    }
+
+    if (upper) {
+
+/*        Compute inverse of upper triangular matrix. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (nounit) {
+		i__2 = j + j * a_dim1;
+		z_div(&z__1, &c_b57, &a[j + j * a_dim1]);
+		a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+		i__2 = j + j * a_dim1;
+		z__1.r = -a[i__2].r, z__1.i = -a[i__2].i;
+		ajj.r = z__1.r, ajj.i = z__1.i;
+	    } else {
+		z__1.r = -1., z__1.i = -0.;
+		ajj.r = z__1.r, ajj.i = z__1.i;
+	    }
+
+/*           Compute elements 1:j-1 of j-th column. */
+
+	    i__2 = j - 1;
+	    ztrmv_("Upper", "No transpose", diag, &i__2, &a[a_offset], lda, &
+		    a[j * a_dim1 + 1], &c__1);
+	    i__2 = j - 1;
+	    zscal_(&i__2, &ajj, &a[j * a_dim1 + 1], &c__1);
+/* L10: */
+	}
+    } else {
+
+/*        Compute inverse of lower triangular matrix. */
+
+	for (j = *n; j >= 1; --j) {
+	    if (nounit) {
+		i__1 = j + j * a_dim1;
+		z_div(&z__1, &c_b57, &a[j + j * a_dim1]);
+		a[i__1].r = z__1.r, a[i__1].i = z__1.i;
+		i__1 = j + j * a_dim1;
+		z__1.r = -a[i__1].r, z__1.i = -a[i__1].i;
+		ajj.r = z__1.r, ajj.i = z__1.i;
+	    } else {
+		z__1.r = -1., z__1.i = -0.;
+		ajj.r = z__1.r, ajj.i = z__1.i;
+	    }
+	    if (j < *n) {
+
+/*              Compute elements j+1:n of j-th column. */
+
+		i__1 = *n - j;
+		ztrmv_("Lower", "No transpose", diag, &i__1, &a[j + 1 + (j +
+			1) * a_dim1], lda, &a[j + 1 + j * a_dim1], &c__1);
+		i__1 = *n - j;
+		zscal_(&i__1, &ajj, &a[j + 1 + j * a_dim1], &c__1);
+	    }
+/* L20: */
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRTI2 */
+
+} /* ztrti2_ */
+
+/* Subroutine */ int ztrtri_(char *uplo, char *diag, integer *n,
+	doublecomplex *a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, i__1, i__2, i__3[2], i__4, i__5;
+    doublecomplex z__1;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer j, jb, nb, nn;
+    extern logical lsame_(char *, char *);
+    static logical upper;
+    extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *,
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *),
+	    ztrsm_(char *, char *, char *, char *, integer *, integer *,
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *,
+	    integer *), ztrti2_(char *, char *
+	    , integer *, doublecomplex *, integer *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical nounit;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZTRTRI computes the inverse of a complex upper or lower triangular
+    matrix A.
+
+    This is the Level 3 BLAS version of the algorithm.
+
+    Arguments
+    =========
+
+    UPLO    (input) CHARACTER*1
+            = 'U':  A is upper triangular;
+            = 'L':  A is lower triangular.
+
+    DIAG    (input) CHARACTER*1
+            = 'N':  A is non-unit triangular;
+            = 'U':  A is unit triangular.
+
+    N       (input) INTEGER
+            The order of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the triangular matrix A.  If UPLO = 'U', the
+            leading N-by-N upper triangular part of the array A contains
+            the upper triangular matrix, and the strictly lower
+            triangular part of A is not referenced.  If UPLO = 'L', the
+            leading N-by-N lower triangular part of the array A contains
+            the lower triangular matrix, and the strictly upper
+            triangular part of A is not referenced.  If DIAG = 'U', the
+            diagonal elements of A are also not referenced and are
+            assumed to be 1.
+            On exit, the (triangular) inverse of the original matrix, in
+            the same storage format.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,N).
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+            > 0: if INFO = i, A(i,i) is exactly zero.  The triangular
+                 matrix is singular and its inverse can not be computed.
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZTRTRI", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Check for singularity if non-unit. */
+
+    if (nounit) {
+	i__1 = *n;
+	for (*info = 1; *info <= i__1; ++(*info)) {
+	    i__2 = *info + *info * a_dim1;
+	    if (a[i__2].r == 0. && a[i__2].i == 0.) {
+		return 0;
+	    }
+/* L10: */
+	}
+	*info = 0;
+    }
+
+/*
+       Determine the block size for this environment.
+
+   Writing concatenation
+*/
+    i__3[0] = 1, a__1[0] = uplo;
+    i__3[1] = 1, a__1[1] = diag;
+    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+    nb = ilaenv_(&c__1, "ZTRTRI", ch__1, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (
+	    ftnlen)2);
+    if (nb <= 1 || nb >= *n) {
+
+/*        Use unblocked code */
+
+	ztrti2_(uplo, diag, n, &a[a_offset], lda, info);
+    } else {
+
+/*        Use blocked code */
+
+	if (upper) {
+
+/*           Compute inverse of upper triangular matrix */
+
+	    i__1 = *n;
+	    i__2 = nb;
+	    for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
+/* Computing MIN */
+		i__4 = nb, i__5 = *n - j + 1;
+		jb = min(i__4,i__5);
+
+/*              Compute rows 1:j-1 of current block column */
+
+		i__4 = j - 1;
+		ztrmm_("Left", "Upper", "No transpose", diag, &i__4, &jb, &
+			c_b57, &a[a_offset], lda, &a[j * a_dim1 + 1], lda);
+		i__4 = j - 1;
+		z__1.r = -1., z__1.i = -0.;
+		ztrsm_("Right", "Upper", "No transpose", diag, &i__4, &jb, &
+			z__1, &a[j + j * a_dim1], lda, &a[j * a_dim1 + 1],
+			lda);
+
+/*              Compute inverse of current diagonal block */
+
+		ztrti2_("Upper", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L20: */
+	    }
+	} else {
+
+/*           Compute inverse of lower triangular matrix */
+
+	    nn = (*n - 1) / nb * nb + 1;
+	    i__2 = -nb;
+	    for (j = nn; i__2 < 0 ? j >= 1 : j <= 1; j += i__2) {
+/* Computing MIN */
+		i__1 = nb, i__4 = *n - j + 1;
+		jb = min(i__1,i__4);
+		if (j + jb <= *n) {
+
+/*                 Compute rows j+jb:n of current block column */
+
+		    i__1 = *n - j - jb + 1;
+		    ztrmm_("Left", "Lower", "No transpose", diag, &i__1, &jb,
+			    &c_b57, &a[j + jb + (j + jb) * a_dim1], lda, &a[j
+			    + jb + j * a_dim1], lda);
+		    i__1 = *n - j - jb + 1;
+		    z__1.r = -1., z__1.i = -0.;
+		    ztrsm_("Right", "Lower", "No transpose", diag, &i__1, &jb,
+			     &z__1, &a[j + j * a_dim1], lda, &a[j + jb + j *
+			    a_dim1], lda);
+		}
+
+/*              Compute inverse of current diagonal block */
+
+		ztrti2_("Lower", diag, &jb, &a[j + j * a_dim1], lda, info);
+/* L30: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTRTRI */
+
+} /* ztrtri_ */
+
+/* Subroutine */ int zung2r_(integer *m, integer *n, integer *k,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNG2R generates an m by n complex matrix Q with orthonormal columns,
+    which is defined as the first n columns of a product of k elementary
+    reflectors of order m
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by ZGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by ZGEQRF in the first k columns of its array
+            argument A.
+            On exit, the m by n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEQRF.
+
+    WORK    (workspace) COMPLEX*16 array, dimension (N)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNG2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Initialise columns k+1:n to columns of the unit matrix */
+
+    i__1 = *n;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (l = 1; l <= i__2; ++l) {
+	    i__3 = l + j * a_dim1;
+	    a[i__3].r = 0., a[i__3].i = 0.;
+/* L10: */
+	}
+	i__2 = j + j * a_dim1;
+	a[i__2].r = 1., a[i__2].i = 0.;
+/* L20: */
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i) to A(i:m,i:n) from the left */
+
+	if (i__ < *n) {
+	    i__1 = i__ + i__ * a_dim1;
+	    a[i__1].r = 1., a[i__1].i = 0.;
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n - i__;
+	    zlarf_("Left", &i__1, &i__2, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	}
+	if (i__ < *m) {
+	    i__1 = *m - i__;
+	    i__2 = i__;
+	    z__1.r = -tau[i__2].r, z__1.i = -tau[i__2].i;
+	    zscal_(&i__1, &z__1, &a[i__ + 1 + i__ * a_dim1], &c__1);
+	}
+	i__1 = i__ + i__ * a_dim1;
+	i__2 = i__;
+	z__1.r = 1. - tau[i__2].r, z__1.i = 0. - tau[i__2].i;
+	a[i__1].r = z__1.r, a[i__1].i = z__1.i;
+
+/*        Set A(1:i-1,i) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = l + i__ * a_dim1;
+	    a[i__2].r = 0., a[i__2].i = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of ZUNG2R */
+
+} /* zung2r_ */
+
+/* Subroutine */ int zungbr_(char *vect, integer *m, integer *n, integer *k,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, j, nb, mn;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical wantq;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int zunglq_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *), zungqr_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNGBR generates one of the complex unitary matrices Q or P**H
+    determined by ZGEBRD when reducing a complex matrix A to bidiagonal
+    form: A = Q * B * P**H.  Q and P**H are defined as products of
+    elementary reflectors H(i) or G(i) respectively.
+
+    If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+    is of order M:
+    if m >= k, Q = H(1) H(2) . . . H(k) and ZUNGBR returns the first n
+    columns of Q, where m >= n >= k;
+    if m < k, Q = H(1) H(2) . . . H(m-1) and ZUNGBR returns Q as an
+    M-by-M matrix.
+
+    If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**H
+    is of order N:
+    if k < n, P**H = G(k) . . . G(2) G(1) and ZUNGBR returns the first m
+    rows of P**H, where n >= m >= k;
+    if k >= n, P**H = G(n-1) . . . G(2) G(1) and ZUNGBR returns P**H as
+    an N-by-N matrix.
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            Specifies whether the matrix Q or the matrix P**H is
+            required, as defined in the transformation applied by ZGEBRD:
+            = 'Q':  generate Q;
+            = 'P':  generate P**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q or P**H to be returned.
+            M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q or P**H to be returned.
+            N >= 0.
+            If VECT = 'Q', M >= N >= min(M,K);
+            if VECT = 'P', N >= M >= min(N,K).
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original M-by-K
+            matrix reduced by ZGEBRD.
+            If VECT = 'P', the number of rows in the original K-by-N
+            matrix reduced by ZGEBRD.
+            K >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by ZGEBRD.
+            On exit, the M-by-N matrix Q or P**H.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= M.
+
+    TAU     (input) COMPLEX*16 array, dimension
+                                  (min(M,K)) if VECT = 'Q'
+                                  (min(N,K)) if VECT = 'P'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i), which determines Q or P**H, as
+            returned by ZGEBRD in its array argument TAUQ or TAUP.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,min(M,N)).
+            For optimum performance LWORK >= min(M,N)*NB, where NB
+            is the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    wantq = lsame_(vect, "Q");
+    mn = min(*m,*n);
+    lquery = *lwork == -1;
+    if (! wantq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0 || wantq && (*n > *m || *n < min(*m,*k)) || ! wantq && (
+	    *m > *n || *m < min(*n,*k))) {
+	*info = -3;
+    } else if (*k < 0) {
+	*info = -4;
+    } else if (*lda < max(1,*m)) {
+	*info = -6;
+    } else if (*lwork < max(1,mn) && ! lquery) {
+	*info = -9;
+    }
+
+    if (*info == 0) {
+	if (wantq) {
+	    nb = ilaenv_(&c__1, "ZUNGQR", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	} else {
+	    nb = ilaenv_(&c__1, "ZUNGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (
+		    ftnlen)1);
+	}
+	lwkopt = max(1,mn) * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNGBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    if (wantq) {
+
+/*
+          Form Q, determined by a call to ZGEBRD to reduce an m-by-k
+          matrix
+*/
+
+	if (*m >= *k) {
+
+/*           If m >= k, assume m >= n >= k */
+
+	    zungqr_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If m < k, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             column to the right, and set the first row and column of Q
+             to those of the unit matrix
+*/
+
+	    for (j = *m; j >= 2; --j) {
+		i__1 = j * a_dim1 + 1;
+		a[i__1].r = 0., a[i__1].i = 0.;
+		i__1 = *m;
+		for (i__ = j + 1; i__ <= i__1; ++i__) {
+		    i__2 = i__ + j * a_dim1;
+		    i__3 = i__ + (j - 1) * a_dim1;
+		    a[i__2].r = a[i__3].r, a[i__2].i = a[i__3].i;
+/* L10: */
+		}
+/* L20: */
+	    }
+	    i__1 = a_dim1 + 1;
+	    a[i__1].r = 1., a[i__1].i = 0.;
+	    i__1 = *m;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		i__2 = i__ + a_dim1;
+		a[i__2].r = 0., a[i__2].i = 0.;
+/* L30: */
+	    }
+	    if (*m > 1) {
+
+/*              Form Q(2:m,2:m) */
+
+		i__1 = *m - 1;
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		zungqr_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    } else {
+
+/*
+          Form P', determined by a call to ZGEBRD to reduce a k-by-n
+          matrix
+*/
+
+	if (*k < *n) {
+
+/*           If k < n, assume k <= m <= n */
+
+	    zunglq_(m, n, k, &a[a_offset], lda, &tau[1], &work[1], lwork, &
+		    iinfo);
+
+	} else {
+
+/*
+             If k >= n, assume m = n
+
+             Shift the vectors which define the elementary reflectors one
+             row downward, and set the first row and column of P' to
+             those of the unit matrix
+*/
+
+	    i__1 = a_dim1 + 1;
+	    a[i__1].r = 1., a[i__1].i = 0.;
+	    i__1 = *n;
+	    for (i__ = 2; i__ <= i__1; ++i__) {
+		i__2 = i__ + a_dim1;
+		a[i__2].r = 0., a[i__2].i = 0.;
+/* L40: */
+	    }
+	    i__1 = *n;
+	    for (j = 2; j <= i__1; ++j) {
+		for (i__ = j - 1; i__ >= 2; --i__) {
+		    i__2 = i__ + j * a_dim1;
+		    i__3 = i__ - 1 + j * a_dim1;
+		    a[i__2].r = a[i__3].r, a[i__2].i = a[i__3].i;
+/* L50: */
+		}
+		i__2 = j * a_dim1 + 1;
+		a[i__2].r = 0., a[i__2].i = 0.;
+/* L60: */
+	    }
+	    if (*n > 1) {
+
+/*              Form P'(2:n,2:n) */
+
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		zunglq_(&i__1, &i__2, &i__3, &a[(a_dim1 << 1) + 2], lda, &tau[
+			1], &work[1], lwork, &iinfo);
+	    }
+	}
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNGBR */
+
+} /* zungbr_ */
+
+/* Subroutine */ int zunghr_(integer *n, integer *ilo, integer *ihi,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, nb, nh, iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int zungqr_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNGHR generates a complex unitary matrix Q which is defined as the
+    product of IHI-ILO elementary reflectors of order N, as returned by
+    ZGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    N       (input) INTEGER
+            The order of the matrix Q. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of ZGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the vectors which define the elementary reflectors,
+            as returned by ZGEHRD.
+            On exit, the N-by-N unitary matrix Q.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,N).
+
+    TAU     (input) COMPLEX*16 array, dimension (N-1)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEHRD.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= IHI-ILO.
+            For optimum performance LWORK >= (IHI-ILO)*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    lquery = *lwork == -1;
+    if (*n < 0) {
+	*info = -1;
+    } else if (*ilo < 1 || *ilo > max(1,*n)) {
+	*info = -2;
+    } else if (*ihi < min(*ilo,*n) || *ihi > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*n)) {
+	*info = -5;
+    } else if (*lwork < max(1,nh) && ! lquery) {
+	*info = -8;
+    }
+
+    if (*info == 0) {
+	nb = ilaenv_(&c__1, "ZUNGQR", " ", &nh, &nh, &nh, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+	lwkopt = max(1,nh) * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNGHR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+/*
+       Shift the vectors which define the elementary reflectors one
+       column to the right, and set the first ilo and the last n-ihi
+       rows and columns to those of the unit matrix
+*/
+
+    i__1 = *ilo + 1;
+    for (j = *ihi; j >= i__1; --j) {
+	i__2 = j - 1;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0., a[i__3].i = 0.;
+/* L10: */
+	}
+	i__2 = *ihi;
+	for (i__ = j + 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    i__4 = i__ + (j - 1) * a_dim1;
+	    a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i;
+/* L20: */
+	}
+	i__2 = *n;
+	for (i__ = *ihi + 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0., a[i__3].i = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+    i__1 = *ilo;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0., a[i__3].i = 0.;
+/* L50: */
+	}
+	i__2 = j + j * a_dim1;
+	a[i__2].r = 1., a[i__2].i = 0.;
+/* L60: */
+    }
+    i__1 = *n;
+    for (j = *ihi + 1; j <= i__1; ++j) {
+	i__2 = *n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * a_dim1;
+	    a[i__3].r = 0., a[i__3].i = 0.;
+/* L70: */
+	}
+	i__2 = j + j * a_dim1;
+	a[i__2].r = 1., a[i__2].i = 0.;
+/* L80: */
+    }
+
+    if (nh > 0) {
+
+/*        Generate Q(ilo+1:ihi,ilo+1:ihi) */
+
+	zungqr_(&nh, &nh, &nh, &a[*ilo + 1 + (*ilo + 1) * a_dim1], lda, &tau[*
+		ilo], &work[1], lwork, &iinfo);
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNGHR */
+
+} /* zunghr_ */
+
+/* Subroutine */ int zungl2_(integer *m, integer *n, integer *k,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer i__, j, l;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *,
+	    doublecomplex *, integer *), zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *), zlacgv_(integer *, doublecomplex *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNGL2 generates an m-by-n complex matrix Q with orthonormal rows,
+    which is defined as the first m rows of a product of k elementary
+    reflectors of order n
+
+          Q  =  H(k)' . . . H(2)' H(1)'
+
+    as returned by ZGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by ZGELQF in the first k rows of its array argument A.
+            On exit, the m by n matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGELQF.
+
+    WORK    (workspace) COMPLEX*16 array, dimension (M)
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNGL2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	return 0;
+    }
+
+    if (*k < *m) {
+
+/*        Initialise rows k+1:m to rows of the unit matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (l = *k + 1; l <= i__2; ++l) {
+		i__3 = l + j * a_dim1;
+		a[i__3].r = 0., a[i__3].i = 0.;
+/* L10: */
+	    }
+	    if (j > *k && j <= *m) {
+		i__2 = j + j * a_dim1;
+		a[i__2].r = 1., a[i__2].i = 0.;
+	    }
+/* L20: */
+	}
+    }
+
+    for (i__ = *k; i__ >= 1; --i__) {
+
+/*        Apply H(i)' to A(i:m,i:n) from the right */
+
+	if (i__ < *n) {
+	    i__1 = *n - i__;
+	    zlacgv_(&i__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    if (i__ < *m) {
+		i__1 = i__ + i__ * a_dim1;
+		a[i__1].r = 1., a[i__1].i = 0.;
+		i__1 = *m - i__;
+		i__2 = *n - i__ + 1;
+		d_cnjg(&z__1, &tau[i__]);
+		zlarf_("Right", &i__1, &i__2, &a[i__ + i__ * a_dim1], lda, &
+			z__1, &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    }
+	    i__1 = *n - i__;
+	    i__2 = i__;
+	    z__1.r = -tau[i__2].r, z__1.i = -tau[i__2].i;
+	    zscal_(&i__1, &z__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	    i__1 = *n - i__;
+	    zlacgv_(&i__1, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+	i__1 = i__ + i__ * a_dim1;
+	d_cnjg(&z__2, &tau[i__]);
+	z__1.r = 1. - z__2.r, z__1.i = 0. - z__2.i;
+	a[i__1].r = z__1.r, a[i__1].i = z__1.i;
+
+/*        Set A(i,1:i-1) to zero */
+
+	i__1 = i__ - 1;
+	for (l = 1; l <= i__1; ++l) {
+	    i__2 = i__ + l * a_dim1;
+	    a[i__2].r = 0., a[i__2].i = 0.;
+/* L30: */
+	}
+/* L40: */
+    }
+    return 0;
+
+/*     End of ZUNGL2 */
+
+} /* zungl2_ */
+
+/* Subroutine */ int zunglq_(integer *m, integer *n, integer *k,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int zungl2_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static logical lquery;
+    static integer lwkopt;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNGLQ generates an M-by-N complex matrix Q with orthonormal rows,
+    which is defined as the first M rows of a product of K elementary
+    reflectors of order N
+
+          Q  =  H(k)' . . . H(2)' H(1)'
+
+    as returned by ZGELQF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. N >= M.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. M >= K >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the i-th row must contain the vector which defines
+            the elementary reflector H(i), for i = 1,2,...,k, as returned
+            by ZGELQF in the first k rows of its array argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGELQF.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,M).
+            For optimum performance LWORK >= M*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit;
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "ZUNGLQ", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*m) * nb;
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *m) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNGLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m <= 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZUNGLQ", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZUNGLQ", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk rows are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(kk+1:m,1:kk) to zero. */
+
+	i__1 = kk;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = kk + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = 0., a[i__3].i = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *m) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	zungl2_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *m) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *n - i__ + 1;
+		zlarft_("Forward", "Rowwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i+ib:m,i:n) from the right */
+
+		i__2 = *m - i__ - ib + 1;
+		i__3 = *n - i__ + 1;
+		zlarfb_("Right", "Conjugate transpose", "Forward", "Rowwise",
+			&i__2, &i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[
+			1], &ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[
+			ib + 1], &ldwork);
+	    }
+
+/*           Apply H' to columns i:n of current block */
+
+	    i__2 = *n - i__ + 1;
+	    zungl2_(&ib, &i__2, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set columns 1:i-1 of current block to zero */
+
+	    i__2 = i__ - 1;
+	    for (j = 1; j <= i__2; ++j) {
+		i__3 = i__ + ib - 1;
+		for (l = i__; l <= i__3; ++l) {
+		    i__4 = l + j * a_dim1;
+		    a[i__4].r = 0., a[i__4].i = 0.;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1].r = (doublereal) iws, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNGLQ */
+
+} /* zunglq_ */
+
+/* Subroutine */ int zungqr_(integer *m, integer *n, integer *k,
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, j, l, ib, nb, ki, kk, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int zung2r_(integer *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNGQR generates an M-by-N complex matrix Q with orthonormal columns,
+    which is defined as the first N columns of a product of K elementary
+    reflectors of order M
+
+          Q  =  H(1) H(2) . . . H(k)
+
+    as returned by ZGEQRF.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix Q. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix Q. M >= N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines the
+            matrix Q. N >= K >= 0.
+
+    A       (input/output) COMPLEX*16 array, dimension (LDA,N)
+            On entry, the i-th column must contain the vector which
+            defines the elementary reflector H(i), for i = 1,2,...,k, as
+            returned by ZGEQRF in the first k columns of its array
+            argument A.
+            On exit, the M-by-N matrix Q.
+
+    LDA     (input) INTEGER
+            The first dimension of the array A. LDA >= max(1,M).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEQRF.
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is the
+            optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument has an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "ZUNGQR", " ", m, n, k, &c_n1, (ftnlen)6, (ftnlen)1);
+    lwkopt = max(1,*n) * nb;
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*k < 0 || *k > *n) {
+	*info = -3;
+    } else if (*lda < max(1,*m)) {
+	*info = -5;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNGQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n <= 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < *k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZUNGQR", " ", m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < *k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZUNGQR", " ", m, n, k, &c_n1,
+			 (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < *k && nx < *k) {
+
+/*
+          Use blocked code after the last block.
+          The first kk columns are handled by the block method.
+*/
+
+	ki = (*k - nx - 1) / nb * nb;
+/* Computing MIN */
+	i__1 = *k, i__2 = ki + nb;
+	kk = min(i__1,i__2);
+
+/*        Set A(1:kk,kk+1:n) to zero. */
+
+	i__1 = *n;
+	for (j = kk + 1; j <= i__1; ++j) {
+	    i__2 = kk;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * a_dim1;
+		a[i__3].r = 0., a[i__3].i = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else {
+	kk = 0;
+    }
+
+/*     Use unblocked code for the last or only block. */
+
+    if (kk < *n) {
+	i__1 = *m - kk;
+	i__2 = *n - kk;
+	i__3 = *k - kk;
+	zung2r_(&i__1, &i__2, &i__3, &a[kk + 1 + (kk + 1) * a_dim1], lda, &
+		tau[kk + 1], &work[1], &iinfo);
+    }
+
+    if (kk > 0) {
+
+/*        Use blocked code */
+
+	i__1 = -nb;
+	for (i__ = ki + 1; i__1 < 0 ? i__ >= 1 : i__ <= 1; i__ += i__1) {
+/* Computing MIN */
+	    i__2 = nb, i__3 = *k - i__ + 1;
+	    ib = min(i__2,i__3);
+	    if (i__ + ib <= *n) {
+
+/*
+                Form the triangular factor of the block reflector
+                H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+		i__2 = *m - i__ + 1;
+		zlarft_("Forward", "Columnwise", &i__2, &ib, &a[i__ + i__ *
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i:m,i+ib:n) from the left */
+
+		i__2 = *m - i__ + 1;
+		i__3 = *n - i__ - ib + 1;
+		zlarfb_("Left", "No transpose", "Forward", "Columnwise", &
+			i__2, &i__3, &ib, &a[i__ + i__ * a_dim1], lda, &work[
+			1], &ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &
+			work[ib + 1], &ldwork);
+	    }
+
+/*           Apply H to rows i:m of current block */
+
+	    i__2 = *m - i__ + 1;
+	    zung2r_(&i__2, &ib, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &
+		    work[1], &iinfo);
+
+/*           Set rows 1:i-1 of current block to zero */
+
+	    i__2 = i__ + ib - 1;
+	    for (j = i__; j <= i__2; ++j) {
+		i__3 = i__ - 1;
+		for (l = 1; l <= i__3; ++l) {
+		    i__4 = l + j * a_dim1;
+		    a[i__4].r = 0., a[i__4].i = 0.;
+/* L30: */
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+    work[1].r = (doublereal) iws, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNGQR */
+
+} /* zungqr_ */
+
+/* Subroutine */ int zunm2l_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, mi, ni, nq;
+    static doublecomplex aii;
+    static logical left;
+    static doublecomplex taui;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNM2L overwrites the general complex m-by-n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'C', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'C',
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by ZGEQLF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q' (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            ZGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEQLF.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the m-by-n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX*16 array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNM2L", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+    } else {
+	mi = *m;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) or H(i)' is applied to C(1:m-k+i,1:n) */
+
+	    mi = *m - *k + i__;
+	} else {
+
+/*           H(i) or H(i)' is applied to C(1:m,1:n-k+i) */
+
+	    ni = *n - *k + i__;
+	}
+
+/*        Apply H(i) or H(i)' */
+
+	if (notran) {
+	    i__3 = i__;
+	    taui.r = tau[i__3].r, taui.i = tau[i__3].i;
+	} else {
+	    d_cnjg(&z__1, &tau[i__]);
+	    taui.r = z__1.r, taui.i = z__1.i;
+	}
+	i__3 = nq - *k + i__ + i__ * a_dim1;
+	aii.r = a[i__3].r, aii.i = a[i__3].i;
+	i__3 = nq - *k + i__ + i__ * a_dim1;
+	a[i__3].r = 1., a[i__3].i = 0.;
+	zlarf_(side, &mi, &ni, &a[i__ * a_dim1 + 1], &c__1, &taui, &c__[
+		c_offset], ldc, &work[1]);
+	i__3 = nq - *k + i__ + i__ * a_dim1;
+	a[i__3].r = aii.r, a[i__3].i = aii.i;
+/* L10: */
+    }
+    return 0;
+
+/*     End of ZUNM2L */
+
+} /* zunm2l_ */
+
+/* Subroutine */ int zunm2r_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static doublecomplex aii;
+    static logical left;
+    static doublecomplex taui;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNM2R overwrites the general complex m-by-n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'C', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'C',
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by ZGEQRF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q' (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            ZGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEQRF.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the m-by-n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX*16 array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNM2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && ! notran || ! left && notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) or H(i)' is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) or H(i)' is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) or H(i)' */
+
+	if (notran) {
+	    i__3 = i__;
+	    taui.r = tau[i__3].r, taui.i = tau[i__3].i;
+	} else {
+	    d_cnjg(&z__1, &tau[i__]);
+	    taui.r = z__1.r, taui.i = z__1.i;
+	}
+	i__3 = i__ + i__ * a_dim1;
+	aii.r = a[i__3].r, aii.i = a[i__3].i;
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = 1., a[i__3].i = 0.;
+	zlarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], &c__1, &taui, &c__[ic
+		+ jc * c_dim1], ldc, &work[1]);
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = aii.r, a[i__3].i = aii.i;
+/* L10: */
+    }
+    return 0;
+
+/*     End of ZUNM2R */
+
+} /* zunm2r_ */
+
+/* Subroutine */ int zunmbr_(char *vect, char *side, char *trans, integer *m,
+	integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex
+	*tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *
+	lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2];
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static logical notran, applyq;
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    If VECT = 'Q', ZUNMBR overwrites the general complex M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    If VECT = 'P', ZUNMBR overwrites the general complex M-by-N matrix C
+    with
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      P * C          C * P
+    TRANS = 'C':      P**H * C       C * P**H
+
+    Here Q and P**H are the unitary matrices determined by ZGEBRD when
+    reducing a complex matrix A to bidiagonal form: A = Q * B * P**H. Q
+    and P**H are defined as products of elementary reflectors H(i) and
+    G(i) respectively.
+
+    Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+    order of the unitary matrix Q or P**H that is applied.
+
+    If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+    if nq >= k, Q = H(1) H(2) . . . H(k);
+    if nq < k, Q = H(1) H(2) . . . H(nq-1).
+
+    If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+    if k < nq, P = G(1) G(2) . . . G(k);
+    if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+    Arguments
+    =========
+
+    VECT    (input) CHARACTER*1
+            = 'Q': apply Q or Q**H;
+            = 'P': apply P or P**H.
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q, Q**H, P or P**H from the Left;
+            = 'R': apply Q, Q**H, P or P**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q or P;
+            = 'C':  Conjugate transpose, apply Q**H or P**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            If VECT = 'Q', the number of columns in the original
+            matrix reduced by ZGEBRD.
+            If VECT = 'P', the number of rows in the original
+            matrix reduced by ZGEBRD.
+            K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension
+                                  (LDA,min(nq,K)) if VECT = 'Q'
+                                  (LDA,nq)        if VECT = 'P'
+            The vectors which define the elementary reflectors H(i) and
+            G(i), whose products determine the matrices Q and P, as
+            returned by ZGEBRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If VECT = 'Q', LDA >= max(1,nq);
+            if VECT = 'P', LDA >= max(1,min(nq,K)).
+
+    TAU     (input) COMPLEX*16 array, dimension (min(nq,K))
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i) or G(i) which determines Q or P, as returned
+            by ZGEBRD in the array argument TAUQ or TAUP.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q
+            or P*C or P**H*C or C*P or C*P**H.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M);
+            if N = 0 or M = 0, LWORK >= 1.
+            For optimum performance LWORK >= max(1,N*NB) if SIDE = 'L',
+            and LWORK >= max(1,M*NB) if SIDE = 'R', where NB is the
+            optimal blocksize. (NB = 0 if M = 0 or N = 0.)
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    applyq = lsame_(vect, "Q");
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q or P and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (*m == 0 || *n == 0) {
+	nw = 0;
+    }
+    if (! applyq && ! lsame_(vect, "P")) {
+	*info = -1;
+    } else if (! left && ! lsame_(side, "R")) {
+	*info = -2;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*k < 0) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = 1, i__2 = min(nq,*k);
+	if (applyq && *lda < max(1,nq) || ! applyq && *lda < max(i__1,i__2)) {
+	    *info = -8;
+	} else if (*ldc < max(1,*m)) {
+	    *info = -11;
+	} else if (*lwork < max(1,nw) && ! lquery) {
+	    *info = -13;
+	}
+    }
+
+    if (*info == 0) {
+	if (nw > 0) {
+	    if (applyq) {
+		if (left) {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *m - 1;
+		    i__2 = *m - 1;
+		    nb = ilaenv_(&c__1, "ZUNMQR", ch__1, &i__1, n, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		} else {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *n - 1;
+		    i__2 = *n - 1;
+		    nb = ilaenv_(&c__1, "ZUNMQR", ch__1, m, &i__1, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		}
+	    } else {
+		if (left) {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *m - 1;
+		    i__2 = *m - 1;
+		    nb = ilaenv_(&c__1, "ZUNMLQ", ch__1, &i__1, n, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		} else {
+/* Writing concatenation */
+		    i__3[0] = 1, a__1[0] = side;
+		    i__3[1] = 1, a__1[1] = trans;
+		    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+		    i__1 = *n - 1;
+		    i__2 = *n - 1;
+		    nb = ilaenv_(&c__1, "ZUNMLQ", ch__1, m, &i__1, &i__2, &
+			    c_n1, (ftnlen)6, (ftnlen)2);
+		}
+	    }
+/* Computing MAX */
+	    i__1 = 1, i__2 = nw * nb;
+	    lwkopt = max(i__1,i__2);
+	} else {
+	    lwkopt = 1;
+	}
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNMBR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    if (applyq) {
+
+/*        Apply Q */
+
+	if (nq >= *k) {
+
+/*           Q was determined by a call to ZGEBRD with nq >= k */
+
+	    zunmqr_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           Q was determined by a call to ZGEBRD with nq < k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    zunmqr_(side, trans, &mi, &ni, &i__1, &a[a_dim1 + 2], lda, &tau[1]
+		    , &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+	}
+    } else {
+
+/*        Apply P */
+
+	if (notran) {
+	    *(unsigned char *)transt = 'C';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+	if (nq > *k) {
+
+/*           P was determined by a call to ZGEBRD with nq > k */
+
+	    zunmlq_(side, transt, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		    c_offset], ldc, &work[1], lwork, &iinfo);
+	} else if (nq > 1) {
+
+/*           P was determined by a call to ZGEBRD with nq <= k */
+
+	    if (left) {
+		mi = *m - 1;
+		ni = *n;
+		i1 = 2;
+		i2 = 1;
+	    } else {
+		mi = *m;
+		ni = *n - 1;
+		i1 = 1;
+		i2 = 2;
+	    }
+	    i__1 = nq - 1;
+	    zunmlq_(side, transt, &mi, &ni, &i__1, &a[(a_dim1 << 1) + 1], lda,
+		     &tau[1], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &
+		    iinfo);
+	}
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNMBR */
+
+} /* zunmbr_ */
+
+/* Subroutine */ int zunmhr_(char *side, char *trans, integer *m, integer *n,
+	integer *ilo, integer *ihi, doublecomplex *a, integer *lda,
+	doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, nh, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNMHR overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    IHI-ILO elementary reflectors, as returned by ZGEHRD:
+
+    Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q**H (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    ILO     (input) INTEGER
+    IHI     (input) INTEGER
+            ILO and IHI must have the same values as in the previous call
+            of ZGEHRD. Q is equal to the unit matrix except in the
+            submatrix Q(ilo+1:ihi,ilo+1:ihi).
+            If SIDE = 'L', then 1 <= ILO <= IHI <= M, if M > 0, and
+            ILO = 1 and IHI = 0, if M = 0;
+            if SIDE = 'R', then 1 <= ILO <= IHI <= N, if N > 0, and
+            ILO = 1 and IHI = 0, if N = 0.
+
+    A       (input) COMPLEX*16 array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by ZGEHRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) COMPLEX*16 array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEHRD.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nh = *ihi - *ilo;
+    left = lsame_(side, "L");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*ilo < 1 || *ilo > max(1,nq)) {
+	*info = -5;
+    } else if (*ihi < min(*ilo,nq) || *ihi > nq) {
+	*info = -6;
+    } else if (*lda < max(1,nq)) {
+	*info = -8;
+    } else if (*ldc < max(1,*m)) {
+	*info = -11;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -13;
+    }
+
+    if (*info == 0) {
+	if (left) {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "ZUNMQR", ch__1, &nh, n, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	} else {
+/* Writing concatenation */
+	    i__1[0] = 1, a__1[0] = side;
+	    i__1[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+	    nb = ilaenv_(&c__1, "ZUNMQR", ch__1, m, &nh, &nh, &c_n1, (ftnlen)
+		    6, (ftnlen)2);
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("ZUNMHR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nh == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    if (left) {
+	mi = nh;
+	ni = *n;
+	i1 = *ilo + 1;
+	i2 = 1;
+    } else {
+	mi = *m;
+	ni = nh;
+	i1 = 1;
+	i2 = *ilo + 1;
+    }
+
+    zunmqr_(side, trans, &mi, &ni, &nh, &a[*ilo + 1 + *ilo * a_dim1], lda, &
+	    tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNMHR */
+
+} /* zunmhr_ */
+
+/* Subroutine */ int zunml2_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    static doublecomplex aii;
+    static logical left;
+    static doublecomplex taui;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *, doublecomplex *), xerbla_(char *, integer *), zlacgv_(integer *, doublecomplex *, integer *);
+    static logical notran;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNML2 overwrites the general complex m-by-n matrix C with
+
+          Q * C  if SIDE = 'L' and TRANS = 'N', or
+
+          Q'* C  if SIDE = 'L' and TRANS = 'C', or
+
+          C * Q  if SIDE = 'R' and TRANS = 'N', or
+
+          C * Q' if SIDE = 'R' and TRANS = 'C',
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k)' . . . H(2)' H(1)'
+
+    as returned by ZGELQF. Q is of order m if SIDE = 'L' and of order n
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q' from the Left
+            = 'R': apply Q or Q' from the Right
+
+    TRANS   (input) CHARACTER*1
+            = 'N': apply Q  (No transpose)
+            = 'C': apply Q' (Conjugate transpose)
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            ZGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGELQF.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the m-by-n matrix C.
+            On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace) COMPLEX*16 array, dimension
+                                     (N) if SIDE = 'L',
+                                     (M) if SIDE = 'R'
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNML2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) or H(i)' is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) or H(i)' is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) or H(i)' */
+
+	if (notran) {
+	    d_cnjg(&z__1, &tau[i__]);
+	    taui.r = z__1.r, taui.i = z__1.i;
+	} else {
+	    i__3 = i__;
+	    taui.r = tau[i__3].r, taui.i = tau[i__3].i;
+	}
+	if (i__ < nq) {
+	    i__3 = nq - i__;
+	    zlacgv_(&i__3, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+	i__3 = i__ + i__ * a_dim1;
+	aii.r = a[i__3].r, aii.i = a[i__3].i;
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = 1., a[i__3].i = 0.;
+	zlarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], lda, &taui, &c__[ic +
+		jc * c_dim1], ldc, &work[1]);
+	i__3 = i__ + i__ * a_dim1;
+	a[i__3].r = aii.r, a[i__3].i = aii.i;
+	if (i__ < nq) {
+	    i__3 = nq - i__;
+	    zlacgv_(&i__3, &a[i__ + (i__ + 1) * a_dim1], lda);
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of ZUNML2 */
+
+} /* zunml2_ */
+
+/* Subroutine */ int zunmlq_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int zunml2_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static logical notran;
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static char transt[1];
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNMLQ overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k)' . . . H(2)' H(1)'
+
+    as returned by ZGELQF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Conjugate transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension
+                                 (LDA,M) if SIDE = 'L',
+                                 (LDA,N) if SIDE = 'R'
+            The i-th row must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            ZGELQF in the first k rows of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A. LDA >= max(1,K).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGELQF.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "ZUNMLQ", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNMLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "ZUNMLQ", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	zunml2_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	if (notran) {
+	    *(unsigned char *)transt = 'C';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    zlarft_("Forward", "Rowwise", &i__4, &ib, &a[i__ + i__ * a_dim1],
+		    lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    zlarfb_(side, transt, "Forward", "Rowwise", &mi, &ni, &ib, &a[i__
+		    + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc * c_dim1],
+		    ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNMLQ */
+
+} /* zunmlq_ */
+
+/* Subroutine */ int zunmql_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int zunm2l_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static logical notran;
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNMQL overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(k) . . . H(2) H(1)
+
+    as returned by ZGEQLF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            ZGEQLF in the last k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEQLF.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = max(1,*n);
+    } else {
+	nq = *n;
+	nw = max(1,*m);
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+
+    if (*info == 0) {
+	if (*m == 0 || *n == 0) {
+	    lwkopt = 1;
+	} else {
+
+/*
+             Determine the block size.  NB may be at most NBMAX, where
+             NBMAX is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 64, i__2 = ilaenv_(&c__1, "ZUNMQL", ch__1, m, n, k, &c_n1,
+		    (ftnlen)6, (ftnlen)2);
+	    nb = min(i__1,i__2);
+	    lwkopt = nw * nb;
+	}
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+
+	if (*lwork < nw && ! lquery) {
+	    *info = -12;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNMQL", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "ZUNMQL", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	zunm2l_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	} else {
+	    mi = *m;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i+ib-1) . . . H(i+1) H(i)
+*/
+
+	    i__4 = nq - *k + i__ + ib - 1;
+	    zlarft_("Backward", "Columnwise", &i__4, &ib, &a[i__ * a_dim1 + 1]
+		    , lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(1:m-k+i+ib-1,1:n) */
+
+		mi = *m - *k + i__ + ib - 1;
+	    } else {
+
+/*              H or H' is applied to C(1:m,1:n-k+i+ib-1) */
+
+		ni = *n - *k + i__ + ib - 1;
+	    }
+
+/*           Apply H or H' */
+
+	    zlarfb_(side, trans, "Backward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ * a_dim1 + 1], lda, t, &c__65, &c__[c_offset], ldc, &
+		    work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNMQL */
+
+} /* zunmql_ */
+
+/* Subroutine */ int zunmqr_(char *side, char *trans, integer *m, integer *n,
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4,
+	    i__5;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i__;
+    static doublecomplex t[4160]	/* was [65][64] */;
+    static integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer nbmin, iinfo;
+    extern /* Subroutine */ int zunm2r_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *,
+	    integer *, integer *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, integer *,
+	    doublecomplex *, integer *);
+    static logical notran;
+    static integer ldwork;
+    extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *,
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *,
+	    integer *);
+    static integer lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNMQR overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix defined as the product of k
+    elementary reflectors
+
+          Q = H(1) H(2) . . . H(k)
+
+    as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
+    if SIDE = 'R'.
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Conjugate transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    K       (input) INTEGER
+            The number of elementary reflectors whose product defines
+            the matrix Q.
+            If SIDE = 'L', M >= K >= 0;
+            if SIDE = 'R', N >= K >= 0.
+
+    A       (input) COMPLEX*16 array, dimension (LDA,K)
+            The i-th column must contain the vector which defines the
+            elementary reflector H(i), for i = 1,2,...,k, as returned by
+            ZGEQRF in the first k columns of its array argument A.
+            A is modified by the routine but restored on exit.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            If SIDE = 'L', LDA >= max(1,M);
+            if SIDE = 'R', LDA >= max(1,N).
+
+    TAU     (input) COMPLEX*16 array, dimension (K)
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZGEQRF.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*
+          Determine the block size.  NB may be at most NBMAX, where NBMAX
+          is used to define the local array T.
+
+   Computing MIN
+   Writing concatenation
+*/
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "ZUNMQR", ch__1, m, n, k, &c_n1, (
+		ftnlen)6, (ftnlen)2);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZUNMQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/*
+   Computing MAX
+   Writing concatenation
+*/
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "ZUNMQR", ch__1, m, n, k, &c_n1, (
+		    ftnlen)6, (ftnlen)2);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	zunm2r_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && ! notran || ! left && notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*
+             Form the triangular factor of the block reflector
+             H = H(i) H(i+1) . . . H(i+ib-1)
+*/
+
+	    i__4 = nq - i__ + 1;
+	    zlarft_("Forward", "Columnwise", &i__4, &ib, &a[i__ + i__ *
+		    a_dim1], lda, &tau[i__], t, &c__65)
+		    ;
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    zlarfb_(side, trans, "Forward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc *
+		    c_dim1], ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNMQR */
+
+} /* zunmqr_ */
+
+/* Subroutine */ int zunmtr_(char *side, char *uplo, char *trans, integer *m,
+	integer *n, doublecomplex *a, integer *lda, doublecomplex *tau,
+	doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1[2], i__2, i__3;
+    char ch__1[2];
+
+    /* Local variables */
+    static integer i1, i2, nb, mi, ni, nq, nw;
+    static logical left;
+    extern logical lsame_(char *, char *);
+    static integer iinfo;
+    static logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer lwkopt;
+    static logical lquery;
+    extern /* Subroutine */ int zunmql_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *,
+	    integer *, doublecomplex *, integer *, doublecomplex *,
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    ZUNMTR overwrites the general complex M-by-N matrix C with
+
+                    SIDE = 'L'     SIDE = 'R'
+    TRANS = 'N':      Q * C          C * Q
+    TRANS = 'C':      Q**H * C       C * Q**H
+
+    where Q is a complex unitary matrix of order nq, with nq = m if
+    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+    nq-1 elementary reflectors, as returned by ZHETRD:
+
+    if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+
+    if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+    Arguments
+    =========
+
+    SIDE    (input) CHARACTER*1
+            = 'L': apply Q or Q**H from the Left;
+            = 'R': apply Q or Q**H from the Right.
+
+    UPLO    (input) CHARACTER*1
+            = 'U': Upper triangle of A contains elementary reflectors
+                   from ZHETRD;
+            = 'L': Lower triangle of A contains elementary reflectors
+                   from ZHETRD.
+
+    TRANS   (input) CHARACTER*1
+            = 'N':  No transpose, apply Q;
+            = 'C':  Conjugate transpose, apply Q**H.
+
+    M       (input) INTEGER
+            The number of rows of the matrix C. M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix C. N >= 0.
+
+    A       (input) COMPLEX*16 array, dimension
+                                 (LDA,M) if SIDE = 'L'
+                                 (LDA,N) if SIDE = 'R'
+            The vectors which define the elementary reflectors, as
+            returned by ZHETRD.
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.
+            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
+
+    TAU     (input) COMPLEX*16 array, dimension
+                                 (M-1) if SIDE = 'L'
+                                 (N-1) if SIDE = 'R'
+            TAU(i) must contain the scalar factor of the elementary
+            reflector H(i), as returned by ZHETRD.
+
+    C       (input/output) COMPLEX*16 array, dimension (LDC,N)
+            On entry, the M-by-N matrix C.
+            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
+
+    LDC     (input) INTEGER
+            The leading dimension of the array C. LDC >= max(1,M).
+
+    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.
+            If SIDE = 'L', LWORK >= max(1,N);
+            if SIDE = 'R', LWORK >= max(1,M).
+            For optimum performance LWORK >= N*NB if SIDE = 'L', and
+            LWORK >=M*NB if SIDE = 'R', where NB is the optimal
+            blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    upper = lsame_(uplo, "U");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	*info = -2;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans,
+	    "C")) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+	if (upper) {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "ZUNMQL", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "ZUNMQL", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	} else {
+	    if (left) {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *m - 1;
+		i__3 = *m - 1;
+		nb = ilaenv_(&c__1, "ZUNMQR", ch__1, &i__2, n, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    } else {
+/* Writing concatenation */
+		i__1[0] = 1, a__1[0] = side;
+		i__1[1] = 1, a__1[1] = trans;
+		s_cat(ch__1, a__1, i__1, &c__2, (ftnlen)2);
+		i__2 = *n - 1;
+		i__3 = *n - 1;
+		nb = ilaenv_(&c__1, "ZUNMQR", ch__1, m, &i__2, &i__3, &c_n1, (
+			ftnlen)6, (ftnlen)2);
+	    }
+	}
+	lwkopt = max(1,nw) * nb;
+	work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    }
+
+    if (*info != 0) {
+	i__2 = -(*info);
+	xerbla_("ZUNMTR", &i__2);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || nq == 1) {
+	work[1].r = 1., work[1].i = 0.;
+	return 0;
+    }
+
+    if (left) {
+	mi = *m - 1;
+	ni = *n;
+    } else {
+	mi = *m;
+	ni = *n - 1;
+    }
+
+    if (upper) {
+
+/*        Q was determined by a call to ZHETRD with UPLO = 'U' */
+
+	i__2 = nq - 1;
+	zunmql_(side, trans, &mi, &ni, &i__2, &a[(a_dim1 << 1) + 1], lda, &
+		tau[1], &c__[c_offset], ldc, &work[1], lwork, &iinfo);
+    } else {
+
+/*        Q was determined by a call to ZHETRD with UPLO = 'L' */
+
+	if (left) {
+	    i1 = 2;
+	    i2 = 1;
+	} else {
+	    i1 = 1;
+	    i2 = 2;
+	}
+	i__2 = nq - 1;
+	zunmqr_(side, trans, &mi, &ni, &i__2, &a[a_dim1 + 2], lda, &tau[1], &
+		c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo);
+    }
+    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
+    return 0;
+
+/*     End of ZUNMTR */
+
+} /* zunmtr_ */
+
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_z_lapack.f.patch b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_z_lapack.f.patch
new file mode 100644
index 00000000000..1e6fc8c0707
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/f2c_z_lapack.f.patch
@@ -0,0 +1,32 @@
+@@ -15278,5 +15278,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( LASTV, I ).NE.ZERO ) EXIT
++                     IF( V( LASTV, I ).NE.ZERO ) GO TO 15
+                   END DO
++   15             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -15290,5 +15291,6 @@
+ !                 Skip any trailing zeros.
+                   DO LASTV = N, I+1, -1
+-                     IF( V( I, LASTV ).NE.ZERO ) EXIT
++                     IF( V( I, LASTV ).NE.ZERO ) GO TO 16
+                   END DO
++   16             CONTINUE
+                   J = MIN( LASTV, PREVLASTV )
+@@ -15338,5 +15340,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = 1, I-1
+-                        IF( V( LASTV, I ).NE.ZERO ) EXIT
++                        IF( V( LASTV, I ).NE.ZERO ) GO TO 35
+                      END DO
++   35                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
+@@ -15354,5 +15357,6 @@
+ !                    Skip any leading zeros.
+                      DO LASTV = N, I+1, -1
+-                        IF( V( I, LASTV ).NE.ZERO ) EXIT
++                        IF( V( I, LASTV ).NE.ZERO ) GO TO 36
+                      END DO
++   36                CONTINUE
+                      J = MAX( LASTV, PREVLASTV )
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/fortran.py b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/fortran.py
new file mode 100644
index 00000000000..3b6ac70f001
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/fortran.py
@@ -0,0 +1,124 @@
+from __future__ import division, absolute_import, print_function
+
+import re
+import itertools
+
+def isBlank(line):
+    return not line
+def isLabel(line):
+    return line[0].isdigit()
+def isComment(line):
+    return line[0] != ' '
+def isContinuation(line):
+    return line[5] != ' '
+
+COMMENT, STATEMENT, CONTINUATION = 0, 1, 2
+def lineType(line):
+    """Return the type of a line of Fortan code."""
+    if isBlank(line):
+        return COMMENT
+    elif isLabel(line):
+        return STATEMENT
+    elif isComment(line):
+        return COMMENT
+    elif isContinuation(line):
+        return CONTINUATION
+    else:
+        return STATEMENT
+
+class LineIterator(object):
+    """LineIterator(iterable)
+
+    Return rstrip()'d lines from iterable, while keeping a count of the
+    line number in the .lineno attribute.
+    """
+    def __init__(self, iterable):
+        object.__init__(self)
+        self.iterable = iter(iterable)
+        self.lineno = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        self.lineno += 1
+        line = next(self.iterable)
+        line = line.rstrip()
+        return line
+
+    next = __next__
+
+
+class PushbackIterator(object):
+    """PushbackIterator(iterable)
+
+    Return an iterator for which items can be pushed back into.
+    Call the .pushback(item) method to have item returned as the next
+    value of .next().
+    """
+    def __init__(self, iterable):
+        object.__init__(self)
+        self.iterable = iter(iterable)
+        self.buffer = []
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.buffer:
+            return self.buffer.pop()
+        else:
+            return next(self.iterable)
+
+    def pushback(self, item):
+        self.buffer.append(item)
+
+    next = __next__
+
+
+def fortranSourceLines(fo):
+    """Return an iterator over statement lines of a Fortran source file.
+
+    Comment and blank lines are stripped out, and continuation lines are
+    merged.
+    """
+    numberingiter = LineIterator(fo)
+    # add an extra '' at the end
+    with_extra = itertools.chain(numberingiter, [''])
+    pushbackiter = PushbackIterator(with_extra)
+    for line in pushbackiter:
+        t = lineType(line)
+        if t == COMMENT:
+            continue
+        elif t == STATEMENT:
+            lines = [line]
+            # this is where we need the extra '', so we don't finish reading
+            # the iterator when we don't want to handle that
+            for next_line in pushbackiter:
+                t = lineType(next_line)
+                if t == CONTINUATION:
+                    lines.append(next_line[6:])
+                else:
+                    pushbackiter.pushback(next_line)
+                    break
+            yield numberingiter.lineno, ''.join(lines)
+        else:
+            raise ValueError("jammed: continuation line not expected: %s:%d" %
+                             (fo.name, numberingiter.lineno))
+
+def getDependencies(filename):
+    """For a Fortran source file, return a list of routines declared as EXTERNAL
+    in it.
+    """
+    fo = open(filename)
+    external_pat = re.compile(r'^\s*EXTERNAL\s', re.I)
+    routines = []
+    for lineno, line in fortranSourceLines(fo):
+        m = external_pat.match(line)
+        if m:
+            names = line = line[m.end():].strip().split(',')
+            names = [n.strip().lower() for n in names]
+            names = [n for n in names if n]
+            routines.extend(names)
+    fo.close()
+    return routines
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/make_lite.py b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/make_lite.py
new file mode 100755
index 00000000000..61102d6ab07
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/make_lite.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+"""
+Usage: make_lite.py <wrapped_routines_file> <lapack_dir> <output_dir>
+
+Typical invocation:
+
+    make_lite.py wrapped_routines /tmp/lapack-3.x.x
+
+Requires the following to be on the path:
+ * f2c
+ * patch
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys
+import os
+import subprocess
+import shutil
+
+import fortran
+import clapack_scrub
+
+PY2 = sys.version_info < (3, 0)
+
+if PY2:
+    from distutils.spawn import find_executable as which
+else:
+    from shutil import which
+
+# Arguments to pass to f2c. You'll always want -A for ANSI C prototypes
+# Others of interest: -a to not make variables static by default
+#                     -C to check array subscripts
+F2C_ARGS = ['-A', '-Nx800']
+
+# The header to add to the top of the f2c_*.c file. Note that dlamch_() calls
+# will be replaced by the macros below by clapack_scrub.scrub_source()
+HEADER = '''\
+/*
+NOTE: This is generated code. Look in Misc/lapack_lite for information on
+      remaking this file.
+*/
+#include "f2c.h"
+
+#ifdef HAVE_CONFIG
+#include "config.h"
+#else
+extern doublereal dlamch_(char *);
+#define EPSILON dlamch_("Epsilon")
+#define SAFEMINIMUM dlamch_("Safe minimum")
+#define PRECISION dlamch_("Precision")
+#define BASE dlamch_("Base")
+#endif
+
+extern doublereal dlapy2_(doublereal *x, doublereal *y);
+
+/*
+f2c knows the exact rules for precedence, and so omits parentheses where not
+strictly necessary. Since this is generated code, we don't really care if
+it's readable, and we know what is written is correct. So don't warn about
+them.
+*/
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+'''
+
+class FortranRoutine(object):
+    """Wrapper for a Fortran routine in a file.
+    """
+    type = 'generic'
+    def __init__(self, name=None, filename=None):
+        self.filename = filename
+        if name is None:
+            root, ext = os.path.splitext(filename)
+            name = root
+        self.name = name
+        self._dependencies = None
+
+    def dependencies(self):
+        if self._dependencies is None:
+            deps = fortran.getDependencies(self.filename)
+            self._dependencies = [d.lower() for d in deps]
+        return self._dependencies
+
+    def __repr__(self):
+        return "FortranRoutine({!r}, filename={!r})".format(self.name, self.filename)
+
+class UnknownFortranRoutine(FortranRoutine):
+    """Wrapper for a Fortran routine for which the corresponding file
+    is not known.
+    """
+    type = 'unknown'
+    def __init__(self, name):
+        FortranRoutine.__init__(self, name=name, filename='<unknown>')
+
+    def dependencies(self):
+        return []
+
+class FortranLibrary(object):
+    """Container for a bunch of Fortran routines.
+    """
+    def __init__(self, src_dirs):
+        self._src_dirs = src_dirs
+        self.names_to_routines = {}
+
+    def _findRoutine(self, rname):
+        rname = rname.lower()
+        for s in self._src_dirs:
+            ffilename = os.path.join(s, rname + '.f')
+            if os.path.exists(ffilename):
+                return self._newFortranRoutine(rname, ffilename)
+        return UnknownFortranRoutine(rname)
+
+    def _newFortranRoutine(self, rname, filename):
+        return FortranRoutine(rname, filename)
+
+    def addIgnorableRoutine(self, rname):
+        """Add a routine that we don't want to consider when looking at
+        dependencies.
+        """
+        rname = rname.lower()
+        routine = UnknownFortranRoutine(rname)
+        self.names_to_routines[rname] = routine
+
+    def addRoutine(self, rname):
+        """Add a routine to the library.
+        """
+        self.getRoutine(rname)
+
+    def getRoutine(self, rname):
+        """Get a routine from the library. Will add if it's not found.
+        """
+        unique = []
+        rname = rname.lower()
+        routine = self.names_to_routines.get(rname, unique)
+        if routine is unique:
+            routine = self._findRoutine(rname)
+            self.names_to_routines[rname] = routine
+        return routine
+
+    def allRoutineNames(self):
+        """Return the names of all the routines.
+        """
+        return list(self.names_to_routines.keys())
+
+    def allRoutines(self):
+        """Return all the routines.
+        """
+        return list(self.names_to_routines.values())
+
+    def resolveAllDependencies(self):
+        """Try to add routines to the library to satisfy all the dependencies
+        for each routine in the library.
+
+        Returns a set of routine names that have the dependencies unresolved.
+        """
+        done_this = set()
+        last_todo = set()
+        while True:
+            todo = set(self.allRoutineNames()) - done_this
+            if todo == last_todo:
+                break
+            for rn in todo:
+                r = self.getRoutine(rn)
+                deps = r.dependencies()
+                for d in deps:
+                    self.addRoutine(d)
+                done_this.add(rn)
+            last_todo = todo
+        return todo
+
+class LapackLibrary(FortranLibrary):
+    def _newFortranRoutine(self, rname, filename):
+        routine = FortranLibrary._newFortranRoutine(self, rname, filename)
+        if 'blas' in filename.lower():
+            routine.type = 'blas'
+        elif 'install' in filename.lower():
+            routine.type = 'config'
+        elif rname.startswith('z'):
+            routine.type = 'z_lapack'
+        elif rname.startswith('c'):
+            routine.type = 'c_lapack'
+        elif rname.startswith('s'):
+            routine.type = 's_lapack'
+        elif rname.startswith('d'):
+            routine.type = 'd_lapack'
+        else:
+            routine.type = 'lapack'
+        return routine
+
+    def allRoutinesByType(self, typename):
+        routines = sorted((r.name, r) for r in self.allRoutines() if r.type == typename)
+        return [a[1] for a in routines]
+
+def printRoutineNames(desc, routines):
+    print(desc)
+    for r in routines:
+        print('\t%s' % r.name)
+
+def getLapackRoutines(wrapped_routines, ignores, lapack_dir):
+    blas_src_dir = os.path.join(lapack_dir, 'BLAS', 'SRC')
+    if not os.path.exists(blas_src_dir):
+        blas_src_dir = os.path.join(lapack_dir, 'blas', 'src')
+    lapack_src_dir = os.path.join(lapack_dir, 'SRC')
+    if not os.path.exists(lapack_src_dir):
+        lapack_src_dir = os.path.join(lapack_dir, 'src')
+    install_src_dir = os.path.join(lapack_dir, 'INSTALL')
+    if not os.path.exists(install_src_dir):
+        install_src_dir = os.path.join(lapack_dir, 'install')
+
+    library = LapackLibrary([install_src_dir, blas_src_dir, lapack_src_dir])
+
+    for r in ignores:
+        library.addIgnorableRoutine(r)
+
+    for w in wrapped_routines:
+        library.addRoutine(w)
+
+    library.resolveAllDependencies()
+
+    return library
+
+def getWrappedRoutineNames(wrapped_routines_file):
+    routines = []
+    ignores = []
+    with open(wrapped_routines_file) as fo:
+        for line in fo:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            if line.startswith('IGNORE:'):
+                line = line[7:].strip()
+                ig = line.split()
+                ignores.extend(ig)
+            else:
+                routines.append(line)
+    return routines, ignores
+
+types = {'blas', 'lapack', 'd_lapack', 's_lapack', 'z_lapack', 'c_lapack', 'config'}
+
+def dumpRoutineNames(library, output_dir):
+    for typename in {'unknown'} | types:
+        routines = library.allRoutinesByType(typename)
+        filename = os.path.join(output_dir, typename + '_routines.lst')
+        with open(filename, 'w') as fo:
+            for r in routines:
+                deps = r.dependencies()
+                fo.write('%s: %s\n' % (r.name, ' '.join(deps)))
+
+def concatenateRoutines(routines, output_file):
+    with open(output_file, 'w') as output_fo:
+        for r in routines:
+            with open(r.filename, 'r') as fo:
+                source = fo.read()
+            output_fo.write(source)
+
+class F2CError(Exception):
+    pass
+
+def runF2C(fortran_filename, output_dir):
+    fortran_filename = fortran_filename.replace('\\', '/')
+    try:
+        subprocess.check_call(
+            ["f2c"] + F2C_ARGS + ['-d', output_dir, fortran_filename]
+        )
+    except subprocess.CalledProcessError:
+        raise F2CError
+
+def scrubF2CSource(c_file):
+    with open(c_file) as fo:
+        source = fo.read()
+    source = clapack_scrub.scrubSource(source, verbose=True)
+    with open(c_file, 'w') as fo:
+        fo.write(HEADER)
+        fo.write(source)
+
+def ensure_executable(name):
+    try:
+        which(name)
+    except:
+        raise SystemExit(name + ' not found')
+
+def main():
+    if len(sys.argv) != 3:
+        print(__doc__)
+        return
+    # Make sure that patch and f2c are found on path
+    ensure_executable('f2c')
+    ensure_executable('patch')
+
+    wrapped_routines_file = sys.argv[1]
+    lapack_src_dir = sys.argv[2]
+    output_dir = os.path.join(os.path.dirname(__file__), 'build')
+
+    try:
+        shutil.rmtree(output_dir)
+    except:
+        pass
+    os.makedirs(output_dir)
+
+    wrapped_routines, ignores = getWrappedRoutineNames(wrapped_routines_file)
+    library = getLapackRoutines(wrapped_routines, ignores, lapack_src_dir)
+
+    dumpRoutineNames(library, output_dir)
+
+    for typename in types:
+        fortran_file = os.path.join(output_dir, 'f2c_%s.f' % typename)
+        c_file = fortran_file[:-2] + '.c'
+        print('creating %s ...'  % c_file)
+        routines = library.allRoutinesByType(typename)
+        concatenateRoutines(routines, fortran_file)
+
+        # apply the patchpatch
+        patch_file = os.path.basename(fortran_file) + '.patch'
+        if os.path.exists(patch_file):
+            subprocess.check_call(['patch', '-u', fortran_file, patch_file])
+            print("Patched {}".format(fortran_file))
+        try:
+            runF2C(fortran_file, output_dir)
+        except F2CError:
+            print('f2c failed on %s' % fortran_file)
+            break
+        scrubF2CSource(c_file)
+
+        # patch any changes needed to the C file
+        c_patch_file = c_file + '.patch'
+        if os.path.exists(c_patch_file):
+            subprocess.check_call(['patch', '-u', c_file, c_patch_file])
+
+        print()
+
+    for fname in os.listdir(output_dir):
+        if fname.endswith('.c'):
+            print('Copying ' + fname)
+            shutil.copy(
+                os.path.join(output_dir, fname),
+                os.path.dirname(__file__),
+            )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/python_xerbla.c b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/python_xerbla.c
new file mode 100644
index 00000000000..dfc195556b4
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/python_xerbla.c
@@ -0,0 +1,48 @@
+#include "Python.h"
+
+#undef c_abs
+#include "f2c.h"
+
+/*
+  From the original manpage:
+  --------------------------
+  XERBLA is an error handler for the LAPACK routines.
+  It is called by an LAPACK routine if an input parameter has an invalid value.
+  A message is printed and execution stops.
+
+  Instead of printing a message and stopping the execution, a
+  ValueError is raised with the message.
+
+  Parameters:
+  -----------
+  srname: Subroutine name to use in error message, maximum six characters.
+          Spaces at the end are skipped.
+  info: Number of the invalid parameter.
+*/
+
+int xerbla_(char *srname, integer *info)
+{
+        static const char format[] = "On entry to %.*s" \
+                " parameter number %d had an illegal value";
+        char buf[sizeof(format) + 6 + 4];   /* 6 for name, 4 for param. num. */
+
+        int len = 0; /* length of subroutine name*/
+#ifdef WITH_THREAD
+        PyGILState_STATE save;
+#endif
+
+        while( len<6 && srname[len]!='\0' )
+                len++;
+        while( len && srname[len-1]==' ' )
+                len--;
+#ifdef WITH_THREAD
+        save = PyGILState_Ensure();
+#endif
+        PyOS_snprintf(buf, sizeof(buf), format, len, srname, *info);
+        PyErr_SetString(PyExc_ValueError, buf);
+#ifdef WITH_THREAD
+        PyGILState_Release(save);
+#endif
+
+        return 0;
+}
diff --git a/contrib/python/numpy/py2/numpy/linalg/lapack_lite/wrapped_routines b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/wrapped_routines
new file mode 100644
index 00000000000..0d99c724d23
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/lapack_lite/wrapped_routines
@@ -0,0 +1,51 @@
+ccopy
+cgeev
+cgelsd
+cgemm
+cgesdd
+cgesv
+cgetrf
+cheevd
+cpotrf
+cpotri
+cpotrs
+dcopy
+dgeev
+dgelsd
+dgemm
+dgeqrf
+dgesdd
+dgesv
+dgetrf
+dorgqr
+dpotrf
+dpotri
+dpotrs
+dsyevd
+scopy
+sgeev
+sgelsd
+sgemm
+sgesdd
+sgesv
+sgetrf
+spotrf
+spotri
+spotrs
+ssyevd
+zcopy
+zgeev
+zgelsd
+zgemm
+zgeqrf
+zgesdd
+zgesv
+zgetrf
+zheevd
+zpotrf
+zpotri
+zpotrs
+zungqr
+# need this b/c it's not properly declared as external in the BLAS source
+dcabs1
+IGNORE: xerbla
diff --git a/contrib/python/numpy/py2/numpy/linalg/setup.py b/contrib/python/numpy/py2/numpy/linalg/setup.py
new file mode 100644
index 00000000000..66c07c9e1e4
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/setup.py
@@ -0,0 +1,60 @@
+from __future__ import division, print_function
+
+import os
+import sys
+
+def configuration(parent_package='', top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    from numpy.distutils.system_info import get_info
+    config = Configuration('linalg', parent_package, top_path)
+
+    config.add_data_dir('tests')
+
+    # Configure lapack_lite
+
+    src_dir = 'lapack_lite'
+    lapack_lite_src = [
+        os.path.join(src_dir, 'python_xerbla.c'),
+        os.path.join(src_dir, 'f2c_z_lapack.c'),
+        os.path.join(src_dir, 'f2c_c_lapack.c'),
+        os.path.join(src_dir, 'f2c_d_lapack.c'),
+        os.path.join(src_dir, 'f2c_s_lapack.c'),
+        os.path.join(src_dir, 'f2c_lapack.c'),
+        os.path.join(src_dir, 'f2c_blas.c'),
+        os.path.join(src_dir, 'f2c_config.c'),
+        os.path.join(src_dir, 'f2c.c'),
+    ]
+    all_sources = config.paths(lapack_lite_src)
+
+    lapack_info = get_info('lapack_opt', 0)  # and {}
+
+    def get_lapack_lite_sources(ext, build_dir):
+        if not lapack_info:
+            print("### Warning:  Using unoptimized lapack ###")
+            return all_sources
+        else:
+            if sys.platform == 'win32':
+                print("### Warning:  python_xerbla.c is disabled ###")
+                return []
+            return [all_sources[0]]
+
+    config.add_extension(
+        'lapack_lite',
+        sources=['lapack_litemodule.c', get_lapack_lite_sources],
+        depends=['lapack_lite/f2c.h'],
+        extra_info=lapack_info,
+    )
+
+    # umath_linalg module
+    config.add_extension(
+        '_umath_linalg',
+        sources=['umath_linalg.c.src', get_lapack_lite_sources],
+        depends=['lapack_lite/f2c.h'],
+        extra_info=lapack_info,
+        libraries=['npymath'],
+    )
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/linalg/tests/__init__.py b/contrib/python/numpy/py2/numpy/linalg/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/linalg/tests/test_build.py b/contrib/python/numpy/py2/numpy/linalg/tests/test_build.py
new file mode 100644
index 00000000000..921390da331
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/tests/test_build.py
@@ -0,0 +1,55 @@
+from __future__ import division, absolute_import, print_function
+
+from subprocess import PIPE, Popen
+import sys
+import re
+import pytest
+
+from numpy.linalg import lapack_lite
+from numpy.testing import assert_
+
+
+class FindDependenciesLdd(object):
+
+    def __init__(self):
+        self.cmd = ['ldd']
+
+        try:
+            p = Popen(self.cmd, stdout=PIPE, stderr=PIPE)
+            stdout, stderr = p.communicate()
+        except OSError:
+            raise RuntimeError("command %s cannot be run" % self.cmd)
+
+    def get_dependencies(self, lfile):
+        p = Popen(self.cmd + [lfile], stdout=PIPE, stderr=PIPE)
+        stdout, stderr = p.communicate()
+        if not (p.returncode == 0):
+            raise RuntimeError("failed dependencies check for %s" % lfile)
+
+        return stdout
+
+    def grep_dependencies(self, lfile, deps):
+        stdout = self.get_dependencies(lfile)
+
+        rdeps = dict([(dep, re.compile(dep)) for dep in deps])
+        founds = []
+        for l in stdout.splitlines():
+            for k, v in rdeps.items():
+                if v.search(l):
+                    founds.append(k)
+
+        return founds
+
+
+class TestF77Mismatch(object):
+
+    @pytest.mark.skipif(not(sys.platform[:5] == 'linux'),
+                        reason="no fortran compiler on non-Linux platform")
+    def test_lapack(self):
+        f = FindDependenciesLdd()
+        deps = f.grep_dependencies(lapack_lite.__file__,
+                                   [b'libg2c', b'libgfortran'])
+        assert_(len(deps) <= 1,
+                         """Both g77 and gfortran runtimes linked in lapack_lite ! This is likely to
+cause random crashes and wrong results. See numpy INSTALL.txt for more
+information.""")
diff --git a/contrib/python/numpy/py2/numpy/linalg/tests/test_deprecations.py b/contrib/python/numpy/py2/numpy/linalg/tests/test_deprecations.py
new file mode 100644
index 00000000000..e12755e0d58
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/tests/test_deprecations.py
@@ -0,0 +1,22 @@
+"""Test deprecation and future warnings.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_warns
+
+
+def test_qr_mode_full_future_warning():
+    """Check mode='full' FutureWarning.
+
+    In numpy 1.8 the mode options 'full' and 'economic' in linalg.qr were
+    deprecated. The release date will probably be sometime in the summer
+    of 2013.
+
+    """
+    a = np.eye(2)
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='full')
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='f')
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='economic')
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='e')
diff --git a/contrib/python/numpy/py2/numpy/linalg/tests/test_linalg.py b/contrib/python/numpy/py2/numpy/linalg/tests/test_linalg.py
new file mode 100644
index 00000000000..235488c6e84
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/tests/test_linalg.py
@@ -0,0 +1,1964 @@
+""" Test functions for linalg module
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import itertools
+import traceback
+import textwrap
+import subprocess
+import pytest
+
+import numpy as np
+from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
+from numpy import multiply, atleast_2d, inf, asarray
+from numpy import linalg
+from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
+from numpy.linalg.linalg import _multi_dot_matrix_chain_order
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_array_equal,
+    assert_almost_equal, assert_allclose, suppress_warnings,
+    assert_raises_regex,
+    )
+
+
+def consistent_subclass(out, in_):
+    # For ndarray subclass input, our output should have the same subclass
+    # (non-ndarray input gets converted to ndarray).
+    return type(out) is (type(in_) if isinstance(in_, np.ndarray)
+                         else np.ndarray)
+
+
+old_assert_almost_equal = assert_almost_equal
+
+
+def assert_almost_equal(a, b, single_decimal=6, double_decimal=12, **kw):
+    if asarray(a).dtype.type in (single, csingle):
+        decimal = single_decimal
+    else:
+        decimal = double_decimal
+    old_assert_almost_equal(a, b, decimal=decimal, **kw)
+
+
+def get_real_dtype(dtype):
+    return {single: single, double: double,
+            csingle: single, cdouble: double}[dtype]
+
+
+def get_complex_dtype(dtype):
+    return {single: csingle, double: cdouble,
+            csingle: csingle, cdouble: cdouble}[dtype]
+
+
+def get_rtol(dtype):
+    # Choose a safe rtol
+    if dtype in (single, csingle):
+        return 1e-5
+    else:
+        return 1e-11
+
+
+# used to categorize tests
+all_tags = {
+  'square', 'nonsquare', 'hermitian',  # mutually exclusive
+  'generalized', 'size-0', 'strided' # optional additions
+}
+
+
+class LinalgCase(object):
+    def __init__(self, name, a, b, tags=set()):
+        """
+        A bundle of arguments to be passed to a test case, with an identifying
+        name, the operands a and b, and a set of tags to filter the tests
+        """
+        assert_(isinstance(name, str))
+        self.name = name
+        self.a = a
+        self.b = b
+        self.tags = frozenset(tags)  # prevent shared tags
+
+    def check(self, do):
+        """
+        Run the function `do` on this test case, expanding arguments
+        """
+        do(self.a, self.b, tags=self.tags)
+
+    def __repr__(self):
+        return "<LinalgCase: %s>" % (self.name,)
+
+
+def apply_tag(tag, cases):
+    """
+    Add the given tag (a string) to each of the cases (a list of LinalgCase
+    objects)
+    """
+    assert tag in all_tags, "Invalid tag"
+    for case in cases:
+        case.tags = case.tags | {tag}
+    return cases
+
+
+#
+# Base test cases
+#
+
+np.random.seed(1234)
+
+CASES = []
+
+# square test cases
+CASES += apply_tag('square', [
+    LinalgCase("single",
+               array([[1., 2.], [3., 4.]], dtype=single),
+               array([2., 1.], dtype=single)),
+    LinalgCase("double",
+               array([[1., 2.], [3., 4.]], dtype=double),
+               array([2., 1.], dtype=double)),
+    LinalgCase("double_2",
+               array([[1., 2.], [3., 4.]], dtype=double),
+               array([[2., 1., 4.], [3., 4., 6.]], dtype=double)),
+    LinalgCase("csingle",
+               array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=csingle),
+               array([2. + 1j, 1. + 2j], dtype=csingle)),
+    LinalgCase("cdouble",
+               array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=cdouble),
+               array([2. + 1j, 1. + 2j], dtype=cdouble)),
+    LinalgCase("cdouble_2",
+               array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=cdouble),
+               array([[2. + 1j, 1. + 2j, 1 + 3j], [1 - 2j, 1 - 3j, 1 - 6j]], dtype=cdouble)),
+    LinalgCase("0x0",
+               np.empty((0, 0), dtype=double),
+               np.empty((0,), dtype=double),
+               tags={'size-0'}),
+    LinalgCase("8x8",
+               np.random.rand(8, 8),
+               np.random.rand(8)),
+    LinalgCase("1x1",
+               np.random.rand(1, 1),
+               np.random.rand(1)),
+    LinalgCase("nonarray",
+               [[1, 2], [3, 4]],
+               [2, 1]),
+])
+
+# non-square test-cases
+CASES += apply_tag('nonsquare', [
+    LinalgCase("single_nsq_1",
+               array([[1., 2., 3.], [3., 4., 6.]], dtype=single),
+               array([2., 1.], dtype=single)),
+    LinalgCase("single_nsq_2",
+               array([[1., 2.], [3., 4.], [5., 6.]], dtype=single),
+               array([2., 1., 3.], dtype=single)),
+    LinalgCase("double_nsq_1",
+               array([[1., 2., 3.], [3., 4., 6.]], dtype=double),
+               array([2., 1.], dtype=double)),
+    LinalgCase("double_nsq_2",
+               array([[1., 2.], [3., 4.], [5., 6.]], dtype=double),
+               array([2., 1., 3.], dtype=double)),
+    LinalgCase("csingle_nsq_1",
+               array(
+                   [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=csingle),
+               array([2. + 1j, 1. + 2j], dtype=csingle)),
+    LinalgCase("csingle_nsq_2",
+               array(
+                   [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=csingle),
+               array([2. + 1j, 1. + 2j, 3. - 3j], dtype=csingle)),
+    LinalgCase("cdouble_nsq_1",
+               array(
+                   [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=cdouble),
+               array([2. + 1j, 1. + 2j], dtype=cdouble)),
+    LinalgCase("cdouble_nsq_2",
+               array(
+                   [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=cdouble),
+               array([2. + 1j, 1. + 2j, 3. - 3j], dtype=cdouble)),
+    LinalgCase("cdouble_nsq_1_2",
+               array(
+                   [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=cdouble),
+               array([[2. + 1j, 1. + 2j], [1 - 1j, 2 - 2j]], dtype=cdouble)),
+    LinalgCase("cdouble_nsq_2_2",
+               array(
+                   [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=cdouble),
+               array([[2. + 1j, 1. + 2j], [1 - 1j, 2 - 2j], [1 - 1j, 2 - 2j]], dtype=cdouble)),
+    LinalgCase("8x11",
+               np.random.rand(8, 11),
+               np.random.rand(8)),
+    LinalgCase("1x5",
+               np.random.rand(1, 5),
+               np.random.rand(1)),
+    LinalgCase("5x1",
+               np.random.rand(5, 1),
+               np.random.rand(5)),
+    LinalgCase("0x4",
+               np.random.rand(0, 4),
+               np.random.rand(0),
+               tags={'size-0'}),
+    LinalgCase("4x0",
+               np.random.rand(4, 0),
+               np.random.rand(4),
+               tags={'size-0'}),
+])
+
+# hermitian test-cases
+CASES += apply_tag('hermitian', [
+    LinalgCase("hsingle",
+               array([[1., 2.], [2., 1.]], dtype=single),
+               None),
+    LinalgCase("hdouble",
+               array([[1., 2.], [2., 1.]], dtype=double),
+               None),
+    LinalgCase("hcsingle",
+               array([[1., 2 + 3j], [2 - 3j, 1]], dtype=csingle),
+               None),
+    LinalgCase("hcdouble",
+               array([[1., 2 + 3j], [2 - 3j, 1]], dtype=cdouble),
+               None),
+    LinalgCase("hempty",
+               np.empty((0, 0), dtype=double),
+               None,
+               tags={'size-0'}),
+    LinalgCase("hnonarray",
+               [[1, 2], [2, 1]],
+               None),
+    LinalgCase("matrix_b_only",
+               array([[1., 2.], [2., 1.]]),
+               None),
+    LinalgCase("hmatrix_1x1",
+               np.random.rand(1, 1),
+               None),
+])
+
+
+#
+# Gufunc test cases
+#
+def _make_generalized_cases():
+    new_cases = []
+
+    for case in CASES:
+        if not isinstance(case.a, np.ndarray):
+            continue
+
+        a = np.array([case.a, 2 * case.a, 3 * case.a])
+        if case.b is None:
+            b = None
+        else:
+            b = np.array([case.b, 7 * case.b, 6 * case.b])
+        new_case = LinalgCase(case.name + "_tile3", a, b,
+                              tags=case.tags | {'generalized'})
+        new_cases.append(new_case)
+
+        a = np.array([case.a] * 2 * 3).reshape((3, 2) + case.a.shape)
+        if case.b is None:
+            b = None
+        else:
+            b = np.array([case.b] * 2 * 3).reshape((3, 2) + case.b.shape)
+        new_case = LinalgCase(case.name + "_tile213", a, b,
+                              tags=case.tags | {'generalized'})
+        new_cases.append(new_case)
+
+    return new_cases
+
+
+CASES += _make_generalized_cases()
+
+
+#
+# Generate stride combination variations of the above
+#
+def _stride_comb_iter(x):
+    """
+    Generate cartesian product of strides for all axes
+    """
+
+    if not isinstance(x, np.ndarray):
+        yield x, "nop"
+        return
+
+    stride_set = [(1,)] * x.ndim
+    stride_set[-1] = (1, 3, -4)
+    if x.ndim > 1:
+        stride_set[-2] = (1, 3, -4)
+    if x.ndim > 2:
+        stride_set[-3] = (1, -4)
+
+    for repeats in itertools.product(*tuple(stride_set)):
+        new_shape = [abs(a * b) for a, b in zip(x.shape, repeats)]
+        slices = tuple([slice(None, None, repeat) for repeat in repeats])
+
+        # new array with different strides, but same data
+        xi = np.empty(new_shape, dtype=x.dtype)
+        xi.view(np.uint32).fill(0xdeadbeef)
+        xi = xi[slices]
+        xi[...] = x
+        xi = xi.view(x.__class__)
+        assert_(np.all(xi == x))
+        yield xi, "stride_" + "_".join(["%+d" % j for j in repeats])
+
+        # generate also zero strides if possible
+        if x.ndim >= 1 and x.shape[-1] == 1:
+            s = list(x.strides)
+            s[-1] = 0
+            xi = np.lib.stride_tricks.as_strided(x, strides=s)
+            yield xi, "stride_xxx_0"
+        if x.ndim >= 2 and x.shape[-2] == 1:
+            s = list(x.strides)
+            s[-2] = 0
+            xi = np.lib.stride_tricks.as_strided(x, strides=s)
+            yield xi, "stride_xxx_0_x"
+        if x.ndim >= 2 and x.shape[:-2] == (1, 1):
+            s = list(x.strides)
+            s[-1] = 0
+            s[-2] = 0
+            xi = np.lib.stride_tricks.as_strided(x, strides=s)
+            yield xi, "stride_xxx_0_0"
+
+
+def _make_strided_cases():
+    new_cases = []
+    for case in CASES:
+        for a, a_label in _stride_comb_iter(case.a):
+            for b, b_label in _stride_comb_iter(case.b):
+                new_case = LinalgCase(case.name + "_" + a_label + "_" + b_label, a, b,
+                                      tags=case.tags | {'strided'})
+                new_cases.append(new_case)
+    return new_cases
+
+
+CASES += _make_strided_cases()
+
+
+#
+# Test different routines against the above cases
+#
+class LinalgTestCase(object):
+    TEST_CASES = CASES
+
+    def check_cases(self, require=set(), exclude=set()):
+        """
+        Run func on each of the cases with all of the tags in require, and none
+        of the tags in exclude
+        """
+        for case in self.TEST_CASES:
+            # filter by require and exclude
+            if case.tags & require != require:
+                continue
+            if case.tags & exclude:
+                continue
+
+            try:
+                case.check(self.do)
+            except Exception:
+                msg = "In test case: %r\n\n" % case
+                msg += traceback.format_exc()
+                raise AssertionError(msg)
+
+
+class LinalgSquareTestCase(LinalgTestCase):
+
+    def test_sq_cases(self):
+        self.check_cases(require={'square'},
+                         exclude={'generalized', 'size-0'})
+
+    def test_empty_sq_cases(self):
+        self.check_cases(require={'square', 'size-0'},
+                         exclude={'generalized'})
+
+
+class LinalgNonsquareTestCase(LinalgTestCase):
+
+    def test_nonsq_cases(self):
+        self.check_cases(require={'nonsquare'},
+                         exclude={'generalized', 'size-0'})
+
+    def test_empty_nonsq_cases(self):
+        self.check_cases(require={'nonsquare', 'size-0'},
+                         exclude={'generalized'})
+
+
+class HermitianTestCase(LinalgTestCase):
+
+    def test_herm_cases(self):
+        self.check_cases(require={'hermitian'},
+                         exclude={'generalized', 'size-0'})
+
+    def test_empty_herm_cases(self):
+        self.check_cases(require={'hermitian', 'size-0'},
+                         exclude={'generalized'})
+
+
+class LinalgGeneralizedSquareTestCase(LinalgTestCase):
+
+    @pytest.mark.slow
+    def test_generalized_sq_cases(self):
+        self.check_cases(require={'generalized', 'square'},
+                         exclude={'size-0'})
+
+    @pytest.mark.slow
+    def test_generalized_empty_sq_cases(self):
+        self.check_cases(require={'generalized', 'square', 'size-0'})
+
+
+class LinalgGeneralizedNonsquareTestCase(LinalgTestCase):
+
+    @pytest.mark.slow
+    def test_generalized_nonsq_cases(self):
+        self.check_cases(require={'generalized', 'nonsquare'},
+                         exclude={'size-0'})
+
+    @pytest.mark.slow
+    def test_generalized_empty_nonsq_cases(self):
+        self.check_cases(require={'generalized', 'nonsquare', 'size-0'})
+
+
+class HermitianGeneralizedTestCase(LinalgTestCase):
+
+    @pytest.mark.slow
+    def test_generalized_herm_cases(self):
+        self.check_cases(require={'generalized', 'hermitian'},
+                         exclude={'size-0'})
+
+    @pytest.mark.slow
+    def test_generalized_empty_herm_cases(self):
+        self.check_cases(require={'generalized', 'hermitian', 'size-0'},
+                         exclude={'none'})
+
+
+def dot_generalized(a, b):
+    a = asarray(a)
+    if a.ndim >= 3:
+        if a.ndim == b.ndim:
+            # matrix x matrix
+            new_shape = a.shape[:-1] + b.shape[-1:]
+        elif a.ndim == b.ndim + 1:
+            # matrix x vector
+            new_shape = a.shape[:-1]
+        else:
+            raise ValueError("Not implemented...")
+        r = np.empty(new_shape, dtype=np.common_type(a, b))
+        for c in itertools.product(*map(range, a.shape[:-2])):
+            r[c] = dot(a[c], b[c])
+        return r
+    else:
+        return dot(a, b)
+
+
+def identity_like_generalized(a):
+    a = asarray(a)
+    if a.ndim >= 3:
+        r = np.empty(a.shape, dtype=a.dtype)
+        r[...] = identity(a.shape[-2])
+        return r
+    else:
+        return identity(a.shape[0])
+
+
+class SolveCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+    # kept apart from TestSolve for use for testing with matrices.
+    def do(self, a, b, tags):
+        x = linalg.solve(a, b)
+        assert_almost_equal(b, dot_generalized(a, x))
+        assert_(consistent_subclass(x, b))
+
+
+class TestSolve(SolveCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(linalg.solve(x, x).dtype, dtype)
+
+    def test_0_size(self):
+        class ArraySubclass(np.ndarray):
+            pass
+        # Test system of 0x0 matrices
+        a = np.arange(8).reshape(2, 2, 2)
+        b = np.arange(6).reshape(1, 2, 3).view(ArraySubclass)
+
+        expected = linalg.solve(a, b)[:, 0:0, :]
+        result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0, :])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+        # Test errors for non-square and only b's dimension being 0
+        assert_raises(linalg.LinAlgError, linalg.solve, a[:, 0:0, 0:1], b)
+        assert_raises(ValueError, linalg.solve, a, b[:, 0:0, :])
+
+        # Test broadcasting error
+        b = np.arange(6).reshape(1, 3, 2)  # broadcasting error
+        assert_raises(ValueError, linalg.solve, a, b)
+        assert_raises(ValueError, linalg.solve, a[0:0], b[0:0])
+
+        # Test zero "single equations" with 0x0 matrices.
+        b = np.arange(2).reshape(1, 2).view(ArraySubclass)
+        expected = linalg.solve(a, b)[:, 0:0]
+        result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+        b = np.arange(3).reshape(1, 3)
+        assert_raises(ValueError, linalg.solve, a, b)
+        assert_raises(ValueError, linalg.solve, a[0:0], b[0:0])
+        assert_raises(ValueError, linalg.solve, a[:, 0:0, 0:0], b)
+
+    def test_0_size_k(self):
+        # test zero multiple equation (K=0) case.
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.arange(4).reshape(1, 2, 2)
+        b = np.arange(6).reshape(3, 2, 1).view(ArraySubclass)
+
+        expected = linalg.solve(a, b)[:, :, 0:0]
+        result = linalg.solve(a, b[:, :, 0:0])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+        # test both zero.
+        expected = linalg.solve(a, b)[:, 0:0, 0:0]
+        result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0, 0:0])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+
+class InvCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        a_inv = linalg.inv(a)
+        assert_almost_equal(dot_generalized(a, a_inv),
+                            identity_like_generalized(a))
+        assert_(consistent_subclass(a_inv, a))
+
+
+class TestInv(InvCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(linalg.inv(x).dtype, dtype)
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.inv(a)
+        assert_(res.dtype.type is np.float64)
+        assert_equal(a.shape, res.shape)
+        assert_(isinstance(res, ArraySubclass))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.inv(a)
+        assert_(res.dtype.type is np.complex64)
+        assert_equal(a.shape, res.shape)
+        assert_(isinstance(res, ArraySubclass))
+
+
+class EigvalsCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        ev = linalg.eigvals(a)
+        evalues, evectors = linalg.eig(a)
+        assert_almost_equal(ev, evalues)
+
+
+class TestEigvals(EigvalsCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(linalg.eigvals(x).dtype, dtype)
+        x = np.array([[1, 0.5], [-1, 1]], dtype=dtype)
+        assert_equal(linalg.eigvals(x).dtype, get_complex_dtype(dtype))
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.eigvals(a)
+        assert_(res.dtype.type is np.float64)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.eigvals(a)
+        assert_(res.dtype.type is np.complex64)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+
+class EigCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        evalues, evectors = linalg.eig(a)
+        assert_allclose(dot_generalized(a, evectors),
+                        np.asarray(evectors) * np.asarray(evalues)[..., None, :],
+                        rtol=get_rtol(evalues.dtype))
+        assert_(consistent_subclass(evectors, a))
+
+
+class TestEig(EigCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        w, v = np.linalg.eig(x)
+        assert_equal(w.dtype, dtype)
+        assert_equal(v.dtype, dtype)
+
+        x = np.array([[1, 0.5], [-1, 1]], dtype=dtype)
+        w, v = np.linalg.eig(x)
+        assert_equal(w.dtype, get_complex_dtype(dtype))
+        assert_equal(v.dtype, get_complex_dtype(dtype))
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res, res_v = linalg.eig(a)
+        assert_(res_v.dtype.type is np.float64)
+        assert_(res.dtype.type is np.float64)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res, res_v = linalg.eig(a)
+        assert_(res_v.dtype.type is np.complex64)
+        assert_(res.dtype.type is np.complex64)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+
+class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        u, s, vt = linalg.svd(a, 0)
+        assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
+                                           np.asarray(vt)),
+                        rtol=get_rtol(u.dtype))
+        assert_(consistent_subclass(u, a))
+        assert_(consistent_subclass(vt, a))
+
+
+class TestSVD(SVDCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        u, s, vh = linalg.svd(x)
+        assert_equal(u.dtype, dtype)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+        assert_equal(vh.dtype, dtype)
+        s = linalg.svd(x, compute_uv=False)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+
+    def test_empty_identity(self):
+        """ Empty input should put an identity matrix in u or vh """
+        x = np.empty((4, 0))
+        u, s, vh = linalg.svd(x, compute_uv=True)
+        assert_equal(u.shape, (4, 4))
+        assert_equal(vh.shape, (0, 0))
+        assert_equal(u, np.eye(4))
+
+        x = np.empty((0, 4))
+        u, s, vh = linalg.svd(x, compute_uv=True)
+        assert_equal(u.shape, (0, 0))
+        assert_equal(vh.shape, (4, 4))
+        assert_equal(vh, np.eye(4))
+
+
+class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+    # cond(x, p) for p in (None, 2, -2)
+
+    def do(self, a, b, tags):
+        c = asarray(a)  # a might be a matrix
+        if 'size-0' in tags:
+            assert_raises(LinAlgError, linalg.cond, c)
+            return
+
+        # +-2 norms
+        s = linalg.svd(c, compute_uv=False)
+        assert_almost_equal(
+            linalg.cond(a), s[..., 0] / s[..., -1],
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, 2), s[..., 0] / s[..., -1],
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, -2), s[..., -1] / s[..., 0],
+            single_decimal=5, double_decimal=11)
+
+        # Other norms
+        cinv = np.linalg.inv(c)
+        assert_almost_equal(
+            linalg.cond(a, 1),
+            abs(c).sum(-2).max(-1) * abs(cinv).sum(-2).max(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, -1),
+            abs(c).sum(-2).min(-1) * abs(cinv).sum(-2).min(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, np.inf),
+            abs(c).sum(-1).max(-1) * abs(cinv).sum(-1).max(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, -np.inf),
+            abs(c).sum(-1).min(-1) * abs(cinv).sum(-1).min(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, 'fro'),
+            np.sqrt((abs(c)**2).sum(-1).sum(-1)
+                    * (abs(cinv)**2).sum(-1).sum(-1)),
+            single_decimal=5, double_decimal=11)
+
+
+class TestCond(CondCases):
+    def test_basic_nonsvd(self):
+        # Smoketest the non-svd norms
+        A = array([[1., 0, 1], [0, -2., 0], [0, 0, 3.]])
+        assert_almost_equal(linalg.cond(A, inf), 4)
+        assert_almost_equal(linalg.cond(A, -inf), 2/3)
+        assert_almost_equal(linalg.cond(A, 1), 4)
+        assert_almost_equal(linalg.cond(A, -1), 0.5)
+        assert_almost_equal(linalg.cond(A, 'fro'), np.sqrt(265 / 12))
+
+    def test_singular(self):
+        # Singular matrices have infinite condition number for
+        # positive norms, and negative norms shouldn't raise
+        # exceptions
+        As = [np.zeros((2, 2)), np.ones((2, 2))]
+        p_pos = [None, 1, 2, 'fro']
+        p_neg = [-1, -2]
+        for A, p in itertools.product(As, p_pos):
+            # Inversion may not hit exact infinity, so just check the
+            # number is large
+            assert_(linalg.cond(A, p) > 1e15)
+        for A, p in itertools.product(As, p_neg):
+            linalg.cond(A, p)
+
+    def test_nan(self):
+        # nans should be passed through, not converted to infs
+        ps = [None, 1, -1, 2, -2, 'fro']
+        p_pos = [None, 1, 2, 'fro']
+
+        A = np.ones((2, 2))
+        A[0,1] = np.nan
+        for p in ps:
+            c = linalg.cond(A, p)
+            assert_(isinstance(c, np.float_))
+            assert_(np.isnan(c))
+
+        A = np.ones((3, 2, 2))
+        A[1,0,1] = np.nan
+        for p in ps:
+            c = linalg.cond(A, p)
+            assert_(np.isnan(c[1]))
+            if p in p_pos:
+                assert_(c[0] > 1e15)
+                assert_(c[2] > 1e15)
+            else:
+                assert_(not np.isnan(c[0]))
+                assert_(not np.isnan(c[2]))
+
+    def test_stacked_singular(self):
+        # Check behavior when only some of the stacked matrices are
+        # singular
+        np.random.seed(1234)
+        A = np.random.rand(2, 2, 2, 2)
+        A[0,0] = 0
+        A[1,1] = 0
+
+        for p in (None, 1, 2, 'fro', -1, -2):
+            c = linalg.cond(A, p)
+            assert_equal(c[0,0], np.inf)
+            assert_equal(c[1,1], np.inf)
+            assert_(np.isfinite(c[0,1]))
+            assert_(np.isfinite(c[1,0]))
+
+
+class PinvCases(LinalgSquareTestCase,
+                LinalgNonsquareTestCase,
+                LinalgGeneralizedSquareTestCase,
+                LinalgGeneralizedNonsquareTestCase):
+
+    def do(self, a, b, tags):
+        a_ginv = linalg.pinv(a)
+        # `a @ a_ginv == I` does not hold if a is singular
+        dot = dot_generalized
+        assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11)
+        assert_(consistent_subclass(a_ginv, a))
+
+
+class TestPinv(PinvCases):
+    pass
+
+
+class DetCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        d = linalg.det(a)
+        (s, ld) = linalg.slogdet(a)
+        if asarray(a).dtype.type in (single, double):
+            ad = asarray(a).astype(double)
+        else:
+            ad = asarray(a).astype(cdouble)
+        ev = linalg.eigvals(ad)
+        assert_almost_equal(d, multiply.reduce(ev, axis=-1))
+        assert_almost_equal(s * np.exp(ld), multiply.reduce(ev, axis=-1))
+
+        s = np.atleast_1d(s)
+        ld = np.atleast_1d(ld)
+        m = (s != 0)
+        assert_almost_equal(np.abs(s[m]), 1)
+        assert_equal(ld[~m], -inf)
+
+
+class TestDet(DetCases):
+    def test_zero(self):
+        assert_equal(linalg.det([[0.0]]), 0.0)
+        assert_equal(type(linalg.det([[0.0]])), double)
+        assert_equal(linalg.det([[0.0j]]), 0.0)
+        assert_equal(type(linalg.det([[0.0j]])), cdouble)
+
+        assert_equal(linalg.slogdet([[0.0]]), (0.0, -inf))
+        assert_equal(type(linalg.slogdet([[0.0]])[0]), double)
+        assert_equal(type(linalg.slogdet([[0.0]])[1]), double)
+        assert_equal(linalg.slogdet([[0.0j]]), (0.0j, -inf))
+        assert_equal(type(linalg.slogdet([[0.0j]])[0]), cdouble)
+        assert_equal(type(linalg.slogdet([[0.0j]])[1]), double)
+
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(np.linalg.det(x).dtype, dtype)
+        ph, s = np.linalg.slogdet(x)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+        assert_equal(ph.dtype, dtype)
+
+    def test_0_size(self):
+        a = np.zeros((0, 0), dtype=np.complex64)
+        res = linalg.det(a)
+        assert_equal(res, 1.)
+        assert_(res.dtype.type is np.complex64)
+        res = linalg.slogdet(a)
+        assert_equal(res, (1, 0))
+        assert_(res[0].dtype.type is np.complex64)
+        assert_(res[1].dtype.type is np.float32)
+
+        a = np.zeros((0, 0), dtype=np.float64)
+        res = linalg.det(a)
+        assert_equal(res, 1.)
+        assert_(res.dtype.type is np.float64)
+        res = linalg.slogdet(a)
+        assert_equal(res, (1, 0))
+        assert_(res[0].dtype.type is np.float64)
+        assert_(res[1].dtype.type is np.float64)
+
+
+class LstsqCases(LinalgSquareTestCase, LinalgNonsquareTestCase):
+
+    def do(self, a, b, tags):
+        arr = np.asarray(a)
+        m, n = arr.shape
+        u, s, vt = linalg.svd(a, 0)
+        x, residuals, rank, sv = linalg.lstsq(a, b, rcond=-1)
+        if m == 0:
+            assert_((x == 0).all())
+        if m <= n:
+            assert_almost_equal(b, dot(a, x))
+            assert_equal(rank, m)
+        else:
+            assert_equal(rank, n)
+        assert_almost_equal(sv, sv.__array_wrap__(s))
+        if rank == n and m > n:
+            expect_resids = (
+                np.asarray(abs(np.dot(a, x) - b)) ** 2).sum(axis=0)
+            expect_resids = np.asarray(expect_resids)
+            if np.asarray(b).ndim == 1:
+                expect_resids.shape = (1,)
+                assert_equal(residuals.shape, expect_resids.shape)
+        else:
+            expect_resids = np.array([]).view(type(x))
+        assert_almost_equal(residuals, expect_resids)
+        assert_(np.issubdtype(residuals.dtype, np.floating))
+        assert_(consistent_subclass(x, b))
+        assert_(consistent_subclass(residuals, b))
+
+
+class TestLstsq(LstsqCases):
+    def test_future_rcond(self):
+        a = np.array([[0., 1.,  0.,  1.,  2.,  0.],
+                      [0., 2.,  0.,  0.,  1.,  0.],
+                      [1., 0.,  1.,  0.,  0.,  4.],
+                      [0., 0.,  0.,  2.,  3.,  0.]]).T
+
+        b = np.array([1, 0, 0, 0, 0, 0])
+        with suppress_warnings() as sup:
+            w = sup.record(FutureWarning, "`rcond` parameter will change")
+            x, residuals, rank, s = linalg.lstsq(a, b)
+            assert_(rank == 4)
+            x, residuals, rank, s = linalg.lstsq(a, b, rcond=-1)
+            assert_(rank == 4)
+            x, residuals, rank, s = linalg.lstsq(a, b, rcond=None)
+            assert_(rank == 3)
+            # Warning should be raised exactly once (first command)
+            assert_(len(w) == 1)
+
+    @pytest.mark.parametrize(["m", "n", "n_rhs"], [
+        (4, 2, 2),
+        (0, 4, 1),
+        (0, 4, 2),
+        (4, 0, 1),
+        (4, 0, 2),
+        (4, 2, 0),
+        (0, 0, 0)
+    ])
+    def test_empty_a_b(self, m, n, n_rhs):
+        a = np.arange(m * n).reshape(m, n)
+        b = np.ones((m, n_rhs))
+        x, residuals, rank, s = linalg.lstsq(a, b, rcond=None)
+        if m == 0:
+            assert_((x == 0).all())
+        assert_equal(x.shape, (n, n_rhs))
+        assert_equal(residuals.shape, ((n_rhs,) if m > n else (0,)))
+        if m > n and n_rhs > 0:
+            # residuals are exactly the squared norms of b's columns
+            r = b - np.dot(a, x)
+            assert_almost_equal(residuals, (r * r).sum(axis=-2))
+        assert_equal(rank, min(m, n))
+        assert_equal(s.shape, (min(m, n),))
+
+    def test_incompatible_dims(self):
+        # use modified version of docstring example
+        x = np.array([0, 1, 2, 3])
+        y = np.array([-1, 0.2, 0.9, 2.1, 3.3])
+        A = np.vstack([x, np.ones(len(x))]).T
+        with assert_raises_regex(LinAlgError, "Incompatible dimensions"):
+            linalg.lstsq(A, y, rcond=None)
+
+
+[email protected]('dt', [np.dtype(c) for c in '?bBhHiIqQefdgFDGO']) 
+class TestMatrixPower(object):
+
+    rshft_0 = np.eye(4)
+    rshft_1 = rshft_0[[3, 0, 1, 2]]
+    rshft_2 = rshft_0[[2, 3, 0, 1]]
+    rshft_3 = rshft_0[[1, 2, 3, 0]]
+    rshft_all = [rshft_0, rshft_1, rshft_2, rshft_3]
+    noninv = array([[1, 0], [0, 0]])
+    stacked = np.block([[[rshft_0]]]*2)
+    #FIXME the 'e' dtype might work in future
+    dtnoinv = [object, np.dtype('e'), np.dtype('g'), np.dtype('G')]
+
+    def test_large_power(self, dt):
+        rshft = self.rshft_1.astype(dt)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 0), self.rshft_0)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 1), self.rshft_1)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 2), self.rshft_2)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 3), self.rshft_3)
+
+    def test_power_is_zero(self, dt):
+        def tz(M):
+            mz = matrix_power(M, 0)
+            assert_equal(mz, identity_like_generalized(M))
+            assert_equal(mz.dtype, M.dtype)
+        
+        for mat in self.rshft_all:
+            tz(mat.astype(dt))
+            if dt != object:
+                tz(self.stacked.astype(dt))
+
+    def test_power_is_one(self, dt):
+        def tz(mat):
+            mz = matrix_power(mat, 1)
+            assert_equal(mz, mat)
+            assert_equal(mz.dtype, mat.dtype)
+
+        for mat in self.rshft_all:
+            tz(mat.astype(dt))
+            if dt != object:
+                tz(self.stacked.astype(dt))
+
+    def test_power_is_two(self, dt):
+        def tz(mat):
+            mz = matrix_power(mat, 2)
+            mmul = matmul if mat.dtype != object else dot
+            assert_equal(mz, mmul(mat, mat))
+            assert_equal(mz.dtype, mat.dtype)
+
+        for mat in self.rshft_all:
+            tz(mat.astype(dt))
+            if dt != object:
+                tz(self.stacked.astype(dt))
+
+    def test_power_is_minus_one(self, dt):
+        def tz(mat):
+            invmat = matrix_power(mat, -1)
+            mmul = matmul if mat.dtype != object else dot
+            assert_almost_equal(
+                mmul(invmat, mat), identity_like_generalized(mat))
+
+        for mat in self.rshft_all:
+            if dt not in self.dtnoinv:
+                tz(mat.astype(dt))
+
+    def test_exceptions_bad_power(self, dt):
+        mat = self.rshft_0.astype(dt)
+        assert_raises(TypeError, matrix_power, mat, 1.5)
+        assert_raises(TypeError, matrix_power, mat, [1])
+
+    def test_exceptions_non_square(self, dt):
+        assert_raises(LinAlgError, matrix_power, np.array([1], dt), 1)
+        assert_raises(LinAlgError, matrix_power, np.array([[1], [2]], dt), 1)
+        assert_raises(LinAlgError, matrix_power, np.ones((4, 3, 2), dt), 1)
+
+    def test_exceptions_not_invertible(self, dt):
+        if dt in self.dtnoinv:
+            return
+        mat = self.noninv.astype(dt)
+        assert_raises(LinAlgError, matrix_power, mat, -1)
+
+
+
+class TestEigvalshCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        # note that eigenvalue arrays returned by eig must be sorted since
+        # their order isn't guaranteed.
+        ev = linalg.eigvalsh(a, 'L')
+        evalues, evectors = linalg.eig(a)
+        evalues.sort(axis=-1)
+        assert_allclose(ev, evalues, rtol=get_rtol(ev.dtype))
+
+        ev2 = linalg.eigvalsh(a, 'U')
+        assert_allclose(ev2, evalues, rtol=get_rtol(ev.dtype))
+
+
+class TestEigvalsh(object):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        w = np.linalg.eigvalsh(x)
+        assert_equal(w.dtype, get_real_dtype(dtype))
+
+    def test_invalid(self):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=np.float32)
+        assert_raises(ValueError, np.linalg.eigvalsh, x, UPLO="lrong")
+        assert_raises(ValueError, np.linalg.eigvalsh, x, "lower")
+        assert_raises(ValueError, np.linalg.eigvalsh, x, "upper")
+
+    def test_UPLO(self):
+        Klo = np.array([[0, 0], [1, 0]], dtype=np.double)
+        Kup = np.array([[0, 1], [0, 0]], dtype=np.double)
+        tgt = np.array([-1, 1], dtype=np.double)
+        rtol = get_rtol(np.double)
+
+        # Check default is 'L'
+        w = np.linalg.eigvalsh(Klo)
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'L'
+        w = np.linalg.eigvalsh(Klo, UPLO='L')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'l'
+        w = np.linalg.eigvalsh(Klo, UPLO='l')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'U'
+        w = np.linalg.eigvalsh(Kup, UPLO='U')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'u'
+        w = np.linalg.eigvalsh(Kup, UPLO='u')
+        assert_allclose(w, tgt, rtol=rtol)
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.eigvalsh(a)
+        assert_(res.dtype.type is np.float64)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.eigvalsh(a)
+        assert_(res.dtype.type is np.float32)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+
+class TestEighCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        # note that eigenvalue arrays returned by eig must be sorted since
+        # their order isn't guaranteed.
+        ev, evc = linalg.eigh(a)
+        evalues, evectors = linalg.eig(a)
+        evalues.sort(axis=-1)
+        assert_almost_equal(ev, evalues)
+
+        assert_allclose(dot_generalized(a, evc),
+                        np.asarray(ev)[..., None, :] * np.asarray(evc),
+                        rtol=get_rtol(ev.dtype))
+
+        ev2, evc2 = linalg.eigh(a, 'U')
+        assert_almost_equal(ev2, evalues)
+
+        assert_allclose(dot_generalized(a, evc2),
+                        np.asarray(ev2)[..., None, :] * np.asarray(evc2),
+                        rtol=get_rtol(ev.dtype), err_msg=repr(a))
+
+
+class TestEigh(object):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        w, v = np.linalg.eigh(x)
+        assert_equal(w.dtype, get_real_dtype(dtype))
+        assert_equal(v.dtype, dtype)
+
+    def test_invalid(self):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=np.float32)
+        assert_raises(ValueError, np.linalg.eigh, x, UPLO="lrong")
+        assert_raises(ValueError, np.linalg.eigh, x, "lower")
+        assert_raises(ValueError, np.linalg.eigh, x, "upper")
+
+    def test_UPLO(self):
+        Klo = np.array([[0, 0], [1, 0]], dtype=np.double)
+        Kup = np.array([[0, 1], [0, 0]], dtype=np.double)
+        tgt = np.array([-1, 1], dtype=np.double)
+        rtol = get_rtol(np.double)
+
+        # Check default is 'L'
+        w, v = np.linalg.eigh(Klo)
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'L'
+        w, v = np.linalg.eigh(Klo, UPLO='L')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'l'
+        w, v = np.linalg.eigh(Klo, UPLO='l')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'U'
+        w, v = np.linalg.eigh(Kup, UPLO='U')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'u'
+        w, v = np.linalg.eigh(Kup, UPLO='u')
+        assert_allclose(w, tgt, rtol=rtol)
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res, res_v = linalg.eigh(a)
+        assert_(res_v.dtype.type is np.float64)
+        assert_(res.dtype.type is np.float64)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res, res_v = linalg.eigh(a)
+        assert_(res_v.dtype.type is np.complex64)
+        assert_(res.dtype.type is np.float32)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+
+class _TestNormBase(object):
+    dt = None
+    dec = None
+
+
+class _TestNormGeneral(_TestNormBase):
+
+    def test_empty(self):
+        assert_equal(norm([]), 0.0)
+        assert_equal(norm(array([], dtype=self.dt)), 0.0)
+        assert_equal(norm(atleast_2d(array([], dtype=self.dt))), 0.0)
+
+    def test_vector_return_type(self):
+        a = np.array([1, 0, 1])
+
+        exact_types = np.typecodes['AllInteger']
+        inexact_types = np.typecodes['AllFloat']
+
+        all_types = exact_types + inexact_types
+
+        for each_inexact_types in all_types:
+            at = a.astype(each_inexact_types)
+
+            an = norm(at, -np.inf)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 0.0)
+
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "divide by zero encountered")
+                an = norm(at, -1)
+                assert_(issubclass(an.dtype.type, np.floating))
+                assert_almost_equal(an, 0.0)
+
+            an = norm(at, 0)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2)
+
+            an = norm(at, 1)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 2)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, an.dtype.type(2.0)**an.dtype.type(1.0/2.0))
+
+            an = norm(at, 4)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, an.dtype.type(2.0)**an.dtype.type(1.0/4.0))
+
+            an = norm(at, np.inf)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 1.0)
+
+    def test_vector(self):
+        a = [1, 2, 3, 4]
+        b = [-1, -2, -3, -4]
+        c = [-1, 2, -3, 4]
+
+        def _test(v):
+            np.testing.assert_almost_equal(norm(v), 30 ** 0.5,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, inf), 4.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -inf), 1.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 1), 10.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -1), 12.0 / 25,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 2), 30 ** 0.5,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -2), ((205. / 144) ** -0.5),
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 0), 4,
+                                           decimal=self.dec)
+
+        for v in (a, b, c,):
+            _test(v)
+
+        for v in (array(a, dtype=self.dt), array(b, dtype=self.dt),
+                  array(c, dtype=self.dt)):
+            _test(v)
+
+    def test_axis(self):
+        # Vector norms.
+        # Compare the use of `axis` with computing the norm of each row
+        # or column separately.
+        A = array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]:
+            expected0 = [norm(A[:, k], ord=order) for k in range(A.shape[1])]
+            assert_almost_equal(norm(A, ord=order, axis=0), expected0)
+            expected1 = [norm(A[k, :], ord=order) for k in range(A.shape[0])]
+            assert_almost_equal(norm(A, ord=order, axis=1), expected1)
+
+        # Matrix norms.
+        B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+        nd = B.ndim
+        for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro']:
+            for axis in itertools.combinations(range(-nd, nd), 2):
+                row_axis, col_axis = axis
+                if row_axis < 0:
+                    row_axis += nd
+                if col_axis < 0:
+                    col_axis += nd
+                if row_axis == col_axis:
+                    assert_raises(ValueError, norm, B, ord=order, axis=axis)
+                else:
+                    n = norm(B, ord=order, axis=axis)
+
+                    # The logic using k_index only works for nd = 3.
+                    # This has to be changed if nd is increased.
+                    k_index = nd - (row_axis + col_axis)
+                    if row_axis < col_axis:
+                        expected = [norm(B[:].take(k, axis=k_index), ord=order)
+                                    for k in range(B.shape[k_index])]
+                    else:
+                        expected = [norm(B[:].take(k, axis=k_index).T, ord=order)
+                                    for k in range(B.shape[k_index])]
+                    assert_almost_equal(n, expected)
+
+    def test_keepdims(self):
+        A = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+
+        allclose_err = 'order {0}, axis = {1}'
+        shape_err = 'Shape mismatch found {0}, expected {1}, order={2}, axis={3}'
+
+        # check the order=None, axis=None case
+        expected = norm(A, ord=None, axis=None)
+        found = norm(A, ord=None, axis=None, keepdims=True)
+        assert_allclose(np.squeeze(found), expected,
+                        err_msg=allclose_err.format(None, None))
+        expected_shape = (1, 1, 1)
+        assert_(found.shape == expected_shape,
+                shape_err.format(found.shape, expected_shape, None, None))
+
+        # Vector norms.
+        for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]:
+            for k in range(A.ndim):
+                expected = norm(A, ord=order, axis=k)
+                found = norm(A, ord=order, axis=k, keepdims=True)
+                assert_allclose(np.squeeze(found), expected,
+                                err_msg=allclose_err.format(order, k))
+                expected_shape = list(A.shape)
+                expected_shape[k] = 1
+                expected_shape = tuple(expected_shape)
+                assert_(found.shape == expected_shape,
+                        shape_err.format(found.shape, expected_shape, order, k))
+
+        # Matrix norms.
+        for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro', 'nuc']:
+            for k in itertools.permutations(range(A.ndim), 2):
+                expected = norm(A, ord=order, axis=k)
+                found = norm(A, ord=order, axis=k, keepdims=True)
+                assert_allclose(np.squeeze(found), expected,
+                                err_msg=allclose_err.format(order, k))
+                expected_shape = list(A.shape)
+                expected_shape[k[0]] = 1
+                expected_shape[k[1]] = 1
+                expected_shape = tuple(expected_shape)
+                assert_(found.shape == expected_shape,
+                        shape_err.format(found.shape, expected_shape, order, k))
+
+
+class _TestNorm2D(_TestNormBase):
+    # Define the part for 2d arrays separately, so we can subclass this
+    # and run the tests using np.matrix in matrixlib.tests.test_matrix_linalg.
+    array = np.array
+
+    def test_matrix_empty(self):
+        assert_equal(norm(self.array([[]], dtype=self.dt)), 0.0)
+
+    def test_matrix_return_type(self):
+        a = self.array([[1, 0, 1], [0, 1, 1]])
+
+        exact_types = np.typecodes['AllInteger']
+
+        # float32, complex64, float64, complex128 types are the only types
+        # allowed by `linalg`, which performs the matrix operations used
+        # within `norm`.
+        inexact_types = 'fdFD'
+
+        all_types = exact_types + inexact_types
+
+        for each_inexact_types in all_types:
+            at = a.astype(each_inexact_types)
+
+            an = norm(at, -np.inf)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "divide by zero encountered")
+                an = norm(at, -1)
+                assert_(issubclass(an.dtype.type, np.floating))
+                assert_almost_equal(an, 1.0)
+
+            an = norm(at, 1)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 2)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 3.0**(1.0/2.0))
+
+            an = norm(at, -2)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 1.0)
+
+            an = norm(at, np.inf)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 'fro')
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 'nuc')
+            assert_(issubclass(an.dtype.type, np.floating))
+            # Lower bar needed to support low precision floats.
+            # They end up being off by 1 in the 7th place.
+            np.testing.assert_almost_equal(an, 2.7320508075688772, decimal=6)
+
+    def test_matrix_2x2(self):
+        A = self.array([[1, 3], [5, 7]], dtype=self.dt)
+        assert_almost_equal(norm(A), 84 ** 0.5)
+        assert_almost_equal(norm(A, 'fro'), 84 ** 0.5)
+        assert_almost_equal(norm(A, 'nuc'), 10.0)
+        assert_almost_equal(norm(A, inf), 12.0)
+        assert_almost_equal(norm(A, -inf), 4.0)
+        assert_almost_equal(norm(A, 1), 10.0)
+        assert_almost_equal(norm(A, -1), 6.0)
+        assert_almost_equal(norm(A, 2), 9.1231056256176615)
+        assert_almost_equal(norm(A, -2), 0.87689437438234041)
+
+        assert_raises(ValueError, norm, A, 'nofro')
+        assert_raises(ValueError, norm, A, -3)
+        assert_raises(ValueError, norm, A, 0)
+
+    def test_matrix_3x3(self):
+        # This test has been added because the 2x2 example
+        # happened to have equal nuclear norm and induced 1-norm.
+        # The 1/10 scaling factor accommodates the absolute tolerance
+        # used in assert_almost_equal.
+        A = (1 / 10) * \
+            self.array([[1, 2, 3], [6, 0, 5], [3, 2, 1]], dtype=self.dt)
+        assert_almost_equal(norm(A), (1 / 10) * 89 ** 0.5)
+        assert_almost_equal(norm(A, 'fro'), (1 / 10) * 89 ** 0.5)
+        assert_almost_equal(norm(A, 'nuc'), 1.3366836911774836)
+        assert_almost_equal(norm(A, inf), 1.1)
+        assert_almost_equal(norm(A, -inf), 0.6)
+        assert_almost_equal(norm(A, 1), 1.0)
+        assert_almost_equal(norm(A, -1), 0.4)
+        assert_almost_equal(norm(A, 2), 0.88722940323461277)
+        assert_almost_equal(norm(A, -2), 0.19456584790481812)
+
+    def test_bad_args(self):
+        # Check that bad arguments raise the appropriate exceptions.
+
+        A = self.array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+
+        # Using `axis=<integer>` or passing in a 1-D array implies vector
+        # norms are being computed, so also using `ord='fro'`
+        # or `ord='nuc'` raises a ValueError.
+        assert_raises(ValueError, norm, A, 'fro', 0)
+        assert_raises(ValueError, norm, A, 'nuc', 0)
+        assert_raises(ValueError, norm, [3, 4], 'fro', None)
+        assert_raises(ValueError, norm, [3, 4], 'nuc', None)
+
+        # Similarly, norm should raise an exception when ord is any finite
+        # number other than 1, 2, -1 or -2 when computing matrix norms.
+        for order in [0, 3]:
+            assert_raises(ValueError, norm, A, order, None)
+            assert_raises(ValueError, norm, A, order, (0, 1))
+            assert_raises(ValueError, norm, B, order, (1, 2))
+
+        # Invalid axis
+        assert_raises(np.AxisError, norm, B, None, 3)
+        assert_raises(np.AxisError, norm, B, None, (2, 3))
+        assert_raises(ValueError, norm, B, None, (0, 1, 2))
+
+
+class _TestNorm(_TestNorm2D, _TestNormGeneral):
+    pass
+
+
+class TestNorm_NonSystematic(object):
+
+    def test_longdouble_norm(self):
+        # Non-regression test: p-norm of longdouble would previously raise
+        # UnboundLocalError.
+        x = np.arange(10, dtype=np.longdouble)
+        old_assert_almost_equal(norm(x, ord=3), 12.65, decimal=2)
+
+    def test_intmin(self):
+        # Non-regression test: p-norm of signed integer would previously do
+        # float cast and abs in the wrong order.
+        x = np.array([-2 ** 31], dtype=np.int32)
+        old_assert_almost_equal(norm(x, ord=3), 2 ** 31, decimal=5)
+
+    def test_complex_high_ord(self):
+        # gh-4156
+        d = np.empty((2,), dtype=np.clongdouble)
+        d[0] = 6 + 7j
+        d[1] = -6 + 7j
+        res = 11.615898132184
+        old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=10)
+        d = d.astype(np.complex128)
+        old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=9)
+        d = d.astype(np.complex64)
+        old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=5)
+
+
+# Separate definitions so we can use them for matrix tests.
+class _TestNormDoubleBase(_TestNormBase):
+    dt = np.double
+    dec = 12
+
+
+class _TestNormSingleBase(_TestNormBase):
+    dt = np.float32
+    dec = 6
+
+
+class _TestNormInt64Base(_TestNormBase):
+    dt = np.int64
+    dec = 12
+
+
+class TestNormDouble(_TestNorm, _TestNormDoubleBase):
+    pass
+
+
+class TestNormSingle(_TestNorm, _TestNormSingleBase):
+    pass
+
+
+class TestNormInt64(_TestNorm, _TestNormInt64Base):
+    pass
+
+
+class TestMatrixRank(object):
+
+    def test_matrix_rank(self):
+        # Full rank matrix
+        assert_equal(4, matrix_rank(np.eye(4)))
+        # rank deficient matrix
+        I = np.eye(4)
+        I[-1, -1] = 0.
+        assert_equal(matrix_rank(I), 3)
+        # All zeros - zero rank
+        assert_equal(matrix_rank(np.zeros((4, 4))), 0)
+        # 1 dimension - rank 1 unless all 0
+        assert_equal(matrix_rank([1, 0, 0, 0]), 1)
+        assert_equal(matrix_rank(np.zeros((4,))), 0)
+        # accepts array-like
+        assert_equal(matrix_rank([1]), 1)
+        # greater than 2 dimensions treated as stacked matrices
+        ms = np.array([I, np.eye(4), np.zeros((4,4))])
+        assert_equal(matrix_rank(ms), np.array([3, 4, 0]))
+        # works on scalar
+        assert_equal(matrix_rank(1), 1)
+
+    def test_symmetric_rank(self):
+        assert_equal(4, matrix_rank(np.eye(4), hermitian=True))
+        assert_equal(1, matrix_rank(np.ones((4, 4)), hermitian=True))
+        assert_equal(0, matrix_rank(np.zeros((4, 4)), hermitian=True))
+        # rank deficient matrix
+        I = np.eye(4)
+        I[-1, -1] = 0.
+        assert_equal(3, matrix_rank(I, hermitian=True))
+        # manually supplied tolerance
+        I[-1, -1] = 1e-8
+        assert_equal(4, matrix_rank(I, hermitian=True, tol=0.99e-8))
+        assert_equal(3, matrix_rank(I, hermitian=True, tol=1.01e-8))
+
+
+def test_reduced_rank():
+    # Test matrices with reduced rank
+    rng = np.random.RandomState(20120714)
+    for i in range(100):
+        # Make a rank deficient matrix
+        X = rng.normal(size=(40, 10))
+        X[:, 0] = X[:, 1] + X[:, 2]
+        # Assert that matrix_rank detected deficiency
+        assert_equal(matrix_rank(X), 9)
+        X[:, 3] = X[:, 4] + X[:, 5]
+        assert_equal(matrix_rank(X), 8)
+
+
+class TestQR(object):
+    # Define the array class here, so run this on matrices elsewhere.
+    array = np.array
+
+    def check_qr(self, a):
+        # This test expects the argument `a` to be an ndarray or
+        # a subclass of an ndarray of inexact type.
+        a_type = type(a)
+        a_dtype = a.dtype
+        m, n = a.shape
+        k = min(m, n)
+
+        # mode == 'complete'
+        q, r = linalg.qr(a, mode='complete')
+        assert_(q.dtype == a_dtype)
+        assert_(r.dtype == a_dtype)
+        assert_(isinstance(q, a_type))
+        assert_(isinstance(r, a_type))
+        assert_(q.shape == (m, m))
+        assert_(r.shape == (m, n))
+        assert_almost_equal(dot(q, r), a)
+        assert_almost_equal(dot(q.T.conj(), q), np.eye(m))
+        assert_almost_equal(np.triu(r), r)
+
+        # mode == 'reduced'
+        q1, r1 = linalg.qr(a, mode='reduced')
+        assert_(q1.dtype == a_dtype)
+        assert_(r1.dtype == a_dtype)
+        assert_(isinstance(q1, a_type))
+        assert_(isinstance(r1, a_type))
+        assert_(q1.shape == (m, k))
+        assert_(r1.shape == (k, n))
+        assert_almost_equal(dot(q1, r1), a)
+        assert_almost_equal(dot(q1.T.conj(), q1), np.eye(k))
+        assert_almost_equal(np.triu(r1), r1)
+
+        # mode == 'r'
+        r2 = linalg.qr(a, mode='r')
+        assert_(r2.dtype == a_dtype)
+        assert_(isinstance(r2, a_type))
+        assert_almost_equal(r2, r1)
+
+
+    @pytest.mark.parametrize(["m", "n"], [
+        (3, 0),
+        (0, 3),
+        (0, 0)
+    ])
+    def test_qr_empty(self, m, n):
+        k = min(m, n)
+        a = np.empty((m, n))
+
+        self.check_qr(a)
+
+        h, tau = np.linalg.qr(a, mode='raw')
+        assert_equal(h.dtype, np.double)
+        assert_equal(tau.dtype, np.double)
+        assert_equal(h.shape, (n, m))
+        assert_equal(tau.shape, (k,))
+
+    def test_mode_raw(self):
+        # The factorization is not unique and varies between libraries,
+        # so it is not possible to check against known values. Functional
+        # testing is a possibility, but awaits the exposure of more
+        # of the functions in lapack_lite. Consequently, this test is
+        # very limited in scope. Note that the results are in FORTRAN
+        # order, hence the h arrays are transposed.
+        a = self.array([[1, 2], [3, 4], [5, 6]], dtype=np.double)
+
+        # Test double
+        h, tau = linalg.qr(a, mode='raw')
+        assert_(h.dtype == np.double)
+        assert_(tau.dtype == np.double)
+        assert_(h.shape == (2, 3))
+        assert_(tau.shape == (2,))
+
+        h, tau = linalg.qr(a.T, mode='raw')
+        assert_(h.dtype == np.double)
+        assert_(tau.dtype == np.double)
+        assert_(h.shape == (3, 2))
+        assert_(tau.shape == (2,))
+
+    def test_mode_all_but_economic(self):
+        a = self.array([[1, 2], [3, 4]])
+        b = self.array([[1, 2], [3, 4], [5, 6]])
+        for dt in "fd":
+            m1 = a.astype(dt)
+            m2 = b.astype(dt)
+            self.check_qr(m1)
+            self.check_qr(m2)
+            self.check_qr(m2.T)
+
+        for dt in "fd":
+            m1 = 1 + 1j * a.astype(dt)
+            m2 = 1 + 1j * b.astype(dt)
+            self.check_qr(m1)
+            self.check_qr(m2)
+            self.check_qr(m2.T)
+
+
+class TestCholesky(object):
+    # TODO: are there no other tests for cholesky?
+
+    def test_basic_property(self):
+        # Check A = L L^H
+        shapes = [(1, 1), (2, 2), (3, 3), (50, 50), (3, 10, 10)]
+        dtypes = (np.float32, np.float64, np.complex64, np.complex128)
+
+        for shape, dtype in itertools.product(shapes, dtypes):
+            np.random.seed(1)
+            a = np.random.randn(*shape)
+            if np.issubdtype(dtype, np.complexfloating):
+                a = a + 1j*np.random.randn(*shape)
+
+            t = list(range(len(shape)))
+            t[-2:] = -1, -2
+
+            a = np.matmul(a.transpose(t).conj(), a)
+            a = np.asarray(a, dtype=dtype)
+
+            c = np.linalg.cholesky(a)
+
+            b = np.matmul(c, c.transpose(t).conj())
+            assert_allclose(b, a,
+                            err_msg="{} {}\n{}\n{}".format(shape, dtype, a, c),
+                            atol=500 * a.shape[0] * np.finfo(dtype).eps)
+
+    def test_0_size(self):
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.cholesky(a)
+        assert_equal(a.shape, res.shape)
+        assert_(res.dtype.type is np.float64)
+        # for documentation purpose:
+        assert_(isinstance(res, np.ndarray))
+
+        a = np.zeros((1, 0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.cholesky(a)
+        assert_equal(a.shape, res.shape)
+        assert_(res.dtype.type is np.complex64)
+        assert_(isinstance(res, np.ndarray))
+
+
+def test_byteorder_check():
+    # Byte order check should pass for native order
+    if sys.byteorder == 'little':
+        native = '<'
+    else:
+        native = '>'
+
+    for dtt in (np.float32, np.float64):
+        arr = np.eye(4, dtype=dtt)
+        n_arr = arr.newbyteorder(native)
+        sw_arr = arr.newbyteorder('S').byteswap()
+        assert_equal(arr.dtype.byteorder, '=')
+        for routine in (linalg.inv, linalg.det, linalg.pinv):
+            # Normal call
+            res = routine(arr)
+            # Native but not '='
+            assert_array_equal(res, routine(n_arr))
+            # Swapped
+            assert_array_equal(res, routine(sw_arr))
+
+
+def test_generalized_raise_multiloop():
+    # It should raise an error even if the error doesn't occur in the
+    # last iteration of the ufunc inner loop
+
+    invertible = np.array([[1, 2], [3, 4]])
+    non_invertible = np.array([[1, 1], [1, 1]])
+
+    x = np.zeros([4, 4, 2, 2])[1::2]
+    x[...] = invertible
+    x[0, 0] = non_invertible
+
+    assert_raises(np.linalg.LinAlgError, np.linalg.inv, x)
+
+
+def test_xerbla_override():
+    # Check that our xerbla has been successfully linked in. If it is not,
+    # the default xerbla routine is called, which prints a message to stdout
+    # and may, or may not, abort the process depending on the LAPACK package.
+
+    XERBLA_OK = 255
+
+    try:
+        pid = os.fork()
+    except (OSError, AttributeError):
+        # fork failed, or not running on POSIX
+        pytest.skip("Not POSIX or fork failed.")
+
+    if pid == 0:
+        # child; close i/o file handles
+        os.close(1)
+        os.close(0)
+        # Avoid producing core files.
+        import resource
+        resource.setrlimit(resource.RLIMIT_CORE, (0, 0))
+        # These calls may abort.
+        try:
+            np.linalg.lapack_lite.xerbla()
+        except ValueError:
+            pass
+        except Exception:
+            os._exit(os.EX_CONFIG)
+
+        try:
+            a = np.array([[1.]])
+            np.linalg.lapack_lite.dorgqr(
+                1, 1, 1, a,
+                0,  # <- invalid value
+                a, a, 0, 0)
+        except ValueError as e:
+            if "DORGQR parameter number 5" in str(e):
+                # success, reuse error code to mark success as
+                # FORTRAN STOP returns as success.
+                os._exit(XERBLA_OK)
+
+        # Did not abort, but our xerbla was not linked in.
+        os._exit(os.EX_CONFIG)
+    else:
+        # parent
+        pid, status = os.wait()
+        if os.WEXITSTATUS(status) != XERBLA_OK:
+            pytest.skip('Numpy xerbla not linked in.')
+
+
+def test_sdot_bug_8577():
+    # Regression test that loading certain other libraries does not
+    # result to wrong results in float32 linear algebra.
+    #
+    # There's a bug gh-8577 on OSX that can trigger this, and perhaps
+    # there are also other situations in which it occurs.
+    #
+    # Do the check in a separate process.
+
+    bad_libs = ['PyQt5.QtWidgets', 'IPython']
+
+    template = textwrap.dedent("""
+    import sys
+    {before}
+    try:
+        import {bad_lib}
+    except ImportError:
+        sys.exit(0)
+    {after}
+    x = np.ones(2, dtype=np.float32)
+    sys.exit(0 if np.allclose(x.dot(x), 2.0) else 1)
+    """)
+
+    for bad_lib in bad_libs:
+        code = template.format(before="import numpy as np", after="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+        # Swapped import order
+        code = template.format(after="import numpy as np", before="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+
+class TestMultiDot(object):
+
+    def test_basic_function_with_three_arguments(self):
+        # multi_dot with three arguments uses a fast hand coded algorithm to
+        # determine the optimal order. Therefore test it separately.
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+
+        assert_almost_equal(multi_dot([A, B, C]), A.dot(B).dot(C))
+        assert_almost_equal(multi_dot([A, B, C]), np.dot(A, np.dot(B, C)))
+
+    def test_basic_function_with_two_arguments(self):
+        # separate code path with two arguments
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+
+        assert_almost_equal(multi_dot([A, B]), A.dot(B))
+        assert_almost_equal(multi_dot([A, B]), np.dot(A, B))
+
+    def test_basic_function_with_dynamic_programing_optimization(self):
+        # multi_dot with four or more arguments uses the dynamic programing
+        # optimization and therefore deserve a separate
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 1))
+        assert_almost_equal(multi_dot([A, B, C, D]), A.dot(B).dot(C).dot(D))
+
+    def test_vector_as_first_argument(self):
+        # The first argument can be 1-D
+        A1d = np.random.random(2)  # 1-D
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 2))
+
+        # the result should be 1-D
+        assert_equal(multi_dot([A1d, B, C, D]).shape, (2,))
+
+    def test_vector_as_last_argument(self):
+        # The last argument can be 1-D
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D1d = np.random.random(2)  # 1-D
+
+        # the result should be 1-D
+        assert_equal(multi_dot([A, B, C, D1d]).shape, (6,))
+
+    def test_vector_as_first_and_last_argument(self):
+        # The first and last arguments can be 1-D
+        A1d = np.random.random(2)  # 1-D
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D1d = np.random.random(2)  # 1-D
+
+        # the result should be a scalar
+        assert_equal(multi_dot([A1d, B, C, D1d]).shape, ())
+
+    def test_dynamic_programming_logic(self):
+        # Test for the dynamic programming part
+        # This test is directly taken from Cormen page 376.
+        arrays = [np.random.random((30, 35)),
+                  np.random.random((35, 15)),
+                  np.random.random((15, 5)),
+                  np.random.random((5, 10)),
+                  np.random.random((10, 20)),
+                  np.random.random((20, 25))]
+        m_expected = np.array([[0., 15750., 7875., 9375., 11875., 15125.],
+                               [0.,     0., 2625., 4375.,  7125., 10500.],
+                               [0.,     0.,    0.,  750.,  2500.,  5375.],
+                               [0.,     0.,    0.,    0.,  1000.,  3500.],
+                               [0.,     0.,    0.,    0.,     0.,  5000.],
+                               [0.,     0.,    0.,    0.,     0.,     0.]])
+        s_expected = np.array([[0,  1,  1,  3,  3,  3],
+                               [0,  0,  2,  3,  3,  3],
+                               [0,  0,  0,  3,  3,  3],
+                               [0,  0,  0,  0,  4,  5],
+                               [0,  0,  0,  0,  0,  5],
+                               [0,  0,  0,  0,  0,  0]], dtype=int)
+        s_expected -= 1  # Cormen uses 1-based index, python does not.
+
+        s, m = _multi_dot_matrix_chain_order(arrays, return_costs=True)
+
+        # Only the upper triangular part (without the diagonal) is interesting.
+        assert_almost_equal(np.triu(s[:-1, 1:]),
+                            np.triu(s_expected[:-1, 1:]))
+        assert_almost_equal(np.triu(m), np.triu(m_expected))
+
+    def test_too_few_input_arrays(self):
+        assert_raises(ValueError, multi_dot, [])
+        assert_raises(ValueError, multi_dot, [np.random.random((3, 3))])
+
+
+class TestTensorinv(object):
+
+    @pytest.mark.parametrize("arr, ind", [
+        (np.ones((4, 6, 8, 2)), 2),
+        (np.ones((3, 3, 2)), 1),
+        ])
+    def test_non_square_handling(self, arr, ind):
+        with assert_raises(LinAlgError):
+            linalg.tensorinv(arr, ind=ind)
+
+    @pytest.mark.parametrize("shape, ind", [
+        # examples from docstring
+        ((4, 6, 8, 3), 2),
+        ((24, 8, 3), 1),
+        ])
+    def test_tensorinv_shape(self, shape, ind):
+        a = np.eye(24)
+        a.shape = shape
+        ainv = linalg.tensorinv(a=a, ind=ind)
+        expected = a.shape[ind:] + a.shape[:ind]
+        actual = ainv.shape
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("ind", [
+        0, -2,
+        ])
+    def test_tensorinv_ind_limit(self, ind):
+        a = np.eye(24)
+        a.shape = (4, 6, 8, 3)
+        with assert_raises(ValueError):
+            linalg.tensorinv(a=a, ind=ind)
+
+    def test_tensorinv_result(self):
+        # mimic a docstring example
+        a = np.eye(24)
+        a.shape = (24, 8, 3)
+        ainv = linalg.tensorinv(a, ind=1)
+        b = np.ones(24)
+        assert_allclose(np.tensordot(ainv, b, 1), np.linalg.tensorsolve(a, b))
diff --git a/contrib/python/numpy/py2/numpy/linalg/tests/test_regression.py b/contrib/python/numpy/py2/numpy/linalg/tests/test_regression.py
new file mode 100644
index 00000000000..bd3a45872cb
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/tests/test_regression.py
@@ -0,0 +1,150 @@
+""" Test functions for linalg module
+"""
+from __future__ import division, absolute_import, print_function
+
+import warnings
+
+import numpy as np
+from numpy import linalg, arange, float64, array, dot, transpose
+from numpy.testing import (
+    assert_, assert_raises, assert_equal, assert_array_equal,
+    assert_array_almost_equal, assert_array_less
+)
+
+
+class TestRegression(object):
+
+    def test_eig_build(self):
+        # Ticket #652
+        rva = array([1.03221168e+02 + 0.j,
+                     -1.91843603e+01 + 0.j,
+                     -6.04004526e-01 + 15.84422474j,
+                     -6.04004526e-01 - 15.84422474j,
+                     -1.13692929e+01 + 0.j,
+                     -6.57612485e-01 + 10.41755503j,
+                     -6.57612485e-01 - 10.41755503j,
+                     1.82126812e+01 + 0.j,
+                     1.06011014e+01 + 0.j,
+                     7.80732773e+00 + 0.j,
+                     -7.65390898e-01 + 0.j,
+                     1.51971555e-15 + 0.j,
+                     -1.51308713e-15 + 0.j])
+        a = arange(13 * 13, dtype=float64)
+        a.shape = (13, 13)
+        a = a % 17
+        va, ve = linalg.eig(a)
+        va.sort()
+        rva.sort()
+        assert_array_almost_equal(va, rva)
+
+    def test_eigh_build(self):
+        # Ticket 662.
+        rvals = [68.60568999, 89.57756725, 106.67185574]
+
+        cov = array([[77.70273908,   3.51489954,  15.64602427],
+                     [3.51489954,  88.97013878,  -1.07431931],
+                     [15.64602427,  -1.07431931,  98.18223512]])
+
+        vals, vecs = linalg.eigh(cov)
+        assert_array_almost_equal(vals, rvals)
+
+    def test_svd_build(self):
+        # Ticket 627.
+        a = array([[0., 1.], [1., 1.], [2., 1.], [3., 1.]])
+        m, n = a.shape
+        u, s, vh = linalg.svd(a)
+
+        b = dot(transpose(u[:, n:]), a)
+
+        assert_array_almost_equal(b, np.zeros((2, 2)))
+
+    def test_norm_vector_badarg(self):
+        # Regression for #786: Froebenius norm for vectors raises
+        # TypeError.
+        assert_raises(ValueError, linalg.norm, array([1., 2., 3.]), 'fro')
+
+    def test_lapack_endian(self):
+        # For bug #1482
+        a = array([[5.7998084,  -2.1825367],
+                   [-2.1825367,   9.85910595]], dtype='>f8')
+        b = array(a, dtype='<f8')
+
+        ap = linalg.cholesky(a)
+        bp = linalg.cholesky(b)
+        assert_array_equal(ap, bp)
+
+    def test_large_svd_32bit(self):
+        # See gh-4442, 64bit would require very large/slow matrices.
+        x = np.eye(1000, 66)
+        np.linalg.svd(x)
+
+    def test_svd_no_uv(self):
+        # gh-4733
+        for shape in (3, 4), (4, 4), (4, 3):
+            for t in float, complex:
+                a = np.ones(shape, dtype=t)
+                w = linalg.svd(a, compute_uv=False)
+                c = np.count_nonzero(np.absolute(w) > 0.5)
+                assert_equal(c, 1)
+                assert_equal(np.linalg.matrix_rank(a), 1)
+                assert_array_less(1, np.linalg.norm(a, ord=2))
+
+    def test_norm_object_array(self):
+        # gh-7575
+        testvector = np.array([np.array([0, 1]), 0, 0], dtype=object)
+
+        norm = linalg.norm(testvector)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        norm = linalg.norm(testvector, ord=1)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype != np.dtype('float64'))
+
+        norm = linalg.norm(testvector, ord=2)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        assert_raises(ValueError, linalg.norm, testvector, ord='fro')
+        assert_raises(ValueError, linalg.norm, testvector, ord='nuc')
+        assert_raises(ValueError, linalg.norm, testvector, ord=np.inf)
+        assert_raises(ValueError, linalg.norm, testvector, ord=-np.inf)
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", DeprecationWarning)
+            assert_raises((AttributeError, DeprecationWarning),
+                              linalg.norm, testvector, ord=0)
+        assert_raises(ValueError, linalg.norm, testvector, ord=-1)
+        assert_raises(ValueError, linalg.norm, testvector, ord=-2)
+
+        testmatrix = np.array([[np.array([0, 1]), 0, 0],
+                               [0,                0, 0]], dtype=object)
+
+        norm = linalg.norm(testmatrix)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        norm = linalg.norm(testmatrix, ord='fro')
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        assert_raises(TypeError, linalg.norm, testmatrix, ord='nuc')
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=np.inf)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=-np.inf)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=0)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=1)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=-1)
+        assert_raises(TypeError, linalg.norm, testmatrix, ord=2)
+        assert_raises(TypeError, linalg.norm, testmatrix, ord=-2)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=3)
+
+    def test_lstsq_complex_larger_rhs(self):
+        # gh-9891
+        size = 20
+        n_rhs = 70
+        G = np.random.randn(size, size) + 1j * np.random.randn(size, size)
+        u = np.random.randn(size, n_rhs) + 1j * np.random.randn(size, n_rhs)
+        b = G.dot(u)
+        # This should work without segmentation fault.
+        u_lstsq, res, rank, sv = linalg.lstsq(G, b, rcond=None)
+        # check results just in case
+        assert_array_almost_equal(u_lstsq, u)
diff --git a/contrib/python/numpy/py2/numpy/linalg/umath_linalg.c.src b/contrib/python/numpy/py2/numpy/linalg/umath_linalg.c.src
new file mode 100644
index 00000000000..9fc68a7aa93
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/linalg/umath_linalg.c.src
@@ -0,0 +1,3688 @@
+/* -*- c -*- */
+
+/*
+ *****************************************************************************
+ **                            INCLUDES                                     **
+ *****************************************************************************
+ */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_config.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+
+
+static const char* umath_linalg_version_string = "0.1.5";
+
+/*
+ ****************************************************************************
+ *                        Debugging support                                 *
+ ****************************************************************************
+ */
+#define TRACE_TXT(...) do { fprintf (stderr, __VA_ARGS__); } while (0)
+#define STACK_TRACE do {} while (0)
+#define TRACE\
+    do {                                        \
+        fprintf (stderr,                        \
+                 "%s:%d:%s\n",                  \
+                 __FILE__,                      \
+                 __LINE__,                      \
+                 __FUNCTION__);                 \
+        STACK_TRACE;                            \
+    } while (0)
+
+#if 0
+#include <execinfo.h>
+void
+dbg_stack_trace()
+{
+    void *trace[32];
+    size_t size;
+
+    size = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
+    backtrace_symbols_fd(trace, size, 1);
+}
+
+#undef STACK_TRACE
+#define STACK_TRACE do { dbg_stack_trace(); } while (0)
+#endif
+
+/*
+ *****************************************************************************
+ *                    BLAS/LAPACK calling macros                             *
+ *****************************************************************************
+ */
+
+#ifdef NO_APPEND_FORTRAN
+# define FNAME(x) x
+#else
+# define FNAME(x) x##_
+#endif
+
+typedef struct { float r, i; } f2c_complex;
+typedef struct { double r, i; } f2c_doublecomplex;
+/* typedef long int (*L_fp)(); */
+
+extern int
+FNAME(sgeev)(char *jobvl, char *jobvr, int *n,
+             float a[], int *lda, float wr[], float wi[],
+             float vl[], int *ldvl, float vr[], int *ldvr,
+             float work[], int lwork[],
+             int *info);
+extern int
+FNAME(dgeev)(char *jobvl, char *jobvr, int *n,
+             double a[], int *lda, double wr[], double wi[],
+             double vl[], int *ldvl, double vr[], int *ldvr,
+             double work[], int lwork[],
+             int *info);
+extern int
+FNAME(cgeev)(char *jobvl, char *jobvr, int *n,
+             f2c_doublecomplex a[], int *lda,
+             f2c_doublecomplex w[],
+             f2c_doublecomplex vl[], int *ldvl,
+             f2c_doublecomplex vr[], int *ldvr,
+             f2c_doublecomplex work[], int *lwork,
+             double rwork[],
+             int *info);
+extern int
+FNAME(zgeev)(char *jobvl, char *jobvr, int *n,
+             f2c_doublecomplex a[], int *lda,
+             f2c_doublecomplex w[],
+             f2c_doublecomplex vl[], int *ldvl,
+             f2c_doublecomplex vr[], int *ldvr,
+             f2c_doublecomplex work[], int *lwork,
+             double rwork[],
+             int *info);
+
+extern int
+FNAME(ssyevd)(char *jobz, char *uplo, int *n,
+              float a[], int *lda, float w[], float work[],
+              int *lwork, int iwork[], int *liwork,
+              int *info);
+extern int
+FNAME(dsyevd)(char *jobz, char *uplo, int *n,
+              double a[], int *lda, double w[], double work[],
+              int *lwork, int iwork[], int *liwork,
+              int *info);
+extern int
+FNAME(cheevd)(char *jobz, char *uplo, int *n,
+              f2c_complex a[], int *lda,
+              float w[], f2c_complex work[],
+              int *lwork, float rwork[], int *lrwork, int iwork[],
+              int *liwork,
+              int *info);
+extern int
+FNAME(zheevd)(char *jobz, char *uplo, int *n,
+              f2c_doublecomplex a[], int *lda,
+              double w[], f2c_doublecomplex work[],
+              int *lwork, double rwork[], int *lrwork, int iwork[],
+              int *liwork,
+              int *info);
+
+extern int
+FNAME(sgelsd)(int *m, int *n, int *nrhs,
+              float a[], int *lda, float b[], int *ldb,
+              float s[], float *rcond, int *rank,
+              float work[], int *lwork, int iwork[],
+              int *info);
+extern int
+FNAME(dgelsd)(int *m, int *n, int *nrhs,
+              double a[], int *lda, double b[], int *ldb,
+              double s[], double *rcond, int *rank,
+              double work[], int *lwork, int iwork[],
+              int *info);
+extern int
+FNAME(cgelsd)(int *m, int *n, int *nrhs,
+              f2c_complex a[], int *lda,
+              f2c_complex b[], int *ldb,
+              float s[], float *rcond, int *rank,
+              f2c_complex work[], int *lwork,
+              float rwork[], int iwork[],
+              int *info);
+extern int
+FNAME(zgelsd)(int *m, int *n, int *nrhs,
+              f2c_doublecomplex a[], int *lda,
+              f2c_doublecomplex b[], int *ldb,
+              double s[], double *rcond, int *rank,
+              f2c_doublecomplex work[], int *lwork,
+              double rwork[], int iwork[],
+              int *info);
+
+extern int
+FNAME(sgesv)(int *n, int *nrhs,
+             float a[], int *lda,
+             int ipiv[],
+             float b[], int *ldb,
+             int *info);
+extern int
+FNAME(dgesv)(int *n, int *nrhs,
+             double a[], int *lda,
+             int ipiv[],
+             double b[], int *ldb,
+             int *info);
+extern int
+FNAME(cgesv)(int *n, int *nrhs,
+             f2c_complex a[], int *lda,
+             int ipiv[],
+             f2c_complex b[], int *ldb,
+             int *info);
+extern int
+FNAME(zgesv)(int *n, int *nrhs,
+             f2c_doublecomplex a[], int *lda,
+             int ipiv[],
+             f2c_doublecomplex b[], int *ldb,
+             int *info);
+
+extern int
+FNAME(sgetrf)(int *m, int *n,
+              float a[], int *lda,
+              int ipiv[],
+              int *info);
+extern int
+FNAME(dgetrf)(int *m, int *n,
+              double a[], int *lda,
+              int ipiv[],
+              int *info);
+extern int
+FNAME(cgetrf)(int *m, int *n,
+              f2c_complex a[], int *lda,
+              int ipiv[],
+              int *info);
+extern int
+FNAME(zgetrf)(int *m, int *n,
+              f2c_doublecomplex a[], int *lda,
+              int ipiv[],
+              int *info);
+
+extern int
+FNAME(spotrf)(char *uplo, int *n,
+              float a[], int *lda,
+              int *info);
+extern int
+FNAME(dpotrf)(char *uplo, int *n,
+              double a[], int *lda,
+              int *info);
+extern int
+FNAME(cpotrf)(char *uplo, int *n,
+              f2c_complex a[], int *lda,
+              int *info);
+extern int
+FNAME(zpotrf)(char *uplo, int *n,
+              f2c_doublecomplex a[], int *lda,
+              int *info);
+
+extern int
+FNAME(sgesdd)(char *jobz, int *m, int *n,
+              float a[], int *lda, float s[], float u[],
+              int *ldu, float vt[], int *ldvt, float work[],
+              int *lwork, int iwork[], int *info);
+extern int
+FNAME(dgesdd)(char *jobz, int *m, int *n,
+              double a[], int *lda, double s[], double u[],
+              int *ldu, double vt[], int *ldvt, double work[],
+              int *lwork, int iwork[], int *info);
+extern int
+FNAME(cgesdd)(char *jobz, int *m, int *n,
+              f2c_complex a[], int *lda,
+              float s[], f2c_complex u[], int *ldu,
+              f2c_complex vt[], int *ldvt,
+              f2c_complex work[], int *lwork,
+              float rwork[], int iwork[], int *info);
+extern int
+FNAME(zgesdd)(char *jobz, int *m, int *n,
+              f2c_doublecomplex a[], int *lda,
+              double s[], f2c_doublecomplex u[], int *ldu,
+              f2c_doublecomplex vt[], int *ldvt,
+              f2c_doublecomplex work[], int *lwork,
+              double rwork[], int iwork[], int *info);
+
+extern int
+FNAME(spotrs)(char *uplo, int *n, int *nrhs,
+              float a[], int *lda,
+              float b[], int *ldb,
+              int *info);
+extern int
+FNAME(dpotrs)(char *uplo, int *n, int *nrhs,
+              double a[], int *lda,
+              double b[], int *ldb,
+              int *info);
+extern int
+FNAME(cpotrs)(char *uplo, int *n, int *nrhs,
+              f2c_complex a[], int *lda,
+              f2c_complex b[], int *ldb,
+              int *info);
+extern int
+FNAME(zpotrs)(char *uplo, int *n, int *nrhs,
+              f2c_doublecomplex a[], int *lda,
+              f2c_doublecomplex b[], int *ldb,
+              int *info);
+
+extern int
+FNAME(spotri)(char *uplo, int *n,
+              float a[], int *lda,
+              int *info);
+extern int
+FNAME(dpotri)(char *uplo, int *n,
+              double a[], int *lda,
+              int *info);
+extern int
+FNAME(cpotri)(char *uplo, int *n,
+              f2c_complex a[], int *lda,
+              int *info);
+extern int
+FNAME(zpotri)(char *uplo, int *n,
+              f2c_doublecomplex a[], int *lda,
+              int *info);
+
+extern int
+FNAME(scopy)(int *n,
+             float *sx, int *incx,
+             float *sy, int *incy);
+extern int
+FNAME(dcopy)(int *n,
+             double *sx, int *incx,
+             double *sy, int *incy);
+extern int
+FNAME(ccopy)(int *n,
+             f2c_complex *sx, int *incx,
+             f2c_complex *sy, int *incy);
+extern int
+FNAME(zcopy)(int *n,
+             f2c_doublecomplex *sx, int *incx,
+             f2c_doublecomplex *sy, int *incy);
+
+extern float
+FNAME(sdot)(int *n,
+            float *sx, int *incx,
+            float *sy, int *incy);
+extern double
+FNAME(ddot)(int *n,
+            double *sx, int *incx,
+            double *sy, int *incy);
+extern void
+FNAME(cdotu)(f2c_complex *ret, int *n,
+             f2c_complex *sx, int *incx,
+             f2c_complex *sy, int *incy);
+extern void
+FNAME(zdotu)(f2c_doublecomplex *ret, int *n,
+             f2c_doublecomplex *sx, int *incx,
+             f2c_doublecomplex *sy, int *incy);
+extern void
+FNAME(cdotc)(f2c_complex *ret, int *n,
+             f2c_complex *sx, int *incx,
+             f2c_complex *sy, int *incy);
+extern void
+FNAME(zdotc)(f2c_doublecomplex *ret, int *n,
+             f2c_doublecomplex *sx, int *incx,
+             f2c_doublecomplex *sy, int *incy);
+
+extern int
+FNAME(sgemm)(char *transa, char *transb,
+             int *m, int *n, int *k,
+             float *alpha,
+             float *a, int *lda,
+             float *b, int *ldb,
+             float *beta,
+             float *c, int *ldc);
+extern int
+FNAME(dgemm)(char *transa, char *transb,
+             int *m, int *n, int *k,
+             double *alpha,
+             double *a, int *lda,
+             double *b, int *ldb,
+             double *beta,
+             double *c, int *ldc);
+extern int
+FNAME(cgemm)(char *transa, char *transb,
+             int *m, int *n, int *k,
+             f2c_complex *alpha,
+             f2c_complex *a, int *lda,
+             f2c_complex *b, int *ldb,
+             f2c_complex *beta,
+             f2c_complex *c, int *ldc);
+extern int
+FNAME(zgemm)(char *transa, char *transb,
+             int *m, int *n, int *k,
+             f2c_doublecomplex *alpha,
+             f2c_doublecomplex *a, int *lda,
+             f2c_doublecomplex *b, int *ldb,
+             f2c_doublecomplex *beta,
+             f2c_doublecomplex *c, int *ldc);
+
+
+#define LAPACK_T(FUNC)                                          \
+    TRACE_TXT("Calling LAPACK ( " # FUNC " )\n");               \
+    FNAME(FUNC)
+
+#define BLAS(FUNC)                              \
+    FNAME(FUNC)
+
+#define LAPACK(FUNC)                            \
+    FNAME(FUNC)
+
+typedef int               fortran_int;
+typedef float             fortran_real;
+typedef double            fortran_doublereal;
+typedef f2c_complex       fortran_complex;
+typedef f2c_doublecomplex fortran_doublecomplex;
+
+
+/*
+ *****************************************************************************
+ **                      Some handy functions                               **
+ *****************************************************************************
+ */
+
+static NPY_INLINE int
+get_fp_invalid_and_clear(void)
+{
+    int status;
+    status = npy_clear_floatstatus_barrier((char*)&status);
+    return !!(status & NPY_FPE_INVALID);
+}
+
+static NPY_INLINE void
+set_fp_invalid_or_clear(int error_occurred)
+{
+    if (error_occurred) {
+        npy_set_floatstatus_invalid();
+    }
+    else {
+        npy_clear_floatstatus_barrier((char*)&error_occurred);
+    }
+}
+
+/*
+ *****************************************************************************
+ **                      Some handy constants                               **
+ *****************************************************************************
+ */
+
+#define UMATH_LINALG_MODULE_NAME "_umath_linalg"
+
+typedef union {
+    fortran_complex f;
+    npy_cfloat npy;
+    float array[2];
+} COMPLEX_t;
+
+typedef union {
+    fortran_doublecomplex f;
+    npy_cdouble npy;
+    double array[2];
+} DOUBLECOMPLEX_t;
+
+static float s_one;
+static float s_zero;
+static float s_minus_one;
+static float s_ninf;
+static float s_nan;
+static double d_one;
+static double d_zero;
+static double d_minus_one;
+static double d_ninf;
+static double d_nan;
+static COMPLEX_t c_one;
+static COMPLEX_t c_zero;
+static COMPLEX_t c_minus_one;
+static COMPLEX_t c_ninf;
+static COMPLEX_t c_nan;
+static DOUBLECOMPLEX_t z_one;
+static DOUBLECOMPLEX_t z_zero;
+static DOUBLECOMPLEX_t z_minus_one;
+static DOUBLECOMPLEX_t z_ninf;
+static DOUBLECOMPLEX_t z_nan;
+
+static void init_constants(void)
+{
+    /*
+       this is needed as NPY_INFINITY and NPY_NAN macros
+       can't be used as initializers. I prefer to just set
+       all the constants the same way.
+    */
+    s_one  = 1.0f;
+    s_zero = 0.0f;
+    s_minus_one = -1.0f;
+    s_ninf = -NPY_INFINITYF;
+    s_nan = NPY_NANF;
+
+    d_one  = 1.0;
+    d_zero = 0.0;
+    d_minus_one = -1.0;
+    d_ninf = -NPY_INFINITY;
+    d_nan = NPY_NAN;
+
+    c_one.array[0]  = 1.0f;
+    c_one.array[1]  = 0.0f;
+    c_zero.array[0] = 0.0f;
+    c_zero.array[1] = 0.0f;
+    c_minus_one.array[0] = -1.0f;
+    c_minus_one.array[1] = 0.0f;
+    c_ninf.array[0] = -NPY_INFINITYF;
+    c_ninf.array[1] = 0.0f;
+    c_nan.array[0] = NPY_NANF;
+    c_nan.array[1] = NPY_NANF;
+
+    z_one.array[0]  = 1.0;
+    z_one.array[1]  = 0.0;
+    z_zero.array[0] = 0.0;
+    z_zero.array[1] = 0.0;
+    z_minus_one.array[0] = -1.0;
+    z_minus_one.array[1] = 0.0;
+    z_ninf.array[0] = -NPY_INFINITY;
+    z_ninf.array[1] = 0.0;
+    z_nan.array[0] = NPY_NAN;
+    z_nan.array[1] = NPY_NAN;
+}
+
+/*
+ *****************************************************************************
+ **               Structs used for data rearrangement                       **
+ *****************************************************************************
+ */
+
+
+/*
+ * this struct contains information about how to linearize a matrix in a local
+ * buffer so that it can be used by blas functions.  All strides are specified
+ * in bytes and are converted to elements later in type specific functions.
+ *
+ * rows: number of rows in the matrix
+ * columns: number of columns in the matrix
+ * row_strides: the number bytes between consecutive rows.
+ * column_strides: the number of bytes between consecutive columns.
+ * output_lead_dim: BLAS/LAPACK-side leading dimension, in elements
+ */
+typedef struct linearize_data_struct
+{
+  npy_intp rows;
+  npy_intp columns;
+  npy_intp row_strides;
+  npy_intp column_strides;
+  npy_intp output_lead_dim;
+} LINEARIZE_DATA_t;
+
+static NPY_INLINE void
+init_linearize_data_ex(LINEARIZE_DATA_t *lin_data,
+                       npy_intp rows,
+                       npy_intp columns,
+                       npy_intp row_strides,
+                       npy_intp column_strides,
+                       npy_intp output_lead_dim)
+{
+    lin_data->rows = rows;
+    lin_data->columns = columns;
+    lin_data->row_strides = row_strides;
+    lin_data->column_strides = column_strides;
+    lin_data->output_lead_dim = output_lead_dim;
+}
+
+static NPY_INLINE void
+init_linearize_data(LINEARIZE_DATA_t *lin_data,
+                    npy_intp rows,
+                    npy_intp columns,
+                    npy_intp row_strides,
+                    npy_intp column_strides)
+{
+    init_linearize_data_ex(
+        lin_data, rows, columns, row_strides, column_strides, columns);
+}
+
+static NPY_INLINE void
+dump_ufunc_object(PyUFuncObject* ufunc)
+{
+    TRACE_TXT("\n\n%s '%s' (%d input(s), %d output(s), %d specialization(s).\n",
+              ufunc->core_enabled? "generalized ufunc" : "scalar ufunc",
+              ufunc->name, ufunc->nin, ufunc->nout, ufunc->ntypes);
+    if (ufunc->core_enabled) {
+        int arg;
+        int dim;
+        TRACE_TXT("\t%s (%d dimension(s) detected).\n",
+                  ufunc->core_signature, ufunc->core_num_dim_ix);
+
+        for (arg = 0; arg < ufunc->nargs; arg++){
+            int * arg_dim_ix = ufunc->core_dim_ixs + ufunc->core_offsets[arg];
+            TRACE_TXT("\t\targ %d (%s) has %d dimension(s): (",
+                      arg, arg < ufunc->nin? "INPUT" : "OUTPUT",
+                      ufunc->core_num_dims[arg]);
+            for (dim = 0; dim < ufunc->core_num_dims[arg]; dim ++) {
+                TRACE_TXT(" %d", arg_dim_ix[dim]);
+            }
+            TRACE_TXT(" )\n");
+        }
+    }
+}
+
+static NPY_INLINE void
+dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params)
+{
+    TRACE_TXT("\n\t%s rows: %zd columns: %zd"\
+              "\n\t\trow_strides: %td column_strides: %td"\
+              "\n", name, params->rows, params->columns,
+              params->row_strides, params->column_strides);
+}
+
+static NPY_INLINE void
+print_FLOAT(npy_float s)
+{
+    TRACE_TXT(" %8.4f", s);
+}
+static NPY_INLINE void
+print_DOUBLE(npy_double d)
+{
+    TRACE_TXT(" %10.6f", d);
+}
+static NPY_INLINE void
+print_CFLOAT(npy_cfloat c)
+{
+    float* c_parts = (float*)&c;
+    TRACE_TXT("(%8.4f, %8.4fj)", c_parts[0], c_parts[1]);
+}
+static NPY_INLINE void
+print_CDOUBLE(npy_cdouble z)
+{
+    double* z_parts = (double*)&z;
+    TRACE_TXT("(%8.4f, %8.4fj)", z_parts[0], z_parts[1]);
+}
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ */
+static NPY_INLINE void
+dump_@TYPE@_matrix(const char* name,
+                   size_t rows, size_t columns,
+                   const @typ@* ptr)
+{
+    size_t i, j;
+
+    TRACE_TXT("\n%s %p (%zd, %zd)\n", name, ptr, rows, columns);
+    for (i = 0; i < rows; i++)
+    {
+        TRACE_TXT("| ");
+        for (j = 0; j < columns; j++)
+        {
+            print_@TYPE@(ptr[j*rows + i]);
+            TRACE_TXT(", ");
+        }
+        TRACE_TXT(" |\n");
+    }
+}
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                            Basics                                       **
+ *****************************************************************************
+ */
+
+static NPY_INLINE fortran_int
+fortran_int_min(fortran_int x, fortran_int y) {
+    return x < y ? x : y;
+}
+
+static NPY_INLINE fortran_int
+fortran_int_max(fortran_int x, fortran_int y) {
+    return x > y ? x : y;
+}
+
+#define INIT_OUTER_LOOP_1 \
+    npy_intp dN = *dimensions++;\
+    npy_intp N_;\
+    npy_intp s0 = *steps++;
+
+#define INIT_OUTER_LOOP_2 \
+    INIT_OUTER_LOOP_1\
+    npy_intp s1 = *steps++;
+
+#define INIT_OUTER_LOOP_3 \
+    INIT_OUTER_LOOP_2\
+    npy_intp s2 = *steps++;
+
+#define INIT_OUTER_LOOP_4 \
+    INIT_OUTER_LOOP_3\
+    npy_intp s3 = *steps++;
+
+#define INIT_OUTER_LOOP_5 \
+    INIT_OUTER_LOOP_4\
+    npy_intp s4 = *steps++;
+
+#define INIT_OUTER_LOOP_6  \
+    INIT_OUTER_LOOP_5\
+    npy_intp s5 = *steps++;
+
+#define INIT_OUTER_LOOP_7  \
+    INIT_OUTER_LOOP_6\
+    npy_intp s6 = *steps++;
+
+#define BEGIN_OUTER_LOOP_2 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1) {
+
+#define BEGIN_OUTER_LOOP_3 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1,\
+             args[2] += s2) {
+
+#define BEGIN_OUTER_LOOP_4 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1,\
+             args[2] += s2,\
+             args[3] += s3) {
+
+#define BEGIN_OUTER_LOOP_5 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1,\
+             args[2] += s2,\
+             args[3] += s3,\
+             args[4] += s4) {
+
+#define BEGIN_OUTER_LOOP_6 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1,\
+             args[2] += s2,\
+             args[3] += s3,\
+             args[4] += s4,\
+             args[5] += s5) {
+
+#define BEGIN_OUTER_LOOP_7 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1,\
+             args[2] += s2,\
+             args[3] += s3,\
+             args[4] += s4,\
+             args[5] += s5,\
+             args[6] += s6) {
+
+#define END_OUTER_LOOP  }
+
+static NPY_INLINE void
+update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
+{
+    size_t i;
+    for (i = 0; i < count; ++i) {
+        bases[i] += offsets[i];
+    }
+}
+
+
+/* disable -Wmaybe-uninitialized as there is some code that generate false
+   positives with this warning
+*/
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
+/*
+ *****************************************************************************
+ **                             HELPER FUNCS                                **
+ *****************************************************************************
+ */
+
+             /* rearranging of 2D matrices using blas */
+
+/**begin repeat
+    #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+    #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
+    #copy = scopy, dcopy, ccopy, zcopy#
+    #nan = s_nan, d_nan, c_nan, z_nan#
+    #zero = s_zero, d_zero, c_zero, z_zero#
+ */
+static NPY_INLINE void *
+linearize_@TYPE@_matrix(void *dst_in,
+                        void *src_in,
+                        const LINEARIZE_DATA_t* data)
+{
+    @typ@ *src = (@typ@ *) src_in;
+    @typ@ *dst = (@typ@ *) dst_in;
+
+    if (dst) {
+        int i, j;
+        @typ@* rv = dst;
+        fortran_int columns = (fortran_int)data->columns;
+        fortran_int column_strides =
+            (fortran_int)(data->column_strides/sizeof(@typ@));
+        fortran_int one = 1;
+        for (i = 0; i < data->rows; i++) {
+            if (column_strides > 0) {
+                FNAME(@copy@)(&columns,
+                              (void*)src, &column_strides,
+                              (void*)dst, &one);
+            }
+            else if (column_strides < 0) {
+                FNAME(@copy@)(&columns,
+                              (void*)((@typ@*)src + (columns-1)*column_strides),
+                              &column_strides,
+                              (void*)dst, &one);
+            }
+            else {
+                /*
+                 * Zero stride has undefined behavior in some BLAS
+                 * implementations (e.g. OSX Accelerate), so do it
+                 * manually
+                 */
+                for (j = 0; j < columns; ++j) {
+                    memcpy((@typ@*)dst + j, (@typ@*)src, sizeof(@typ@));
+                }
+            }
+            src += data->row_strides/sizeof(@typ@);
+            dst += data->output_lead_dim;
+        }
+        return rv;
+    } else {
+        return src;
+    }
+}
+
+static NPY_INLINE void *
+delinearize_@TYPE@_matrix(void *dst_in,
+                          void *src_in,
+                          const LINEARIZE_DATA_t* data)
+{
+    @typ@ *src = (@typ@ *) src_in;
+    @typ@ *dst = (@typ@ *) dst_in;
+
+    if (src) {
+        int i;
+        @typ@ *rv = src;
+        fortran_int columns = (fortran_int)data->columns;
+        fortran_int column_strides =
+            (fortran_int)(data->column_strides/sizeof(@typ@));
+        fortran_int one = 1;
+        for (i = 0; i < data->rows; i++) {
+            if (column_strides > 0) {
+                FNAME(@copy@)(&columns,
+                              (void*)src, &one,
+                              (void*)dst, &column_strides);
+            }
+            else if (column_strides < 0) {
+                FNAME(@copy@)(&columns,
+                              (void*)src, &one,
+                              (void*)((@typ@*)dst + (columns-1)*column_strides),
+                              &column_strides);
+            }
+            else {
+                /*
+                 * Zero stride has undefined behavior in some BLAS
+                 * implementations (e.g. OSX Accelerate), so do it
+                 * manually
+                 */
+                if (columns > 0) {
+                    memcpy((@typ@*)dst,
+                           (@typ@*)src + (columns-1),
+                           sizeof(@typ@));
+                }
+            }
+            src += data->output_lead_dim;
+            dst += data->row_strides/sizeof(@typ@);
+        }
+
+        return rv;
+    } else {
+        return src;
+    }
+}
+
+static NPY_INLINE void
+nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
+{
+    @typ@ *dst = (@typ@ *) dst_in;
+
+    int i, j;
+    for (i = 0; i < data->rows; i++) {
+        @typ@ *cp = dst;
+        ptrdiff_t cs = data->column_strides/sizeof(@typ@);
+        for (j = 0; j < data->columns; ++j) {
+            *cp = @nan@;
+            cp += cs;
+        }
+        dst += data->row_strides/sizeof(@typ@);
+    }
+}
+
+static NPY_INLINE void
+zero_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
+{
+    @typ@ *dst = (@typ@ *) dst_in;
+
+    int i, j;
+    for (i = 0; i < data->rows; i++) {
+        @typ@ *cp = dst;
+        ptrdiff_t cs = data->column_strides/sizeof(@typ@);
+        for (j = 0; j < data->columns; ++j) {
+            *cp = @zero@;
+            cp += cs;
+        }
+        dst += data->row_strides/sizeof(@typ@);
+    }
+}
+
+/**end repeat**/
+
+               /* identity square matrix generation */
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
+   #cblas_type = s, d, c, z#
+ */
+static NPY_INLINE void
+identity_@TYPE@_matrix(void *ptr, size_t n)
+{
+    size_t i;
+    @typ@ *matrix = (@typ@*) ptr;
+    /* in IEEE floating point, zeroes are represented as bitwise 0 */
+    memset(matrix, 0, n*n*sizeof(@typ@));
+
+    for (i = 0; i < n; ++i)
+    {
+        *matrix = @cblas_type@_one;
+        matrix += n+1;
+    }
+}
+/**end repeat**/
+
+         /* lower/upper triangular matrix using blas (in place) */
+/**begin repeat
+
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
+   #cblas_type = s, d, c, z#
+ */
+
+static NPY_INLINE void
+triu_@TYPE@_matrix(void *ptr, size_t n)
+{
+    size_t i, j;
+    @typ@ *matrix = (@typ@*)ptr;
+    matrix += n;
+    for (i = 1; i < n; ++i) {
+        for (j = 0; j < i; ++j) {
+            matrix[j] = @cblas_type@_zero;
+        }
+        matrix += n;
+    }
+}
+/**end repeat**/
+
+
+/* -------------------------------------------------------------------------- */
+                          /* Determinants */
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE#
+   #typ = npy_float, npy_double#
+   #log_func = npy_logf, npy_log#
+   #exp_func = npy_expf, npy_exp#
+   #zero = 0.0f, 0.0#
+*/
+
+static NPY_INLINE void
+@TYPE@_slogdet_from_factored_diagonal(@typ@* src,
+                                      fortran_int m,
+                                      @typ@ *sign,
+                                      @typ@ *logdet)
+{
+    @typ@ acc_sign = *sign;
+    @typ@ acc_logdet = @zero@;
+    int i;
+    for (i = 0; i < m; i++) {
+        @typ@ abs_element = *src;
+        if (abs_element < @zero@) {
+            acc_sign = -acc_sign;
+            abs_element = -abs_element;
+        }
+
+        acc_logdet += @log_func@(abs_element);
+        src += m+1;
+    }
+
+    *sign = acc_sign;
+    *logdet = acc_logdet;
+}
+
+static NPY_INLINE @typ@
+@TYPE@_det_from_slogdet(@typ@ sign, @typ@ logdet)
+{
+    @typ@ result = sign * @exp_func@(logdet);
+    return result;
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = CFLOAT, CDOUBLE#
+   #typ = npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double#
+   #abs_func = npy_cabsf, npy_cabs#
+   #log_func = npy_logf, npy_log#
+   #exp_func = npy_expf, npy_exp#
+   #zero = 0.0f, 0.0#
+*/
+#define RE(COMPLEX) (((@basetyp@*)(&COMPLEX))[0])
+#define IM(COMPLEX) (((@basetyp@*)(&COMPLEX))[1])
+
+static NPY_INLINE @typ@
+@TYPE@_mult(@typ@ op1, @typ@ op2)
+{
+    @typ@ rv;
+
+    RE(rv) = RE(op1)*RE(op2) - IM(op1)*IM(op2);
+    IM(rv) = RE(op1)*IM(op2) + IM(op1)*RE(op2);
+
+    return rv;
+}
+
+
+static NPY_INLINE void
+@TYPE@_slogdet_from_factored_diagonal(@typ@* src,
+                                      fortran_int m,
+                                      @typ@ *sign,
+                                      @basetyp@ *logdet)
+{
+    int i;
+    @typ@ sign_acc = *sign;
+    @basetyp@ logdet_acc = @zero@;
+
+    for (i = 0; i < m; i++)
+    {
+        @basetyp@ abs_element = @abs_func@(*src);
+        @typ@ sign_element;
+        RE(sign_element) = RE(*src) / abs_element;
+        IM(sign_element) = IM(*src) / abs_element;
+
+        sign_acc = @TYPE@_mult(sign_acc, sign_element);
+        logdet_acc += @log_func@(abs_element);
+        src += m + 1;
+    }
+
+    *sign = sign_acc;
+    *logdet = logdet_acc;
+}
+
+static NPY_INLINE @typ@
+@TYPE@_det_from_slogdet(@typ@ sign, @basetyp@ logdet)
+{
+    @typ@ tmp;
+    RE(tmp) = @exp_func@(logdet);
+    IM(tmp) = @zero@;
+    return @TYPE@_mult(sign, tmp);
+}
+#undef RE
+#undef IM
+/**end repeat**/
+
+
+/* As in the linalg package, the determinant is computed via LU factorization
+ * using LAPACK.
+ * slogdet computes sign + log(determinant).
+ * det computes sign * exp(slogdet).
+ */
+/**begin repeat
+
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float, npy_double#
+   #cblas_type = s, d, c, z#
+*/
+
+static NPY_INLINE void
+@TYPE@_slogdet_single_element(fortran_int m,
+                              void* src,
+                              fortran_int* pivots,
+                              @typ@ *sign,
+                              @basetyp@ *logdet)
+{
+    fortran_int info = 0;
+    fortran_int lda = fortran_int_max(m, 1);
+    int i;
+    /* note: done in place */
+    LAPACK(@cblas_type@getrf)(&m, &m, (void *)src, &lda, pivots, &info);
+
+    if (info == 0) {
+        int change_sign = 0;
+        /* note: fortran uses 1 based indexing */
+        for (i = 0; i < m; i++)
+        {
+            change_sign += (pivots[i] != (i+1));
+        }
+
+        memcpy(sign,
+               (change_sign % 2)?
+                   &@cblas_type@_minus_one :
+                   &@cblas_type@_one
+               , sizeof(*sign));
+        @TYPE@_slogdet_from_factored_diagonal(src, m, sign, logdet);
+    } else {
+        /*
+          if getrf fails, use 0 as sign and -inf as logdet
+        */
+        memcpy(sign, &@cblas_type@_zero, sizeof(*sign));
+        memcpy(logdet, &@cblas_type@_ninf, sizeof(*logdet));
+    }
+}
+
+static void
+@TYPE@_slogdet(char **args,
+               npy_intp *dimensions,
+               npy_intp *steps,
+               void *NPY_UNUSED(func))
+{
+    fortran_int m;
+    npy_uint8 *tmp_buff = NULL;
+    size_t matrix_size;
+    size_t pivot_size;
+    size_t safe_m;
+    /* notes:
+     *   matrix will need to be copied always, as factorization in lapack is
+     *          made inplace
+     *   matrix will need to be in column-major order, as expected by lapack
+     *          code (fortran)
+     *   always a square matrix
+     *   need to allocate memory for both, matrix_buffer and pivot buffer
+     */
+    INIT_OUTER_LOOP_3
+    m = (fortran_int) dimensions[0];
+    safe_m = m;
+    matrix_size = safe_m * safe_m * sizeof(@typ@);
+    pivot_size = safe_m * sizeof(fortran_int);
+    tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
+
+    if (tmp_buff) {
+        LINEARIZE_DATA_t lin_data;
+        /* swapped steps to get matrix in FORTRAN order */
+        init_linearize_data(&lin_data, m, m, steps[1], steps[0]);
+        BEGIN_OUTER_LOOP_3
+            linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data);
+            @TYPE@_slogdet_single_element(m,
+                                          (void*)tmp_buff,
+                                          (fortran_int*)(tmp_buff+matrix_size),
+                                          (@typ@*)args[1],
+                                          (@basetyp@*)args[2]);
+        END_OUTER_LOOP
+
+        free(tmp_buff);
+    }
+}
+
+static void
+@TYPE@_det(char **args,
+           npy_intp *dimensions,
+           npy_intp *steps,
+           void *NPY_UNUSED(func))
+{
+    fortran_int m;
+    npy_uint8 *tmp_buff;
+    size_t matrix_size;
+    size_t pivot_size;
+    size_t safe_m;
+    /* notes:
+     *   matrix will need to be copied always, as factorization in lapack is
+     *       made inplace
+     *   matrix will need to be in column-major order, as expected by lapack
+     *       code (fortran)
+     *   always a square matrix
+     *   need to allocate memory for both, matrix_buffer and pivot buffer
+     */
+    INIT_OUTER_LOOP_2
+    m = (fortran_int) dimensions[0];
+    safe_m = m;
+    matrix_size = safe_m * safe_m * sizeof(@typ@);
+    pivot_size = safe_m * sizeof(fortran_int);
+    tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
+
+    if (tmp_buff) {
+        LINEARIZE_DATA_t lin_data;
+        @typ@ sign;
+        @basetyp@ logdet;
+        /* swapped steps to get matrix in FORTRAN order */
+        init_linearize_data(&lin_data, m, m, steps[1], steps[0]);
+
+        BEGIN_OUTER_LOOP_2
+            linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data);
+            @TYPE@_slogdet_single_element(m,
+                                          (void*)tmp_buff,
+                                          (fortran_int*)(tmp_buff + matrix_size),
+                                          &sign,
+                                          &logdet);
+            *(@typ@ *)args[1] = @TYPE@_det_from_slogdet(sign, logdet);
+        END_OUTER_LOOP
+
+        free(tmp_buff);
+    }
+}
+/**end repeat**/
+
+
+/* -------------------------------------------------------------------------- */
+                          /* Eigh family */
+
+typedef struct eigh_params_struct {
+    void *A;     /* matrix */
+    void *W;     /* eigenvalue vector */
+    void *WORK;  /* main work buffer */
+    void *RWORK; /* secondary work buffer (for complex versions) */
+    void *IWORK;
+    fortran_int N;
+    fortran_int LWORK;
+    fortran_int LRWORK;
+    fortran_int LIWORK;
+    char JOBZ;
+    char UPLO;
+    fortran_int LDA;
+} EIGH_PARAMS_t;
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE#
+   #typ = npy_float, npy_double#
+   #ftyp = fortran_real, fortran_doublereal#
+   #lapack_func = ssyevd, dsyevd#
+*/
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(EIGH_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->JOBZ, &params->UPLO, &params->N,
+                          params->A, &params->LDA, params->W,
+                          params->WORK, &params->LWORK,
+                          params->IWORK, &params->LIWORK,
+                          &rv);
+    return rv;
+}
+
+/*
+ * Initialize the parameters to use in for the lapack function _syevd
+ * Handles buffer allocation
+ */
+static NPY_INLINE int
+init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
+                   fortran_int N)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    fortran_int lwork;
+    fortran_int liwork;
+    npy_uint8 *a, *w, *work, *iwork;
+    size_t safe_N = N;
+    size_t alloc_size = safe_N * (safe_N + 1) * sizeof(@typ@);
+    fortran_int lda = fortran_int_max(N, 1);
+
+    mem_buff = malloc(alloc_size);
+
+    if (!mem_buff) {
+        goto error;
+    }
+    a = mem_buff;
+    w = mem_buff + safe_N * safe_N * sizeof(@typ@);
+
+    params->A = a;
+    params->W = w;
+    params->RWORK = NULL; /* unused */
+    params->N = N;
+    params->LRWORK = 0; /* unused */
+    params->JOBZ = JOBZ;
+    params->UPLO = UPLO;
+    params->LDA = lda;
+
+    /* Work size query */
+    {
+        @typ@ query_work_size;
+        fortran_int query_iwork_size;
+
+        params->LWORK = -1;
+        params->LIWORK = -1;
+        params->WORK = &query_work_size;
+        params->IWORK = &query_iwork_size;
+
+        if (call_@lapack_func@(params) != 0) {
+            goto error;
+        }
+
+        lwork = (fortran_int)query_work_size;
+        liwork = query_iwork_size;
+    }
+
+    mem_buff2 = malloc(lwork*sizeof(@typ@) + liwork*sizeof(fortran_int));
+    if (!mem_buff2) {
+        goto error;
+    }
+
+    work = mem_buff2;
+    iwork = mem_buff2 + lwork*sizeof(@typ@);
+
+    params->LWORK = lwork;
+    params->WORK = work;
+    params->LIWORK = liwork;
+    params->IWORK = iwork;
+
+    return 1;
+
+ error:
+    /* something failed */
+    memset(params, 0, sizeof(*params));
+    free(mem_buff2);
+    free(mem_buff);
+
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = CFLOAT, CDOUBLE#
+   #typ = npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double#
+   #ftyp = fortran_complex, fortran_doublecomplex#
+   #fbasetyp = fortran_real, fortran_doublereal#
+   #lapack_func = cheevd, zheevd#
+*/
+static NPY_INLINE fortran_int
+call_@lapack_func@(EIGH_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->JOBZ, &params->UPLO, &params->N,
+                          params->A, &params->LDA, params->W,
+                          params->WORK, &params->LWORK,
+                          params->RWORK, &params->LRWORK,
+                          params->IWORK, &params->LIWORK,
+                          &rv);
+    return rv;
+}
+
+/*
+ * Initialize the parameters to use in for the lapack function _heev
+ * Handles buffer allocation
+ */
+static NPY_INLINE int
+init_@lapack_func@(EIGH_PARAMS_t *params,
+                   char JOBZ,
+                   char UPLO,
+                   fortran_int N)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    fortran_int lwork;
+    fortran_int lrwork;
+    fortran_int liwork;
+    npy_uint8 *a, *w, *work, *rwork, *iwork;
+    size_t safe_N = N;
+    fortran_int lda = fortran_int_max(N, 1);
+
+    mem_buff = malloc(safe_N * safe_N * sizeof(@typ@) +
+                      safe_N * sizeof(@basetyp@));
+    if (!mem_buff) {
+        goto error;
+    }
+    a = mem_buff;
+    w = mem_buff + safe_N * safe_N * sizeof(@typ@);
+
+    params->A = a;
+    params->W = w;
+    params->N = N;
+    params->JOBZ = JOBZ;
+    params->UPLO = UPLO;
+    params->LDA = lda;
+
+    /* Work size query */
+    {
+        @ftyp@ query_work_size;
+        @fbasetyp@ query_rwork_size;
+        fortran_int query_iwork_size;
+
+        params->LWORK = -1;
+        params->LRWORK = -1;
+        params->LIWORK = -1;
+        params->WORK = &query_work_size;
+        params->RWORK = &query_rwork_size;
+        params->IWORK = &query_iwork_size;
+
+        if (call_@lapack_func@(params) != 0) {
+            goto error;
+        }
+
+        lwork = (fortran_int)*(@fbasetyp@*)&query_work_size;
+        lrwork = (fortran_int)query_rwork_size;
+        liwork = query_iwork_size;
+    }
+
+    mem_buff2 = malloc(lwork*sizeof(@typ@) +
+                       lrwork*sizeof(@basetyp@) +
+                       liwork*sizeof(fortran_int));
+    if (!mem_buff2) {
+        goto error;
+    }
+
+    work = mem_buff2;
+    rwork = work + lwork*sizeof(@typ@);
+    iwork = rwork + lrwork*sizeof(@basetyp@);
+
+    params->WORK = work;
+    params->RWORK = rwork;
+    params->IWORK = iwork;
+    params->LWORK = lwork;
+    params->LRWORK = lrwork;
+    params->LIWORK = liwork;
+
+    return 1;
+
+    /* something failed */
+error:
+    memset(params, 0, sizeof(*params));
+    free(mem_buff2);
+    free(mem_buff);
+
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #BASETYPE = FLOAT, DOUBLE, FLOAT, DOUBLE#
+   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float, npy_double#
+   #lapack_func = ssyevd, dsyevd, cheevd, zheevd#
+**/
+/*
+ * (M, M)->(M,)(M, M)
+ * dimensions[1] -> M
+ * args[0] -> A[in]
+ * args[1] -> W
+ * args[2] -> A[out]
+ */
+
+static NPY_INLINE void
+release_@lapack_func@(EIGH_PARAMS_t *params)
+{
+    /* allocated memory in A and WORK */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+
+static NPY_INLINE void
+@TYPE@_eigh_wrapper(char JOBZ,
+                    char UPLO,
+                    char**args,
+                    npy_intp* dimensions,
+                    npy_intp* steps)
+{
+    ptrdiff_t outer_steps[3];
+    size_t iter;
+    size_t outer_dim = *dimensions++;
+    size_t op_count = (JOBZ=='N')?2:3;
+    EIGH_PARAMS_t eigh_params;
+    int error_occurred = get_fp_invalid_and_clear();
+
+    for (iter = 0; iter < op_count; ++iter) {
+        outer_steps[iter] = (ptrdiff_t) steps[iter];
+    }
+    steps += op_count;
+
+    if (init_@lapack_func@(&eigh_params,
+                           JOBZ,
+                           UPLO,
+                           (fortran_int)dimensions[0])) {
+        LINEARIZE_DATA_t matrix_in_ld;
+        LINEARIZE_DATA_t eigenvectors_out_ld;
+        LINEARIZE_DATA_t eigenvalues_out_ld;
+
+        init_linearize_data(&matrix_in_ld,
+                            eigh_params.N, eigh_params.N,
+                            steps[1], steps[0]);
+        init_linearize_data(&eigenvalues_out_ld,
+                            1, eigh_params.N,
+                            0, steps[2]);
+        if ('V' == eigh_params.JOBZ) {
+            init_linearize_data(&eigenvectors_out_ld,
+                                eigh_params.N, eigh_params.N,
+                                steps[4], steps[3]);
+        }
+
+        for (iter = 0; iter < outer_dim; ++iter) {
+            int not_ok;
+            /* copy the matrix in */
+            linearize_@TYPE@_matrix(eigh_params.A, args[0], &matrix_in_ld);
+            not_ok = call_@lapack_func@(&eigh_params);
+            if (!not_ok) {
+                /* lapack ok, copy result out */
+                delinearize_@BASETYPE@_matrix(args[1],
+                                              eigh_params.W,
+                                              &eigenvalues_out_ld);
+
+                if ('V' == eigh_params.JOBZ) {
+                    delinearize_@TYPE@_matrix(args[2],
+                                              eigh_params.A,
+                                              &eigenvectors_out_ld);
+                }
+            } else {
+                /* lapack fail, set result to nan */
+                error_occurred = 1;
+                nan_@BASETYPE@_matrix(args[1], &eigenvalues_out_ld);
+                if ('V' == eigh_params.JOBZ) {
+                    nan_@TYPE@_matrix(args[2], &eigenvectors_out_ld);
+                }
+            }
+            update_pointers((npy_uint8**)args, outer_steps, op_count);
+        }
+
+        release_@lapack_func@(&eigh_params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+ */
+static void
+@TYPE@_eighlo(char **args,
+              npy_intp *dimensions,
+              npy_intp *steps,
+              void *NPY_UNUSED(func))
+{
+    @TYPE@_eigh_wrapper('V', 'L', args, dimensions, steps);
+}
+
+static void
+@TYPE@_eighup(char **args,
+              npy_intp *dimensions,
+              npy_intp *steps,
+              void* NPY_UNUSED(func))
+{
+    @TYPE@_eigh_wrapper('V', 'U', args, dimensions, steps);
+}
+
+static void
+@TYPE@_eigvalshlo(char **args,
+                  npy_intp *dimensions,
+                  npy_intp *steps,
+                  void* NPY_UNUSED(func))
+{
+    @TYPE@_eigh_wrapper('N', 'L', args, dimensions, steps);
+}
+
+static void
+@TYPE@_eigvalshup(char **args,
+                  npy_intp *dimensions,
+                  npy_intp *steps,
+                  void* NPY_UNUSED(func))
+{
+    @TYPE@_eigh_wrapper('N', 'U', args, dimensions, steps);
+}
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                  /* Solve family (includes inv) */
+
+typedef struct gesv_params_struct
+{
+    void *A; /* A is (N, N) of base type */
+    void *B; /* B is (N, NRHS) of base type */
+    fortran_int * IPIV; /* IPIV is (N) */
+
+    fortran_int N;
+    fortran_int NRHS;
+    fortran_int LDA;
+    fortran_int LDB;
+} GESV_PARAMS_t;
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #lapack_func = sgesv, dgesv, cgesv, zgesv#
+*/
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(GESV_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->IPIV,
+                          params->B, &params->LDB,
+                          &rv);
+    return rv;
+}
+
+/*
+ * Initialize the parameters to use in for the lapack function _heev
+ * Handles buffer allocation
+ */
+static NPY_INLINE int
+init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *a, *b, *ipiv;
+    size_t safe_N = N;
+    size_t safe_NRHS = NRHS;
+    fortran_int ld = fortran_int_max(N, 1);
+    mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@) +
+                      safe_N * safe_NRHS*sizeof(@ftyp@) +
+                      safe_N * sizeof(fortran_int));
+    if (!mem_buff) {
+        goto error;
+    }
+    a = mem_buff;
+    b = a + safe_N * safe_N * sizeof(@ftyp@);
+    ipiv = b + safe_N * safe_NRHS * sizeof(@ftyp@);
+
+    params->A = a;
+    params->B = b;
+    params->IPIV = (fortran_int*)ipiv;
+    params->N = N;
+    params->NRHS = NRHS;
+    params->LDA = ld;
+    params->LDB = ld;
+
+    return 1;
+ error:
+    free(mem_buff);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+static NPY_INLINE void
+release_@lapack_func@(GESV_PARAMS_t *params)
+{
+    /* memory block base is in A */
+    free(params->A);
+    memset(params, 0, sizeof(*params));
+}
+
+static void
+@TYPE@_solve(char **args, npy_intp *dimensions, npy_intp *steps,
+             void *NPY_UNUSED(func))
+{
+    GESV_PARAMS_t params;
+    fortran_int n, nrhs;
+    int error_occurred = get_fp_invalid_and_clear();
+    INIT_OUTER_LOOP_3
+
+    n = (fortran_int)dimensions[0];
+    nrhs = (fortran_int)dimensions[1];
+    if (init_@lapack_func@(&params, n, nrhs)) {
+        LINEARIZE_DATA_t a_in, b_in, r_out;
+
+        init_linearize_data(&a_in, n, n, steps[1], steps[0]);
+        init_linearize_data(&b_in, nrhs, n, steps[3], steps[2]);
+        init_linearize_data(&r_out, nrhs, n, steps[5], steps[4]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
+            not_ok =call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[2], params.B, &r_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[2], &r_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+static void
+@TYPE@_solve1(char **args, npy_intp *dimensions, npy_intp *steps,
+              void *NPY_UNUSED(func))
+{
+    GESV_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n;
+    INIT_OUTER_LOOP_3
+
+    n = (fortran_int)dimensions[0];
+    if (init_@lapack_func@(&params, n, 1)) {
+        LINEARIZE_DATA_t a_in, b_in, r_out;
+        init_linearize_data(&a_in, n, n, steps[1], steps[0]);
+        init_linearize_data(&b_in, 1, n, 1, steps[2]);
+        init_linearize_data(&r_out, 1, n, 1, steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[2], params.B, &r_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[2], &r_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+static void
+@TYPE@_inv(char **args, npy_intp *dimensions, npy_intp *steps,
+           void *NPY_UNUSED(func))
+{
+    GESV_PARAMS_t params;
+    fortran_int n;
+    int error_occurred = get_fp_invalid_and_clear();
+    INIT_OUTER_LOOP_2
+
+    n = (fortran_int)dimensions[0];
+    if (init_@lapack_func@(&params, n, n)) {
+        LINEARIZE_DATA_t a_in, r_out;
+        init_linearize_data(&a_in, n, n, steps[1], steps[0]);
+        init_linearize_data(&r_out, n, n, steps[3], steps[2]);
+
+        BEGIN_OUTER_LOOP_2
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            identity_@TYPE@_matrix(params.B, n);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[1], params.B, &r_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[1], &r_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+
+/* -------------------------------------------------------------------------- */
+                     /* Cholesky decomposition */
+
+typedef struct potr_params_struct
+{
+    void *A;
+    fortran_int N;
+    fortran_int LDA;
+    char UPLO;
+} POTR_PARAMS_t;
+
+/**begin repeat
+
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #lapack_func = spotrf, dpotrf, cpotrf, zpotrf#
+ */
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(POTR_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->UPLO,
+                          &params->N, params->A, &params->LDA,
+                          &rv);
+    return rv;
+}
+
+static NPY_INLINE int
+init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *a;
+    size_t safe_N = N;
+    fortran_int lda = fortran_int_max(N, 1);
+
+    mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@));
+    if (!mem_buff) {
+        goto error;
+    }
+
+    a = mem_buff;
+
+    params->A = a;
+    params->N = N;
+    params->LDA = lda;
+    params->UPLO = UPLO;
+
+    return 1;
+ error:
+    free(mem_buff);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+static NPY_INLINE void
+release_@lapack_func@(POTR_PARAMS_t *params)
+{
+    /* memory block base in A */
+    free(params->A);
+    memset(params, 0, sizeof(*params));
+}
+
+static void
+@TYPE@_cholesky(char uplo, char **args, npy_intp *dimensions, npy_intp *steps)
+{
+    POTR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n;
+    INIT_OUTER_LOOP_2
+
+    assert(uplo == 'L');
+
+    n = (fortran_int)dimensions[0];
+    if (init_@lapack_func@(&params, uplo, n)) {
+        LINEARIZE_DATA_t a_in, r_out;
+        init_linearize_data(&a_in, n, n, steps[1], steps[0]);
+        init_linearize_data(&r_out, n, n, steps[3], steps[2]);
+        BEGIN_OUTER_LOOP_2
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                triu_@TYPE@_matrix(params.A, params.N);
+                delinearize_@TYPE@_matrix(args[1], params.A, &r_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[1], &r_out);
+            }
+        END_OUTER_LOOP
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+static void
+@TYPE@_cholesky_lo(char **args, npy_intp *dimensions, npy_intp *steps,
+                void *NPY_UNUSED(func))
+{
+    @TYPE@_cholesky('L', args, dimensions, steps);
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                          /* eig family  */
+
+typedef struct geev_params_struct {
+    void *A;
+    void *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/
+    void *WI;
+    void *VLR; /* REAL VL buffers for _geev where _ is s, d */
+    void *VRR; /* REAL VR buffers for _geev hwere _ is s, d */
+    void *WORK;
+    void *W;  /* final w */
+    void *VL; /* final vl */
+    void *VR; /* final vr */
+
+    fortran_int N;
+    fortran_int LDA;
+    fortran_int LDVL;
+    fortran_int LDVR;
+    fortran_int LWORK;
+
+    char JOBVL;
+    char JOBVR;
+} GEEV_PARAMS_t;
+
+static NPY_INLINE void
+dump_geev_params(const char *name, GEEV_PARAMS_t* params)
+{
+    TRACE_TXT("\n%s\n"
+
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+              "\t%10s: %p\n"\
+
+              "\t%10s: %d\n"\
+              "\t%10s: %d\n"\
+              "\t%10s: %d\n"\
+              "\t%10s: %d\n"\
+              "\t%10s: %d\n"\
+
+              "\t%10s: %c\n"\
+              "\t%10s: %c\n",
+
+              name,
+
+              "A", params->A,
+              "WR", params->WR,
+              "WI", params->WI,
+              "VLR", params->VLR,
+              "VRR", params->VRR,
+              "WORK", params->WORK,
+              "W", params->W,
+              "VL", params->VL,
+              "VR", params->VR,
+
+              "N", (int)params->N,
+              "LDA", (int)params->LDA,
+              "LDVL", (int)params->LDVL,
+              "LDVR", (int)params->LDVR,
+              "LWORK", (int)params->LWORK,
+
+              "JOBVL", params->JOBVL,
+              "JOBVR", params->JOBVR);
+}
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE#
+   #CTYPE = CFLOAT, CDOUBLE#
+   #typ = float, double#
+   #complextyp = COMPLEX_t, DOUBLECOMPLEX_t#
+   #lapack_func = sgeev, dgeev#
+   #zero = 0.0f, 0.0#
+*/
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(GEEV_PARAMS_t* params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->JOBVL, &params->JOBVR,
+                          &params->N, params->A, &params->LDA,
+                          params->WR, params->WI,
+                          params->VLR, &params->LDVL,
+                          params->VRR, &params->LDVR,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+
+static NPY_INLINE int
+init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr;
+    size_t safe_n = n;
+    size_t a_size = safe_n * safe_n * sizeof(@typ@);
+    size_t wr_size = safe_n * sizeof(@typ@);
+    size_t wi_size = safe_n * sizeof(@typ@);
+    size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(@typ@) : 0;
+    size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(@typ@) : 0;
+    size_t w_size = wr_size*2;
+    size_t vl_size = vlr_size*2;
+    size_t vr_size = vrr_size*2;
+    size_t work_count = 0;
+    fortran_int ld = fortran_int_max(n, 1);
+
+    /* allocate data for known sizes (all but work) */
+    mem_buff = malloc(a_size + wr_size + wi_size +
+                      vlr_size + vrr_size +
+                      w_size + vl_size + vr_size);
+    if (!mem_buff) {
+        goto error;
+    }
+
+    a = mem_buff;
+    wr = a + a_size;
+    wi = wr + wr_size;
+    vlr = wi + wi_size;
+    vrr = vlr + vlr_size;
+    w = vrr + vrr_size;
+    vl = w + w_size;
+    vr = vl + vl_size;
+
+    params->A = a;
+    params->WR = wr;
+    params->WI = wi;
+    params->VLR = vlr;
+    params->VRR = vrr;
+    params->W = w;
+    params->VL = vl;
+    params->VR = vr;
+    params->N = n;
+    params->LDA = ld;
+    params->LDVL = ld;
+    params->LDVR = ld;
+    params->JOBVL = jobvl;
+    params->JOBVR = jobvr;
+
+    /* Work size query */
+    {
+        @typ@ work_size_query;
+
+        params->LWORK = -1;
+        params->WORK = &work_size_query;
+
+        if (call_@lapack_func@(params) != 0) {
+            goto error;
+        }
+
+        work_count = (size_t)work_size_query;
+    }
+
+    mem_buff2 = malloc(work_count*sizeof(@typ@));
+    if (!mem_buff2) {
+        goto error;
+    }
+    work = mem_buff2;
+
+    params->LWORK = (fortran_int)work_count;
+    params->WORK = work;
+
+    return 1;
+ error:
+    free(mem_buff2);
+    free(mem_buff);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+static NPY_INLINE void
+mk_@TYPE@_complex_array_from_real(@complextyp@ *c, const @typ@ *re, size_t n)
+{
+    size_t iter;
+    for (iter = 0; iter < n; ++iter) {
+        c[iter].array[0] = re[iter];
+        c[iter].array[1] = @zero@;
+    }
+}
+
+static NPY_INLINE void
+mk_@TYPE@_complex_array(@complextyp@ *c,
+                        const @typ@ *re,
+                        const @typ@ *im,
+                        size_t n)
+{
+    size_t iter;
+    for (iter = 0; iter < n; ++iter) {
+        c[iter].array[0] = re[iter];
+        c[iter].array[1] = im[iter];
+    }
+}
+
+static NPY_INLINE void
+mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c,
+                                       const @typ@ *r,
+                                       size_t n)
+{
+    size_t iter;
+    for (iter = 0; iter < n; ++iter) {
+        @typ@ re = r[iter];
+        @typ@ im = r[iter+n];
+        c[iter].array[0] = re;
+        c[iter].array[1] = im;
+        c[iter+n].array[0] = re;
+        c[iter+n].array[1] = -im;
+    }
+}
+
+/*
+ * make the complex eigenvectors from the real array produced by sgeev/zgeev.
+ * c is the array where the results will be left.
+ * r is the source array of reals produced by sgeev/zgeev
+ * i is the eigenvalue imaginary part produced by sgeev/zgeev
+ * n is so that the order of the matrix is n by n
+ */
+static NPY_INLINE void
+mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c,
+                                      const @typ@ *r,
+                                      const @typ@ *i,
+                                      size_t n)
+{
+    size_t iter = 0;
+    while (iter < n)
+    {
+        if (i[iter] ==  @zero@) {
+            /* eigenvalue was real, eigenvectors as well...  */
+            mk_@TYPE@_complex_array_from_real(c, r, n);
+            c += n;
+            r += n;
+            iter ++;
+        } else {
+            /* eigenvalue was complex, generate a pair of eigenvectors */
+            mk_@TYPE@_complex_array_conjugate_pair(c, r, n);
+            c += 2*n;
+            r += 2*n;
+            iter += 2;
+        }
+    }
+}
+
+
+static NPY_INLINE void
+process_@lapack_func@_results(GEEV_PARAMS_t *params)
+{
+    /* REAL versions of geev need the results to be translated
+     * into complex versions. This is the way to deal with imaginary
+     * results. In our gufuncs we will always return complex arrays!
+     */
+    mk_@TYPE@_complex_array(params->W, params->WR, params->WI, params->N);
+
+    /* handle the eigenvectors */
+    if ('V' == params->JOBVL) {
+        mk_@lapack_func@_complex_eigenvectors(params->VL, params->VLR,
+                                              params->WI, params->N);
+    }
+    if ('V' == params->JOBVR) {
+        mk_@lapack_func@_complex_eigenvectors(params->VR, params->VRR,
+                                              params->WI, params->N);
+    }
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = CFLOAT, CDOUBLE#
+   #typ = COMPLEX_t, DOUBLECOMPLEX_t#
+   #ftyp = fortran_complex, fortran_doublecomplex#
+   #realtyp = float, double#
+   #lapack_func = cgeev, zgeev#
+ */
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(GEEV_PARAMS_t* params)
+{
+    fortran_int rv;
+
+    LAPACK(@lapack_func@)(&params->JOBVL, &params->JOBVR,
+                          &params->N, params->A, &params->LDA,
+                          params->W,
+                          params->VL, &params->LDVL,
+                          params->VR, &params->LDVR,
+                          params->WORK, &params->LWORK,
+                          params->WR, /* actually RWORK */
+                          &rv);
+    return rv;
+}
+
+static NPY_INLINE int
+init_@lapack_func@(GEEV_PARAMS_t* params,
+                   char jobvl,
+                   char jobvr,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *w, *vl, *vr, *work, *rwork;
+    size_t safe_n = n;
+    size_t a_size = safe_n * safe_n * sizeof(@ftyp@);
+    size_t w_size = safe_n * sizeof(@ftyp@);
+    size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0;
+    size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0;
+    size_t rwork_size = 2 * safe_n * sizeof(@realtyp@);
+    size_t work_count = 0;
+    size_t total_size = a_size + w_size + vl_size + vr_size + rwork_size;
+    fortran_int ld = fortran_int_max(n, 1);
+
+    mem_buff = malloc(total_size);
+    if (!mem_buff) {
+        goto error;
+    }
+
+    a = mem_buff;
+    w = a + a_size;
+    vl = w + w_size;
+    vr = vl + vl_size;
+    rwork = vr + vr_size;
+
+    params->A = a;
+    params->WR = rwork;
+    params->WI = NULL;
+    params->VLR = NULL;
+    params->VRR = NULL;
+    params->VL = vl;
+    params->VR = vr;
+    params->W = w;
+    params->N = n;
+    params->LDA = ld;
+    params->LDVL = ld;
+    params->LDVR = ld;
+    params->JOBVL = jobvl;
+    params->JOBVR = jobvr;
+
+    /* Work size query */
+    {
+        @typ@ work_size_query;
+
+        params->LWORK = -1;
+        params->WORK = &work_size_query;
+
+        if (call_@lapack_func@(params) != 0) {
+            goto error;
+        }
+
+        work_count = (size_t) work_size_query.array[0];
+        /* Fix a bug in lapack 3.0.0 */
+        if(work_count == 0) work_count = 1;
+    }
+
+    mem_buff2 = malloc(work_count*sizeof(@ftyp@));
+    if (!mem_buff2) {
+        goto error;
+    }
+
+    work = mem_buff2;
+
+    params->LWORK = (fortran_int)work_count;
+    params->WORK = work;
+
+    return 1;
+ error:
+    free(mem_buff2);
+    free(mem_buff);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+
+static NPY_INLINE void
+process_@lapack_func@_results(GEEV_PARAMS_t *NPY_UNUSED(params))
+{
+    /* nothing to do here, complex versions are ready to copy out */
+}
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CDOUBLE#
+   #COMPLEXTYPE = CFLOAT, CDOUBLE, CDOUBLE#
+   #ftype = fortran_real, fortran_doublereal, fortran_doublecomplex#
+   #lapack_func = sgeev, dgeev, zgeev#
+ */
+
+static NPY_INLINE void
+release_@lapack_func@(GEEV_PARAMS_t *params)
+{
+    free(params->WORK);
+    free(params->A);
+    memset(params, 0, sizeof(*params));
+}
+
+static NPY_INLINE void
+@TYPE@_eig_wrapper(char JOBVL,
+                   char JOBVR,
+                   char**args,
+                   npy_intp* dimensions,
+                   npy_intp* steps)
+{
+    ptrdiff_t outer_steps[4];
+    size_t iter;
+    size_t outer_dim = *dimensions++;
+    size_t op_count = 2;
+    int error_occurred = get_fp_invalid_and_clear();
+    GEEV_PARAMS_t geev_params;
+
+    assert(JOBVL == 'N');
+
+    STACK_TRACE;
+    op_count += 'V'==JOBVL?1:0;
+    op_count += 'V'==JOBVR?1:0;
+
+    for (iter = 0; iter < op_count; ++iter) {
+        outer_steps[iter] = (ptrdiff_t) steps[iter];
+    }
+    steps += op_count;
+
+    if (init_@lapack_func@(&geev_params,
+                           JOBVL, JOBVR,
+                           (fortran_int)dimensions[0])) {
+        LINEARIZE_DATA_t a_in;
+        LINEARIZE_DATA_t w_out;
+        LINEARIZE_DATA_t vl_out;
+        LINEARIZE_DATA_t vr_out;
+
+        init_linearize_data(&a_in,
+                            geev_params.N, geev_params.N,
+                            steps[1], steps[0]);
+        steps += 2;
+        init_linearize_data(&w_out,
+                            1, geev_params.N,
+                            0, steps[0]);
+        steps += 1;
+        if ('V' == geev_params.JOBVL) {
+            init_linearize_data(&vl_out,
+                                geev_params.N, geev_params.N,
+                                steps[1], steps[0]);
+            steps += 2;
+        }
+        if ('V' == geev_params.JOBVR) {
+            init_linearize_data(&vr_out,
+                                geev_params.N, geev_params.N,
+                                steps[1], steps[0]);
+        }
+
+        for (iter = 0; iter < outer_dim; ++iter) {
+            int not_ok;
+            char **arg_iter = args;
+            /* copy the matrix in */
+            linearize_@TYPE@_matrix(geev_params.A, *arg_iter++, &a_in);
+            not_ok = call_@lapack_func@(&geev_params);
+
+            if (!not_ok) {
+                process_@lapack_func@_results(&geev_params);
+                delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
+                                                 geev_params.W,
+                                                 &w_out);
+
+                if ('V' == geev_params.JOBVL) {
+                    delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
+                                                     geev_params.VL,
+                                                     &vl_out);
+                }
+                if ('V' == geev_params.JOBVR) {
+                    delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
+                                                     geev_params.VR,
+                                                     &vr_out);
+                }
+            } else {
+                /* geev failed */
+                error_occurred = 1;
+                nan_@COMPLEXTYPE@_matrix(*arg_iter++, &w_out);
+                if ('V' == geev_params.JOBVL) {
+                    nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vl_out);
+                }
+                if ('V' == geev_params.JOBVR) {
+                    nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vr_out);
+                }
+            }
+            update_pointers((npy_uint8**)args, outer_steps, op_count);
+        }
+
+        release_@lapack_func@(&geev_params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+static void
+@TYPE@_eig(char **args,
+           npy_intp *dimensions,
+           npy_intp *steps,
+           void *NPY_UNUSED(func))
+{
+    @TYPE@_eig_wrapper('N', 'V', args, dimensions, steps);
+}
+
+static void
+@TYPE@_eigvals(char **args,
+               npy_intp *dimensions,
+               npy_intp *steps,
+               void *NPY_UNUSED(func))
+{
+    @TYPE@_eig_wrapper('N', 'N', args, dimensions, steps);
+}
+
+/**end repeat**/
+
+
+/* -------------------------------------------------------------------------- */
+                 /* singular value decomposition  */
+
+typedef struct gessd_params_struct
+{
+    void *A;
+    void *S;
+    void *U;
+    void *VT;
+    void *WORK;
+    void *RWORK;
+    void *IWORK;
+
+    fortran_int M;
+    fortran_int N;
+    fortran_int LDA;
+    fortran_int LDU;
+    fortran_int LDVT;
+    fortran_int LWORK;
+    char JOBZ;
+} GESDD_PARAMS_t;
+
+
+static NPY_INLINE void
+dump_gesdd_params(const char *name,
+                  GESDD_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+
+              "%14s: %15c'%c'\n",
+
+              name,
+
+              "A", params->A,
+              "S", params->S,
+              "U", params->U,
+              "VT", params->VT,
+              "WORK", params->WORK,
+              "RWORK", params->RWORK,
+              "IWORK", params->IWORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "LDA", (int)params->LDA,
+              "LDU", (int)params->LDU,
+              "LDVT", (int)params->LDVT,
+              "LWORK", (int)params->LWORK,
+
+              "JOBZ", ' ', params->JOBZ);
+}
+
+static NPY_INLINE int
+compute_urows_vtcolumns(char jobz,
+                        fortran_int m, fortran_int n,
+                        fortran_int *urows, fortran_int *vtcolumns)
+{
+    fortran_int min_m_n = fortran_int_min(m, n);
+    switch(jobz)
+    {
+    case 'N':
+        *urows = 0;
+        *vtcolumns = 0;
+        break;
+    case 'A':
+        *urows = m;
+        *vtcolumns = n;
+        break;
+    case 'S':
+        {
+            *urows = min_m_n;
+            *vtcolumns = min_m_n;
+        }
+        break;
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE#
+   #lapack_func = sgesdd, dgesdd#
+   #ftyp = fortran_real, fortran_doublereal#
+ */
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(GESDD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->JOBZ, &params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->S,
+                          params->U, &params->LDU,
+                          params->VT, &params->LDVT,
+                          params->WORK, &params->LWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static NPY_INLINE int
+init_@lapack_func@(GESDD_PARAMS_t *params,
+                   char jobz,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *s, *u, *vt, *work, *iwork;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+    fortran_int u_row_count, vt_column_count;
+    size_t safe_u_row_count, safe_vt_column_count;
+    size_t u_size, vt_size;
+    fortran_int work_count;
+    size_t work_size;
+    size_t iwork_size = 8 * safe_min_m_n * sizeof(fortran_int);
+    fortran_int ld = fortran_int_max(m, 1);
+
+    if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count)) {
+        goto error;
+    }
+
+    safe_u_row_count = u_row_count;
+    safe_vt_column_count = vt_column_count;
+
+    u_size = safe_u_row_count * safe_m * sizeof(@ftyp@);
+    vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@);
+
+    mem_buff = malloc(a_size + s_size + u_size + vt_size + iwork_size);
+
+    if (!mem_buff) {
+        goto error;
+    }
+
+    a = mem_buff;
+    s = a + a_size;
+    u = s + s_size;
+    vt = u + u_size;
+    iwork = vt + vt_size;
+
+    /* fix vt_column_count so that it is a valid lapack parameter (0 is not) */
+    vt_column_count = fortran_int_max(1, vt_column_count);
+
+    params->M = m;
+    params->N = n;
+    params->A = a;
+    params->S = s;
+    params->U = u;
+    params->VT = vt;
+    params->RWORK = NULL;
+    params->IWORK = iwork;
+    params->M = m;
+    params->N = n;
+    params->LDA = ld;
+    params->LDU = ld;
+    params->LDVT = vt_column_count;
+    params->JOBZ = jobz;
+
+    /* Work size query */
+    {
+        @ftyp@ work_size_query;
+
+        params->LWORK = -1;
+        params->WORK = &work_size_query;
+
+        if (call_@lapack_func@(params) != 0) {
+            goto error;
+        }
+
+        work_count = (fortran_int)work_size_query;
+        /* Fix a bug in lapack 3.0.0 */
+        if(work_count == 0) work_count = 1;
+        work_size = (size_t)work_count * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2) {
+        goto error;
+    }
+
+    work = mem_buff2;
+
+    params->LWORK = work_count;
+    params->WORK = work;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE = CFLOAT, CDOUBLE#
+   #ftyp = fortran_complex, fortran_doublecomplex#
+   #frealtyp = fortran_real, fortran_doublereal#
+   #typ = COMPLEX_t, DOUBLECOMPLEX_t#
+   #lapack_func = cgesdd, zgesdd#
+ */
+
+static NPY_INLINE fortran_int
+call_@lapack_func@(GESDD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->JOBZ, &params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->S,
+                          params->U, &params->LDU,
+                          params->VT, &params->LDVT,
+                          params->WORK, &params->LWORK,
+                          params->RWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static NPY_INLINE int
+init_@lapack_func@(GESDD_PARAMS_t *params,
+                   char jobz,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL;
+    npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork;
+    size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size;
+    size_t safe_u_row_count, safe_vt_column_count;
+    fortran_int u_row_count, vt_column_count, work_count;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    fortran_int ld = fortran_int_max(m, 1);
+
+    if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count)) {
+        goto error;
+    }
+
+    safe_u_row_count = u_row_count;
+    safe_vt_column_count = vt_column_count;
+
+    a_size = safe_m * safe_n * sizeof(@ftyp@);
+    s_size = safe_min_m_n * sizeof(@frealtyp@);
+    u_size = safe_u_row_count * safe_m * sizeof(@ftyp@);
+    vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@);
+    rwork_size = 'N'==jobz?
+        (7 * safe_min_m_n) :
+        (5*safe_min_m_n * safe_min_m_n + 5*safe_min_m_n);
+    rwork_size *= sizeof(@ftyp@);
+    iwork_size = 8 * safe_min_m_n* sizeof(fortran_int);
+
+    mem_buff = malloc(a_size +
+                      s_size +
+                      u_size +
+                      vt_size +
+                      rwork_size +
+                      iwork_size);
+    if (!mem_buff) {
+        goto error;
+    }
+
+    a = mem_buff;
+    s = a + a_size;
+    u = s + s_size;
+    vt = u + u_size;
+    rwork = vt + vt_size;
+    iwork = rwork + rwork_size;
+
+    /* fix vt_column_count so that it is a valid lapack parameter (0 is not) */
+    vt_column_count = fortran_int_max(1, vt_column_count);
+
+    params->A = a;
+    params->S = s;
+    params->U = u;
+    params->VT = vt;
+    params->RWORK = rwork;
+    params->IWORK = iwork;
+    params->M = m;
+    params->N = n;
+    params->LDA = ld;
+    params->LDU = ld;
+    params->LDVT = vt_column_count;
+    params->JOBZ = jobz;
+
+    /* Work size query */
+    {
+        @ftyp@ work_size_query;
+
+        params->LWORK = -1;
+        params->WORK = &work_size_query;
+
+        if (call_@lapack_func@(params) != 0) {
+            goto error;
+        }
+
+        work_count = (fortran_int)((@typ@*)&work_size_query)->array[0];
+        /* Fix a bug in lapack 3.0.0 */
+        if(work_count == 0) work_count = 1;
+        work_size = (size_t)work_count * sizeof(@ftyp@);
+    }
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2) {
+        goto error;
+    }
+
+    work = mem_buff2;
+
+    params->LWORK = work_count;
+    params->WORK = work;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff2);
+    free(mem_buff);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+   #REALTYPE = FLOAT, DOUBLE, FLOAT, DOUBLE#
+   #lapack_func = sgesdd, dgesdd, cgesdd, zgesdd#
+ */
+static NPY_INLINE void
+release_@lapack_func@(GESDD_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+static NPY_INLINE void
+@TYPE@_svd_wrapper(char JOBZ,
+                   char **args,
+                   npy_intp* dimensions,
+                   npy_intp* steps)
+{
+    ptrdiff_t outer_steps[4];
+    int error_occurred = get_fp_invalid_and_clear();
+    size_t iter;
+    size_t outer_dim = *dimensions++;
+    size_t op_count = (JOBZ=='N')?2:4;
+    GESDD_PARAMS_t params;
+
+    for (iter = 0; iter < op_count; ++iter) {
+        outer_steps[iter] = (ptrdiff_t) steps[iter];
+    }
+    steps += op_count;
+
+    if (init_@lapack_func@(&params,
+                           JOBZ,
+                           (fortran_int)dimensions[0],
+                           (fortran_int)dimensions[1])) {
+        LINEARIZE_DATA_t a_in, u_out, s_out, v_out;
+        fortran_int min_m_n = params.M < params.N ? params.M : params.N;
+
+        init_linearize_data(&a_in, params.N, params.M, steps[1], steps[0]);
+        if ('N' == params.JOBZ) {
+            /* only the singular values are wanted */
+            init_linearize_data(&s_out, 1, min_m_n, 0, steps[2]);
+        } else {
+            fortran_int u_columns, v_rows;
+            if ('S' == params.JOBZ) {
+                u_columns = min_m_n;
+                v_rows = min_m_n;
+            } else { /* JOBZ == 'A' */
+                u_columns = params.M;
+                v_rows = params.N;
+            }
+            init_linearize_data(&u_out,
+                                u_columns, params.M,
+                                steps[3], steps[2]);
+            init_linearize_data(&s_out,
+                                1, min_m_n,
+                                0, steps[4]);
+            init_linearize_data(&v_out,
+                                params.N, v_rows,
+                                steps[6], steps[5]);
+        }
+
+        for (iter = 0; iter < outer_dim; ++iter) {
+            int not_ok;
+            /* copy the matrix in */
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                if ('N' == params.JOBZ) {
+                    delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out);
+                } else {
+                    if ('A' == params.JOBZ && min_m_n == 0) {
+                        /* Lapack has betrayed us and left these uninitialized,
+                         * so produce an identity matrix for whichever of u
+                         * and v is not empty.
+                         */
+                        identity_@TYPE@_matrix(params.U, params.M);
+                        identity_@TYPE@_matrix(params.VT, params.N);
+                    }
+
+                    delinearize_@TYPE@_matrix(args[1], params.U, &u_out);
+                    delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out);
+                    delinearize_@TYPE@_matrix(args[3], params.VT, &v_out);
+                }
+            } else {
+                error_occurred = 1;
+                if ('N' == params.JOBZ) {
+                    nan_@REALTYPE@_matrix(args[1], &s_out);
+                } else {
+                    nan_@TYPE@_matrix(args[1], &u_out);
+                    nan_@REALTYPE@_matrix(args[2], &s_out);
+                    nan_@TYPE@_matrix(args[3], &v_out);
+                }
+            }
+            update_pointers((npy_uint8**)args, outer_steps, op_count);
+        }
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+/**end repeat*/
+
+
+/* svd gufunc entry points */
+/**begin repeat
+   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+ */
+static void
+@TYPE@_svd_N(char **args,
+             npy_intp *dimensions,
+             npy_intp *steps,
+             void *NPY_UNUSED(func))
+{
+    @TYPE@_svd_wrapper('N', args, dimensions, steps);
+}
+
+static void
+@TYPE@_svd_S(char **args,
+             npy_intp *dimensions,
+             npy_intp *steps,
+             void *NPY_UNUSED(func))
+{
+    @TYPE@_svd_wrapper('S', args, dimensions, steps);
+}
+
+static void
+@TYPE@_svd_A(char **args,
+             npy_intp *dimensions,
+             npy_intp *steps,
+             void *NPY_UNUSED(func))
+{
+    @TYPE@_svd_wrapper('A', args, dimensions, steps);
+}
+
+/**end repeat**/
+
+
+/* -------------------------------------------------------------------------- */
+                 /* least squares */
+
+typedef struct gelsd_params_struct
+{
+    fortran_int M;
+    fortran_int N;
+    fortran_int NRHS;
+    void *A;
+    fortran_int LDA;
+    void *B;
+    fortran_int LDB;
+    void *S;
+    void *RCOND;
+    fortran_int RANK;
+    void *WORK;
+    fortran_int LWORK;
+    void *RWORK;
+    void *IWORK;
+} GELSD_PARAMS_t;
+
+
+static inline void
+dump_gelsd_params(const char *name,
+                  GELSD_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+
+              "%14s: %18p\n",
+
+              name,
+
+              "A", params->A,
+              "B", params->B,
+              "S", params->S,
+              "WORK", params->WORK,
+              "RWORK", params->RWORK,
+              "IWORK", params->IWORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "NRHS", (int)params->NRHS,
+              "LDA", (int)params->LDA,
+              "LDB", (int)params->LDB,
+              "LWORK", (int)params->LWORK,
+              "RANK", (int)params->RANK,
+
+              "RCOND", params->RCOND);
+}
+
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sgelsd,dgelsd#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static inline int
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *b, *s, *work, *iwork;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    size_t iwork_size;
+    fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
+
+    mem_buff = malloc(a_size + b_size + s_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
+
+    params->M = m;
+    params->N = n;
+    params->NRHS = nrhs;
+    params->A = a;
+    params->B = b;
+    params->S = s;
+    params->LDA = lda;
+    params->LDB = ldb;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+        fortran_int iwork_size_query;
+
+        params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = NULL;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query;
+
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
+    }
+
+    mem_buff2 = malloc(work_size + iwork_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    iwork = work + work_size;
+
+    params->WORK = work;
+    params->RWORK = NULL;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CFLOAT,CDOUBLE#
+   #ftyp=fortran_complex,fortran_doublecomplex#
+   #frealtyp=fortran_real,fortran_doublereal#
+   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
+   #lapack_func=cgelsd,zgelsd#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->RWORK, params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static inline int
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
+
+    fortran_int work_count;
+    size_t work_size, rwork_size, iwork_size;
+    fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
+
+    mem_buff = malloc(a_size + b_size + s_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
+
+    params->M = m;
+    params->N = n;
+    params->NRHS = nrhs;
+    params->A = a;
+    params->B = b;
+    params->S = s;
+    params->LDA = lda;
+    params->LDB = ldb;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+        @frealtyp@ rwork_size_query;
+        fortran_int iwork_size_query;
+
+        params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = &rwork_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query.r;
+
+        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
+        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
+    }
+
+    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    rwork = work + work_size;
+    iwork = rwork + rwork_size;
+
+    params->WORK = work;
+    params->RWORK = rwork;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
+   #dot_func=sdot,ddot,cdotc,zdotc#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
+ */
+static inline void
+release_@lapack_func@(GELSD_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/** Compute the squared l2 norm of a contiguous vector */
+static @basetyp@
+@TYPE@_abs2(@typ@ *p, npy_intp n) {
+    npy_intp i;
+    @basetyp@ res = 0;
+    for (i = 0; i < n; i++) {
+        @typ@ el = p[i];
+#if @cmplx@
+        res += el.real*el.real + el.imag*el.imag;
+#else
+        res += el*el;
+#endif
+    }
+    return res;
+}
+
+static void
+@TYPE@_lstsq(char **args, npy_intp *dimensions, npy_intp *steps,
+             void *NPY_UNUSED(func))
+{
+    GELSD_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m, nrhs;
+    fortran_int excess;
+
+    INIT_OUTER_LOOP_7
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+    nrhs = (fortran_int)dimensions[2];
+    excess = m - n;
+
+    if (init_@lapack_func@(&params, m, n, nrhs)) {
+        LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
+        init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
+        init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
+        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
+
+        BEGIN_OUTER_LOOP_7
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
+            params.RCOND = args[2];
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
+                *(npy_int*) args[5] = params.RANK;
+                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
+
+                /* Note that linalg.lstsq discards this when excess == 0 */
+                if (excess >= 0 && params.RANK == n) {
+                    /* Compute the residuals as the square sum of each column */
+                    int i;
+                    char *resid = args[4];
+                    @ftyp@ *components = (@ftyp@ *)params.B + n;
+                    for (i = 0; i < nrhs; i++) {
+                        @ftyp@ *vector = components + i*m;
+                        /* Numpy and fortran floating types are the same size,
+                         * so this cast is safe */
+                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
+                        memcpy(
+                            resid + i*r_out.column_strides,
+                            &abs2, sizeof(abs2));
+                    }
+                }
+                else {
+                    /* Note that this is always discarded by linalg.lstsq */
+                    nan_@REALTYPE@_matrix(args[4], &r_out);
+                }
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[3], &x_out);
+                nan_@REALTYPE@_matrix(args[4], &r_out);
+                *(npy_int*) args[5] = -1;
+                nan_@REALTYPE@_matrix(args[6], &s_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+#pragma GCC diagnostic pop
+
+/* -------------------------------------------------------------------------- */
+              /* gufunc registration  */
+
+static void *array_of_nulls[] = {
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL,
+    (void *)NULL
+};
+
+#define FUNC_ARRAY_NAME(NAME) NAME ## _funcs
+
+#define GUFUNC_FUNC_ARRAY_REAL(NAME)                    \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        FLOAT_ ## NAME,                                 \
+        DOUBLE_ ## NAME                                 \
+    }
+
+#define GUFUNC_FUNC_ARRAY_REAL_COMPLEX(NAME)            \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        FLOAT_ ## NAME,                                 \
+        DOUBLE_ ## NAME,                                \
+        CFLOAT_ ## NAME,                                \
+        CDOUBLE_ ## NAME                                \
+    }
+
+/* There are problems with eig in complex single precision.
+ * That kernel is disabled
+ */
+#define GUFUNC_FUNC_ARRAY_EIG(NAME)                     \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        FLOAT_ ## NAME,                                 \
+        DOUBLE_ ## NAME,                                \
+        CDOUBLE_ ## NAME                                \
+    }
+
+
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighlo);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighup);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshlo);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshup);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve1);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(inv);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
+GUFUNC_FUNC_ARRAY_EIG(eig);
+GUFUNC_FUNC_ARRAY_EIG(eigvals);
+
+static char equal_2_types[] = {
+    NPY_FLOAT, NPY_FLOAT,
+    NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT, NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE
+};
+
+static char equal_3_types[] = {
+    NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
+    NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE
+};
+
+/* second result is logdet, that will always be a REAL */
+static char slogdet_types[] = {
+    NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
+    NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE
+};
+
+static char eigh_types[] = {
+    NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
+    NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
+};
+
+static char eighvals_types[] = {
+    NPY_FLOAT, NPY_FLOAT,
+    NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_DOUBLE
+};
+
+static char eig_types[] = {
+    NPY_FLOAT, NPY_CFLOAT, NPY_CFLOAT,
+    NPY_DOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE
+};
+
+static char eigvals_types[] = {
+    NPY_FLOAT, NPY_CFLOAT,
+    NPY_DOUBLE, NPY_CDOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE
+};
+
+static char svd_1_1_types[] = {
+    NPY_FLOAT, NPY_FLOAT,
+    NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_DOUBLE
+};
+
+static char svd_1_3_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
+};
+
+/*  A,           b,           rcond,      x,           resid,      rank,    s,        */
+static char lstsq_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_FLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,  NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
+};
+
+typedef struct gufunc_descriptor_struct {
+    char *name;
+    char *signature;
+    char *doc;
+    int ntypes;
+    int nin;
+    int nout;
+    PyUFuncGenericFunction *funcs;
+    char *types;
+} GUFUNC_DESCRIPTOR_t;
+
+GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
+    {
+        "slogdet",
+        "(m,m)->(),()",
+        "slogdet on the last two dimensions and broadcast on the rest. \n"\
+        "Results in two arrays, one with sign and the other with log of the"\
+        " determinants. \n"\
+        "    \"(m,m)->(),()\" \n",
+        4, 1, 2,
+        FUNC_ARRAY_NAME(slogdet),
+        slogdet_types
+    },
+    {
+        "det",
+        "(m,m)->()",
+        "det of the last two dimensions and broadcast on the rest. \n"\
+        "    \"(m,m)->()\" \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(det),
+        equal_2_types
+    },
+    {
+        "eigh_lo",
+        "(m,m)->(m),(m,m)",
+        "eigh on the last two dimension and broadcast to the rest, using"\
+        " lower triangle \n"\
+        "Results in a vector of eigenvalues and a matrix with the"\
+        "eigenvectors. \n"\
+        "    \"(m,m)->(m),(m,m)\" \n",
+        4, 1, 2,
+        FUNC_ARRAY_NAME(eighlo),
+        eigh_types
+    },
+    {
+        "eigh_up",
+        "(m,m)->(m),(m,m)",
+        "eigh on the last two dimension and broadcast to the rest, using"\
+        " upper triangle. \n"\
+        "Results in a vector of eigenvalues and a matrix with the"\
+        " eigenvectors. \n"\
+        "    \"(m,m)->(m),(m,m)\" \n",
+        4, 1, 2,
+        FUNC_ARRAY_NAME(eighup),
+        eigh_types
+    },
+    {
+        "eigvalsh_lo",
+        "(m,m)->(m)",
+        "eigh on the last two dimension and broadcast to the rest, using"\
+        " lower triangle. \n"\
+        "Results in a vector of eigenvalues and a matrix with the"\
+        "eigenvectors. \n"\
+        "    \"(m,m)->(m)\" \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(eigvalshlo),
+        eighvals_types
+    },
+    {
+        "eigvalsh_up",
+        "(m,m)->(m)",
+        "eigvalsh on the last two dimension and broadcast to the rest,"\
+        " using upper triangle. \n"\
+        "Results in a vector of eigenvalues and a matrix with the"\
+        "eigenvectors.\n"\
+        "    \"(m,m)->(m)\" \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(eigvalshup),
+        eighvals_types
+    },
+    {
+        "solve",
+        "(m,m),(m,n)->(m,n)",
+        "solve the system a x = b, on the last two dimensions, broadcast"\
+        " to the rest. \n"\
+        "Results in a matrices with the solutions. \n"\
+        "    \"(m,m),(m,n)->(m,n)\" \n",
+        4, 2, 1,
+        FUNC_ARRAY_NAME(solve),
+        equal_3_types
+    },
+    {
+        "solve1",
+        "(m,m),(m)->(m)",
+        "solve the system a x = b, for b being a vector, broadcast in"\
+        " the outer dimensions. \n"\
+        "Results in vectors with the solutions. \n"\
+        "    \"(m,m),(m)->(m)\" \n",
+        4, 2, 1,
+        FUNC_ARRAY_NAME(solve1),
+        equal_3_types
+    },
+    {
+        "inv",
+        "(m, m)->(m, m)",
+        "compute the inverse of the last two dimensions and broadcast"\
+        " to the rest. \n"\
+        "Results in the inverse matrices. \n"\
+        "    \"(m,m)->(m,m)\" \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(inv),
+        equal_2_types
+    },
+    {
+        "cholesky_lo",
+        "(m,m)->(m,m)",
+        "cholesky decomposition of hermitian positive-definite matrices. \n"\
+        "Broadcast to all outer dimensions. \n"\
+        "    \"(m,m)->(m,m)\" \n",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(cholesky_lo),
+        equal_2_types
+    },
+    {
+        "svd_m",
+        "(m,n)->(m)",
+        "svd when n>=m. ",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(svd_N),
+        svd_1_1_types
+    },
+    {
+        "svd_n",
+        "(m,n)->(n)",
+        "svd when n<=m",
+        4, 1, 1,
+        FUNC_ARRAY_NAME(svd_N),
+        svd_1_1_types
+    },
+    {
+        "svd_m_s",
+        "(m,n)->(m,m),(m),(m,n)",
+        "svd when m<=n",
+        4, 1, 3,
+        FUNC_ARRAY_NAME(svd_S),
+        svd_1_3_types
+    },
+    {
+        "svd_n_s",
+        "(m,n)->(m,n),(n),(n,n)",
+        "svd when m>=n",
+        4, 1, 3,
+        FUNC_ARRAY_NAME(svd_S),
+        svd_1_3_types
+    },
+    {
+        "svd_m_f",
+        "(m,n)->(m,m),(m),(n,n)",
+        "svd when m<=n",
+        4, 1, 3,
+        FUNC_ARRAY_NAME(svd_A),
+        svd_1_3_types
+    },
+    {
+        "svd_n_f",
+        "(m,n)->(m,m),(n),(n,n)",
+        "svd when m>=n",
+        4, 1, 3,
+        FUNC_ARRAY_NAME(svd_A),
+        svd_1_3_types
+    },
+    {
+        "eig",
+        "(m,m)->(m),(m,m)",
+        "eig on the last two dimension and broadcast to the rest. \n"\
+        "Results in a vector with the  eigenvalues and a matrix with the"\
+        " eigenvectors. \n"\
+        "    \"(m,m)->(m),(m,m)\" \n",
+        3, 1, 2,
+        FUNC_ARRAY_NAME(eig),
+        eig_types
+    },
+    {
+        "eigvals",
+        "(m,m)->(m)",
+        "eigvals on the last two dimension and broadcast to the rest. \n"\
+        "Results in a vector of eigenvalues. \n",
+        3, 1, 1,
+        FUNC_ARRAY_NAME(eigvals),
+        eigvals_types
+    },
+    {
+        "lstsq_m",
+        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",
+        "least squares on the last two dimensions and broadcast to the rest. \n"\
+        "For m <= n. \n",
+        4, 3, 4,
+        FUNC_ARRAY_NAME(lstsq),
+        lstsq_types
+    },
+    {
+        "lstsq_n",
+        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(n)",
+        "least squares on the last two dimensions and broadcast to the rest. \n"\
+        "For m >= n, meaning that residuals are produced. \n",
+        4, 3, 4,
+        FUNC_ARRAY_NAME(lstsq),
+        lstsq_types
+    }
+};
+
+static void
+addUfuncs(PyObject *dictionary) {
+    PyObject *f;
+    int i;
+    const int gufunc_count = sizeof(gufunc_descriptors)/
+        sizeof(gufunc_descriptors[0]);
+    for (i = 0; i < gufunc_count; i++) {
+        GUFUNC_DESCRIPTOR_t* d = &gufunc_descriptors[i];
+        f = PyUFunc_FromFuncAndDataAndSignature(d->funcs,
+                                                array_of_nulls,
+                                                d->types,
+                                                d->ntypes,
+                                                d->nin,
+                                                d->nout,
+                                                PyUFunc_None,
+                                                d->name,
+                                                d->doc,
+                                                0,
+                                                d->signature);
+        PyDict_SetItemString(dictionary, d->name, f);
+#if 0
+        dump_ufunc_object((PyUFuncObject*) f);
+#endif
+        Py_DECREF(f);
+    }
+}
+
+
+
+/* -------------------------------------------------------------------------- */
+                  /* Module initialization stuff  */
+
+static PyMethodDef UMath_LinAlgMethods[] = {
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+#if defined(NPY_PY3K)
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        UMATH_LINALG_MODULE_NAME,
+        NULL,
+        -1,
+        UMath_LinAlgMethods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+#endif
+
+#if defined(NPY_PY3K)
+#define RETVAL(x) x
+PyObject *PyInit__umath_linalg(void)
+#else
+#define RETVAL(x)
+PyMODINIT_FUNC
+init_umath_linalg(void)
+#endif
+{
+    PyObject *m;
+    PyObject *d;
+    PyObject *version;
+
+    init_constants();
+#if defined(NPY_PY3K)
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule(UMATH_LINALG_MODULE_NAME, UMath_LinAlgMethods);
+#endif
+    if (m == NULL) {
+        return RETVAL(NULL);
+    }
+
+    import_array();
+    import_ufunc();
+
+    d = PyModule_GetDict(m);
+
+    version = PyString_FromString(umath_linalg_version_string);
+    PyDict_SetItemString(d, "__version__", version);
+    Py_DECREF(version);
+
+    /* Load the ufunc operators into the module's namespace */
+    addUfuncs(d);
+
+    if (PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_linalg module.");
+        return RETVAL(NULL);
+    }
+
+    return RETVAL(m);
+}
diff --git a/contrib/python/numpy/py2/numpy/ma/setup.py b/contrib/python/numpy/py2/numpy/ma/setup.py
new file mode 100644
index 00000000000..d1d6c89b513
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/setup.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+from __future__ import division, print_function
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('ma', parent_package, top_path)
+    config.add_data_dir('tests')
+    return config
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    config = configuration(top_path='').todict()
+    setup(**config)
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/__init__.py b/contrib/python/numpy/py2/numpy/ma/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_core.py b/contrib/python/numpy/py2/numpy/ma/tests/test_core.py
new file mode 100644
index 00000000000..e0dbf1b1af7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_core.py
@@ -0,0 +1,5205 @@
+# pylint: disable-msg=W0400,W0511,W0611,W0612,W0614,R0201,E1102
+"""Tests suite for MaskedArray & subclassing.
+
+:author: Pierre Gerard-Marchant
+:contact: pierregm_at_uga_dot_edu
+"""
+from __future__ import division, absolute_import, print_function
+
+__author__ = "Pierre GF Gerard-Marchant"
+
+import sys
+import warnings
+import operator
+import itertools
+import textwrap
+import pytest
+
+from functools import reduce
+
+
+import numpy as np
+import numpy.ma.core
+import numpy.core.fromnumeric as fromnumeric
+import numpy.core.umath as umath
+from numpy.testing import (
+    assert_raises, assert_warns, suppress_warnings
+    )
+from numpy import ndarray
+from numpy.compat import asbytes
+from numpy.ma.testutils import (
+    assert_, assert_array_equal, assert_equal, assert_almost_equal,
+    assert_equal_records, fail_if_equal, assert_not_equal,
+    assert_mask_equal
+    )
+from numpy.ma.core import (
+    MAError, MaskError, MaskType, MaskedArray, abs, absolute, add, all,
+    allclose, allequal, alltrue, angle, anom, arange, arccos, arccosh, arctan2,
+    arcsin, arctan, argsort, array, asarray, choose, concatenate,
+    conjugate, cos, cosh, count, default_fill_value, diag, divide, empty,
+    empty_like, equal, exp, flatten_mask, filled, fix_invalid,
+    flatten_structured_array, fromflex, getmask, getmaskarray, greater,
+    greater_equal, identity, inner, isMaskedArray, less, less_equal, log,
+    log10, make_mask, make_mask_descr, mask_or, masked, masked_array,
+    masked_equal, masked_greater, masked_greater_equal, masked_inside,
+    masked_less, masked_less_equal, masked_not_equal, masked_outside,
+    masked_print_option, masked_values, masked_where, max, maximum,
+    maximum_fill_value, min, minimum, minimum_fill_value, mod, multiply,
+    mvoid, nomask, not_equal, ones, outer, power, product, put, putmask,
+    ravel, repeat, reshape, resize, shape, sin, sinh, sometrue, sort, sqrt,
+    subtract, sum, take, tan, tanh, transpose, where, zeros,
+    )
+from numpy.core.numeric import pickle
+
+pi = np.pi
+
+
+suppress_copy_mask_on_assignment = suppress_warnings()
+suppress_copy_mask_on_assignment.filter(
+    numpy.ma.core.MaskedArrayFutureWarning,
+    "setting an item on a masked array which has a shared mask will not copy")
+
+
+# For parametrized numeric testing
+num_dts = [np.dtype(dt_) for dt_ in '?bhilqBHILQefdgFD']
+num_ids = [dt_.char for dt_ in num_dts]
+
+
+class TestMaskedArray(object):
+    # Base test class for MaskedArrays.
+
+    def setup(self):
+        # Base data definition.
+        x = np.array([1., 1., 1., -2., pi/2.0, 4., 5., -10., 10., 1., 2., 3.])
+        y = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.])
+        a10 = 10.
+        m1 = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+        m2 = [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1]
+        xm = masked_array(x, mask=m1)
+        ym = masked_array(y, mask=m2)
+        z = np.array([-.5, 0., .5, .8])
+        zm = masked_array(z, mask=[0, 1, 0, 0])
+        xf = np.where(m1, 1e+20, x)
+        xm.set_fill_value(1e+20)
+        self.d = (x, y, a10, m1, m2, xm, ym, z, zm, xf)
+
+    def test_basicattributes(self):
+        # Tests some basic array attributes.
+        a = array([1, 3, 2])
+        b = array([1, 3, 2], mask=[1, 0, 1])
+        assert_equal(a.ndim, 1)
+        assert_equal(b.ndim, 1)
+        assert_equal(a.size, 3)
+        assert_equal(b.size, 3)
+        assert_equal(a.shape, (3,))
+        assert_equal(b.shape, (3,))
+
+    def test_basic0d(self):
+        # Checks masking a scalar
+        x = masked_array(0)
+        assert_equal(str(x), '0')
+        x = masked_array(0, mask=True)
+        assert_equal(str(x), str(masked_print_option))
+        x = masked_array(0, mask=False)
+        assert_equal(str(x), '0')
+        x = array(0, mask=1)
+        assert_(x.filled().dtype is x._data.dtype)
+
+    def test_basic1d(self):
+        # Test of basic array creation and properties in 1 dimension.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        assert_(not isMaskedArray(x))
+        assert_(isMaskedArray(xm))
+        assert_((xm - ym).filled(0).any())
+        fail_if_equal(xm.mask.astype(int), ym.mask.astype(int))
+        s = x.shape
+        assert_equal(np.shape(xm), s)
+        assert_equal(xm.shape, s)
+        assert_equal(xm.dtype, x.dtype)
+        assert_equal(zm.dtype, z.dtype)
+        assert_equal(xm.size, reduce(lambda x, y:x * y, s))
+        assert_equal(count(xm), len(m1) - reduce(lambda x, y:x + y, m1))
+        assert_array_equal(xm, xf)
+        assert_array_equal(filled(xm, 1.e20), xf)
+        assert_array_equal(x, xm)
+
+    def test_basic2d(self):
+        # Test of basic array creation and properties in 2 dimensions.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        for s in [(4, 3), (6, 2)]:
+            x.shape = s
+            y.shape = s
+            xm.shape = s
+            ym.shape = s
+            xf.shape = s
+
+            assert_(not isMaskedArray(x))
+            assert_(isMaskedArray(xm))
+            assert_equal(shape(xm), s)
+            assert_equal(xm.shape, s)
+            assert_equal(xm.size, reduce(lambda x, y:x * y, s))
+            assert_equal(count(xm), len(m1) - reduce(lambda x, y:x + y, m1))
+            assert_equal(xm, xf)
+            assert_equal(filled(xm, 1.e20), xf)
+            assert_equal(x, xm)
+
+    def test_concatenate_basic(self):
+        # Tests concatenations.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        # basic concatenation
+        assert_equal(np.concatenate((x, y)), concatenate((xm, ym)))
+        assert_equal(np.concatenate((x, y)), concatenate((x, y)))
+        assert_equal(np.concatenate((x, y)), concatenate((xm, y)))
+        assert_equal(np.concatenate((x, y, x)), concatenate((x, ym, x)))
+
+    def test_concatenate_alongaxis(self):
+        # Tests concatenations.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        # Concatenation along an axis
+        s = (3, 4)
+        x.shape = y.shape = xm.shape = ym.shape = s
+        assert_equal(xm.mask, np.reshape(m1, s))
+        assert_equal(ym.mask, np.reshape(m2, s))
+        xmym = concatenate((xm, ym), 1)
+        assert_equal(np.concatenate((x, y), 1), xmym)
+        assert_equal(np.concatenate((xm.mask, ym.mask), 1), xmym._mask)
+
+        x = zeros(2)
+        y = array(ones(2), mask=[False, True])
+        z = concatenate((x, y))
+        assert_array_equal(z, [0, 0, 1, 1])
+        assert_array_equal(z.mask, [False, False, False, True])
+        z = concatenate((y, x))
+        assert_array_equal(z, [1, 1, 0, 0])
+        assert_array_equal(z.mask, [False, True, False, False])
+
+    def test_concatenate_flexible(self):
+        # Tests the concatenation on flexible arrays.
+        data = masked_array(list(zip(np.random.rand(10),
+                                     np.arange(10))),
+                            dtype=[('a', float), ('b', int)])
+
+        test = concatenate([data[:5], data[5:]])
+        assert_equal_records(test, data)
+
+    def test_creation_ndmin(self):
+        # Check the use of ndmin
+        x = array([1, 2, 3], mask=[1, 0, 0], ndmin=2)
+        assert_equal(x.shape, (1, 3))
+        assert_equal(x._data, [[1, 2, 3]])
+        assert_equal(x._mask, [[1, 0, 0]])
+
+    def test_creation_ndmin_from_maskedarray(self):
+        # Make sure we're not losing the original mask w/ ndmin
+        x = array([1, 2, 3])
+        x[-1] = masked
+        xx = array(x, ndmin=2, dtype=float)
+        assert_equal(x.shape, x._mask.shape)
+        assert_equal(xx.shape, xx._mask.shape)
+
+    def test_creation_maskcreation(self):
+        # Tests how masks are initialized at the creation of Maskedarrays.
+        data = arange(24, dtype=float)
+        data[[3, 6, 15]] = masked
+        dma_1 = MaskedArray(data)
+        assert_equal(dma_1.mask, data.mask)
+        dma_2 = MaskedArray(dma_1)
+        assert_equal(dma_2.mask, dma_1.mask)
+        dma_3 = MaskedArray(dma_1, mask=[1, 0, 0, 0] * 6)
+        fail_if_equal(dma_3.mask, dma_1.mask)
+
+        x = array([1, 2, 3], mask=True)
+        assert_equal(x._mask, [True, True, True])
+        x = array([1, 2, 3], mask=False)
+        assert_equal(x._mask, [False, False, False])
+        y = array([1, 2, 3], mask=x._mask, copy=False)
+        assert_(np.may_share_memory(x.mask, y.mask))
+        y = array([1, 2, 3], mask=x._mask, copy=True)
+        assert_(not np.may_share_memory(x.mask, y.mask))
+
+    def test_creation_with_list_of_maskedarrays(self):
+        # Tests creating a masked array from a list of masked arrays.
+        x = array(np.arange(5), mask=[1, 0, 0, 0, 0])
+        data = array((x, x[::-1]))
+        assert_equal(data, [[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]])
+        assert_equal(data._mask, [[1, 0, 0, 0, 0], [0, 0, 0, 0, 1]])
+
+        x.mask = nomask
+        data = array((x, x[::-1]))
+        assert_equal(data, [[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]])
+        assert_(data.mask is nomask)
+
+    def test_creation_from_ndarray_with_padding(self):
+        x = np.array([('A', 0)], dtype={'names':['f0','f1'],
+                                        'formats':['S4','i8'],
+                                        'offsets':[0,8]})
+        array(x)  # used to fail due to 'V' padding field in x.dtype.descr
+
+    def test_asarray(self):
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        xm.fill_value = -9999
+        xm._hardmask = True
+        xmm = asarray(xm)
+        assert_equal(xmm._data, xm._data)
+        assert_equal(xmm._mask, xm._mask)
+        assert_equal(xmm.fill_value, xm.fill_value)
+        assert_equal(xmm._hardmask, xm._hardmask)
+
+    def test_asarray_default_order(self):
+        # See Issue #6646
+        m = np.eye(3).T
+        assert_(not m.flags.c_contiguous)
+
+        new_m = asarray(m)
+        assert_(new_m.flags.c_contiguous)
+
+    def test_asarray_enforce_order(self):
+        # See Issue #6646
+        m = np.eye(3).T
+        assert_(not m.flags.c_contiguous)
+
+        new_m = asarray(m, order='C')
+        assert_(new_m.flags.c_contiguous)
+
+    def test_fix_invalid(self):
+        # Checks fix_invalid.
+        with np.errstate(invalid='ignore'):
+            data = masked_array([np.nan, 0., 1.], mask=[0, 0, 1])
+            data_fixed = fix_invalid(data)
+            assert_equal(data_fixed._data, [data.fill_value, 0., 1.])
+            assert_equal(data_fixed._mask, [1., 0., 1.])
+
+    def test_maskedelement(self):
+        # Test of masked element
+        x = arange(6)
+        x[1] = masked
+        assert_(str(masked) == '--')
+        assert_(x[1] is masked)
+        assert_equal(filled(x[1], 0), 0)
+
+    def test_set_element_as_object(self):
+        # Tests setting elements with object
+        a = empty(1, dtype=object)
+        x = (1, 2, 3, 4, 5)
+        a[0] = x
+        assert_equal(a[0], x)
+        assert_(a[0] is x)
+
+        import datetime
+        dt = datetime.datetime.now()
+        a[0] = dt
+        assert_(a[0] is dt)
+
+    def test_indexing(self):
+        # Tests conversions and indexing
+        x1 = np.array([1, 2, 4, 3])
+        x2 = array(x1, mask=[1, 0, 0, 0])
+        x3 = array(x1, mask=[0, 1, 0, 1])
+        x4 = array(x1)
+        # test conversion to strings
+        str(x2)  # raises?
+        repr(x2)  # raises?
+        assert_equal(np.sort(x1), sort(x2, endwith=False))
+        # tests of indexing
+        assert_(type(x2[1]) is type(x1[1]))
+        assert_(x1[1] == x2[1])
+        assert_(x2[0] is masked)
+        assert_equal(x1[2], x2[2])
+        assert_equal(x1[2:5], x2[2:5])
+        assert_equal(x1[:], x2[:])
+        assert_equal(x1[1:], x3[1:])
+        x1[2] = 9
+        x2[2] = 9
+        assert_equal(x1, x2)
+        x1[1:3] = 99
+        x2[1:3] = 99
+        assert_equal(x1, x2)
+        x2[1] = masked
+        assert_equal(x1, x2)
+        x2[1:3] = masked
+        assert_equal(x1, x2)
+        x2[:] = x1
+        x2[1] = masked
+        assert_(allequal(getmask(x2), array([0, 1, 0, 0])))
+        x3[:] = masked_array([1, 2, 3, 4], [0, 1, 1, 0])
+        assert_(allequal(getmask(x3), array([0, 1, 1, 0])))
+        x4[:] = masked_array([1, 2, 3, 4], [0, 1, 1, 0])
+        assert_(allequal(getmask(x4), array([0, 1, 1, 0])))
+        assert_(allequal(x4, array([1, 2, 3, 4])))
+        x1 = np.arange(5) * 1.0
+        x2 = masked_values(x1, 3.0)
+        assert_equal(x1, x2)
+        assert_(allequal(array([0, 0, 0, 1, 0], MaskType), x2.mask))
+        assert_equal(3.0, x2.fill_value)
+        x1 = array([1, 'hello', 2, 3], object)
+        x2 = np.array([1, 'hello', 2, 3], object)
+        s1 = x1[1]
+        s2 = x2[1]
+        assert_equal(type(s2), str)
+        assert_equal(type(s1), str)
+        assert_equal(s1, s2)
+        assert_(x1[1:1].shape == (0,))
+
+    @suppress_copy_mask_on_assignment
+    def test_copy(self):
+        # Tests of some subtle points of copying and sizing.
+        n = [0, 0, 1, 0, 0]
+        m = make_mask(n)
+        m2 = make_mask(m)
+        assert_(m is m2)
+        m3 = make_mask(m, copy=1)
+        assert_(m is not m3)
+
+        x1 = np.arange(5)
+        y1 = array(x1, mask=m)
+        assert_equal(y1._data.__array_interface__, x1.__array_interface__)
+        assert_(allequal(x1, y1.data))
+        assert_equal(y1._mask.__array_interface__, m.__array_interface__)
+
+        y1a = array(y1)
+        # Default for masked array is not to copy; see gh-10318.
+        assert_(y1a._data.__array_interface__ ==
+                        y1._data.__array_interface__)
+        assert_(y1a._mask.__array_interface__ ==
+                        y1._mask.__array_interface__)
+
+        y2 = array(x1, mask=m3)
+        assert_(y2._data.__array_interface__ == x1.__array_interface__)
+        assert_(y2._mask.__array_interface__ == m3.__array_interface__)
+        assert_(y2[2] is masked)
+        y2[2] = 9
+        assert_(y2[2] is not masked)
+        assert_(y2._mask.__array_interface__ == m3.__array_interface__)
+        assert_(allequal(y2.mask, 0))
+
+        y2a = array(x1, mask=m, copy=1)
+        assert_(y2a._data.__array_interface__ != x1.__array_interface__)
+        #assert_( y2a.mask is not m)
+        assert_(y2a._mask.__array_interface__ != m.__array_interface__)
+        assert_(y2a[2] is masked)
+        y2a[2] = 9
+        assert_(y2a[2] is not masked)
+        #assert_( y2a.mask is not m)
+        assert_(y2a._mask.__array_interface__ != m.__array_interface__)
+        assert_(allequal(y2a.mask, 0))
+
+        y3 = array(x1 * 1.0, mask=m)
+        assert_(filled(y3).dtype is (x1 * 1.0).dtype)
+
+        x4 = arange(4)
+        x4[2] = masked
+        y4 = resize(x4, (8,))
+        assert_equal(concatenate([x4, x4]), y4)
+        assert_equal(getmask(y4), [0, 0, 1, 0, 0, 0, 1, 0])
+        y5 = repeat(x4, (2, 2, 2, 2), axis=0)
+        assert_equal(y5, [0, 0, 1, 1, 2, 2, 3, 3])
+        y6 = repeat(x4, 2, axis=0)
+        assert_equal(y5, y6)
+        y7 = x4.repeat((2, 2, 2, 2), axis=0)
+        assert_equal(y5, y7)
+        y8 = x4.repeat(2, 0)
+        assert_equal(y5, y8)
+
+        y9 = x4.copy()
+        assert_equal(y9._data, x4._data)
+        assert_equal(y9._mask, x4._mask)
+
+        x = masked_array([1, 2, 3], mask=[0, 1, 0])
+        # Copy is False by default
+        y = masked_array(x)
+        assert_equal(y._data.ctypes.data, x._data.ctypes.data)
+        assert_equal(y._mask.ctypes.data, x._mask.ctypes.data)
+        y = masked_array(x, copy=True)
+        assert_not_equal(y._data.ctypes.data, x._data.ctypes.data)
+        assert_not_equal(y._mask.ctypes.data, x._mask.ctypes.data)
+
+    def test_copy_0d(self):
+        # gh-9430
+        x = np.ma.array(43, mask=True)
+        xc = x.copy()
+        assert_equal(xc.mask, True)
+
+    def test_copy_on_python_builtins(self):
+        # Tests copy works on python builtins (issue#8019)
+        assert_(isMaskedArray(np.ma.copy([1,2,3])))
+        assert_(isMaskedArray(np.ma.copy((1,2,3))))
+
+    def test_copy_immutable(self):
+        # Tests that the copy method is immutable, GitHub issue #5247
+        a = np.ma.array([1, 2, 3])
+        b = np.ma.array([4, 5, 6])
+        a_copy_method = a.copy
+        b.copy
+        assert_equal(a_copy_method(), [1, 2, 3])
+
+    def test_deepcopy(self):
+        from copy import deepcopy
+        a = array([0, 1, 2], mask=[False, True, False])
+        copied = deepcopy(a)
+        assert_equal(copied.mask, a.mask)
+        assert_not_equal(id(a._mask), id(copied._mask))
+
+        copied[1] = 1
+        assert_equal(copied.mask, [0, 0, 0])
+        assert_equal(a.mask, [0, 1, 0])
+
+        copied = deepcopy(a)
+        assert_equal(copied.mask, a.mask)
+        copied.mask[1] = False
+        assert_equal(copied.mask, [0, 0, 0])
+        assert_equal(a.mask, [0, 1, 0])
+
+    def test_str_repr(self):
+        a = array([0, 1, 2], mask=[False, True, False])
+        assert_equal(str(a), '[0 -- 2]')
+        assert_equal(
+            repr(a),
+            textwrap.dedent('''\
+            masked_array(data=[0, --, 2],
+                         mask=[False,  True, False],
+                   fill_value=999999)''')
+        )
+
+        # arrays with a continuation
+        a = np.ma.arange(2000)
+        a[1:50] = np.ma.masked
+        assert_equal(
+            repr(a),
+            textwrap.dedent('''\
+            masked_array(data=[0, --, --, ..., 1997, 1998, 1999],
+                         mask=[False,  True,  True, ..., False, False, False],
+                   fill_value=999999)''')
+        )
+
+        # line-wrapped 1d arrays are correctly aligned
+        a = np.ma.arange(20)
+        assert_equal(
+            repr(a),
+            textwrap.dedent('''\
+            masked_array(data=[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
+                               14, 15, 16, 17, 18, 19],
+                         mask=False,
+                   fill_value=999999)''')
+        )
+
+        # 2d arrays cause wrapping
+        a = array([[1, 2, 3], [4, 5, 6]], dtype=np.int8)
+        a[1,1] = np.ma.masked
+        assert_equal(
+            repr(a),
+            textwrap.dedent('''\
+            masked_array(
+              data=[[1, 2, 3],
+                    [4, --, 6]],
+              mask=[[False, False, False],
+                    [False,  True, False]],
+              fill_value=999999,
+              dtype=int8)''')
+        )
+
+        # but not it they're a row vector
+        assert_equal(
+            repr(a[:1]),
+            textwrap.dedent('''\
+            masked_array(data=[[1, 2, 3]],
+                         mask=[[False, False, False]],
+                   fill_value=999999,
+                        dtype=int8)''')
+        )
+
+        # dtype=int is implied, so not shown
+        assert_equal(
+            repr(a.astype(int)),
+            textwrap.dedent('''\
+            masked_array(
+              data=[[1, 2, 3],
+                    [4, --, 6]],
+              mask=[[False, False, False],
+                    [False,  True, False]],
+              fill_value=999999)''')
+        )
+
+    def test_str_repr_legacy(self):
+        oldopts = np.get_printoptions()
+        np.set_printoptions(legacy='1.13')
+        try:
+            a = array([0, 1, 2], mask=[False, True, False])
+            assert_equal(str(a), '[0 -- 2]')
+            assert_equal(repr(a), 'masked_array(data = [0 -- 2],\n'
+                                  '             mask = [False  True False],\n'
+                                  '       fill_value = 999999)\n')
+
+            a = np.ma.arange(2000)
+            a[1:50] = np.ma.masked
+            assert_equal(
+                repr(a),
+                'masked_array(data = [0 -- -- ..., 1997 1998 1999],\n'
+                '             mask = [False  True  True ..., False False False],\n'
+                '       fill_value = 999999)\n'
+            )
+        finally:
+            np.set_printoptions(**oldopts)
+
+    def test_0d_unicode(self):
+        u = u'caf\xe9'
+        utype = type(u)
+
+        arr_nomask = np.ma.array(u)
+        arr_masked = np.ma.array(u, mask=True)
+
+        assert_equal(utype(arr_nomask), u)
+        assert_equal(utype(arr_masked), u'--')
+
+    def test_pickling(self):
+        # Tests pickling
+        for dtype in (int, float, str, object):
+            a = arange(10).astype(dtype)
+            a.fill_value = 999
+
+            masks = ([0, 0, 0, 1, 0, 1, 0, 1, 0, 1],  # partially masked
+                     True,                            # Fully masked
+                     False)                           # Fully unmasked
+
+            for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+                for mask in masks:
+                    a.mask = mask
+                    a_pickled = pickle.loads(pickle.dumps(a, protocol=proto))
+                    assert_equal(a_pickled._mask, a._mask)
+                    assert_equal(a_pickled._data, a._data)
+                    if dtype in (object, int):
+                        assert_equal(a_pickled.fill_value, 999)
+                    else:
+                        assert_equal(a_pickled.fill_value, dtype(999))
+                    assert_array_equal(a_pickled.mask, mask)
+
+    def test_pickling_subbaseclass(self):
+        # Test pickling w/ a subclass of ndarray
+        x = np.array([(1.0, 2), (3.0, 4)],
+                     dtype=[('x', float), ('y', int)]).view(np.recarray)
+        a = masked_array(x, mask=[(True, False), (False, True)])
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            a_pickled = pickle.loads(pickle.dumps(a, protocol=proto))
+            assert_equal(a_pickled._mask, a._mask)
+            assert_equal(a_pickled, a)
+            assert_(isinstance(a_pickled._data, np.recarray))
+
+    def test_pickling_maskedconstant(self):
+        # Test pickling MaskedConstant
+        mc = np.ma.masked
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            mc_pickled = pickle.loads(pickle.dumps(mc, protocol=proto))
+            assert_equal(mc_pickled._baseclass, mc._baseclass)
+            assert_equal(mc_pickled._mask, mc._mask)
+            assert_equal(mc_pickled._data, mc._data)
+
+    def test_pickling_wstructured(self):
+        # Tests pickling w/ structured array
+        a = array([(1, 1.), (2, 2.)], mask=[(0, 0), (0, 1)],
+                  dtype=[('a', int), ('b', float)])
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            a_pickled = pickle.loads(pickle.dumps(a, protocol=proto))
+            assert_equal(a_pickled._mask, a._mask)
+            assert_equal(a_pickled, a)
+
+    def test_pickling_keepalignment(self):
+        # Tests pickling w/ F_CONTIGUOUS arrays
+        a = arange(10)
+        a.shape = (-1, 2)
+        b = a.T
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            test = pickle.loads(pickle.dumps(b, protocol=proto))
+            assert_equal(test, b)
+
+    def test_single_element_subscript(self):
+        # Tests single element subscripts of Maskedarrays.
+        a = array([1, 3, 2])
+        b = array([1, 3, 2], mask=[1, 0, 1])
+        assert_equal(a[0].shape, ())
+        assert_equal(b[0].shape, ())
+        assert_equal(b[1].shape, ())
+
+    def test_topython(self):
+        # Tests some communication issues with Python.
+        assert_equal(1, int(array(1)))
+        assert_equal(1.0, float(array(1)))
+        assert_equal(1, int(array([[[1]]])))
+        assert_equal(1.0, float(array([[1]])))
+        assert_raises(TypeError, float, array([1, 1]))
+
+        with suppress_warnings() as sup:
+            sup.filter(UserWarning, 'Warning: converting a masked element')
+            assert_(np.isnan(float(array([1], mask=[1]))))
+
+            a = array([1, 2, 3], mask=[1, 0, 0])
+            assert_raises(TypeError, lambda: float(a))
+            assert_equal(float(a[-1]), 3.)
+            assert_(np.isnan(float(a[0])))
+        assert_raises(TypeError, int, a)
+        assert_equal(int(a[-1]), 3)
+        assert_raises(MAError, lambda:int(a[0]))
+
+    def test_oddfeatures_1(self):
+        # Test of other odd features
+        x = arange(20)
+        x = x.reshape(4, 5)
+        x.flat[5] = 12
+        assert_(x[1, 0] == 12)
+        z = x + 10j * x
+        assert_equal(z.real, x)
+        assert_equal(z.imag, 10 * x)
+        assert_equal((z * conjugate(z)).real, 101 * x * x)
+        z.imag[...] = 0.0
+
+        x = arange(10)
+        x[3] = masked
+        assert_(str(x[3]) == str(masked))
+        c = x >= 8
+        assert_(count(where(c, masked, masked)) == 0)
+        assert_(shape(where(c, masked, masked)) == c.shape)
+
+        z = masked_where(c, x)
+        assert_(z.dtype is x.dtype)
+        assert_(z[3] is masked)
+        assert_(z[4] is not masked)
+        assert_(z[7] is not masked)
+        assert_(z[8] is masked)
+        assert_(z[9] is masked)
+        assert_equal(x, z)
+
+    def test_oddfeatures_2(self):
+        # Tests some more features.
+        x = array([1., 2., 3., 4., 5.])
+        c = array([1, 1, 1, 0, 0])
+        x[2] = masked
+        z = where(c, x, -x)
+        assert_equal(z, [1., 2., 0., -4., -5])
+        c[0] = masked
+        z = where(c, x, -x)
+        assert_equal(z, [1., 2., 0., -4., -5])
+        assert_(z[0] is masked)
+        assert_(z[1] is not masked)
+        assert_(z[2] is masked)
+
+    @suppress_copy_mask_on_assignment
+    def test_oddfeatures_3(self):
+        # Tests some generic features
+        atest = array([10], mask=True)
+        btest = array([20])
+        idx = atest.mask
+        atest[idx] = btest[idx]
+        assert_equal(atest, [20])
+
+    def test_filled_with_object_dtype(self):
+        a = np.ma.masked_all(1, dtype='O')
+        assert_equal(a.filled('x')[0], 'x')
+
+    def test_filled_with_flexible_dtype(self):
+        # Test filled w/ flexible dtype
+        flexi = array([(1, 1, 1)],
+                      dtype=[('i', int), ('s', '|S8'), ('f', float)])
+        flexi[0] = masked
+        assert_equal(flexi.filled(),
+                     np.array([(default_fill_value(0),
+                                default_fill_value('0'),
+                                default_fill_value(0.),)], dtype=flexi.dtype))
+        flexi[0] = masked
+        assert_equal(flexi.filled(1),
+                     np.array([(1, '1', 1.)], dtype=flexi.dtype))
+
+    def test_filled_with_mvoid(self):
+        # Test filled w/ mvoid
+        ndtype = [('a', int), ('b', float)]
+        a = mvoid((1, 2.), mask=[(0, 1)], dtype=ndtype)
+        # Filled using default
+        test = a.filled()
+        assert_equal(tuple(test), (1, default_fill_value(1.)))
+        # Explicit fill_value
+        test = a.filled((-1, -1))
+        assert_equal(tuple(test), (1, -1))
+        # Using predefined filling values
+        a.fill_value = (-999, -999)
+        assert_equal(tuple(a.filled()), (1, -999))
+
+    def test_filled_with_nested_dtype(self):
+        # Test filled w/ nested dtype
+        ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])]
+        a = array([(1, (1, 1)), (2, (2, 2))],
+                  mask=[(0, (1, 0)), (0, (0, 1))], dtype=ndtype)
+        test = a.filled(0)
+        control = np.array([(1, (0, 1)), (2, (2, 0))], dtype=ndtype)
+        assert_equal(test, control)
+
+        test = a['B'].filled(0)
+        control = np.array([(0, 1), (2, 0)], dtype=a['B'].dtype)
+        assert_equal(test, control)
+
+        # test if mask gets set correctly (see #6760)
+        Z = numpy.ma.zeros(2, numpy.dtype([("A", "(2,2)i1,(2,2)i1", (2,2))]))
+        assert_equal(Z.data.dtype, numpy.dtype([('A', [('f0', 'i1', (2, 2)),
+                                          ('f1', 'i1', (2, 2))], (2, 2))]))
+        assert_equal(Z.mask.dtype, numpy.dtype([('A', [('f0', '?', (2, 2)),
+                                          ('f1', '?', (2, 2))], (2, 2))]))
+
+    def test_filled_with_f_order(self):
+        # Test filled w/ F-contiguous array
+        a = array(np.array([(0, 1, 2), (4, 5, 6)], order='F'),
+                  mask=np.array([(0, 0, 1), (1, 0, 0)], order='F'),
+                  order='F')  # this is currently ignored
+        assert_(a.flags['F_CONTIGUOUS'])
+        assert_(a.filled(0).flags['F_CONTIGUOUS'])
+
+    def test_optinfo_propagation(self):
+        # Checks that _optinfo dictionary isn't back-propagated
+        x = array([1, 2, 3, ], dtype=float)
+        x._optinfo['info'] = '???'
+        y = x.copy()
+        assert_equal(y._optinfo['info'], '???')
+        y._optinfo['info'] = '!!!'
+        assert_equal(x._optinfo['info'], '???')
+
+    def test_optinfo_forward_propagation(self):
+        a = array([1,2,2,4])
+        a._optinfo["key"] = "value"
+        assert_equal(a._optinfo["key"], (a == 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a != 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a > 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a >= 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a <= 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a + 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a - 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a * 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], (a / 2)._optinfo["key"])
+        assert_equal(a._optinfo["key"], a[:2]._optinfo["key"])
+        assert_equal(a._optinfo["key"], a[[0,0,2]]._optinfo["key"])
+        assert_equal(a._optinfo["key"], np.exp(a)._optinfo["key"])
+        assert_equal(a._optinfo["key"], np.abs(a)._optinfo["key"])
+        assert_equal(a._optinfo["key"], array(a, copy=True)._optinfo["key"])
+        assert_equal(a._optinfo["key"], np.zeros_like(a)._optinfo["key"])
+
+    def test_fancy_printoptions(self):
+        # Test printing a masked array w/ fancy dtype.
+        fancydtype = np.dtype([('x', int), ('y', [('t', int), ('s', float)])])
+        test = array([(1, (2, 3.0)), (4, (5, 6.0))],
+                     mask=[(1, (0, 1)), (0, (1, 0))],
+                     dtype=fancydtype)
+        control = "[(--, (2, --)) (4, (--, 6.0))]"
+        assert_equal(str(test), control)
+
+        # Test 0-d array with multi-dimensional dtype
+        t_2d0 = masked_array(data = (0, [[0.0, 0.0, 0.0],
+                                        [0.0, 0.0, 0.0]],
+                                    0.0),
+                             mask = (False, [[True, False, True],
+                                             [False, False, True]],
+                                     False),
+                             dtype = "int, (2,3)float, float")
+        control = "(0, [[--, 0.0, --], [0.0, 0.0, --]], 0.0)"
+        assert_equal(str(t_2d0), control)
+
+    def test_flatten_structured_array(self):
+        # Test flatten_structured_array on arrays
+        # On ndarray
+        ndtype = [('a', int), ('b', float)]
+        a = np.array([(1, 1), (2, 2)], dtype=ndtype)
+        test = flatten_structured_array(a)
+        control = np.array([[1., 1.], [2., 2.]], dtype=float)
+        assert_equal(test, control)
+        assert_equal(test.dtype, control.dtype)
+        # On masked_array
+        a = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype)
+        test = flatten_structured_array(a)
+        control = array([[1., 1.], [2., 2.]],
+                        mask=[[0, 1], [1, 0]], dtype=float)
+        assert_equal(test, control)
+        assert_equal(test.dtype, control.dtype)
+        assert_equal(test.mask, control.mask)
+        # On masked array with nested structure
+        ndtype = [('a', int), ('b', [('ba', int), ('bb', float)])]
+        a = array([(1, (1, 1.1)), (2, (2, 2.2))],
+                  mask=[(0, (1, 0)), (1, (0, 1))], dtype=ndtype)
+        test = flatten_structured_array(a)
+        control = array([[1., 1., 1.1], [2., 2., 2.2]],
+                        mask=[[0, 1, 0], [1, 0, 1]], dtype=float)
+        assert_equal(test, control)
+        assert_equal(test.dtype, control.dtype)
+        assert_equal(test.mask, control.mask)
+        # Keeping the initial shape
+        ndtype = [('a', int), ('b', float)]
+        a = np.array([[(1, 1), ], [(2, 2), ]], dtype=ndtype)
+        test = flatten_structured_array(a)
+        control = np.array([[[1., 1.], ], [[2., 2.], ]], dtype=float)
+        assert_equal(test, control)
+        assert_equal(test.dtype, control.dtype)
+
+    def test_void0d(self):
+        # Test creating a mvoid object
+        ndtype = [('a', int), ('b', int)]
+        a = np.array([(1, 2,)], dtype=ndtype)[0]
+        f = mvoid(a)
+        assert_(isinstance(f, mvoid))
+
+        a = masked_array([(1, 2)], mask=[(1, 0)], dtype=ndtype)[0]
+        assert_(isinstance(a, mvoid))
+
+        a = masked_array([(1, 2), (1, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype)
+        f = mvoid(a._data[0], a._mask[0])
+        assert_(isinstance(f, mvoid))
+
+    def test_mvoid_getitem(self):
+        # Test mvoid.__getitem__
+        ndtype = [('a', int), ('b', int)]
+        a = masked_array([(1, 2,), (3, 4)], mask=[(0, 0), (1, 0)],
+                         dtype=ndtype)
+        # w/o mask
+        f = a[0]
+        assert_(isinstance(f, mvoid))
+        assert_equal((f[0], f['a']), (1, 1))
+        assert_equal(f['b'], 2)
+        # w/ mask
+        f = a[1]
+        assert_(isinstance(f, mvoid))
+        assert_(f[0] is masked)
+        assert_(f['a'] is masked)
+        assert_equal(f[1], 4)
+
+        # exotic dtype
+        A = masked_array(data=[([0,1],)],
+                         mask=[([True, False],)],
+                         dtype=[("A", ">i2", (2,))])
+        assert_equal(A[0]["A"], A["A"][0])
+        assert_equal(A[0]["A"], masked_array(data=[0, 1],
+                         mask=[True, False], dtype=">i2"))
+
+    def test_mvoid_iter(self):
+        # Test iteration on __getitem__
+        ndtype = [('a', int), ('b', int)]
+        a = masked_array([(1, 2,), (3, 4)], mask=[(0, 0), (1, 0)],
+                         dtype=ndtype)
+        # w/o mask
+        assert_equal(list(a[0]), [1, 2])
+        # w/ mask
+        assert_equal(list(a[1]), [masked, 4])
+
+    def test_mvoid_print(self):
+        # Test printing a mvoid
+        mx = array([(1, 1), (2, 2)], dtype=[('a', int), ('b', int)])
+        assert_equal(str(mx[0]), "(1, 1)")
+        mx['b'][0] = masked
+        ini_display = masked_print_option._display
+        masked_print_option.set_display("-X-")
+        try:
+            assert_equal(str(mx[0]), "(1, -X-)")
+            assert_equal(repr(mx[0]), "(1, -X-)")
+        finally:
+            masked_print_option.set_display(ini_display)
+
+        # also check if there are object datatypes (see gh-7493)
+        mx = array([(1,), (2,)], dtype=[('a', 'O')])
+        assert_equal(str(mx[0]), "(1,)")
+
+    def test_mvoid_multidim_print(self):
+
+        # regression test for gh-6019
+        t_ma = masked_array(data = [([1, 2, 3],)],
+                            mask = [([False, True, False],)],
+                            fill_value = ([999999, 999999, 999999],),
+                            dtype = [('a', '<i4', (3,))])
+        assert_(str(t_ma[0]) == "([1, --, 3],)")
+        assert_(repr(t_ma[0]) == "([1, --, 3],)")
+
+        # additional tests with structured arrays
+
+        t_2d = masked_array(data = [([[1, 2], [3,4]],)],
+                            mask = [([[False, True], [True, False]],)],
+                            dtype = [('a', '<i4', (2,2))])
+        assert_(str(t_2d[0]) == "([[1, --], [--, 4]],)")
+        assert_(repr(t_2d[0]) == "([[1, --], [--, 4]],)")
+
+        t_0d = masked_array(data = [(1,2)],
+                            mask = [(True,False)],
+                            dtype = [('a', '<i4'), ('b', '<i4')])
+        assert_(str(t_0d[0]) == "(--, 2)")
+        assert_(repr(t_0d[0]) == "(--, 2)")
+
+        t_2d = masked_array(data = [([[1, 2], [3,4]], 1)],
+                            mask = [([[False, True], [True, False]], False)],
+                            dtype = [('a', '<i4', (2,2)), ('b', float)])
+        assert_(str(t_2d[0]) == "([[1, --], [--, 4]], 1.0)")
+        assert_(repr(t_2d[0]) == "([[1, --], [--, 4]], 1.0)")
+
+        t_ne = masked_array(data=[(1, (1, 1))],
+                            mask=[(True, (True, False))],
+                            dtype = [('a', '<i4'), ('b', 'i4,i4')])
+        assert_(str(t_ne[0]) == "(--, (--, 1))")
+        assert_(repr(t_ne[0]) == "(--, (--, 1))")
+
+    def test_object_with_array(self):
+        mx1 = masked_array([1.], mask=[True])
+        mx2 = masked_array([1., 2.])
+        mx = masked_array([mx1, mx2], mask=[False, True])
+        assert_(mx[0] is mx1)
+        assert_(mx[1] is not mx2)
+        assert_(np.all(mx[1].data == mx2.data))
+        assert_(np.all(mx[1].mask))
+        # check that we return a view.
+        mx[1].data[0] = 0.
+        assert_(mx2[0] == 0.)
+
+
+class TestMaskedArrayArithmetic(object):
+    # Base test class for MaskedArrays.
+
+    def setup(self):
+        # Base data definition.
+        x = np.array([1., 1., 1., -2., pi/2.0, 4., 5., -10., 10., 1., 2., 3.])
+        y = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.])
+        a10 = 10.
+        m1 = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+        m2 = [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1]
+        xm = masked_array(x, mask=m1)
+        ym = masked_array(y, mask=m2)
+        z = np.array([-.5, 0., .5, .8])
+        zm = masked_array(z, mask=[0, 1, 0, 0])
+        xf = np.where(m1, 1e+20, x)
+        xm.set_fill_value(1e+20)
+        self.d = (x, y, a10, m1, m2, xm, ym, z, zm, xf)
+        self.err_status = np.geterr()
+        np.seterr(divide='ignore', invalid='ignore')
+
+    def teardown(self):
+        np.seterr(**self.err_status)
+
+    def test_basic_arithmetic(self):
+        # Test of basic arithmetic.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        a2d = array([[1, 2], [0, 4]])
+        a2dm = masked_array(a2d, [[0, 0], [1, 0]])
+        assert_equal(a2d * a2d, a2d * a2dm)
+        assert_equal(a2d + a2d, a2d + a2dm)
+        assert_equal(a2d - a2d, a2d - a2dm)
+        for s in [(12,), (4, 3), (2, 6)]:
+            x = x.reshape(s)
+            y = y.reshape(s)
+            xm = xm.reshape(s)
+            ym = ym.reshape(s)
+            xf = xf.reshape(s)
+            assert_equal(-x, -xm)
+            assert_equal(x + y, xm + ym)
+            assert_equal(x - y, xm - ym)
+            assert_equal(x * y, xm * ym)
+            assert_equal(x / y, xm / ym)
+            assert_equal(a10 + y, a10 + ym)
+            assert_equal(a10 - y, a10 - ym)
+            assert_equal(a10 * y, a10 * ym)
+            assert_equal(a10 / y, a10 / ym)
+            assert_equal(x + a10, xm + a10)
+            assert_equal(x - a10, xm - a10)
+            assert_equal(x * a10, xm * a10)
+            assert_equal(x / a10, xm / a10)
+            assert_equal(x ** 2, xm ** 2)
+            assert_equal(abs(x) ** 2.5, abs(xm) ** 2.5)
+            assert_equal(x ** y, xm ** ym)
+            assert_equal(np.add(x, y), add(xm, ym))
+            assert_equal(np.subtract(x, y), subtract(xm, ym))
+            assert_equal(np.multiply(x, y), multiply(xm, ym))
+            assert_equal(np.divide(x, y), divide(xm, ym))
+
+    def test_divide_on_different_shapes(self):
+        x = arange(6, dtype=float)
+        x.shape = (2, 3)
+        y = arange(3, dtype=float)
+
+        z = x / y
+        assert_equal(z, [[-1., 1., 1.], [-1., 4., 2.5]])
+        assert_equal(z.mask, [[1, 0, 0], [1, 0, 0]])
+
+        z = x / y[None,:]
+        assert_equal(z, [[-1., 1., 1.], [-1., 4., 2.5]])
+        assert_equal(z.mask, [[1, 0, 0], [1, 0, 0]])
+
+        y = arange(2, dtype=float)
+        z = x / y[:, None]
+        assert_equal(z, [[-1., -1., -1.], [3., 4., 5.]])
+        assert_equal(z.mask, [[1, 1, 1], [0, 0, 0]])
+
+    def test_mixed_arithmetic(self):
+        # Tests mixed arithmetics.
+        na = np.array([1])
+        ma = array([1])
+        assert_(isinstance(na + ma, MaskedArray))
+        assert_(isinstance(ma + na, MaskedArray))
+
+    def test_limits_arithmetic(self):
+        tiny = np.finfo(float).tiny
+        a = array([tiny, 1. / tiny, 0.])
+        assert_equal(getmaskarray(a / 2), [0, 0, 0])
+        assert_equal(getmaskarray(2 / a), [1, 0, 1])
+
+    def test_masked_singleton_arithmetic(self):
+        # Tests some scalar arithmetics on MaskedArrays.
+        # Masked singleton should remain masked no matter what
+        xm = array(0, mask=1)
+        assert_((1 / array(0)).mask)
+        assert_((1 + xm).mask)
+        assert_((-xm).mask)
+        assert_(maximum(xm, xm).mask)
+        assert_(minimum(xm, xm).mask)
+
+    def test_masked_singleton_equality(self):
+        # Tests (in)equality on masked singleton
+        a = array([1, 2, 3], mask=[1, 1, 0])
+        assert_((a[0] == 0) is masked)
+        assert_((a[0] != 0) is masked)
+        assert_equal((a[-1] == 0), False)
+        assert_equal((a[-1] != 0), True)
+
+    def test_arithmetic_with_masked_singleton(self):
+        # Checks that there's no collapsing to masked
+        x = masked_array([1, 2])
+        y = x * masked
+        assert_equal(y.shape, x.shape)
+        assert_equal(y._mask, [True, True])
+        y = x[0] * masked
+        assert_(y is masked)
+        y = x + masked
+        assert_equal(y.shape, x.shape)
+        assert_equal(y._mask, [True, True])
+
+    def test_arithmetic_with_masked_singleton_on_1d_singleton(self):
+        # Check that we're not losing the shape of a singleton
+        x = masked_array([1, ])
+        y = x + masked
+        assert_equal(y.shape, x.shape)
+        assert_equal(y.mask, [True, ])
+
+    def test_scalar_arithmetic(self):
+        x = array(0, mask=0)
+        assert_equal(x.filled().ctypes.data, x.ctypes.data)
+        # Make sure we don't lose the shape in some circumstances
+        xm = array((0, 0)) / 0.
+        assert_equal(xm.shape, (2,))
+        assert_equal(xm.mask, [1, 1])
+
+    def test_basic_ufuncs(self):
+        # Test various functions such as sin, cos.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        assert_equal(np.cos(x), cos(xm))
+        assert_equal(np.cosh(x), cosh(xm))
+        assert_equal(np.sin(x), sin(xm))
+        assert_equal(np.sinh(x), sinh(xm))
+        assert_equal(np.tan(x), tan(xm))
+        assert_equal(np.tanh(x), tanh(xm))
+        assert_equal(np.sqrt(abs(x)), sqrt(xm))
+        assert_equal(np.log(abs(x)), log(xm))
+        assert_equal(np.log10(abs(x)), log10(xm))
+        assert_equal(np.exp(x), exp(xm))
+        assert_equal(np.arcsin(z), arcsin(zm))
+        assert_equal(np.arccos(z), arccos(zm))
+        assert_equal(np.arctan(z), arctan(zm))
+        assert_equal(np.arctan2(x, y), arctan2(xm, ym))
+        assert_equal(np.absolute(x), absolute(xm))
+        assert_equal(np.angle(x + 1j*y), angle(xm + 1j*ym))
+        assert_equal(np.angle(x + 1j*y, deg=True), angle(xm + 1j*ym, deg=True))
+        assert_equal(np.equal(x, y), equal(xm, ym))
+        assert_equal(np.not_equal(x, y), not_equal(xm, ym))
+        assert_equal(np.less(x, y), less(xm, ym))
+        assert_equal(np.greater(x, y), greater(xm, ym))
+        assert_equal(np.less_equal(x, y), less_equal(xm, ym))
+        assert_equal(np.greater_equal(x, y), greater_equal(xm, ym))
+        assert_equal(np.conjugate(x), conjugate(xm))
+
+    def test_count_func(self):
+        # Tests count
+        assert_equal(1, count(1))
+        assert_equal(0, array(1, mask=[1]))
+
+        ott = array([0., 1., 2., 3.], mask=[1, 0, 0, 0])
+        res = count(ott)
+        assert_(res.dtype.type is np.intp)
+        assert_equal(3, res)
+
+        ott = ott.reshape((2, 2))
+        res = count(ott)
+        assert_(res.dtype.type is np.intp)
+        assert_equal(3, res)
+        res = count(ott, 0)
+        assert_(isinstance(res, ndarray))
+        assert_equal([1, 2], res)
+        assert_(getmask(res) is nomask)
+
+        ott = array([0., 1., 2., 3.])
+        res = count(ott, 0)
+        assert_(isinstance(res, ndarray))
+        assert_(res.dtype.type is np.intp)
+        assert_raises(np.AxisError, ott.count, axis=1)
+
+    def test_count_on_python_builtins(self):
+        # Tests count works on python builtins (issue#8019)
+        assert_equal(3, count([1,2,3]))
+        assert_equal(2, count((1,2)))
+
+    def test_minmax_func(self):
+        # Tests minimum and maximum.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        # max doesn't work if shaped
+        xr = np.ravel(x)
+        xmr = ravel(xm)
+        # following are true because of careful selection of data
+        assert_equal(max(xr), maximum.reduce(xmr))
+        assert_equal(min(xr), minimum.reduce(xmr))
+
+        assert_equal(minimum([1, 2, 3], [4, 0, 9]), [1, 0, 3])
+        assert_equal(maximum([1, 2, 3], [4, 0, 9]), [4, 2, 9])
+        x = arange(5)
+        y = arange(5) - 2
+        x[3] = masked
+        y[0] = masked
+        assert_equal(minimum(x, y), where(less(x, y), x, y))
+        assert_equal(maximum(x, y), where(greater(x, y), x, y))
+        assert_(minimum.reduce(x) == 0)
+        assert_(maximum.reduce(x) == 4)
+
+        x = arange(4).reshape(2, 2)
+        x[-1, -1] = masked
+        assert_equal(maximum.reduce(x, axis=None), 2)
+
+    def test_minimummaximum_func(self):
+        a = np.ones((2, 2))
+        aminimum = minimum(a, a)
+        assert_(isinstance(aminimum, MaskedArray))
+        assert_equal(aminimum, np.minimum(a, a))
+
+        aminimum = minimum.outer(a, a)
+        assert_(isinstance(aminimum, MaskedArray))
+        assert_equal(aminimum, np.minimum.outer(a, a))
+
+        amaximum = maximum(a, a)
+        assert_(isinstance(amaximum, MaskedArray))
+        assert_equal(amaximum, np.maximum(a, a))
+
+        amaximum = maximum.outer(a, a)
+        assert_(isinstance(amaximum, MaskedArray))
+        assert_equal(amaximum, np.maximum.outer(a, a))
+
+    def test_minmax_reduce(self):
+        # Test np.min/maximum.reduce on array w/ full False mask
+        a = array([1, 2, 3], mask=[False, False, False])
+        b = np.maximum.reduce(a)
+        assert_equal(b, 3)
+
+    def test_minmax_funcs_with_output(self):
+        # Tests the min/max functions with explicit outputs
+        mask = np.random.rand(12).round()
+        xm = array(np.random.uniform(0, 10, 12), mask=mask)
+        xm.shape = (3, 4)
+        for funcname in ('min', 'max'):
+            # Initialize
+            npfunc = getattr(np, funcname)
+            mafunc = getattr(numpy.ma.core, funcname)
+            # Use the np version
+            nout = np.empty((4,), dtype=int)
+            try:
+                result = npfunc(xm, axis=0, out=nout)
+            except MaskError:
+                pass
+            nout = np.empty((4,), dtype=float)
+            result = npfunc(xm, axis=0, out=nout)
+            assert_(result is nout)
+            # Use the ma version
+            nout.fill(-999)
+            result = mafunc(xm, axis=0, out=nout)
+            assert_(result is nout)
+
+    def test_minmax_methods(self):
+        # Additional tests on max/min
+        (_, _, _, _, _, xm, _, _, _, _) = self.d
+        xm.shape = (xm.size,)
+        assert_equal(xm.max(), 10)
+        assert_(xm[0].max() is masked)
+        assert_(xm[0].max(0) is masked)
+        assert_(xm[0].max(-1) is masked)
+        assert_equal(xm.min(), -10.)
+        assert_(xm[0].min() is masked)
+        assert_(xm[0].min(0) is masked)
+        assert_(xm[0].min(-1) is masked)
+        assert_equal(xm.ptp(), 20.)
+        assert_(xm[0].ptp() is masked)
+        assert_(xm[0].ptp(0) is masked)
+        assert_(xm[0].ptp(-1) is masked)
+
+        x = array([1, 2, 3], mask=True)
+        assert_(x.min() is masked)
+        assert_(x.max() is masked)
+        assert_(x.ptp() is masked)
+
+    def test_addsumprod(self):
+        # Tests add, sum, product.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        assert_equal(np.add.reduce(x), add.reduce(x))
+        assert_equal(np.add.accumulate(x), add.accumulate(x))
+        assert_equal(4, sum(array(4), axis=0))
+        assert_equal(4, sum(array(4), axis=0))
+        assert_equal(np.sum(x, axis=0), sum(x, axis=0))
+        assert_equal(np.sum(filled(xm, 0), axis=0), sum(xm, axis=0))
+        assert_equal(np.sum(x, 0), sum(x, 0))
+        assert_equal(np.product(x, axis=0), product(x, axis=0))
+        assert_equal(np.product(x, 0), product(x, 0))
+        assert_equal(np.product(filled(xm, 1), axis=0), product(xm, axis=0))
+        s = (3, 4)
+        x.shape = y.shape = xm.shape = ym.shape = s
+        if len(s) > 1:
+            assert_equal(np.concatenate((x, y), 1), concatenate((xm, ym), 1))
+            assert_equal(np.add.reduce(x, 1), add.reduce(x, 1))
+            assert_equal(np.sum(x, 1), sum(x, 1))
+            assert_equal(np.product(x, 1), product(x, 1))
+
+    def test_binops_d2D(self):
+        # Test binary operations on 2D data
+        a = array([[1.], [2.], [3.]], mask=[[False], [True], [True]])
+        b = array([[2., 3.], [4., 5.], [6., 7.]])
+
+        test = a * b
+        control = array([[2., 3.], [2., 2.], [3., 3.]],
+                        mask=[[0, 0], [1, 1], [1, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+        test = b * a
+        control = array([[2., 3.], [4., 5.], [6., 7.]],
+                        mask=[[0, 0], [1, 1], [1, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+        a = array([[1.], [2.], [3.]])
+        b = array([[2., 3.], [4., 5.], [6., 7.]],
+                  mask=[[0, 0], [0, 0], [0, 1]])
+        test = a * b
+        control = array([[2, 3], [8, 10], [18, 3]],
+                        mask=[[0, 0], [0, 0], [0, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+        test = b * a
+        control = array([[2, 3], [8, 10], [18, 7]],
+                        mask=[[0, 0], [0, 0], [0, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+    def test_domained_binops_d2D(self):
+        # Test domained binary operations on 2D data
+        a = array([[1.], [2.], [3.]], mask=[[False], [True], [True]])
+        b = array([[2., 3.], [4., 5.], [6., 7.]])
+
+        test = a / b
+        control = array([[1. / 2., 1. / 3.], [2., 2.], [3., 3.]],
+                        mask=[[0, 0], [1, 1], [1, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+        test = b / a
+        control = array([[2. / 1., 3. / 1.], [4., 5.], [6., 7.]],
+                        mask=[[0, 0], [1, 1], [1, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+        a = array([[1.], [2.], [3.]])
+        b = array([[2., 3.], [4., 5.], [6., 7.]],
+                  mask=[[0, 0], [0, 0], [0, 1]])
+        test = a / b
+        control = array([[1. / 2, 1. / 3], [2. / 4, 2. / 5], [3. / 6, 3]],
+                        mask=[[0, 0], [0, 0], [0, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+        test = b / a
+        control = array([[2 / 1., 3 / 1.], [4 / 2., 5 / 2.], [6 / 3., 7]],
+                        mask=[[0, 0], [0, 0], [0, 1]])
+        assert_equal(test, control)
+        assert_equal(test.data, control.data)
+        assert_equal(test.mask, control.mask)
+
+    def test_noshrinking(self):
+        # Check that we don't shrink a mask when not wanted
+        # Binary operations
+        a = masked_array([1., 2., 3.], mask=[False, False, False],
+                         shrink=False)
+        b = a + 1
+        assert_equal(b.mask, [0, 0, 0])
+        # In place binary operation
+        a += 1
+        assert_equal(a.mask, [0, 0, 0])
+        # Domained binary operation
+        b = a / 1.
+        assert_equal(b.mask, [0, 0, 0])
+        # In place binary operation
+        a /= 1.
+        assert_equal(a.mask, [0, 0, 0])
+
+    def test_ufunc_nomask(self):
+        # check the case ufuncs should set the mask to false
+        m = np.ma.array([1])
+        # check we don't get array([False], dtype=bool)
+        assert_equal(np.true_divide(m, 5).mask.shape, ())
+
+    def test_noshink_on_creation(self):
+        # Check that the mask is not shrunk on array creation when not wanted
+        a = np.ma.masked_values([1., 2.5, 3.1], 1.5, shrink=False)
+        assert_equal(a.mask, [0, 0, 0])
+
+    def test_mod(self):
+        # Tests mod
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
+        assert_equal(mod(x, y), mod(xm, ym))
+        test = mod(ym, xm)
+        assert_equal(test, np.mod(ym, xm))
+        assert_equal(test.mask, mask_or(xm.mask, ym.mask))
+        test = mod(xm, ym)
+        assert_equal(test, np.mod(xm, ym))
+        assert_equal(test.mask, mask_or(mask_or(xm.mask, ym.mask), (ym == 0)))
+
+    def test_TakeTransposeInnerOuter(self):
+        # Test of take, transpose, inner, outer products
+        x = arange(24)
+        y = np.arange(24)
+        x[5:6] = masked
+        x = x.reshape(2, 3, 4)
+        y = y.reshape(2, 3, 4)
+        assert_equal(np.transpose(y, (2, 0, 1)), transpose(x, (2, 0, 1)))
+        assert_equal(np.take(y, (2, 0, 1), 1), take(x, (2, 0, 1), 1))
+        assert_equal(np.inner(filled(x, 0), filled(y, 0)),
+                     inner(x, y))
+        assert_equal(np.outer(filled(x, 0), filled(y, 0)),
+                     outer(x, y))
+        y = array(['abc', 1, 'def', 2, 3], object)
+        y[2] = masked
+        t = take(y, [0, 3, 4])
+        assert_(t[0] == 'abc')
+        assert_(t[1] == 2)
+        assert_(t[2] == 3)
+
+    def test_imag_real(self):
+        # Check complex
+        xx = array([1 + 10j, 20 + 2j], mask=[1, 0])
+        assert_equal(xx.imag, [10, 2])
+        assert_equal(xx.imag.filled(), [1e+20, 2])
+        assert_equal(xx.imag.dtype, xx._data.imag.dtype)
+        assert_equal(xx.real, [1, 20])
+        assert_equal(xx.real.filled(), [1e+20, 20])
+        assert_equal(xx.real.dtype, xx._data.real.dtype)
+
+    def test_methods_with_output(self):
+        xm = array(np.random.uniform(0, 10, 12)).reshape(3, 4)
+        xm[:, 0] = xm[0] = xm[-1, -1] = masked
+
+        funclist = ('sum', 'prod', 'var', 'std', 'max', 'min', 'ptp', 'mean',)
+
+        for funcname in funclist:
+            npfunc = getattr(np, funcname)
+            xmmeth = getattr(xm, funcname)
+            # A ndarray as explicit input
+            output = np.empty(4, dtype=float)
+            output.fill(-9999)
+            result = npfunc(xm, axis=0, out=output)
+            # ... the result should be the given output
+            assert_(result is output)
+            assert_equal(result, xmmeth(axis=0, out=output))
+
+            output = empty(4, dtype=int)
+            result = xmmeth(axis=0, out=output)
+            assert_(result is output)
+            assert_(output[0] is masked)
+
+    def test_eq_on_structured(self):
+        # Test the equality of structured arrays
+        ndtype = [('A', int), ('B', int)]
+        a = array([(1, 1), (2, 2)], mask=[(0, 1), (0, 0)], dtype=ndtype)
+
+        test = (a == a)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [False, False])
+        assert_(test.fill_value == True)
+
+        test = (a == a[0])
+        assert_equal(test.data, [True, False])
+        assert_equal(test.mask, [False, False])
+        assert_(test.fill_value == True)
+
+        b = array([(1, 1), (2, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype)
+        test = (a == b)
+        assert_equal(test.data, [False, True])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+        test = (a[0] == b)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+        b = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype)
+        test = (a == b)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [False, False])
+        assert_(test.fill_value == True)
+
+        # complicated dtype, 2-dimensional array.
+        ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])]
+        a = array([[(1, (1, 1)), (2, (2, 2))],
+                   [(3, (3, 3)), (4, (4, 4))]],
+                  mask=[[(0, (1, 0)), (0, (0, 1))],
+                        [(1, (0, 0)), (1, (1, 1))]], dtype=ndtype)
+        test = (a[0, 0] == a)
+        assert_equal(test.data, [[True, False], [False, False]])
+        assert_equal(test.mask, [[False, False], [False, True]])
+        assert_(test.fill_value == True)
+
+    def test_ne_on_structured(self):
+        # Test the equality of structured arrays
+        ndtype = [('A', int), ('B', int)]
+        a = array([(1, 1), (2, 2)], mask=[(0, 1), (0, 0)], dtype=ndtype)
+
+        test = (a != a)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [False, False])
+        assert_(test.fill_value == True)
+
+        test = (a != a[0])
+        assert_equal(test.data, [False, True])
+        assert_equal(test.mask, [False, False])
+        assert_(test.fill_value == True)
+
+        b = array([(1, 1), (2, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype)
+        test = (a != b)
+        assert_equal(test.data, [True, False])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+        test = (a[0] != b)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+        b = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype)
+        test = (a != b)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [False, False])
+        assert_(test.fill_value == True)
+
+        # complicated dtype, 2-dimensional array.
+        ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])]
+        a = array([[(1, (1, 1)), (2, (2, 2))],
+                   [(3, (3, 3)), (4, (4, 4))]],
+                  mask=[[(0, (1, 0)), (0, (0, 1))],
+                        [(1, (0, 0)), (1, (1, 1))]], dtype=ndtype)
+        test = (a[0, 0] != a)
+        assert_equal(test.data, [[False, True], [True, True]])
+        assert_equal(test.mask, [[False, False], [False, True]])
+        assert_(test.fill_value == True)
+
+    def test_eq_ne_structured_extra(self):
+        # ensure simple examples are symmetric and make sense.
+        # from https://github.com/numpy/numpy/pull/8590#discussion_r101126465
+        dt = np.dtype('i4,i4')
+        for m1 in (mvoid((1, 2), mask=(0, 0), dtype=dt),
+                   mvoid((1, 2), mask=(0, 1), dtype=dt),
+                   mvoid((1, 2), mask=(1, 0), dtype=dt),
+                   mvoid((1, 2), mask=(1, 1), dtype=dt)):
+            ma1 = m1.view(MaskedArray)
+            r1 = ma1.view('2i4')
+            for m2 in (np.array((1, 1), dtype=dt),
+                       mvoid((1, 1), dtype=dt),
+                       mvoid((1, 0), mask=(0, 1), dtype=dt),
+                       mvoid((3, 2), mask=(0, 1), dtype=dt)):
+                ma2 = m2.view(MaskedArray)
+                r2 = ma2.view('2i4')
+                eq_expected = (r1 == r2).all()
+                assert_equal(m1 == m2, eq_expected)
+                assert_equal(m2 == m1, eq_expected)
+                assert_equal(ma1 == m2, eq_expected)
+                assert_equal(m1 == ma2, eq_expected)
+                assert_equal(ma1 == ma2, eq_expected)
+                # Also check it is the same if we do it element by element.
+                el_by_el = [m1[name] == m2[name] for name in dt.names]
+                assert_equal(array(el_by_el, dtype=bool).all(), eq_expected)
+                ne_expected = (r1 != r2).any()
+                assert_equal(m1 != m2, ne_expected)
+                assert_equal(m2 != m1, ne_expected)
+                assert_equal(ma1 != m2, ne_expected)
+                assert_equal(m1 != ma2, ne_expected)
+                assert_equal(ma1 != ma2, ne_expected)
+                el_by_el = [m1[name] != m2[name] for name in dt.names]
+                assert_equal(array(el_by_el, dtype=bool).any(), ne_expected)
+
+    @pytest.mark.parametrize('dt', ['S', 'U'])
+    @pytest.mark.parametrize('fill', [None, 'A'])
+    def test_eq_for_strings(self, dt, fill):
+        # Test the equality of structured arrays
+        a = array(['a', 'b'], dtype=dt, mask=[0, 1], fill_value=fill)
+
+        test = (a == a)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        test = (a == a[0])
+        assert_equal(test.data, [True, False])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        b = array(['a', 'b'], dtype=dt, mask=[1, 0], fill_value=fill)
+        test = (a == b)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [True, True])
+        assert_(test.fill_value == True)
+
+        # test = (a[0] == b)  # doesn't work in Python2
+        test = (b == a[0])
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+    @pytest.mark.parametrize('dt', ['S', 'U'])
+    @pytest.mark.parametrize('fill', [None, 'A'])
+    def test_ne_for_strings(self, dt, fill):
+        # Test the equality of structured arrays
+        a = array(['a', 'b'], dtype=dt, mask=[0, 1], fill_value=fill)
+
+        test = (a != a)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        test = (a != a[0])
+        assert_equal(test.data, [False, True])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        b = array(['a', 'b'], dtype=dt, mask=[1, 0], fill_value=fill)
+        test = (a != b)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [True, True])
+        assert_(test.fill_value == True)
+
+        # test = (a[0] != b)  # doesn't work in Python2
+        test = (b != a[0])
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+    @pytest.mark.parametrize('dt1', num_dts, ids=num_ids)
+    @pytest.mark.parametrize('dt2', num_dts, ids=num_ids)
+    @pytest.mark.parametrize('fill', [None, 1])
+    def test_eq_for_numeric(self, dt1, dt2, fill):
+        # Test the equality of structured arrays
+        a = array([0, 1], dtype=dt1, mask=[0, 1], fill_value=fill)
+
+        test = (a == a)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        test = (a == a[0])
+        assert_equal(test.data, [True, False])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        b = array([0, 1], dtype=dt2, mask=[1, 0], fill_value=fill)
+        test = (a == b)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [True, True])
+        assert_(test.fill_value == True)
+
+        # test = (a[0] == b)  # doesn't work in Python2
+        test = (b == a[0])
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+    @pytest.mark.parametrize('dt1', num_dts, ids=num_ids)
+    @pytest.mark.parametrize('dt2', num_dts, ids=num_ids)
+    @pytest.mark.parametrize('fill', [None, 1])
+    def test_ne_for_numeric(self, dt1, dt2, fill):
+        # Test the equality of structured arrays
+        a = array([0, 1], dtype=dt1, mask=[0, 1], fill_value=fill)
+
+        test = (a != a)
+        assert_equal(test.data, [False, False])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        test = (a != a[0])
+        assert_equal(test.data, [False, True])
+        assert_equal(test.mask, [False, True])
+        assert_(test.fill_value == True)
+
+        b = array([0, 1], dtype=dt2, mask=[1, 0], fill_value=fill)
+        test = (a != b)
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [True, True])
+        assert_(test.fill_value == True)
+
+        # test = (a[0] != b)  # doesn't work in Python2
+        test = (b != a[0])
+        assert_equal(test.data, [True, True])
+        assert_equal(test.mask, [True, False])
+        assert_(test.fill_value == True)
+
+    def test_eq_with_None(self):
+        # Really, comparisons with None should not be done, but check them
+        # anyway. Note that pep8 will flag these tests.
+        # Deprecation is in place for arrays, and when it happens this
+        # test will fail (and have to be changed accordingly).
+
+        # With partial mask
+        with suppress_warnings() as sup:
+            sup.filter(FutureWarning, "Comparison to `None`")
+            a = array([None, 1], mask=[0, 1])
+            assert_equal(a == None, array([True, False], mask=[0, 1]))
+            assert_equal(a.data == None, [True, False])
+            assert_equal(a != None, array([False, True], mask=[0, 1]))
+            # With nomask
+            a = array([None, 1], mask=False)
+            assert_equal(a == None, [True, False])
+            assert_equal(a != None, [False, True])
+            # With complete mask
+            a = array([None, 2], mask=True)
+            assert_equal(a == None, array([False, True], mask=True))
+            assert_equal(a != None, array([True, False], mask=True))
+            # Fully masked, even comparison to None should return "masked"
+            a = masked
+            assert_equal(a == None, masked)
+
+    def test_eq_with_scalar(self):
+        a = array(1)
+        assert_equal(a == 1, True)
+        assert_equal(a == 0, False)
+        assert_equal(a != 1, False)
+        assert_equal(a != 0, True)
+        b = array(1, mask=True)
+        assert_equal(b == 0, masked)
+        assert_equal(b == 1, masked)
+        assert_equal(b != 0, masked)
+        assert_equal(b != 1, masked)
+
+    def test_eq_different_dimensions(self):
+        m1 = array([1, 1], mask=[0, 1])
+        # test comparison with both masked and regular arrays.
+        for m2 in (array([[0, 1], [1, 2]]),
+                   np.array([[0, 1], [1, 2]])):
+            test = (m1 == m2)
+            assert_equal(test.data, [[False, False],
+                                     [True, False]])
+            assert_equal(test.mask, [[False, True],
+                                     [False, True]])
+
+    def test_numpyarithmetics(self):
+        # Check that the mask is not back-propagated when using numpy functions
+        a = masked_array([-1, 0, 1, 2, 3], mask=[0, 0, 0, 0, 1])
+        control = masked_array([np.nan, np.nan, 0, np.log(2), -1],
+                               mask=[1, 1, 0, 0, 1])
+
+        test = log(a)
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        assert_equal(a.mask, [0, 0, 0, 0, 1])
+
+        test = np.log(a)
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        assert_equal(a.mask, [0, 0, 0, 0, 1])
+
+
+class TestMaskedArrayAttributes(object):
+
+    def test_keepmask(self):
+        # Tests the keep mask flag
+        x = masked_array([1, 2, 3], mask=[1, 0, 0])
+        mx = masked_array(x)
+        assert_equal(mx.mask, x.mask)
+        mx = masked_array(x, mask=[0, 1, 0], keep_mask=False)
+        assert_equal(mx.mask, [0, 1, 0])
+        mx = masked_array(x, mask=[0, 1, 0], keep_mask=True)
+        assert_equal(mx.mask, [1, 1, 0])
+        # We default to true
+        mx = masked_array(x, mask=[0, 1, 0])
+        assert_equal(mx.mask, [1, 1, 0])
+
+    def test_hardmask(self):
+        # Test hard_mask
+        d = arange(5)
+        n = [0, 0, 0, 1, 1]
+        m = make_mask(n)
+        xh = array(d, mask=m, hard_mask=True)
+        # We need to copy, to avoid updating d in xh !
+        xs = array(d, mask=m, hard_mask=False, copy=True)
+        xh[[1, 4]] = [10, 40]
+        xs[[1, 4]] = [10, 40]
+        assert_equal(xh._data, [0, 10, 2, 3, 4])
+        assert_equal(xs._data, [0, 10, 2, 3, 40])
+        assert_equal(xs.mask, [0, 0, 0, 1, 0])
+        assert_(xh._hardmask)
+        assert_(not xs._hardmask)
+        xh[1:4] = [10, 20, 30]
+        xs[1:4] = [10, 20, 30]
+        assert_equal(xh._data, [0, 10, 20, 3, 4])
+        assert_equal(xs._data, [0, 10, 20, 30, 40])
+        assert_equal(xs.mask, nomask)
+        xh[0] = masked
+        xs[0] = masked
+        assert_equal(xh.mask, [1, 0, 0, 1, 1])
+        assert_equal(xs.mask, [1, 0, 0, 0, 0])
+        xh[:] = 1
+        xs[:] = 1
+        assert_equal(xh._data, [0, 1, 1, 3, 4])
+        assert_equal(xs._data, [1, 1, 1, 1, 1])
+        assert_equal(xh.mask, [1, 0, 0, 1, 1])
+        assert_equal(xs.mask, nomask)
+        # Switch to soft mask
+        xh.soften_mask()
+        xh[:] = arange(5)
+        assert_equal(xh._data, [0, 1, 2, 3, 4])
+        assert_equal(xh.mask, nomask)
+        # Switch back to hard mask
+        xh.harden_mask()
+        xh[xh < 3] = masked
+        assert_equal(xh._data, [0, 1, 2, 3, 4])
+        assert_equal(xh._mask, [1, 1, 1, 0, 0])
+        xh[filled(xh > 1, False)] = 5
+        assert_equal(xh._data, [0, 1, 2, 5, 5])
+        assert_equal(xh._mask, [1, 1, 1, 0, 0])
+
+        xh = array([[1, 2], [3, 4]], mask=[[1, 0], [0, 0]], hard_mask=True)
+        xh[0] = 0
+        assert_equal(xh._data, [[1, 0], [3, 4]])
+        assert_equal(xh._mask, [[1, 0], [0, 0]])
+        xh[-1, -1] = 5
+        assert_equal(xh._data, [[1, 0], [3, 5]])
+        assert_equal(xh._mask, [[1, 0], [0, 0]])
+        xh[filled(xh < 5, False)] = 2
+        assert_equal(xh._data, [[1, 2], [2, 5]])
+        assert_equal(xh._mask, [[1, 0], [0, 0]])
+
+    def test_hardmask_again(self):
+        # Another test of hardmask
+        d = arange(5)
+        n = [0, 0, 0, 1, 1]
+        m = make_mask(n)
+        xh = array(d, mask=m, hard_mask=True)
+        xh[4:5] = 999
+        xh[0:1] = 999
+        assert_equal(xh._data, [999, 1, 2, 3, 4])
+
+    def test_hardmask_oncemore_yay(self):
+        # OK, yet another test of hardmask
+        # Make sure that harden_mask/soften_mask//unshare_mask returns self
+        a = array([1, 2, 3], mask=[1, 0, 0])
+        b = a.harden_mask()
+        assert_equal(a, b)
+        b[0] = 0
+        assert_equal(a, b)
+        assert_equal(b, array([1, 2, 3], mask=[1, 0, 0]))
+        a = b.soften_mask()
+        a[0] = 0
+        assert_equal(a, b)
+        assert_equal(b, array([0, 2, 3], mask=[0, 0, 0]))
+
+    def test_smallmask(self):
+        # Checks the behaviour of _smallmask
+        a = arange(10)
+        a[1] = masked
+        a[1] = 1
+        assert_equal(a._mask, nomask)
+        a = arange(10)
+        a._smallmask = False
+        a[1] = masked
+        a[1] = 1
+        assert_equal(a._mask, zeros(10))
+
+    def test_shrink_mask(self):
+        # Tests .shrink_mask()
+        a = array([1, 2, 3], mask=[0, 0, 0])
+        b = a.shrink_mask()
+        assert_equal(a, b)
+        assert_equal(a.mask, nomask)
+
+        # Mask cannot be shrunk on structured types, so is a no-op
+        a = np.ma.array([(1, 2.0)], [('a', int), ('b', float)])
+        b = a.copy()
+        a.shrink_mask()
+        assert_equal(a.mask, b.mask)
+
+    def test_flat(self):
+        # Test that flat can return all types of items [#4585, #4615]
+        # test 2-D record array
+        # ... on structured array w/ masked records
+        x = array([[(1, 1.1, 'one'), (2, 2.2, 'two'), (3, 3.3, 'thr')],
+                   [(4, 4.4, 'fou'), (5, 5.5, 'fiv'), (6, 6.6, 'six')]],
+                  dtype=[('a', int), ('b', float), ('c', '|S8')])
+        x['a'][0, 1] = masked
+        x['b'][1, 0] = masked
+        x['c'][0, 2] = masked
+        x[-1, -1] = masked
+        xflat = x.flat
+        assert_equal(xflat[0], x[0, 0])
+        assert_equal(xflat[1], x[0, 1])
+        assert_equal(xflat[2], x[0, 2])
+        assert_equal(xflat[:3], x[0])
+        assert_equal(xflat[3], x[1, 0])
+        assert_equal(xflat[4], x[1, 1])
+        assert_equal(xflat[5], x[1, 2])
+        assert_equal(xflat[3:], x[1])
+        assert_equal(xflat[-1], x[-1, -1])
+        i = 0
+        j = 0
+        for xf in xflat:
+            assert_equal(xf, x[j, i])
+            i += 1
+            if i >= x.shape[-1]:
+                i = 0
+                j += 1
+
+    def test_assign_dtype(self):
+        # check that the mask's dtype is updated when dtype is changed
+        a = np.zeros(4, dtype='f4,i4')
+
+        m = np.ma.array(a)
+        m.dtype = np.dtype('f4')
+        repr(m)  # raises?
+        assert_equal(m.dtype, np.dtype('f4'))
+
+        # check that dtype changes that change shape of mask too much
+        # are not allowed
+        def assign():
+            m = np.ma.array(a)
+            m.dtype = np.dtype('f8')
+        assert_raises(ValueError, assign)
+
+        b = a.view(dtype='f4', type=np.ma.MaskedArray)  # raises?
+        assert_equal(b.dtype, np.dtype('f4'))
+
+        # check that nomask is preserved
+        a = np.zeros(4, dtype='f4')
+        m = np.ma.array(a)
+        m.dtype = np.dtype('f4,i4')
+        assert_equal(m.dtype, np.dtype('f4,i4'))
+        assert_equal(m._mask, np.ma.nomask)
+
+
+class TestFillingValues(object):
+
+    def test_check_on_scalar(self):
+        # Test _check_fill_value set to valid and invalid values
+        _check_fill_value = np.ma.core._check_fill_value
+
+        fval = _check_fill_value(0, int)
+        assert_equal(fval, 0)
+        fval = _check_fill_value(None, int)
+        assert_equal(fval, default_fill_value(0))
+
+        fval = _check_fill_value(0, "|S3")
+        assert_equal(fval, b"0")
+        fval = _check_fill_value(None, "|S3")
+        assert_equal(fval, default_fill_value(b"camelot!"))
+        assert_raises(TypeError, _check_fill_value, 1e+20, int)
+        assert_raises(TypeError, _check_fill_value, 'stuff', int)
+
+    def test_check_on_fields(self):
+        # Tests _check_fill_value with records
+        _check_fill_value = np.ma.core._check_fill_value
+        ndtype = [('a', int), ('b', float), ('c', "|S3")]
+        # A check on a list should return a single record
+        fval = _check_fill_value([-999, -12345678.9, "???"], ndtype)
+        assert_(isinstance(fval, ndarray))
+        assert_equal(fval.item(), [-999, -12345678.9, b"???"])
+        # A check on None should output the defaults
+        fval = _check_fill_value(None, ndtype)
+        assert_(isinstance(fval, ndarray))
+        assert_equal(fval.item(), [default_fill_value(0),
+                                   default_fill_value(0.),
+                                   asbytes(default_fill_value("0"))])
+        #.....Using a structured type as fill_value should work
+        fill_val = np.array((-999, -12345678.9, "???"), dtype=ndtype)
+        fval = _check_fill_value(fill_val, ndtype)
+        assert_(isinstance(fval, ndarray))
+        assert_equal(fval.item(), [-999, -12345678.9, b"???"])
+
+        #.....Using a flexible type w/ a different type shouldn't matter
+        # BEHAVIOR in 1.5 and earlier, and 1.13 and later: match structured
+        # types by position
+        fill_val = np.array((-999, -12345678.9, "???"),
+                            dtype=[("A", int), ("B", float), ("C", "|S3")])
+        fval = _check_fill_value(fill_val, ndtype)
+        assert_(isinstance(fval, ndarray))
+        assert_equal(fval.item(), [-999, -12345678.9, b"???"])
+
+        #.....Using an object-array shouldn't matter either
+        fill_val = np.ndarray(shape=(1,), dtype=object)
+        fill_val[0] = (-999, -12345678.9, b"???")
+        fval = _check_fill_value(fill_val, object)
+        assert_(isinstance(fval, ndarray))
+        assert_equal(fval.item(), [-999, -12345678.9, b"???"])
+        # NOTE: This test was never run properly as "fill_value" rather than
+        # "fill_val" was assigned.  Written properly, it fails.
+        #fill_val = np.array((-999, -12345678.9, "???"))
+        #fval = _check_fill_value(fill_val, ndtype)
+        #assert_(isinstance(fval, ndarray))
+        #assert_equal(fval.item(), [-999, -12345678.9, b"???"])
+        #.....One-field-only flexible type should work as well
+        ndtype = [("a", int)]
+        fval = _check_fill_value(-999999999, ndtype)
+        assert_(isinstance(fval, ndarray))
+        assert_equal(fval.item(), (-999999999,))
+
+    def test_fillvalue_conversion(self):
+        # Tests the behavior of fill_value during conversion
+        # We had a tailored comment to make sure special attributes are
+        # properly dealt with
+        a = array([b'3', b'4', b'5'])
+        a._optinfo.update({'comment':"updated!"})
+
+        b = array(a, dtype=int)
+        assert_equal(b._data, [3, 4, 5])
+        assert_equal(b.fill_value, default_fill_value(0))
+
+        b = array(a, dtype=float)
+        assert_equal(b._data, [3, 4, 5])
+        assert_equal(b.fill_value, default_fill_value(0.))
+
+        b = a.astype(int)
+        assert_equal(b._data, [3, 4, 5])
+        assert_equal(b.fill_value, default_fill_value(0))
+        assert_equal(b._optinfo['comment'], "updated!")
+
+        b = a.astype([('a', '|S3')])
+        assert_equal(b['a']._data, a._data)
+        assert_equal(b['a'].fill_value, a.fill_value)
+
+    def test_default_fill_value(self):
+        # check all calling conventions
+        f1 = default_fill_value(1.)
+        f2 = default_fill_value(np.array(1.))
+        f3 = default_fill_value(np.array(1.).dtype)
+        assert_equal(f1, f2)
+        assert_equal(f1, f3)
+
+    def test_default_fill_value_structured(self):
+        fields = array([(1, 1, 1)],
+                      dtype=[('i', int), ('s', '|S8'), ('f', float)])
+
+        f1 = default_fill_value(fields)
+        f2 = default_fill_value(fields.dtype)
+        expected = np.array((default_fill_value(0),
+                             default_fill_value('0'),
+                             default_fill_value(0.)), dtype=fields.dtype)
+        assert_equal(f1, expected)
+        assert_equal(f2, expected)
+
+    def test_default_fill_value_void(self):
+        dt = np.dtype([('v', 'V7')])
+        f = default_fill_value(dt)
+        assert_equal(f['v'], np.array(default_fill_value(dt['v']), dt['v']))
+
+    def test_fillvalue(self):
+        # Yet more fun with the fill_value
+        data = masked_array([1, 2, 3], fill_value=-999)
+        series = data[[0, 2, 1]]
+        assert_equal(series._fill_value, data._fill_value)
+
+        mtype = [('f', float), ('s', '|S3')]
+        x = array([(1, 'a'), (2, 'b'), (pi, 'pi')], dtype=mtype)
+        x.fill_value = 999
+        assert_equal(x.fill_value.item(), [999., b'999'])
+        assert_equal(x['f'].fill_value, 999)
+        assert_equal(x['s'].fill_value, b'999')
+
+        x.fill_value = (9, '???')
+        assert_equal(x.fill_value.item(), (9, b'???'))
+        assert_equal(x['f'].fill_value, 9)
+        assert_equal(x['s'].fill_value, b'???')
+
+        x = array([1, 2, 3.1])
+        x.fill_value = 999
+        assert_equal(np.asarray(x.fill_value).dtype, float)
+        assert_equal(x.fill_value, 999.)
+        assert_equal(x._fill_value, np.array(999.))
+
+    def test_subarray_fillvalue(self):
+        # gh-10483   test multi-field index fill value
+        fields = array([(1, 1, 1)],
+                      dtype=[('i', int), ('s', '|S8'), ('f', float)])
+        with suppress_warnings() as sup:
+            sup.filter(FutureWarning, "Numpy has detected")
+            subfields = fields[['i', 'f']]
+            assert_equal(tuple(subfields.fill_value), (999999, 1.e+20))
+            # test comparison does not raise:
+            subfields[1:] == subfields[:-1]
+
+    def test_fillvalue_exotic_dtype(self):
+        # Tests yet more exotic flexible dtypes
+        _check_fill_value = np.ma.core._check_fill_value
+        ndtype = [('i', int), ('s', '|S8'), ('f', float)]
+        control = np.array((default_fill_value(0),
+                            default_fill_value('0'),
+                            default_fill_value(0.),),
+                           dtype=ndtype)
+        assert_equal(_check_fill_value(None, ndtype), control)
+        # The shape shouldn't matter
+        ndtype = [('f0', float, (2, 2))]
+        control = np.array((default_fill_value(0.),),
+                           dtype=[('f0', float)]).astype(ndtype)
+        assert_equal(_check_fill_value(None, ndtype), control)
+        control = np.array((0,), dtype=[('f0', float)]).astype(ndtype)
+        assert_equal(_check_fill_value(0, ndtype), control)
+
+        ndtype = np.dtype("int, (2,3)float, float")
+        control = np.array((default_fill_value(0),
+                            default_fill_value(0.),
+                            default_fill_value(0.),),
+                           dtype="int, float, float").astype(ndtype)
+        test = _check_fill_value(None, ndtype)
+        assert_equal(test, control)
+        control = np.array((0, 0, 0), dtype="int, float, float").astype(ndtype)
+        assert_equal(_check_fill_value(0, ndtype), control)
+        # but when indexing, fill value should become scalar not tuple
+        # See issue #6723
+        M = masked_array(control)
+        assert_equal(M["f1"].fill_value.ndim, 0)
+
+    def test_fillvalue_datetime_timedelta(self):
+        # Test default fillvalue for datetime64 and timedelta64 types.
+        # See issue #4476, this would return '?' which would cause errors
+        # elsewhere
+
+        for timecode in ("as", "fs", "ps", "ns", "us", "ms", "s", "m",
+                         "h", "D", "W", "M", "Y"):
+            control = numpy.datetime64("NaT", timecode)
+            test = default_fill_value(numpy.dtype("<M8[" + timecode + "]"))
+            np.testing.assert_equal(test, control)
+
+            control = numpy.timedelta64("NaT", timecode)
+            test = default_fill_value(numpy.dtype("<m8[" + timecode + "]"))
+            np.testing.assert_equal(test, control)
+
+    def test_extremum_fill_value(self):
+        # Tests extremum fill values for flexible type.
+        a = array([(1, (2, 3)), (4, (5, 6))],
+                  dtype=[('A', int), ('B', [('BA', int), ('BB', int)])])
+        test = a.fill_value
+        assert_equal(test.dtype, a.dtype)
+        assert_equal(test['A'], default_fill_value(a['A']))
+        assert_equal(test['B']['BA'], default_fill_value(a['B']['BA']))
+        assert_equal(test['B']['BB'], default_fill_value(a['B']['BB']))
+
+        test = minimum_fill_value(a)
+        assert_equal(test.dtype, a.dtype)
+        assert_equal(test[0], minimum_fill_value(a['A']))
+        assert_equal(test[1][0], minimum_fill_value(a['B']['BA']))
+        assert_equal(test[1][1], minimum_fill_value(a['B']['BB']))
+        assert_equal(test[1], minimum_fill_value(a['B']))
+
+        test = maximum_fill_value(a)
+        assert_equal(test.dtype, a.dtype)
+        assert_equal(test[0], maximum_fill_value(a['A']))
+        assert_equal(test[1][0], maximum_fill_value(a['B']['BA']))
+        assert_equal(test[1][1], maximum_fill_value(a['B']['BB']))
+        assert_equal(test[1], maximum_fill_value(a['B']))
+
+    def test_extremum_fill_value_subdtype(self):
+        a = array(([2, 3, 4],), dtype=[('value', np.int8, 3)])
+
+        test = minimum_fill_value(a)
+        assert_equal(test.dtype, a.dtype)
+        assert_equal(test[0], np.full(3, minimum_fill_value(a['value'])))
+
+        test = maximum_fill_value(a)
+        assert_equal(test.dtype, a.dtype)
+        assert_equal(test[0], np.full(3, maximum_fill_value(a['value'])))
+
+    def test_fillvalue_individual_fields(self):
+        # Test setting fill_value on individual fields
+        ndtype = [('a', int), ('b', int)]
+        # Explicit fill_value
+        a = array(list(zip([1, 2, 3], [4, 5, 6])),
+                  fill_value=(-999, -999), dtype=ndtype)
+        aa = a['a']
+        aa.set_fill_value(10)
+        assert_equal(aa._fill_value, np.array(10))
+        assert_equal(tuple(a.fill_value), (10, -999))
+        a.fill_value['b'] = -10
+        assert_equal(tuple(a.fill_value), (10, -10))
+        # Implicit fill_value
+        t = array(list(zip([1, 2, 3], [4, 5, 6])), dtype=ndtype)
+        tt = t['a']
+        tt.set_fill_value(10)
+        assert_equal(tt._fill_value, np.array(10))
+        assert_equal(tuple(t.fill_value), (10, default_fill_value(0)))
+
+    def test_fillvalue_implicit_structured_array(self):
+        # Check that fill_value is always defined for structured arrays
+        ndtype = ('b', float)
+        adtype = ('a', float)
+        a = array([(1.,), (2.,)], mask=[(False,), (False,)],
+                  fill_value=(np.nan,), dtype=np.dtype([adtype]))
+        b = empty(a.shape, dtype=[adtype, ndtype])
+        b['a'] = a['a']
+        b['a'].set_fill_value(a['a'].fill_value)
+        f = b._fill_value[()]
+        assert_(np.isnan(f[0]))
+        assert_equal(f[-1], default_fill_value(1.))
+
+    def test_fillvalue_as_arguments(self):
+        # Test adding a fill_value parameter to empty/ones/zeros
+        a = empty(3, fill_value=999.)
+        assert_equal(a.fill_value, 999.)
+
+        a = ones(3, fill_value=999., dtype=float)
+        assert_equal(a.fill_value, 999.)
+
+        a = zeros(3, fill_value=0., dtype=complex)
+        assert_equal(a.fill_value, 0.)
+
+        a = identity(3, fill_value=0., dtype=complex)
+        assert_equal(a.fill_value, 0.)
+
+    def test_shape_argument(self):
+        # Test that shape can be provides as an argument
+        # GH issue 6106
+        a = empty(shape=(3, ))
+        assert_equal(a.shape, (3, ))
+
+        a = ones(shape=(3, ), dtype=float)
+        assert_equal(a.shape, (3, ))
+
+        a = zeros(shape=(3, ), dtype=complex)
+        assert_equal(a.shape, (3, ))
+
+    def test_fillvalue_in_view(self):
+        # Test the behavior of fill_value in view
+
+        # Create initial masked array
+        x = array([1, 2, 3], fill_value=1, dtype=np.int64)
+
+        # Check that fill_value is preserved by default
+        y = x.view()
+        assert_(y.fill_value == 1)
+
+        # Check that fill_value is preserved if dtype is specified and the
+        # dtype is an ndarray sub-class and has a _fill_value attribute
+        y = x.view(MaskedArray)
+        assert_(y.fill_value == 1)
+
+        # Check that fill_value is preserved if type is specified and the
+        # dtype is an ndarray sub-class and has a _fill_value attribute (by
+        # default, the first argument is dtype, not type)
+        y = x.view(type=MaskedArray)
+        assert_(y.fill_value == 1)
+
+        # Check that code does not crash if passed an ndarray sub-class that
+        # does not have a _fill_value attribute
+        y = x.view(np.ndarray)
+        y = x.view(type=np.ndarray)
+
+        # Check that fill_value can be overridden with view
+        y = x.view(MaskedArray, fill_value=2)
+        assert_(y.fill_value == 2)
+
+        # Check that fill_value can be overridden with view (using type=)
+        y = x.view(type=MaskedArray, fill_value=2)
+        assert_(y.fill_value == 2)
+
+        # Check that fill_value gets reset if passed a dtype but not a
+        # fill_value. This is because even though in some cases one can safely
+        # cast the fill_value, e.g. if taking an int64 view of an int32 array,
+        # in other cases, this cannot be done (e.g. int32 view of an int64
+        # array with a large fill_value).
+        y = x.view(dtype=np.int32)
+        assert_(y.fill_value == 999999)
+
+    def test_fillvalue_bytes_or_str(self):
+        # Test whether fill values work as expected for structured dtypes
+        # containing bytes or str.  See issue #7259.
+        a = empty(shape=(3, ), dtype="(2)3S,(2)3U")
+        assert_equal(a["f0"].fill_value, default_fill_value(b"spam"))
+        assert_equal(a["f1"].fill_value, default_fill_value("eggs"))
+
+
+class TestUfuncs(object):
+    # Test class for the application of ufuncs on MaskedArrays.
+
+    def setup(self):
+        # Base data definition.
+        self.d = (array([1.0, 0, -1, pi / 2] * 2, mask=[0, 1] + [0] * 6),
+                  array([1.0, 0, -1, pi / 2] * 2, mask=[1, 0] + [0] * 6),)
+        self.err_status = np.geterr()
+        np.seterr(divide='ignore', invalid='ignore')
+
+    def teardown(self):
+        np.seterr(**self.err_status)
+
+    def test_testUfuncRegression(self):
+        # Tests new ufuncs on MaskedArrays.
+        for f in ['sqrt', 'log', 'log10', 'exp', 'conjugate',
+                  'sin', 'cos', 'tan',
+                  'arcsin', 'arccos', 'arctan',
+                  'sinh', 'cosh', 'tanh',
+                  'arcsinh',
+                  'arccosh',
+                  'arctanh',
+                  'absolute', 'fabs', 'negative',
+                  'floor', 'ceil',
+                  'logical_not',
+                  'add', 'subtract', 'multiply',
+                  'divide', 'true_divide', 'floor_divide',
+                  'remainder', 'fmod', 'hypot', 'arctan2',
+                  'equal', 'not_equal', 'less_equal', 'greater_equal',
+                  'less', 'greater',
+                  'logical_and', 'logical_or', 'logical_xor',
+                  ]:
+            try:
+                uf = getattr(umath, f)
+            except AttributeError:
+                uf = getattr(fromnumeric, f)
+            mf = getattr(numpy.ma.core, f)
+            args = self.d[:uf.nin]
+            ur = uf(*args)
+            mr = mf(*args)
+            assert_equal(ur.filled(0), mr.filled(0), f)
+            assert_mask_equal(ur.mask, mr.mask, err_msg=f)
+
+    def test_reduce(self):
+        # Tests reduce on MaskedArrays.
+        a = self.d[0]
+        assert_(not alltrue(a, axis=0))
+        assert_(sometrue(a, axis=0))
+        assert_equal(sum(a[:3], axis=0), 0)
+        assert_equal(product(a, axis=0), 0)
+        assert_equal(add.reduce(a), pi)
+
+    def test_minmax(self):
+        # Tests extrema on MaskedArrays.
+        a = arange(1, 13).reshape(3, 4)
+        amask = masked_where(a < 5, a)
+        assert_equal(amask.max(), a.max())
+        assert_equal(amask.min(), 5)
+        assert_equal(amask.max(0), a.max(0))
+        assert_equal(amask.min(0), [5, 6, 7, 8])
+        assert_(amask.max(1)[0].mask)
+        assert_(amask.min(1)[0].mask)
+
+    def test_ndarray_mask(self):
+        # Check that the mask of the result is a ndarray (not a MaskedArray...)
+        a = masked_array([-1, 0, 1, 2, 3], mask=[0, 0, 0, 0, 1])
+        test = np.sqrt(a)
+        control = masked_array([-1, 0, 1, np.sqrt(2), -1],
+                               mask=[1, 0, 0, 0, 1])
+        assert_equal(test, control)
+        assert_equal(test.mask, control.mask)
+        assert_(not isinstance(test.mask, MaskedArray))
+
+    def test_treatment_of_NotImplemented(self):
+        # Check that NotImplemented is returned at appropriate places
+
+        a = masked_array([1., 2.], mask=[1, 0])
+        assert_raises(TypeError, operator.mul, a, "abc")
+        assert_raises(TypeError, operator.truediv, a, "abc")
+
+        class MyClass(object):
+            __array_priority__ = a.__array_priority__ + 1
+
+            def __mul__(self, other):
+                return "My mul"
+
+            def __rmul__(self, other):
+                return "My rmul"
+
+        me = MyClass()
+        assert_(me * a == "My mul")
+        assert_(a * me == "My rmul")
+
+        # and that __array_priority__ is respected
+        class MyClass2(object):
+            __array_priority__ = 100
+
+            def __mul__(self, other):
+                return "Me2mul"
+
+            def __rmul__(self, other):
+                return "Me2rmul"
+
+            def __rdiv__(self, other):
+                return "Me2rdiv"
+
+            __rtruediv__ = __rdiv__
+
+        me_too = MyClass2()
+        assert_(a.__mul__(me_too) is NotImplemented)
+        assert_(all(multiply.outer(a, me_too) == "Me2rmul"))
+        assert_(a.__truediv__(me_too) is NotImplemented)
+        assert_(me_too * a == "Me2mul")
+        assert_(a * me_too == "Me2rmul")
+        assert_(a / me_too == "Me2rdiv")
+
+    def test_no_masked_nan_warnings(self):
+        # check that a nan in masked position does not
+        # cause ufunc warnings
+
+        m = np.ma.array([0.5, np.nan], mask=[0,1])
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("error")
+
+            # test unary and binary ufuncs
+            exp(m)
+            add(m, 1)
+            m > 0
+
+            # test different unary domains
+            sqrt(m)
+            log(m)
+            tan(m)
+            arcsin(m)
+            arccos(m)
+            arccosh(m)
+
+            # test binary domains
+            divide(m, 2)
+
+            # also check that allclose uses ma ufuncs, to avoid warning
+            allclose(m, 0.5)
+
+class TestMaskedArrayInPlaceArithmetics(object):
+    # Test MaskedArray Arithmetics
+
+    def setup(self):
+        x = arange(10)
+        y = arange(10)
+        xm = arange(10)
+        xm[2] = masked
+        self.intdata = (x, y, xm)
+        self.floatdata = (x.astype(float), y.astype(float), xm.astype(float))
+        self.othertypes = np.typecodes['AllInteger'] + np.typecodes['AllFloat']
+        self.othertypes = [np.dtype(_).type for _ in self.othertypes]
+        self.uint8data = (
+            x.astype(np.uint8),
+            y.astype(np.uint8),
+            xm.astype(np.uint8)
+        )
+
+    def test_inplace_addition_scalar(self):
+        # Test of inplace additions
+        (x, y, xm) = self.intdata
+        xm[2] = masked
+        x += 1
+        assert_equal(x, y + 1)
+        xm += 1
+        assert_equal(xm, y + 1)
+
+        (x, _, xm) = self.floatdata
+        id1 = x.data.ctypes.data
+        x += 1.
+        assert_(id1 == x.data.ctypes.data)
+        assert_equal(x, y + 1.)
+
+    def test_inplace_addition_array(self):
+        # Test of inplace additions
+        (x, y, xm) = self.intdata
+        m = xm.mask
+        a = arange(10, dtype=np.int16)
+        a[-1] = masked
+        x += a
+        xm += a
+        assert_equal(x, y + a)
+        assert_equal(xm, y + a)
+        assert_equal(xm.mask, mask_or(m, a.mask))
+
+    def test_inplace_subtraction_scalar(self):
+        # Test of inplace subtractions
+        (x, y, xm) = self.intdata
+        x -= 1
+        assert_equal(x, y - 1)
+        xm -= 1
+        assert_equal(xm, y - 1)
+
+    def test_inplace_subtraction_array(self):
+        # Test of inplace subtractions
+        (x, y, xm) = self.floatdata
+        m = xm.mask
+        a = arange(10, dtype=float)
+        a[-1] = masked
+        x -= a
+        xm -= a
+        assert_equal(x, y - a)
+        assert_equal(xm, y - a)
+        assert_equal(xm.mask, mask_or(m, a.mask))
+
+    def test_inplace_multiplication_scalar(self):
+        # Test of inplace multiplication
+        (x, y, xm) = self.floatdata
+        x *= 2.0
+        assert_equal(x, y * 2)
+        xm *= 2.0
+        assert_equal(xm, y * 2)
+
+    def test_inplace_multiplication_array(self):
+        # Test of inplace multiplication
+        (x, y, xm) = self.floatdata
+        m = xm.mask
+        a = arange(10, dtype=float)
+        a[-1] = masked
+        x *= a
+        xm *= a
+        assert_equal(x, y * a)
+        assert_equal(xm, y * a)
+        assert_equal(xm.mask, mask_or(m, a.mask))
+
+    def test_inplace_division_scalar_int(self):
+        # Test of inplace division
+        (x, y, xm) = self.intdata
+        x = arange(10) * 2
+        xm = arange(10) * 2
+        xm[2] = masked
+        x //= 2
+        assert_equal(x, y)
+        xm //= 2
+        assert_equal(xm, y)
+
+    def test_inplace_division_scalar_float(self):
+        # Test of inplace division
+        (x, y, xm) = self.floatdata
+        x /= 2.0
+        assert_equal(x, y / 2.0)
+        xm /= arange(10)
+        assert_equal(xm, ones((10,)))
+
+    def test_inplace_division_array_float(self):
+        # Test of inplace division
+        (x, y, xm) = self.floatdata
+        m = xm.mask
+        a = arange(10, dtype=float)
+        a[-1] = masked
+        x /= a
+        xm /= a
+        assert_equal(x, y / a)
+        assert_equal(xm, y / a)
+        assert_equal(xm.mask, mask_or(mask_or(m, a.mask), (a == 0)))
+
+    def test_inplace_division_misc(self):
+
+        x = [1., 1., 1., -2., pi / 2., 4., 5., -10., 10., 1., 2., 3.]
+        y = [5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.]
+        m1 = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+        m2 = [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1]
+        xm = masked_array(x, mask=m1)
+        ym = masked_array(y, mask=m2)
+
+        z = xm / ym
+        assert_equal(z._mask, [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1])
+        assert_equal(z._data,
+                     [1., 1., 1., -1., -pi / 2., 4., 5., 1., 1., 1., 2., 3.])
+
+        xm = xm.copy()
+        xm /= ym
+        assert_equal(xm._mask, [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1])
+        assert_equal(z._data,
+                     [1., 1., 1., -1., -pi / 2., 4., 5., 1., 1., 1., 2., 3.])
+
+    def test_datafriendly_add(self):
+        # Test keeping data w/ (inplace) addition
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        # Test add w/ scalar
+        xx = x + 1
+        assert_equal(xx.data, [2, 3, 3])
+        assert_equal(xx.mask, [0, 0, 1])
+        # Test iadd w/ scalar
+        x += 1
+        assert_equal(x.data, [2, 3, 3])
+        assert_equal(x.mask, [0, 0, 1])
+        # Test add w/ array
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        xx = x + array([1, 2, 3], mask=[1, 0, 0])
+        assert_equal(xx.data, [1, 4, 3])
+        assert_equal(xx.mask, [1, 0, 1])
+        # Test iadd w/ array
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        x += array([1, 2, 3], mask=[1, 0, 0])
+        assert_equal(x.data, [1, 4, 3])
+        assert_equal(x.mask, [1, 0, 1])
+
+    def test_datafriendly_sub(self):
+        # Test keeping data w/ (inplace) subtraction
+        # Test sub w/ scalar
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        xx = x - 1
+        assert_equal(xx.data, [0, 1, 3])
+        assert_equal(xx.mask, [0, 0, 1])
+        # Test isub w/ scalar
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        x -= 1
+        assert_equal(x.data, [0, 1, 3])
+        assert_equal(x.mask, [0, 0, 1])
+        # Test sub w/ array
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        xx = x - array([1, 2, 3], mask=[1, 0, 0])
+        assert_equal(xx.data, [1, 0, 3])
+        assert_equal(xx.mask, [1, 0, 1])
+        # Test isub w/ array
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        x -= array([1, 2, 3], mask=[1, 0, 0])
+        assert_equal(x.data, [1, 0, 3])
+        assert_equal(x.mask, [1, 0, 1])
+
+    def test_datafriendly_mul(self):
+        # Test keeping data w/ (inplace) multiplication
+        # Test mul w/ scalar
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        xx = x * 2
+        assert_equal(xx.data, [2, 4, 3])
+        assert_equal(xx.mask, [0, 0, 1])
+        # Test imul w/ scalar
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        x *= 2
+        assert_equal(x.data, [2, 4, 3])
+        assert_equal(x.mask, [0, 0, 1])
+        # Test mul w/ array
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        xx = x * array([10, 20, 30], mask=[1, 0, 0])
+        assert_equal(xx.data, [1, 40, 3])
+        assert_equal(xx.mask, [1, 0, 1])
+        # Test imul w/ array
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        x *= array([10, 20, 30], mask=[1, 0, 0])
+        assert_equal(x.data, [1, 40, 3])
+        assert_equal(x.mask, [1, 0, 1])
+
+    def test_datafriendly_div(self):
+        # Test keeping data w/ (inplace) division
+        # Test div on scalar
+        x = array([1, 2, 3], mask=[0, 0, 1])
+        xx = x / 2.
+        assert_equal(xx.data, [1 / 2., 2 / 2., 3])
+        assert_equal(xx.mask, [0, 0, 1])
+        # Test idiv on scalar
+        x = array([1., 2., 3.], mask=[0, 0, 1])
+        x /= 2.
+        assert_equal(x.data, [1 / 2., 2 / 2., 3])
+        assert_equal(x.mask, [0, 0, 1])
+        # Test div on array
+        x = array([1., 2., 3.], mask=[0, 0, 1])
+        xx = x / array([10., 20., 30.], mask=[1, 0, 0])
+        assert_equal(xx.data, [1., 2. / 20., 3.])
+        assert_equal(xx.mask, [1, 0, 1])
+        # Test idiv on array
+        x = array([1., 2., 3.], mask=[0, 0, 1])
+        x /= array([10., 20., 30.], mask=[1, 0, 0])
+        assert_equal(x.data, [1., 2 / 20., 3.])
+        assert_equal(x.mask, [1, 0, 1])
+
+    def test_datafriendly_pow(self):
+        # Test keeping data w/ (inplace) power
+        # Test pow on scalar
+        x = array([1., 2., 3.], mask=[0, 0, 1])
+        xx = x ** 2.5
+        assert_equal(xx.data, [1., 2. ** 2.5, 3.])
+        assert_equal(xx.mask, [0, 0, 1])
+        # Test ipow on scalar
+        x **= 2.5
+        assert_equal(x.data, [1., 2. ** 2.5, 3])
+        assert_equal(x.mask, [0, 0, 1])
+
+    def test_datafriendly_add_arrays(self):
+        a = array([[1, 1], [3, 3]])
+        b = array([1, 1], mask=[0, 0])
+        a += b
+        assert_equal(a, [[2, 2], [4, 4]])
+        if a.mask is not nomask:
+            assert_equal(a.mask, [[0, 0], [0, 0]])
+
+        a = array([[1, 1], [3, 3]])
+        b = array([1, 1], mask=[0, 1])
+        a += b
+        assert_equal(a, [[2, 2], [4, 4]])
+        assert_equal(a.mask, [[0, 1], [0, 1]])
+
+    def test_datafriendly_sub_arrays(self):
+        a = array([[1, 1], [3, 3]])
+        b = array([1, 1], mask=[0, 0])
+        a -= b
+        assert_equal(a, [[0, 0], [2, 2]])
+        if a.mask is not nomask:
+            assert_equal(a.mask, [[0, 0], [0, 0]])
+
+        a = array([[1, 1], [3, 3]])
+        b = array([1, 1], mask=[0, 1])
+        a -= b
+        assert_equal(a, [[0, 0], [2, 2]])
+        assert_equal(a.mask, [[0, 1], [0, 1]])
+
+    def test_datafriendly_mul_arrays(self):
+        a = array([[1, 1], [3, 3]])
+        b = array([1, 1], mask=[0, 0])
+        a *= b
+        assert_equal(a, [[1, 1], [3, 3]])
+        if a.mask is not nomask:
+            assert_equal(a.mask, [[0, 0], [0, 0]])
+
+        a = array([[1, 1], [3, 3]])
+        b = array([1, 1], mask=[0, 1])
+        a *= b
+        assert_equal(a, [[1, 1], [3, 3]])
+        assert_equal(a.mask, [[0, 1], [0, 1]])
+
+    def test_inplace_addition_scalar_type(self):
+        # Test of inplace additions
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                xm[2] = masked
+                x += t(1)
+                assert_equal(x, y + t(1))
+                xm += t(1)
+                assert_equal(xm, y + t(1))
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_addition_array_type(self):
+        # Test of inplace additions
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                m = xm.mask
+                a = arange(10, dtype=t)
+                a[-1] = masked
+                x += a
+                xm += a
+                assert_equal(x, y + a)
+                assert_equal(xm, y + a)
+                assert_equal(xm.mask, mask_or(m, a.mask))
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_subtraction_scalar_type(self):
+        # Test of inplace subtractions
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                x -= t(1)
+                assert_equal(x, y - t(1))
+                xm -= t(1)
+                assert_equal(xm, y - t(1))
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_subtraction_array_type(self):
+        # Test of inplace subtractions
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                m = xm.mask
+                a = arange(10, dtype=t)
+                a[-1] = masked
+                x -= a
+                xm -= a
+                assert_equal(x, y - a)
+                assert_equal(xm, y - a)
+                assert_equal(xm.mask, mask_or(m, a.mask))
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_multiplication_scalar_type(self):
+        # Test of inplace multiplication
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                x *= t(2)
+                assert_equal(x, y * t(2))
+                xm *= t(2)
+                assert_equal(xm, y * t(2))
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_multiplication_array_type(self):
+        # Test of inplace multiplication
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                m = xm.mask
+                a = arange(10, dtype=t)
+                a[-1] = masked
+                x *= a
+                xm *= a
+                assert_equal(x, y * a)
+                assert_equal(xm, y * a)
+                assert_equal(xm.mask, mask_or(m, a.mask))
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_floor_division_scalar_type(self):
+        # Test of inplace division
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                x = arange(10, dtype=t) * t(2)
+                xm = arange(10, dtype=t) * t(2)
+                xm[2] = masked
+                x //= t(2)
+                xm //= t(2)
+                assert_equal(x, y)
+                assert_equal(xm, y)
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_floor_division_array_type(self):
+        # Test of inplace division
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                m = xm.mask
+                a = arange(10, dtype=t)
+                a[-1] = masked
+                x //= a
+                xm //= a
+                assert_equal(x, y // a)
+                assert_equal(xm, y // a)
+                assert_equal(
+                    xm.mask,
+                    mask_or(mask_or(m, a.mask), (a == t(0)))
+                )
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+    def test_inplace_division_scalar_type(self):
+        # Test of inplace division
+        for t in self.othertypes:
+            with suppress_warnings() as sup:
+                sup.record(UserWarning)
+
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                x = arange(10, dtype=t) * t(2)
+                xm = arange(10, dtype=t) * t(2)
+                xm[2] = masked
+
+                # May get a DeprecationWarning or a TypeError.
+                #
+                # This is a consequence of the fact that this is true divide
+                # and will require casting to float for calculation and
+                # casting back to the original type. This will only be raised
+                # with integers. Whether it is an error or warning is only
+                # dependent on how stringent the casting rules are.
+                #
+                # Will handle the same way.
+                try:
+                    x /= t(2)
+                    assert_equal(x, y)
+                except (DeprecationWarning, TypeError) as e:
+                    warnings.warn(str(e), stacklevel=1)
+                try:
+                    xm /= t(2)
+                    assert_equal(xm, y)
+                except (DeprecationWarning, TypeError) as e:
+                    warnings.warn(str(e), stacklevel=1)
+
+                if issubclass(t, np.integer):
+                    assert_equal(len(sup.log), 2, "Failed on type=%s." % t)
+                else:
+                    assert_equal(len(sup.log), 0, "Failed on type=%s." % t)
+
+    def test_inplace_division_array_type(self):
+        # Test of inplace division
+        for t in self.othertypes:
+            with suppress_warnings() as sup:
+                sup.record(UserWarning)
+                (x, y, xm) = (_.astype(t) for _ in self.uint8data)
+                m = xm.mask
+                a = arange(10, dtype=t)
+                a[-1] = masked
+
+                # May get a DeprecationWarning or a TypeError.
+                #
+                # This is a consequence of the fact that this is true divide
+                # and will require casting to float for calculation and
+                # casting back to the original type. This will only be raised
+                # with integers. Whether it is an error or warning is only
+                # dependent on how stringent the casting rules are.
+                #
+                # Will handle the same way.
+                try:
+                    x /= a
+                    assert_equal(x, y / a)
+                except (DeprecationWarning, TypeError) as e:
+                    warnings.warn(str(e), stacklevel=1)
+                try:
+                    xm /= a
+                    assert_equal(xm, y / a)
+                    assert_equal(
+                        xm.mask,
+                        mask_or(mask_or(m, a.mask), (a == t(0)))
+                    )
+                except (DeprecationWarning, TypeError) as e:
+                    warnings.warn(str(e), stacklevel=1)
+
+                if issubclass(t, np.integer):
+                    assert_equal(len(sup.log), 2, "Failed on type=%s." % t)
+                else:
+                    assert_equal(len(sup.log), 0, "Failed on type=%s." % t)
+
+    def test_inplace_pow_type(self):
+        # Test keeping data w/ (inplace) power
+        for t in self.othertypes:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings("always")
+                # Test pow on scalar
+                x = array([1, 2, 3], mask=[0, 0, 1], dtype=t)
+                xx = x ** t(2)
+                xx_r = array([1, 2 ** 2, 3], mask=[0, 0, 1], dtype=t)
+                assert_equal(xx.data, xx_r.data)
+                assert_equal(xx.mask, xx_r.mask)
+                # Test ipow on scalar
+                x **= t(2)
+                assert_equal(x.data, xx_r.data)
+                assert_equal(x.mask, xx_r.mask)
+
+                assert_equal(len(w), 0, "Failed on type=%s." % t)
+
+
+class TestMaskedArrayMethods(object):
+    # Test class for miscellaneous MaskedArrays methods.
+    def setup(self):
+        # Base data definition.
+        x = np.array([8.375, 7.545, 8.828, 8.5, 1.757, 5.928,
+                      8.43, 7.78, 9.865, 5.878, 8.979, 4.732,
+                      3.012, 6.022, 5.095, 3.116, 5.238, 3.957,
+                      6.04, 9.63, 7.712, 3.382, 4.489, 6.479,
+                      7.189, 9.645, 5.395, 4.961, 9.894, 2.893,
+                      7.357, 9.828, 6.272, 3.758, 6.693, 0.993])
+        X = x.reshape(6, 6)
+        XX = x.reshape(3, 2, 2, 3)
+
+        m = np.array([0, 1, 0, 1, 0, 0,
+                     1, 0, 1, 1, 0, 1,
+                     0, 0, 0, 1, 0, 1,
+                     0, 0, 0, 1, 1, 1,
+                     1, 0, 0, 1, 0, 0,
+                     0, 0, 1, 0, 1, 0])
+        mx = array(data=x, mask=m)
+        mX = array(data=X, mask=m.reshape(X.shape))
+        mXX = array(data=XX, mask=m.reshape(XX.shape))
+
+        m2 = np.array([1, 1, 0, 1, 0, 0,
+                      1, 1, 1, 1, 0, 1,
+                      0, 0, 1, 1, 0, 1,
+                      0, 0, 0, 1, 1, 1,
+                      1, 0, 0, 1, 1, 0,
+                      0, 0, 1, 0, 1, 1])
+        m2x = array(data=x, mask=m2)
+        m2X = array(data=X, mask=m2.reshape(X.shape))
+        m2XX = array(data=XX, mask=m2.reshape(XX.shape))
+        self.d = (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX)
+
+    def test_generic_methods(self):
+        # Tests some MaskedArray methods.
+        a = array([1, 3, 2])
+        assert_equal(a.any(), a._data.any())
+        assert_equal(a.all(), a._data.all())
+        assert_equal(a.argmax(), a._data.argmax())
+        assert_equal(a.argmin(), a._data.argmin())
+        assert_equal(a.choose(0, 1, 2, 3, 4), a._data.choose(0, 1, 2, 3, 4))
+        assert_equal(a.compress([1, 0, 1]), a._data.compress([1, 0, 1]))
+        assert_equal(a.conj(), a._data.conj())
+        assert_equal(a.conjugate(), a._data.conjugate())
+
+        m = array([[1, 2], [3, 4]])
+        assert_equal(m.diagonal(), m._data.diagonal())
+        assert_equal(a.sum(), a._data.sum())
+        assert_equal(a.take([1, 2]), a._data.take([1, 2]))
+        assert_equal(m.transpose(), m._data.transpose())
+
+    def test_allclose(self):
+        # Tests allclose on arrays
+        a = np.random.rand(10)
+        b = a + np.random.rand(10) * 1e-8
+        assert_(allclose(a, b))
+        # Test allclose w/ infs
+        a[0] = np.inf
+        assert_(not allclose(a, b))
+        b[0] = np.inf
+        assert_(allclose(a, b))
+        # Test allclose w/ masked
+        a = masked_array(a)
+        a[-1] = masked
+        assert_(allclose(a, b, masked_equal=True))
+        assert_(not allclose(a, b, masked_equal=False))
+        # Test comparison w/ scalar
+        a *= 1e-8
+        a[0] = 0
+        assert_(allclose(a, 0, masked_equal=True))
+
+        # Test that the function works for MIN_INT integer typed arrays
+        a = masked_array([np.iinfo(np.int_).min], dtype=np.int_)
+        assert_(allclose(a, a))
+
+    def test_allany(self):
+        # Checks the any/all methods/functions.
+        x = np.array([[0.13, 0.26, 0.90],
+                      [0.28, 0.33, 0.63],
+                      [0.31, 0.87, 0.70]])
+        m = np.array([[True, False, False],
+                      [False, False, False],
+                      [True, True, False]], dtype=np.bool_)
+        mx = masked_array(x, mask=m)
+        mxbig = (mx > 0.5)
+        mxsmall = (mx < 0.5)
+
+        assert_(not mxbig.all())
+        assert_(mxbig.any())
+        assert_equal(mxbig.all(0), [False, False, True])
+        assert_equal(mxbig.all(1), [False, False, True])
+        assert_equal(mxbig.any(0), [False, False, True])
+        assert_equal(mxbig.any(1), [True, True, True])
+
+        assert_(not mxsmall.all())
+        assert_(mxsmall.any())
+        assert_equal(mxsmall.all(0), [True, True, False])
+        assert_equal(mxsmall.all(1), [False, False, False])
+        assert_equal(mxsmall.any(0), [True, True, False])
+        assert_equal(mxsmall.any(1), [True, True, False])
+
+    def test_allany_oddities(self):
+        # Some fun with all and any
+        store = empty((), dtype=bool)
+        full = array([1, 2, 3], mask=True)
+
+        assert_(full.all() is masked)
+        full.all(out=store)
+        assert_(store)
+        assert_(store._mask, True)
+        assert_(store is not masked)
+
+        store = empty((), dtype=bool)
+        assert_(full.any() is masked)
+        full.any(out=store)
+        assert_(not store)
+        assert_(store._mask, True)
+        assert_(store is not masked)
+
+    def test_argmax_argmin(self):
+        # Tests argmin & argmax on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+
+        assert_equal(mx.argmin(), 35)
+        assert_equal(mX.argmin(), 35)
+        assert_equal(m2x.argmin(), 4)
+        assert_equal(m2X.argmin(), 4)
+        assert_equal(mx.argmax(), 28)
+        assert_equal(mX.argmax(), 28)
+        assert_equal(m2x.argmax(), 31)
+        assert_equal(m2X.argmax(), 31)
+
+        assert_equal(mX.argmin(0), [2, 2, 2, 5, 0, 5])
+        assert_equal(m2X.argmin(0), [2, 2, 4, 5, 0, 4])
+        assert_equal(mX.argmax(0), [0, 5, 0, 5, 4, 0])
+        assert_equal(m2X.argmax(0), [5, 5, 0, 5, 1, 0])
+
+        assert_equal(mX.argmin(1), [4, 1, 0, 0, 5, 5, ])
+        assert_equal(m2X.argmin(1), [4, 4, 0, 0, 5, 3])
+        assert_equal(mX.argmax(1), [2, 4, 1, 1, 4, 1])
+        assert_equal(m2X.argmax(1), [2, 4, 1, 1, 1, 1])
+
+    def test_clip(self):
+        # Tests clip on MaskedArrays.
+        x = np.array([8.375, 7.545, 8.828, 8.5, 1.757, 5.928,
+                      8.43, 7.78, 9.865, 5.878, 8.979, 4.732,
+                      3.012, 6.022, 5.095, 3.116, 5.238, 3.957,
+                      6.04, 9.63, 7.712, 3.382, 4.489, 6.479,
+                      7.189, 9.645, 5.395, 4.961, 9.894, 2.893,
+                      7.357, 9.828, 6.272, 3.758, 6.693, 0.993])
+        m = np.array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
+                      0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
+                      1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])
+        mx = array(x, mask=m)
+        clipped = mx.clip(2, 8)
+        assert_equal(clipped.mask, mx.mask)
+        assert_equal(clipped._data, x.clip(2, 8))
+        assert_equal(clipped._data, mx._data.clip(2, 8))
+
+    def test_compress(self):
+        # test compress
+        a = masked_array([1., 2., 3., 4., 5.], fill_value=9999)
+        condition = (a > 1.5) & (a < 3.5)
+        assert_equal(a.compress(condition), [2., 3.])
+
+        a[[2, 3]] = masked
+        b = a.compress(condition)
+        assert_equal(b._data, [2., 3.])
+        assert_equal(b._mask, [0, 1])
+        assert_equal(b.fill_value, 9999)
+        assert_equal(b, a[condition])
+
+        condition = (a < 4.)
+        b = a.compress(condition)
+        assert_equal(b._data, [1., 2., 3.])
+        assert_equal(b._mask, [0, 0, 1])
+        assert_equal(b.fill_value, 9999)
+        assert_equal(b, a[condition])
+
+        a = masked_array([[10, 20, 30], [40, 50, 60]],
+                         mask=[[0, 0, 1], [1, 0, 0]])
+        b = a.compress(a.ravel() >= 22)
+        assert_equal(b._data, [30, 40, 50, 60])
+        assert_equal(b._mask, [1, 1, 0, 0])
+
+        x = np.array([3, 1, 2])
+        b = a.compress(x >= 2, axis=1)
+        assert_equal(b._data, [[10, 30], [40, 60]])
+        assert_equal(b._mask, [[0, 1], [1, 0]])
+
+    def test_compressed(self):
+        # Tests compressed
+        a = array([1, 2, 3, 4], mask=[0, 0, 0, 0])
+        b = a.compressed()
+        assert_equal(b, a)
+        a[0] = masked
+        b = a.compressed()
+        assert_equal(b, [2, 3, 4])
+
+    def test_empty(self):
+        # Tests empty/like
+        datatype = [('a', int), ('b', float), ('c', '|S8')]
+        a = masked_array([(1, 1.1, '1.1'), (2, 2.2, '2.2'), (3, 3.3, '3.3')],
+                         dtype=datatype)
+        assert_equal(len(a.fill_value.item()), len(datatype))
+
+        b = empty_like(a)
+        assert_equal(b.shape, a.shape)
+        assert_equal(b.fill_value, a.fill_value)
+
+        b = empty(len(a), dtype=datatype)
+        assert_equal(b.shape, a.shape)
+        assert_equal(b.fill_value, a.fill_value)
+
+        # check empty_like mask handling
+        a = masked_array([1, 2, 3], mask=[False, True, False])
+        b = empty_like(a)
+        assert_(not np.may_share_memory(a.mask, b.mask))
+        b = a.view(masked_array)
+        assert_(np.may_share_memory(a.mask, b.mask))
+
+    @suppress_copy_mask_on_assignment
+    def test_put(self):
+        # Tests put.
+        d = arange(5)
+        n = [0, 0, 0, 1, 1]
+        m = make_mask(n)
+        x = array(d, mask=m)
+        assert_(x[3] is masked)
+        assert_(x[4] is masked)
+        x[[1, 4]] = [10, 40]
+        assert_(x[3] is masked)
+        assert_(x[4] is not masked)
+        assert_equal(x, [0, 10, 2, -1, 40])
+
+        x = masked_array(arange(10), mask=[1, 0, 0, 0, 0] * 2)
+        i = [0, 2, 4, 6]
+        x.put(i, [6, 4, 2, 0])
+        assert_equal(x, asarray([6, 1, 4, 3, 2, 5, 0, 7, 8, 9, ]))
+        assert_equal(x.mask, [0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
+        x.put(i, masked_array([0, 2, 4, 6], [1, 0, 1, 0]))
+        assert_array_equal(x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ])
+        assert_equal(x.mask, [1, 0, 0, 0, 1, 1, 0, 0, 0, 0])
+
+        x = masked_array(arange(10), mask=[1, 0, 0, 0, 0] * 2)
+        put(x, i, [6, 4, 2, 0])
+        assert_equal(x, asarray([6, 1, 4, 3, 2, 5, 0, 7, 8, 9, ]))
+        assert_equal(x.mask, [0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
+        put(x, i, masked_array([0, 2, 4, 6], [1, 0, 1, 0]))
+        assert_array_equal(x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ])
+        assert_equal(x.mask, [1, 0, 0, 0, 1, 1, 0, 0, 0, 0])
+
+    def test_put_nomask(self):
+        # GitHub issue 6425
+        x = zeros(10)
+        z = array([3., -1.], mask=[False, True])
+
+        x.put([1, 2], z)
+        assert_(x[0] is not masked)
+        assert_equal(x[0], 0)
+        assert_(x[1] is not masked)
+        assert_equal(x[1], 3)
+        assert_(x[2] is masked)
+        assert_(x[3] is not masked)
+        assert_equal(x[3], 0)
+
+    def test_put_hardmask(self):
+        # Tests put on hardmask
+        d = arange(5)
+        n = [0, 0, 0, 1, 1]
+        m = make_mask(n)
+        xh = array(d + 1, mask=m, hard_mask=True, copy=True)
+        xh.put([4, 2, 0, 1, 3], [1, 2, 3, 4, 5])
+        assert_equal(xh._data, [3, 4, 2, 4, 5])
+
+    def test_putmask(self):
+        x = arange(6) + 1
+        mx = array(x, mask=[0, 0, 0, 1, 1, 1])
+        mask = [0, 0, 1, 0, 0, 1]
+        # w/o mask, w/o masked values
+        xx = x.copy()
+        putmask(xx, mask, 99)
+        assert_equal(xx, [1, 2, 99, 4, 5, 99])
+        # w/ mask, w/o masked values
+        mxx = mx.copy()
+        putmask(mxx, mask, 99)
+        assert_equal(mxx._data, [1, 2, 99, 4, 5, 99])
+        assert_equal(mxx._mask, [0, 0, 0, 1, 1, 0])
+        # w/o mask, w/ masked values
+        values = array([10, 20, 30, 40, 50, 60], mask=[1, 1, 1, 0, 0, 0])
+        xx = x.copy()
+        putmask(xx, mask, values)
+        assert_equal(xx._data, [1, 2, 30, 4, 5, 60])
+        assert_equal(xx._mask, [0, 0, 1, 0, 0, 0])
+        # w/ mask, w/ masked values
+        mxx = mx.copy()
+        putmask(mxx, mask, values)
+        assert_equal(mxx._data, [1, 2, 30, 4, 5, 60])
+        assert_equal(mxx._mask, [0, 0, 1, 1, 1, 0])
+        # w/ mask, w/ masked values + hardmask
+        mxx = mx.copy()
+        mxx.harden_mask()
+        putmask(mxx, mask, values)
+        assert_equal(mxx, [1, 2, 30, 4, 5, 60])
+
+    def test_ravel(self):
+        # Tests ravel
+        a = array([[1, 2, 3, 4, 5]], mask=[[0, 1, 0, 0, 0]])
+        aravel = a.ravel()
+        assert_equal(aravel._mask.shape, aravel.shape)
+        a = array([0, 0], mask=[1, 1])
+        aravel = a.ravel()
+        assert_equal(aravel._mask.shape, a.shape)
+        # Checks that small_mask is preserved
+        a = array([1, 2, 3, 4], mask=[0, 0, 0, 0], shrink=False)
+        assert_equal(a.ravel()._mask, [0, 0, 0, 0])
+        # Test that the fill_value is preserved
+        a.fill_value = -99
+        a.shape = (2, 2)
+        ar = a.ravel()
+        assert_equal(ar._mask, [0, 0, 0, 0])
+        assert_equal(ar._data, [1, 2, 3, 4])
+        assert_equal(ar.fill_value, -99)
+        # Test index ordering
+        assert_equal(a.ravel(order='C'), [1, 2, 3, 4])
+        assert_equal(a.ravel(order='F'), [1, 3, 2, 4])
+
+    def test_reshape(self):
+        # Tests reshape
+        x = arange(4)
+        x[0] = masked
+        y = x.reshape(2, 2)
+        assert_equal(y.shape, (2, 2,))
+        assert_equal(y._mask.shape, (2, 2,))
+        assert_equal(x.shape, (4,))
+        assert_equal(x._mask.shape, (4,))
+
+    def test_sort(self):
+        # Test sort
+        x = array([1, 4, 2, 3], mask=[0, 1, 0, 0], dtype=np.uint8)
+
+        sortedx = sort(x)
+        assert_equal(sortedx._data, [1, 2, 3, 4])
+        assert_equal(sortedx._mask, [0, 0, 0, 1])
+
+        sortedx = sort(x, endwith=False)
+        assert_equal(sortedx._data, [4, 1, 2, 3])
+        assert_equal(sortedx._mask, [1, 0, 0, 0])
+
+        x.sort()
+        assert_equal(x._data, [1, 2, 3, 4])
+        assert_equal(x._mask, [0, 0, 0, 1])
+
+        x = array([1, 4, 2, 3], mask=[0, 1, 0, 0], dtype=np.uint8)
+        x.sort(endwith=False)
+        assert_equal(x._data, [4, 1, 2, 3])
+        assert_equal(x._mask, [1, 0, 0, 0])
+
+        x = [1, 4, 2, 3]
+        sortedx = sort(x)
+        assert_(not isinstance(sorted, MaskedArray))
+
+        x = array([0, 1, -1, -2, 2], mask=nomask, dtype=np.int8)
+        sortedx = sort(x, endwith=False)
+        assert_equal(sortedx._data, [-2, -1, 0, 1, 2])
+        x = array([0, 1, -1, -2, 2], mask=[0, 1, 0, 0, 1], dtype=np.int8)
+        sortedx = sort(x, endwith=False)
+        assert_equal(sortedx._data, [1, 2, -2, -1, 0])
+        assert_equal(sortedx._mask, [1, 1, 0, 0, 0])
+
+    def test_stable_sort(self):
+        x = array([1, 2, 3, 1, 2, 3], dtype=np.uint8)
+        expected = array([0, 3, 1, 4, 2, 5])
+        computed = argsort(x, kind='stable')
+        assert_equal(computed, expected)
+
+    def test_argsort_matches_sort(self):
+        x = array([1, 4, 2, 3], mask=[0, 1, 0, 0], dtype=np.uint8)
+
+        for kwargs in [dict(),
+                       dict(endwith=True),
+                       dict(endwith=False),
+                       dict(fill_value=2),
+                       dict(fill_value=2, endwith=True),
+                       dict(fill_value=2, endwith=False)]:
+            sortedx = sort(x, **kwargs)
+            argsortedx = x[argsort(x, **kwargs)]
+            assert_equal(sortedx._data, argsortedx._data)
+            assert_equal(sortedx._mask, argsortedx._mask)
+
+    def test_sort_2d(self):
+        # Check sort of 2D array.
+        # 2D array w/o mask
+        a = masked_array([[8, 4, 1], [2, 0, 9]])
+        a.sort(0)
+        assert_equal(a, [[2, 0, 1], [8, 4, 9]])
+        a = masked_array([[8, 4, 1], [2, 0, 9]])
+        a.sort(1)
+        assert_equal(a, [[1, 4, 8], [0, 2, 9]])
+        # 2D array w/mask
+        a = masked_array([[8, 4, 1], [2, 0, 9]], mask=[[1, 0, 0], [0, 0, 1]])
+        a.sort(0)
+        assert_equal(a, [[2, 0, 1], [8, 4, 9]])
+        assert_equal(a._mask, [[0, 0, 0], [1, 0, 1]])
+        a = masked_array([[8, 4, 1], [2, 0, 9]], mask=[[1, 0, 0], [0, 0, 1]])
+        a.sort(1)
+        assert_equal(a, [[1, 4, 8], [0, 2, 9]])
+        assert_equal(a._mask, [[0, 0, 1], [0, 0, 1]])
+        # 3D
+        a = masked_array([[[7, 8, 9], [4, 5, 6], [1, 2, 3]],
+                          [[1, 2, 3], [7, 8, 9], [4, 5, 6]],
+                          [[7, 8, 9], [1, 2, 3], [4, 5, 6]],
+                          [[4, 5, 6], [1, 2, 3], [7, 8, 9]]])
+        a[a % 4 == 0] = masked
+        am = a.copy()
+        an = a.filled(99)
+        am.sort(0)
+        an.sort(0)
+        assert_equal(am, an)
+        am = a.copy()
+        an = a.filled(99)
+        am.sort(1)
+        an.sort(1)
+        assert_equal(am, an)
+        am = a.copy()
+        an = a.filled(99)
+        am.sort(2)
+        an.sort(2)
+        assert_equal(am, an)
+
+    def test_sort_flexible(self):
+        # Test sort on structured dtype.
+        a = array(
+            data=[(3, 3), (3, 2), (2, 2), (2, 1), (1, 0), (1, 1), (1, 2)],
+            mask=[(0, 0), (0, 1), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0)],
+            dtype=[('A', int), ('B', int)])
+        mask_last = array(
+            data=[(1, 1), (1, 2), (2, 1), (2, 2), (3, 3), (3, 2), (1, 0)],
+            mask=[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 0)],
+            dtype=[('A', int), ('B', int)])
+        mask_first = array(
+            data=[(1, 0), (1, 1), (1, 2), (2, 1), (2, 2), (3, 2), (3, 3)],
+            mask=[(1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0)],
+            dtype=[('A', int), ('B', int)])
+
+        test = sort(a)
+        assert_equal(test, mask_last)
+        assert_equal(test.mask, mask_last.mask)
+
+        test = sort(a, endwith=False)
+        assert_equal(test, mask_first)
+        assert_equal(test.mask, mask_first.mask)
+
+        # Test sort on dtype with subarray (gh-8069)
+        # Just check that the sort does not error, structured array subarrays
+        # are treated as byte strings and that leads to differing behavior
+        # depending on endianess and `endwith`.
+        dt = np.dtype([('v', int, 2)])
+        a = a.view(dt)
+        test = sort(a)
+        test = sort(a, endwith=False)
+
+    def test_argsort(self):
+        # Test argsort
+        a = array([1, 5, 2, 4, 3], mask=[1, 0, 0, 1, 0])
+        assert_equal(np.argsort(a), argsort(a))
+
+    def test_squeeze(self):
+        # Check squeeze
+        data = masked_array([[1, 2, 3]])
+        assert_equal(data.squeeze(), [1, 2, 3])
+        data = masked_array([[1, 2, 3]], mask=[[1, 1, 1]])
+        assert_equal(data.squeeze(), [1, 2, 3])
+        assert_equal(data.squeeze()._mask, [1, 1, 1])
+
+        # normal ndarrays return a view
+        arr = np.array([[1]])
+        arr_sq = arr.squeeze()
+        assert_equal(arr_sq, 1)
+        arr_sq[...] = 2
+        assert_equal(arr[0,0], 2)
+
+        # so maskedarrays should too
+        m_arr = masked_array([[1]], mask=True)
+        m_arr_sq = m_arr.squeeze()
+        assert_(m_arr_sq is not np.ma.masked)
+        assert_equal(m_arr_sq.mask, True)
+        m_arr_sq[...] = 2
+        assert_equal(m_arr[0,0], 2)
+
+    def test_swapaxes(self):
+        # Tests swapaxes on MaskedArrays.
+        x = np.array([8.375, 7.545, 8.828, 8.5, 1.757, 5.928,
+                      8.43, 7.78, 9.865, 5.878, 8.979, 4.732,
+                      3.012, 6.022, 5.095, 3.116, 5.238, 3.957,
+                      6.04, 9.63, 7.712, 3.382, 4.489, 6.479,
+                      7.189, 9.645, 5.395, 4.961, 9.894, 2.893,
+                      7.357, 9.828, 6.272, 3.758, 6.693, 0.993])
+        m = np.array([0, 1, 0, 1, 0, 0,
+                      1, 0, 1, 1, 0, 1,
+                      0, 0, 0, 1, 0, 1,
+                      0, 0, 0, 1, 1, 1,
+                      1, 0, 0, 1, 0, 0,
+                      0, 0, 1, 0, 1, 0])
+        mX = array(x, mask=m).reshape(6, 6)
+        mXX = mX.reshape(3, 2, 2, 3)
+
+        mXswapped = mX.swapaxes(0, 1)
+        assert_equal(mXswapped[-1], mX[:, -1])
+
+        mXXswapped = mXX.swapaxes(0, 2)
+        assert_equal(mXXswapped.shape, (2, 2, 3, 3))
+
+    def test_take(self):
+        # Tests take
+        x = masked_array([10, 20, 30, 40], [0, 1, 0, 1])
+        assert_equal(x.take([0, 0, 3]), masked_array([10, 10, 40], [0, 0, 1]))
+        assert_equal(x.take([0, 0, 3]), x[[0, 0, 3]])
+        assert_equal(x.take([[0, 1], [0, 1]]),
+                     masked_array([[10, 20], [10, 20]], [[0, 1], [0, 1]]))
+
+        # assert_equal crashes when passed np.ma.mask
+        assert_(x[1] is np.ma.masked)
+        assert_(x.take(1) is np.ma.masked)
+
+        x = array([[10, 20, 30], [40, 50, 60]], mask=[[0, 0, 1], [1, 0, 0, ]])
+        assert_equal(x.take([0, 2], axis=1),
+                     array([[10, 30], [40, 60]], mask=[[0, 1], [1, 0]]))
+        assert_equal(take(x, [0, 2], axis=1),
+                     array([[10, 30], [40, 60]], mask=[[0, 1], [1, 0]]))
+
+    def test_take_masked_indices(self):
+        # Test take w/ masked indices
+        a = np.array((40, 18, 37, 9, 22))
+        indices = np.arange(3)[None,:] + np.arange(5)[:, None]
+        mindices = array(indices, mask=(indices >= len(a)))
+        # No mask
+        test = take(a, mindices, mode='clip')
+        ctrl = array([[40, 18, 37],
+                      [18, 37, 9],
+                      [37, 9, 22],
+                      [9, 22, 22],
+                      [22, 22, 22]])
+        assert_equal(test, ctrl)
+        # Masked indices
+        test = take(a, mindices)
+        ctrl = array([[40, 18, 37],
+                      [18, 37, 9],
+                      [37, 9, 22],
+                      [9, 22, 40],
+                      [22, 40, 40]])
+        ctrl[3, 2] = ctrl[4, 1] = ctrl[4, 2] = masked
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, ctrl.mask)
+        # Masked input + masked indices
+        a = array((40, 18, 37, 9, 22), mask=(0, 1, 0, 0, 0))
+        test = take(a, mindices)
+        ctrl[0, 1] = ctrl[1, 0] = masked
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, ctrl.mask)
+
+    def test_tolist(self):
+        # Tests to list
+        # ... on 1D
+        x = array(np.arange(12))
+        x[[1, -2]] = masked
+        xlist = x.tolist()
+        assert_(xlist[1] is None)
+        assert_(xlist[-2] is None)
+        # ... on 2D
+        x.shape = (3, 4)
+        xlist = x.tolist()
+        ctrl = [[0, None, 2, 3], [4, 5, 6, 7], [8, 9, None, 11]]
+        assert_equal(xlist[0], [0, None, 2, 3])
+        assert_equal(xlist[1], [4, 5, 6, 7])
+        assert_equal(xlist[2], [8, 9, None, 11])
+        assert_equal(xlist, ctrl)
+        # ... on structured array w/ masked records
+        x = array(list(zip([1, 2, 3],
+                           [1.1, 2.2, 3.3],
+                           ['one', 'two', 'thr'])),
+                  dtype=[('a', int), ('b', float), ('c', '|S8')])
+        x[-1] = masked
+        assert_equal(x.tolist(),
+                     [(1, 1.1, b'one'),
+                      (2, 2.2, b'two'),
+                      (None, None, None)])
+        # ... on structured array w/ masked fields
+        a = array([(1, 2,), (3, 4)], mask=[(0, 1), (0, 0)],
+                  dtype=[('a', int), ('b', int)])
+        test = a.tolist()
+        assert_equal(test, [[1, None], [3, 4]])
+        # ... on mvoid
+        a = a[0]
+        test = a.tolist()
+        assert_equal(test, [1, None])
+
+    def test_tolist_specialcase(self):
+        # Test mvoid.tolist: make sure we return a standard Python object
+        a = array([(0, 1), (2, 3)], dtype=[('a', int), ('b', int)])
+        # w/o mask: each entry is a np.void whose elements are standard Python
+        for entry in a:
+            for item in entry.tolist():
+                assert_(not isinstance(item, np.generic))
+        # w/ mask: each entry is a ma.void whose elements should be
+        # standard Python
+        a.mask[0] = (0, 1)
+        for entry in a:
+            for item in entry.tolist():
+                assert_(not isinstance(item, np.generic))
+
+    def test_toflex(self):
+        # Test the conversion to records
+        data = arange(10)
+        record = data.toflex()
+        assert_equal(record['_data'], data._data)
+        assert_equal(record['_mask'], data._mask)
+
+        data[[0, 1, 2, -1]] = masked
+        record = data.toflex()
+        assert_equal(record['_data'], data._data)
+        assert_equal(record['_mask'], data._mask)
+
+        ndtype = [('i', int), ('s', '|S3'), ('f', float)]
+        data = array([(i, s, f) for (i, s, f) in zip(np.arange(10),
+                                                     'ABCDEFGHIJKLM',
+                                                     np.random.rand(10))],
+                     dtype=ndtype)
+        data[[0, 1, 2, -1]] = masked
+        record = data.toflex()
+        assert_equal(record['_data'], data._data)
+        assert_equal(record['_mask'], data._mask)
+
+        ndtype = np.dtype("int, (2,3)float, float")
+        data = array([(i, f, ff) for (i, f, ff) in zip(np.arange(10),
+                                                       np.random.rand(10),
+                                                       np.random.rand(10))],
+                     dtype=ndtype)
+        data[[0, 1, 2, -1]] = masked
+        record = data.toflex()
+        assert_equal_records(record['_data'], data._data)
+        assert_equal_records(record['_mask'], data._mask)
+
+    def test_fromflex(self):
+        # Test the reconstruction of a masked_array from a record
+        a = array([1, 2, 3])
+        test = fromflex(a.toflex())
+        assert_equal(test, a)
+        assert_equal(test.mask, a.mask)
+
+        a = array([1, 2, 3], mask=[0, 0, 1])
+        test = fromflex(a.toflex())
+        assert_equal(test, a)
+        assert_equal(test.mask, a.mask)
+
+        a = array([(1, 1.), (2, 2.), (3, 3.)], mask=[(1, 0), (0, 0), (0, 1)],
+                  dtype=[('A', int), ('B', float)])
+        test = fromflex(a.toflex())
+        assert_equal(test, a)
+        assert_equal(test.data, a.data)
+
+    def test_arraymethod(self):
+        # Test a _arraymethod w/ n argument
+        marray = masked_array([[1, 2, 3, 4, 5]], mask=[0, 0, 1, 0, 0])
+        control = masked_array([[1], [2], [3], [4], [5]],
+                               mask=[0, 0, 1, 0, 0])
+        assert_equal(marray.T, control)
+        assert_equal(marray.transpose(), control)
+
+        assert_equal(MaskedArray.cumsum(marray.T, 0), control.cumsum(0))
+
+    def test_arraymethod_0d(self):
+        # gh-9430
+        x = np.ma.array(42, mask=True)
+        assert_equal(x.T.mask, x.mask)
+        assert_equal(x.T.data, x.data)
+
+    def test_transpose_view(self):
+        x = np.ma.array([[1, 2, 3], [4, 5, 6]])
+        x[0,1] = np.ma.masked
+        xt = x.T
+
+        xt[1,0] = 10
+        xt[0,1] = np.ma.masked
+
+        assert_equal(x.data, xt.T.data)
+        assert_equal(x.mask, xt.T.mask)
+
+    def test_diagonal_view(self):
+        x = np.ma.zeros((3,3))
+        x[0,0] = 10
+        x[1,1] = np.ma.masked
+        x[2,2] = 20
+        xd = x.diagonal()
+        x[1,1] = 15
+        assert_equal(xd.mask, x.diagonal().mask)
+        assert_equal(xd.data, x.diagonal().data)
+
+
+class TestMaskedArrayMathMethods(object):
+
+    def setup(self):
+        # Base data definition.
+        x = np.array([8.375, 7.545, 8.828, 8.5, 1.757, 5.928,
+                      8.43, 7.78, 9.865, 5.878, 8.979, 4.732,
+                      3.012, 6.022, 5.095, 3.116, 5.238, 3.957,
+                      6.04, 9.63, 7.712, 3.382, 4.489, 6.479,
+                      7.189, 9.645, 5.395, 4.961, 9.894, 2.893,
+                      7.357, 9.828, 6.272, 3.758, 6.693, 0.993])
+        X = x.reshape(6, 6)
+        XX = x.reshape(3, 2, 2, 3)
+
+        m = np.array([0, 1, 0, 1, 0, 0,
+                     1, 0, 1, 1, 0, 1,
+                     0, 0, 0, 1, 0, 1,
+                     0, 0, 0, 1, 1, 1,
+                     1, 0, 0, 1, 0, 0,
+                     0, 0, 1, 0, 1, 0])
+        mx = array(data=x, mask=m)
+        mX = array(data=X, mask=m.reshape(X.shape))
+        mXX = array(data=XX, mask=m.reshape(XX.shape))
+
+        m2 = np.array([1, 1, 0, 1, 0, 0,
+                      1, 1, 1, 1, 0, 1,
+                      0, 0, 1, 1, 0, 1,
+                      0, 0, 0, 1, 1, 1,
+                      1, 0, 0, 1, 1, 0,
+                      0, 0, 1, 0, 1, 1])
+        m2x = array(data=x, mask=m2)
+        m2X = array(data=X, mask=m2.reshape(X.shape))
+        m2XX = array(data=XX, mask=m2.reshape(XX.shape))
+        self.d = (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX)
+
+    def test_cumsumprod(self):
+        # Tests cumsum & cumprod on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+        mXcp = mX.cumsum(0)
+        assert_equal(mXcp._data, mX.filled(0).cumsum(0))
+        mXcp = mX.cumsum(1)
+        assert_equal(mXcp._data, mX.filled(0).cumsum(1))
+
+        mXcp = mX.cumprod(0)
+        assert_equal(mXcp._data, mX.filled(1).cumprod(0))
+        mXcp = mX.cumprod(1)
+        assert_equal(mXcp._data, mX.filled(1).cumprod(1))
+
+    def test_cumsumprod_with_output(self):
+        # Tests cumsum/cumprod w/ output
+        xm = array(np.random.uniform(0, 10, 12)).reshape(3, 4)
+        xm[:, 0] = xm[0] = xm[-1, -1] = masked
+
+        for funcname in ('cumsum', 'cumprod'):
+            npfunc = getattr(np, funcname)
+            xmmeth = getattr(xm, funcname)
+
+            # A ndarray as explicit input
+            output = np.empty((3, 4), dtype=float)
+            output.fill(-9999)
+            result = npfunc(xm, axis=0, out=output)
+            # ... the result should be the given output
+            assert_(result is output)
+            assert_equal(result, xmmeth(axis=0, out=output))
+
+            output = empty((3, 4), dtype=int)
+            result = xmmeth(axis=0, out=output)
+            assert_(result is output)
+
+    def test_ptp(self):
+        # Tests ptp on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+        (n, m) = X.shape
+        assert_equal(mx.ptp(), mx.compressed().ptp())
+        rows = np.zeros(n, float)
+        cols = np.zeros(m, float)
+        for k in range(m):
+            cols[k] = mX[:, k].compressed().ptp()
+        for k in range(n):
+            rows[k] = mX[k].compressed().ptp()
+        assert_equal(mX.ptp(0), cols)
+        assert_equal(mX.ptp(1), rows)
+
+    def test_add_object(self):
+        x = masked_array(['a', 'b'], mask=[1, 0], dtype=object)
+        y = x + 'x'
+        assert_equal(y[1], 'bx')
+        assert_(y.mask[0])
+
+    def test_sum_object(self):
+        # Test sum on object dtype
+        a = masked_array([1, 2, 3], mask=[1, 0, 0], dtype=object)
+        assert_equal(a.sum(), 5)
+        a = masked_array([[1, 2, 3], [4, 5, 6]], dtype=object)
+        assert_equal(a.sum(axis=0), [5, 7, 9])
+
+    def test_prod_object(self):
+        # Test prod on object dtype
+        a = masked_array([1, 2, 3], mask=[1, 0, 0], dtype=object)
+        assert_equal(a.prod(), 2 * 3)
+        a = masked_array([[1, 2, 3], [4, 5, 6]], dtype=object)
+        assert_equal(a.prod(axis=0), [4, 10, 18])
+
+    def test_meananom_object(self):
+        # Test mean/anom on object dtype
+        a = masked_array([1, 2, 3], dtype=object)
+        assert_equal(a.mean(), 2)
+        assert_equal(a.anom(), [-1, 0, 1])
+
+    def test_trace(self):
+        # Tests trace on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+        mXdiag = mX.diagonal()
+        assert_equal(mX.trace(), mX.diagonal().compressed().sum())
+        assert_almost_equal(mX.trace(),
+                            X.trace() - sum(mXdiag.mask * X.diagonal(),
+                                            axis=0))
+        assert_equal(np.trace(mX), mX.trace())
+
+        # gh-5560
+        arr = np.arange(2*4*4).reshape(2,4,4)
+        m_arr = np.ma.masked_array(arr, False)
+        assert_equal(arr.trace(axis1=1, axis2=2), m_arr.trace(axis1=1, axis2=2))
+
+    def test_dot(self):
+        # Tests dot on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+        fx = mx.filled(0)
+        r = mx.dot(mx)
+        assert_almost_equal(r.filled(0), fx.dot(fx))
+        assert_(r.mask is nomask)
+
+        fX = mX.filled(0)
+        r = mX.dot(mX)
+        assert_almost_equal(r.filled(0), fX.dot(fX))
+        assert_(r.mask[1,3])
+        r1 = empty_like(r)
+        mX.dot(mX, out=r1)
+        assert_almost_equal(r, r1)
+
+        mYY = mXX.swapaxes(-1, -2)
+        fXX, fYY = mXX.filled(0), mYY.filled(0)
+        r = mXX.dot(mYY)
+        assert_almost_equal(r.filled(0), fXX.dot(fYY))
+        r1 = empty_like(r)
+        mXX.dot(mYY, out=r1)
+        assert_almost_equal(r, r1)
+
+    def test_dot_shape_mismatch(self):
+        # regression test
+        x = masked_array([[1,2],[3,4]], mask=[[0,1],[0,0]])
+        y = masked_array([[1,2],[3,4]], mask=[[0,1],[0,0]])
+        z = masked_array([[0,1],[3,3]])
+        x.dot(y, out=z)
+        assert_almost_equal(z.filled(0), [[1, 0], [15, 16]])
+        assert_almost_equal(z.mask, [[0, 1], [0, 0]])
+
+    def test_varmean_nomask(self):
+        # gh-5769
+        foo = array([1,2,3,4], dtype='f8')
+        bar = array([1,2,3,4], dtype='f8')
+        assert_equal(type(foo.mean()), np.float64)
+        assert_equal(type(foo.var()), np.float64)
+        assert((foo.mean() == bar.mean()) is np.bool_(True))
+
+        # check array type is preserved and out works
+        foo = array(np.arange(16).reshape((4,4)), dtype='f8')
+        bar = empty(4, dtype='f4')
+        assert_equal(type(foo.mean(axis=1)), MaskedArray)
+        assert_equal(type(foo.var(axis=1)), MaskedArray)
+        assert_(foo.mean(axis=1, out=bar) is bar)
+        assert_(foo.var(axis=1, out=bar) is bar)
+
+    def test_varstd(self):
+        # Tests var & std on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+        assert_almost_equal(mX.var(axis=None), mX.compressed().var())
+        assert_almost_equal(mX.std(axis=None), mX.compressed().std())
+        assert_almost_equal(mX.std(axis=None, ddof=1),
+                            mX.compressed().std(ddof=1))
+        assert_almost_equal(mX.var(axis=None, ddof=1),
+                            mX.compressed().var(ddof=1))
+        assert_equal(mXX.var(axis=3).shape, XX.var(axis=3).shape)
+        assert_equal(mX.var().shape, X.var().shape)
+        (mXvar0, mXvar1) = (mX.var(axis=0), mX.var(axis=1))
+        assert_almost_equal(mX.var(axis=None, ddof=2),
+                            mX.compressed().var(ddof=2))
+        assert_almost_equal(mX.std(axis=None, ddof=2),
+                            mX.compressed().std(ddof=2))
+        for k in range(6):
+            assert_almost_equal(mXvar1[k], mX[k].compressed().var())
+            assert_almost_equal(mXvar0[k], mX[:, k].compressed().var())
+            assert_almost_equal(np.sqrt(mXvar0[k]),
+                                mX[:, k].compressed().std())
+
+    @pytest.mark.skipif(sys.platform=='win32' and sys.version_info < (3, 6),
+                        reason='Fails on Python < 3.6 on Windows, gh-9671')
+    @suppress_copy_mask_on_assignment
+    def test_varstd_specialcases(self):
+        # Test a special case for var
+        nout = np.array(-1, dtype=float)
+        mout = array(-1, dtype=float)
+
+        x = array(arange(10), mask=True)
+        for methodname in ('var', 'std'):
+            method = getattr(x, methodname)
+            assert_(method() is masked)
+            assert_(method(0) is masked)
+            assert_(method(-1) is masked)
+            # Using a masked array as explicit output
+            method(out=mout)
+            assert_(mout is not masked)
+            assert_equal(mout.mask, True)
+            # Using a ndarray as explicit output
+            method(out=nout)
+            assert_(np.isnan(nout))
+
+        x = array(arange(10), mask=True)
+        x[-1] = 9
+        for methodname in ('var', 'std'):
+            method = getattr(x, methodname)
+            assert_(method(ddof=1) is masked)
+            assert_(method(0, ddof=1) is masked)
+            assert_(method(-1, ddof=1) is masked)
+            # Using a masked array as explicit output
+            method(out=mout, ddof=1)
+            assert_(mout is not masked)
+            assert_equal(mout.mask, True)
+            # Using a ndarray as explicit output
+            method(out=nout, ddof=1)
+            assert_(np.isnan(nout))
+
+    def test_varstd_ddof(self):
+        a = array([[1, 1, 0], [1, 1, 0]], mask=[[0, 0, 1], [0, 0, 1]])
+        test = a.std(axis=0, ddof=0)
+        assert_equal(test.filled(0), [0, 0, 0])
+        assert_equal(test.mask, [0, 0, 1])
+        test = a.std(axis=0, ddof=1)
+        assert_equal(test.filled(0), [0, 0, 0])
+        assert_equal(test.mask, [0, 0, 1])
+        test = a.std(axis=0, ddof=2)
+        assert_equal(test.filled(0), [0, 0, 0])
+        assert_equal(test.mask, [1, 1, 1])
+
+    def test_diag(self):
+        # Test diag
+        x = arange(9).reshape((3, 3))
+        x[1, 1] = masked
+        out = np.diag(x)
+        assert_equal(out, [0, 4, 8])
+        out = diag(x)
+        assert_equal(out, [0, 4, 8])
+        assert_equal(out.mask, [0, 1, 0])
+        out = diag(out)
+        control = array([[0, 0, 0], [0, 4, 0], [0, 0, 8]],
+                        mask=[[0, 0, 0], [0, 1, 0], [0, 0, 0]])
+        assert_equal(out, control)
+
+    def test_axis_methods_nomask(self):
+        # Test the combination nomask & methods w/ axis
+        a = array([[1, 2, 3], [4, 5, 6]])
+
+        assert_equal(a.sum(0), [5, 7, 9])
+        assert_equal(a.sum(-1), [6, 15])
+        assert_equal(a.sum(1), [6, 15])
+
+        assert_equal(a.prod(0), [4, 10, 18])
+        assert_equal(a.prod(-1), [6, 120])
+        assert_equal(a.prod(1), [6, 120])
+
+        assert_equal(a.min(0), [1, 2, 3])
+        assert_equal(a.min(-1), [1, 4])
+        assert_equal(a.min(1), [1, 4])
+
+        assert_equal(a.max(0), [4, 5, 6])
+        assert_equal(a.max(-1), [3, 6])
+        assert_equal(a.max(1), [3, 6])
+
+
+class TestMaskedArrayMathMethodsComplex(object):
+    # Test class for miscellaneous MaskedArrays methods.
+    def setup(self):
+        # Base data definition.
+        x = np.array([8.375j, 7.545j, 8.828j, 8.5j, 1.757j, 5.928,
+                      8.43, 7.78, 9.865, 5.878, 8.979, 4.732,
+                      3.012, 6.022, 5.095, 3.116, 5.238, 3.957,
+                      6.04, 9.63, 7.712, 3.382, 4.489, 6.479j,
+                      7.189j, 9.645, 5.395, 4.961, 9.894, 2.893,
+                      7.357, 9.828, 6.272, 3.758, 6.693, 0.993j])
+        X = x.reshape(6, 6)
+        XX = x.reshape(3, 2, 2, 3)
+
+        m = np.array([0, 1, 0, 1, 0, 0,
+                     1, 0, 1, 1, 0, 1,
+                     0, 0, 0, 1, 0, 1,
+                     0, 0, 0, 1, 1, 1,
+                     1, 0, 0, 1, 0, 0,
+                     0, 0, 1, 0, 1, 0])
+        mx = array(data=x, mask=m)
+        mX = array(data=X, mask=m.reshape(X.shape))
+        mXX = array(data=XX, mask=m.reshape(XX.shape))
+
+        m2 = np.array([1, 1, 0, 1, 0, 0,
+                      1, 1, 1, 1, 0, 1,
+                      0, 0, 1, 1, 0, 1,
+                      0, 0, 0, 1, 1, 1,
+                      1, 0, 0, 1, 1, 0,
+                      0, 0, 1, 0, 1, 1])
+        m2x = array(data=x, mask=m2)
+        m2X = array(data=X, mask=m2.reshape(X.shape))
+        m2XX = array(data=XX, mask=m2.reshape(XX.shape))
+        self.d = (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX)
+
+    def test_varstd(self):
+        # Tests var & std on MaskedArrays.
+        (x, X, XX, m, mx, mX, mXX, m2x, m2X, m2XX) = self.d
+        assert_almost_equal(mX.var(axis=None), mX.compressed().var())
+        assert_almost_equal(mX.std(axis=None), mX.compressed().std())
+        assert_equal(mXX.var(axis=3).shape, XX.var(axis=3).shape)
+        assert_equal(mX.var().shape, X.var().shape)
+        (mXvar0, mXvar1) = (mX.var(axis=0), mX.var(axis=1))
+        assert_almost_equal(mX.var(axis=None, ddof=2),
+                            mX.compressed().var(ddof=2))
+        assert_almost_equal(mX.std(axis=None, ddof=2),
+                            mX.compressed().std(ddof=2))
+        for k in range(6):
+            assert_almost_equal(mXvar1[k], mX[k].compressed().var())
+            assert_almost_equal(mXvar0[k], mX[:, k].compressed().var())
+            assert_almost_equal(np.sqrt(mXvar0[k]),
+                                mX[:, k].compressed().std())
+
+
+class TestMaskedArrayFunctions(object):
+    # Test class for miscellaneous functions.
+
+    def setup(self):
+        x = np.array([1., 1., 1., -2., pi/2.0, 4., 5., -10., 10., 1., 2., 3.])
+        y = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.])
+        m1 = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+        m2 = [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1]
+        xm = masked_array(x, mask=m1)
+        ym = masked_array(y, mask=m2)
+        xm.set_fill_value(1e+20)
+        self.info = (xm, ym)
+
+    def test_masked_where_bool(self):
+        x = [1, 2]
+        y = masked_where(False, x)
+        assert_equal(y, [1, 2])
+        assert_equal(y[1], 2)
+
+    def test_masked_equal_wlist(self):
+        x = [1, 2, 3]
+        mx = masked_equal(x, 3)
+        assert_equal(mx, x)
+        assert_equal(mx._mask, [0, 0, 1])
+        mx = masked_not_equal(x, 3)
+        assert_equal(mx, x)
+        assert_equal(mx._mask, [1, 1, 0])
+
+    def test_masked_equal_fill_value(self):
+        x = [1, 2, 3]
+        mx = masked_equal(x, 3)
+        assert_equal(mx._mask, [0, 0, 1])
+        assert_equal(mx.fill_value, 3)
+
+    def test_masked_where_condition(self):
+        # Tests masking functions.
+        x = array([1., 2., 3., 4., 5.])
+        x[2] = masked
+        assert_equal(masked_where(greater(x, 2), x), masked_greater(x, 2))
+        assert_equal(masked_where(greater_equal(x, 2), x),
+                     masked_greater_equal(x, 2))
+        assert_equal(masked_where(less(x, 2), x), masked_less(x, 2))
+        assert_equal(masked_where(less_equal(x, 2), x),
+                     masked_less_equal(x, 2))
+        assert_equal(masked_where(not_equal(x, 2), x), masked_not_equal(x, 2))
+        assert_equal(masked_where(equal(x, 2), x), masked_equal(x, 2))
+        assert_equal(masked_where(not_equal(x, 2), x), masked_not_equal(x, 2))
+        assert_equal(masked_where([1, 1, 0, 0, 0], [1, 2, 3, 4, 5]),
+                     [99, 99, 3, 4, 5])
+
+    def test_masked_where_oddities(self):
+        # Tests some generic features.
+        atest = ones((10, 10, 10), dtype=float)
+        btest = zeros(atest.shape, MaskType)
+        ctest = masked_where(btest, atest)
+        assert_equal(atest, ctest)
+
+    def test_masked_where_shape_constraint(self):
+        a = arange(10)
+        with assert_raises(IndexError):
+            masked_equal(1, a)
+        test = masked_equal(a, 1)
+        assert_equal(test.mask, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0])
+
+    def test_masked_where_structured(self):
+        # test that masked_where on a structured array sets a structured
+        # mask (see issue #2972)
+        a = np.zeros(10, dtype=[("A", "<f2"), ("B", "<f4")])
+        am = np.ma.masked_where(a["A"] < 5, a)
+        assert_equal(am.mask.dtype.names, am.dtype.names)
+        assert_equal(am["A"],
+                    np.ma.masked_array(np.zeros(10), np.ones(10)))
+
+    def test_masked_where_mismatch(self):
+        # gh-4520
+        x = np.arange(10)
+        y = np.arange(5)
+        assert_raises(IndexError, np.ma.masked_where, y > 6, x)
+
+    def test_masked_otherfunctions(self):
+        assert_equal(masked_inside(list(range(5)), 1, 3),
+                     [0, 199, 199, 199, 4])
+        assert_equal(masked_outside(list(range(5)), 1, 3), [199, 1, 2, 3, 199])
+        assert_equal(masked_inside(array(list(range(5)),
+                                         mask=[1, 0, 0, 0, 0]), 1, 3).mask,
+                     [1, 1, 1, 1, 0])
+        assert_equal(masked_outside(array(list(range(5)),
+                                          mask=[0, 1, 0, 0, 0]), 1, 3).mask,
+                     [1, 1, 0, 0, 1])
+        assert_equal(masked_equal(array(list(range(5)),
+                                        mask=[1, 0, 0, 0, 0]), 2).mask,
+                     [1, 0, 1, 0, 0])
+        assert_equal(masked_not_equal(array([2, 2, 1, 2, 1],
+                                            mask=[1, 0, 0, 0, 0]), 2).mask,
+                     [1, 0, 1, 0, 1])
+
+    def test_round(self):
+        a = array([1.23456, 2.34567, 3.45678, 4.56789, 5.67890],
+                  mask=[0, 1, 0, 0, 0])
+        assert_equal(a.round(), [1., 2., 3., 5., 6.])
+        assert_equal(a.round(1), [1.2, 2.3, 3.5, 4.6, 5.7])
+        assert_equal(a.round(3), [1.235, 2.346, 3.457, 4.568, 5.679])
+        b = empty_like(a)
+        a.round(out=b)
+        assert_equal(b, [1., 2., 3., 5., 6.])
+
+        x = array([1., 2., 3., 4., 5.])
+        c = array([1, 1, 1, 0, 0])
+        x[2] = masked
+        z = where(c, x, -x)
+        assert_equal(z, [1., 2., 0., -4., -5])
+        c[0] = masked
+        z = where(c, x, -x)
+        assert_equal(z, [1., 2., 0., -4., -5])
+        assert_(z[0] is masked)
+        assert_(z[1] is not masked)
+        assert_(z[2] is masked)
+
+    def test_round_with_output(self):
+        # Testing round with an explicit output
+
+        xm = array(np.random.uniform(0, 10, 12)).reshape(3, 4)
+        xm[:, 0] = xm[0] = xm[-1, -1] = masked
+
+        # A ndarray as explicit input
+        output = np.empty((3, 4), dtype=float)
+        output.fill(-9999)
+        result = np.round(xm, decimals=2, out=output)
+        # ... the result should be the given output
+        assert_(result is output)
+        assert_equal(result, xm.round(decimals=2, out=output))
+
+        output = empty((3, 4), dtype=float)
+        result = xm.round(decimals=2, out=output)
+        assert_(result is output)
+
+    def test_round_with_scalar(self):
+        # Testing round with scalar/zero dimension input
+        # GH issue 2244
+        a = array(1.1, mask=[False])
+        assert_equal(a.round(), 1)
+
+        a = array(1.1, mask=[True])
+        assert_(a.round() is masked)
+
+        a = array(1.1, mask=[False])
+        output = np.empty(1, dtype=float)
+        output.fill(-9999)
+        a.round(out=output)
+        assert_equal(output, 1)
+
+        a = array(1.1, mask=[False])
+        output = array(-9999., mask=[True])
+        a.round(out=output)
+        assert_equal(output[()], 1)
+
+        a = array(1.1, mask=[True])
+        output = array(-9999., mask=[False])
+        a.round(out=output)
+        assert_(output[()] is masked)
+
+    def test_identity(self):
+        a = identity(5)
+        assert_(isinstance(a, MaskedArray))
+        assert_equal(a, np.identity(5))
+
+    def test_power(self):
+        x = -1.1
+        assert_almost_equal(power(x, 2.), 1.21)
+        assert_(power(x, masked) is masked)
+        x = array([-1.1, -1.1, 1.1, 1.1, 0.])
+        b = array([0.5, 2., 0.5, 2., -1.], mask=[0, 0, 0, 0, 1])
+        y = power(x, b)
+        assert_almost_equal(y, [0, 1.21, 1.04880884817, 1.21, 0.])
+        assert_equal(y._mask, [1, 0, 0, 0, 1])
+        b.mask = nomask
+        y = power(x, b)
+        assert_equal(y._mask, [1, 0, 0, 0, 1])
+        z = x ** b
+        assert_equal(z._mask, y._mask)
+        assert_almost_equal(z, y)
+        assert_almost_equal(z._data, y._data)
+        x **= b
+        assert_equal(x._mask, y._mask)
+        assert_almost_equal(x, y)
+        assert_almost_equal(x._data, y._data)
+
+    def test_power_with_broadcasting(self):
+        # Test power w/ broadcasting
+        a2 = np.array([[1., 2., 3.], [4., 5., 6.]])
+        a2m = array(a2, mask=[[1, 0, 0], [0, 0, 1]])
+        b1 = np.array([2, 4, 3])
+        b2 = np.array([b1, b1])
+        b2m = array(b2, mask=[[0, 1, 0], [0, 1, 0]])
+
+        ctrl = array([[1 ** 2, 2 ** 4, 3 ** 3], [4 ** 2, 5 ** 4, 6 ** 3]],
+                     mask=[[1, 1, 0], [0, 1, 1]])
+        # No broadcasting, base & exp w/ mask
+        test = a2m ** b2m
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, ctrl.mask)
+        # No broadcasting, base w/ mask, exp w/o mask
+        test = a2m ** b2
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, a2m.mask)
+        # No broadcasting, base w/o mask, exp w/ mask
+        test = a2 ** b2m
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, b2m.mask)
+
+        ctrl = array([[2 ** 2, 4 ** 4, 3 ** 3], [2 ** 2, 4 ** 4, 3 ** 3]],
+                     mask=[[0, 1, 0], [0, 1, 0]])
+        test = b1 ** b2m
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, ctrl.mask)
+        test = b2m ** b1
+        assert_equal(test, ctrl)
+        assert_equal(test.mask, ctrl.mask)
+
+    def test_where(self):
+        # Test the where function
+        x = np.array([1., 1., 1., -2., pi/2.0, 4., 5., -10., 10., 1., 2., 3.])
+        y = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.])
+        m1 = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+        m2 = [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1]
+        xm = masked_array(x, mask=m1)
+        ym = masked_array(y, mask=m2)
+        xm.set_fill_value(1e+20)
+
+        d = where(xm > 2, xm, -9)
+        assert_equal(d, [-9., -9., -9., -9., -9., 4.,
+                         -9., -9., 10., -9., -9., 3.])
+        assert_equal(d._mask, xm._mask)
+        d = where(xm > 2, -9, ym)
+        assert_equal(d, [5., 0., 3., 2., -1., -9.,
+                         -9., -10., -9., 1., 0., -9.])
+        assert_equal(d._mask, [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0])
+        d = where(xm > 2, xm, masked)
+        assert_equal(d, [-9., -9., -9., -9., -9., 4.,
+                         -9., -9., 10., -9., -9., 3.])
+        tmp = xm._mask.copy()
+        tmp[(xm <= 2).filled(True)] = True
+        assert_equal(d._mask, tmp)
+
+        ixm = xm.astype(int)
+        d = where(ixm > 2, ixm, masked)
+        assert_equal(d, [-9, -9, -9, -9, -9, 4, -9, -9, 10, -9, -9, 3])
+        assert_equal(d.dtype, ixm.dtype)
+
+    def test_where_object(self):
+        a = np.array(None)
+        b = masked_array(None)
+        r = b.copy()
+        assert_equal(np.ma.where(True, a, a), r)
+        assert_equal(np.ma.where(True, b, b), r)
+
+    def test_where_with_masked_choice(self):
+        x = arange(10)
+        x[3] = masked
+        c = x >= 8
+        # Set False to masked
+        z = where(c, x, masked)
+        assert_(z.dtype is x.dtype)
+        assert_(z[3] is masked)
+        assert_(z[4] is masked)
+        assert_(z[7] is masked)
+        assert_(z[8] is not masked)
+        assert_(z[9] is not masked)
+        assert_equal(x, z)
+        # Set True to masked
+        z = where(c, masked, x)
+        assert_(z.dtype is x.dtype)
+        assert_(z[3] is masked)
+        assert_(z[4] is not masked)
+        assert_(z[7] is not masked)
+        assert_(z[8] is masked)
+        assert_(z[9] is masked)
+
+    def test_where_with_masked_condition(self):
+        x = array([1., 2., 3., 4., 5.])
+        c = array([1, 1, 1, 0, 0])
+        x[2] = masked
+        z = where(c, x, -x)
+        assert_equal(z, [1., 2., 0., -4., -5])
+        c[0] = masked
+        z = where(c, x, -x)
+        assert_equal(z, [1., 2., 0., -4., -5])
+        assert_(z[0] is masked)
+        assert_(z[1] is not masked)
+        assert_(z[2] is masked)
+
+        x = arange(1, 6)
+        x[-1] = masked
+        y = arange(1, 6) * 10
+        y[2] = masked
+        c = array([1, 1, 1, 0, 0], mask=[1, 0, 0, 0, 0])
+        cm = c.filled(1)
+        z = where(c, x, y)
+        zm = where(cm, x, y)
+        assert_equal(z, zm)
+        assert_(getmask(zm) is nomask)
+        assert_equal(zm, [1, 2, 3, 40, 50])
+        z = where(c, masked, 1)
+        assert_equal(z, [99, 99, 99, 1, 1])
+        z = where(c, 1, masked)
+        assert_equal(z, [99, 1, 1, 99, 99])
+
+    def test_where_type(self):
+        # Test the type conservation with where
+        x = np.arange(4, dtype=np.int32)
+        y = np.arange(4, dtype=np.float32) * 2.2
+        test = where(x > 1.5, y, x).dtype
+        control = np.find_common_type([np.int32, np.float32], [])
+        assert_equal(test, control)
+
+    def test_where_broadcast(self):
+        # Issue 8599
+        x = np.arange(9).reshape(3, 3)
+        y = np.zeros(3)
+        core = np.where([1, 0, 1], x, y)
+        ma = where([1, 0, 1], x, y)
+
+        assert_equal(core, ma)
+        assert_equal(core.dtype, ma.dtype)
+
+    def test_where_structured(self):
+        # Issue 8600
+        dt = np.dtype([('a', int), ('b', int)])
+        x = np.array([(1, 2), (3, 4), (5, 6)], dtype=dt)
+        y = np.array((10, 20), dtype=dt)
+        core = np.where([0, 1, 1], x, y)
+        ma = np.where([0, 1, 1], x, y)
+
+        assert_equal(core, ma)
+        assert_equal(core.dtype, ma.dtype)
+
+    def test_where_structured_masked(self):
+        dt = np.dtype([('a', int), ('b', int)])
+        x = np.array([(1, 2), (3, 4), (5, 6)], dtype=dt)
+
+        ma = where([0, 1, 1], x, masked)
+        expected = masked_where([1, 0, 0], x)
+
+        assert_equal(ma.dtype, expected.dtype)
+        assert_equal(ma, expected)
+        assert_equal(ma.mask, expected.mask)
+
+    def test_choose(self):
+        # Test choose
+        choices = [[0, 1, 2, 3], [10, 11, 12, 13],
+                   [20, 21, 22, 23], [30, 31, 32, 33]]
+        chosen = choose([2, 3, 1, 0], choices)
+        assert_equal(chosen, array([20, 31, 12, 3]))
+        chosen = choose([2, 4, 1, 0], choices, mode='clip')
+        assert_equal(chosen, array([20, 31, 12, 3]))
+        chosen = choose([2, 4, 1, 0], choices, mode='wrap')
+        assert_equal(chosen, array([20, 1, 12, 3]))
+        # Check with some masked indices
+        indices_ = array([2, 4, 1, 0], mask=[1, 0, 0, 1])
+        chosen = choose(indices_, choices, mode='wrap')
+        assert_equal(chosen, array([99, 1, 12, 99]))
+        assert_equal(chosen.mask, [1, 0, 0, 1])
+        # Check with some masked choices
+        choices = array(choices, mask=[[0, 0, 0, 1], [1, 1, 0, 1],
+                                       [1, 0, 0, 0], [0, 0, 0, 0]])
+        indices_ = [2, 3, 1, 0]
+        chosen = choose(indices_, choices, mode='wrap')
+        assert_equal(chosen, array([20, 31, 12, 3]))
+        assert_equal(chosen.mask, [1, 0, 0, 1])
+
+    def test_choose_with_out(self):
+        # Test choose with an explicit out keyword
+        choices = [[0, 1, 2, 3], [10, 11, 12, 13],
+                   [20, 21, 22, 23], [30, 31, 32, 33]]
+        store = empty(4, dtype=int)
+        chosen = choose([2, 3, 1, 0], choices, out=store)
+        assert_equal(store, array([20, 31, 12, 3]))
+        assert_(store is chosen)
+        # Check with some masked indices + out
+        store = empty(4, dtype=int)
+        indices_ = array([2, 3, 1, 0], mask=[1, 0, 0, 1])
+        chosen = choose(indices_, choices, mode='wrap', out=store)
+        assert_equal(store, array([99, 31, 12, 99]))
+        assert_equal(store.mask, [1, 0, 0, 1])
+        # Check with some masked choices + out ina ndarray !
+        choices = array(choices, mask=[[0, 0, 0, 1], [1, 1, 0, 1],
+                                       [1, 0, 0, 0], [0, 0, 0, 0]])
+        indices_ = [2, 3, 1, 0]
+        store = empty(4, dtype=int).view(ndarray)
+        chosen = choose(indices_, choices, mode='wrap', out=store)
+        assert_equal(store, array([999999, 31, 12, 999999]))
+
+    def test_reshape(self):
+        a = arange(10)
+        a[0] = masked
+        # Try the default
+        b = a.reshape((5, 2))
+        assert_equal(b.shape, (5, 2))
+        assert_(b.flags['C'])
+        # Try w/ arguments as list instead of tuple
+        b = a.reshape(5, 2)
+        assert_equal(b.shape, (5, 2))
+        assert_(b.flags['C'])
+        # Try w/ order
+        b = a.reshape((5, 2), order='F')
+        assert_equal(b.shape, (5, 2))
+        assert_(b.flags['F'])
+        # Try w/ order
+        b = a.reshape(5, 2, order='F')
+        assert_equal(b.shape, (5, 2))
+        assert_(b.flags['F'])
+
+        c = np.reshape(a, (2, 5))
+        assert_(isinstance(c, MaskedArray))
+        assert_equal(c.shape, (2, 5))
+        assert_(c[0, 0] is masked)
+        assert_(c.flags['C'])
+
+    def test_make_mask_descr(self):
+        # Flexible
+        ntype = [('a', float), ('b', float)]
+        test = make_mask_descr(ntype)
+        assert_equal(test, [('a', bool), ('b', bool)])
+        assert_(test is make_mask_descr(test))
+
+        # Standard w/ shape
+        ntype = (float, 2)
+        test = make_mask_descr(ntype)
+        assert_equal(test, (bool, 2))
+        assert_(test is make_mask_descr(test))
+
+        # Standard standard
+        ntype = float
+        test = make_mask_descr(ntype)
+        assert_equal(test, np.dtype(bool))
+        assert_(test is make_mask_descr(test))
+
+        # Nested
+        ntype = [('a', float), ('b', [('ba', float), ('bb', float)])]
+        test = make_mask_descr(ntype)
+        control = np.dtype([('a', 'b1'), ('b', [('ba', 'b1'), ('bb', 'b1')])])
+        assert_equal(test, control)
+        assert_(test is make_mask_descr(test))
+
+        # Named+ shape
+        ntype = [('a', (float, 2))]
+        test = make_mask_descr(ntype)
+        assert_equal(test, np.dtype([('a', (bool, 2))]))
+        assert_(test is make_mask_descr(test))
+
+        # 2 names
+        ntype = [(('A', 'a'), float)]
+        test = make_mask_descr(ntype)
+        assert_equal(test, np.dtype([(('A', 'a'), bool)]))
+        assert_(test is make_mask_descr(test))
+
+        # nested boolean types should preserve identity
+        base_type = np.dtype([('a', int, 3)])
+        base_mtype = make_mask_descr(base_type)
+        sub_type = np.dtype([('a', int), ('b', base_mtype)])
+        test = make_mask_descr(sub_type)
+        assert_equal(test, np.dtype([('a', bool), ('b', [('a', bool, 3)])]))
+        assert_(test.fields['b'][0] is base_mtype)
+
+    def test_make_mask(self):
+        # Test make_mask
+        # w/ a list as an input
+        mask = [0, 1]
+        test = make_mask(mask)
+        assert_equal(test.dtype, MaskType)
+        assert_equal(test, [0, 1])
+        # w/ a ndarray as an input
+        mask = np.array([0, 1], dtype=bool)
+        test = make_mask(mask)
+        assert_equal(test.dtype, MaskType)
+        assert_equal(test, [0, 1])
+        # w/ a flexible-type ndarray as an input - use default
+        mdtype = [('a', bool), ('b', bool)]
+        mask = np.array([(0, 0), (0, 1)], dtype=mdtype)
+        test = make_mask(mask)
+        assert_equal(test.dtype, MaskType)
+        assert_equal(test, [1, 1])
+        # w/ a flexible-type ndarray as an input - use input dtype
+        mdtype = [('a', bool), ('b', bool)]
+        mask = np.array([(0, 0), (0, 1)], dtype=mdtype)
+        test = make_mask(mask, dtype=mask.dtype)
+        assert_equal(test.dtype, mdtype)
+        assert_equal(test, mask)
+        # w/ a flexible-type ndarray as an input - use input dtype
+        mdtype = [('a', float), ('b', float)]
+        bdtype = [('a', bool), ('b', bool)]
+        mask = np.array([(0, 0), (0, 1)], dtype=mdtype)
+        test = make_mask(mask, dtype=mask.dtype)
+        assert_equal(test.dtype, bdtype)
+        assert_equal(test, np.array([(0, 0), (0, 1)], dtype=bdtype))
+        # Ensure this also works for void
+        mask = np.array((False, True), dtype='?,?')[()]
+        assert_(isinstance(mask, np.void))
+        test = make_mask(mask, dtype=mask.dtype)
+        assert_equal(test, mask)
+        assert_(test is not mask)
+        mask = np.array((0, 1), dtype='i4,i4')[()]
+        test2 = make_mask(mask, dtype=mask.dtype)
+        assert_equal(test2, test)
+        # test that nomask is returned when m is nomask.
+        bools = [True, False]
+        dtypes = [MaskType, float]
+        msgformat = 'copy=%s, shrink=%s, dtype=%s'
+        for cpy, shr, dt in itertools.product(bools, bools, dtypes):
+            res = make_mask(nomask, copy=cpy, shrink=shr, dtype=dt)
+            assert_(res is nomask, msgformat % (cpy, shr, dt))
+
+    def test_mask_or(self):
+        # Initialize
+        mtype = [('a', bool), ('b', bool)]
+        mask = np.array([(0, 0), (0, 1), (1, 0), (0, 0)], dtype=mtype)
+        # Test using nomask as input
+        test = mask_or(mask, nomask)
+        assert_equal(test, mask)
+        test = mask_or(nomask, mask)
+        assert_equal(test, mask)
+        # Using False as input
+        test = mask_or(mask, False)
+        assert_equal(test, mask)
+        # Using another array w / the same dtype
+        other = np.array([(0, 1), (0, 1), (0, 1), (0, 1)], dtype=mtype)
+        test = mask_or(mask, other)
+        control = np.array([(0, 1), (0, 1), (1, 1), (0, 1)], dtype=mtype)
+        assert_equal(test, control)
+        # Using another array w / a different dtype
+        othertype = [('A', bool), ('B', bool)]
+        other = np.array([(0, 1), (0, 1), (0, 1), (0, 1)], dtype=othertype)
+        try:
+            test = mask_or(mask, other)
+        except ValueError:
+            pass
+        # Using nested arrays
+        dtype = [('a', bool), ('b', [('ba', bool), ('bb', bool)])]
+        amask = np.array([(0, (1, 0)), (0, (1, 0))], dtype=dtype)
+        bmask = np.array([(1, (0, 1)), (0, (0, 0))], dtype=dtype)
+        cntrl = np.array([(1, (1, 1)), (0, (1, 0))], dtype=dtype)
+        assert_equal(mask_or(amask, bmask), cntrl)
+
+    def test_flatten_mask(self):
+        # Tests flatten mask
+        # Standard dtype
+        mask = np.array([0, 0, 1], dtype=bool)
+        assert_equal(flatten_mask(mask), mask)
+        # Flexible dtype
+        mask = np.array([(0, 0), (0, 1)], dtype=[('a', bool), ('b', bool)])
+        test = flatten_mask(mask)
+        control = np.array([0, 0, 0, 1], dtype=bool)
+        assert_equal(test, control)
+
+        mdtype = [('a', bool), ('b', [('ba', bool), ('bb', bool)])]
+        data = [(0, (0, 0)), (0, (0, 1))]
+        mask = np.array(data, dtype=mdtype)
+        test = flatten_mask(mask)
+        control = np.array([0, 0, 0, 0, 0, 1], dtype=bool)
+        assert_equal(test, control)
+
+    def test_on_ndarray(self):
+        # Test functions on ndarrays
+        a = np.array([1, 2, 3, 4])
+        m = array(a, mask=False)
+        test = anom(a)
+        assert_equal(test, m.anom())
+        test = reshape(a, (2, 2))
+        assert_equal(test, m.reshape(2, 2))
+
+    def test_compress(self):
+        # Test compress function on ndarray and masked array
+        # Address Github #2495.
+        arr = np.arange(8)
+        arr.shape = 4, 2
+        cond = np.array([True, False, True, True])
+        control = arr[[0, 2, 3]]
+        test = np.ma.compress(cond, arr, axis=0)
+        assert_equal(test, control)
+        marr = np.ma.array(arr)
+        test = np.ma.compress(cond, marr, axis=0)
+        assert_equal(test, control)
+
+    def test_compressed(self):
+        # Test ma.compressed function.
+        # Address gh-4026
+        a = np.ma.array([1, 2])
+        test = np.ma.compressed(a)
+        assert_(type(test) is np.ndarray)
+
+        # Test case when input data is ndarray subclass
+        class A(np.ndarray):
+            pass
+
+        a = np.ma.array(A(shape=0))
+        test = np.ma.compressed(a)
+        assert_(type(test) is A)
+
+        # Test that compress flattens
+        test = np.ma.compressed([[1],[2]])
+        assert_equal(test.ndim, 1)
+        test = np.ma.compressed([[[[[1]]]]])
+        assert_equal(test.ndim, 1)
+
+        # Test case when input is MaskedArray subclass
+        class M(MaskedArray):
+            pass
+
+        test = np.ma.compressed(M(shape=(0,1,2)))
+        assert_equal(test.ndim, 1)
+
+        # with .compressed() overridden
+        class M(MaskedArray):
+            def compressed(self):
+                return 42
+
+        test = np.ma.compressed(M(shape=(0,1,2)))
+        assert_equal(test, 42)
+
+    def test_convolve(self):
+        a = masked_equal(np.arange(5), 2)
+        b = np.array([1, 1])
+        test = np.ma.convolve(a, b)
+        assert_equal(test, masked_equal([0, 1, -1, -1, 7, 4], -1))
+
+        test = np.ma.convolve(a, b, propagate_mask=False)
+        assert_equal(test, masked_equal([0, 1, 1, 3, 7, 4], -1))
+
+        test = np.ma.convolve([1, 1], [1, 1, 1])
+        assert_equal(test, masked_equal([1, 2, 2, 1], -1))
+
+        a = [1, 1]
+        b = masked_equal([1, -1, -1, 1], -1)
+        test = np.ma.convolve(a, b, propagate_mask=False)
+        assert_equal(test, masked_equal([1, 1, -1, 1, 1], -1))
+        test = np.ma.convolve(a, b, propagate_mask=True)
+        assert_equal(test, masked_equal([-1, -1, -1, -1, -1], -1))
+
+
+class TestMaskedFields(object):
+
+    def setup(self):
+        ilist = [1, 2, 3, 4, 5]
+        flist = [1.1, 2.2, 3.3, 4.4, 5.5]
+        slist = ['one', 'two', 'three', 'four', 'five']
+        ddtype = [('a', int), ('b', float), ('c', '|S8')]
+        mdtype = [('a', bool), ('b', bool), ('c', bool)]
+        mask = [0, 1, 0, 0, 1]
+        base = array(list(zip(ilist, flist, slist)), mask=mask, dtype=ddtype)
+        self.data = dict(base=base, mask=mask, ddtype=ddtype, mdtype=mdtype)
+
+    def test_set_records_masks(self):
+        base = self.data['base']
+        mdtype = self.data['mdtype']
+        # Set w/ nomask or masked
+        base.mask = nomask
+        assert_equal_records(base._mask, np.zeros(base.shape, dtype=mdtype))
+        base.mask = masked
+        assert_equal_records(base._mask, np.ones(base.shape, dtype=mdtype))
+        # Set w/ simple boolean
+        base.mask = False
+        assert_equal_records(base._mask, np.zeros(base.shape, dtype=mdtype))
+        base.mask = True
+        assert_equal_records(base._mask, np.ones(base.shape, dtype=mdtype))
+        # Set w/ list
+        base.mask = [0, 0, 0, 1, 1]
+        assert_equal_records(base._mask,
+                             np.array([(x, x, x) for x in [0, 0, 0, 1, 1]],
+                                      dtype=mdtype))
+
+    def test_set_record_element(self):
+        # Check setting an element of a record)
+        base = self.data['base']
+        (base_a, base_b, base_c) = (base['a'], base['b'], base['c'])
+        base[0] = (pi, pi, 'pi')
+
+        assert_equal(base_a.dtype, int)
+        assert_equal(base_a._data, [3, 2, 3, 4, 5])
+
+        assert_equal(base_b.dtype, float)
+        assert_equal(base_b._data, [pi, 2.2, 3.3, 4.4, 5.5])
+
+        assert_equal(base_c.dtype, '|S8')
+        assert_equal(base_c._data,
+                     [b'pi', b'two', b'three', b'four', b'five'])
+
+    def test_set_record_slice(self):
+        base = self.data['base']
+        (base_a, base_b, base_c) = (base['a'], base['b'], base['c'])
+        base[:3] = (pi, pi, 'pi')
+
+        assert_equal(base_a.dtype, int)
+        assert_equal(base_a._data, [3, 3, 3, 4, 5])
+
+        assert_equal(base_b.dtype, float)
+        assert_equal(base_b._data, [pi, pi, pi, 4.4, 5.5])
+
+        assert_equal(base_c.dtype, '|S8')
+        assert_equal(base_c._data,
+                     [b'pi', b'pi', b'pi', b'four', b'five'])
+
+    def test_mask_element(self):
+        "Check record access"
+        base = self.data['base']
+        base[0] = masked
+
+        for n in ('a', 'b', 'c'):
+            assert_equal(base[n].mask, [1, 1, 0, 0, 1])
+            assert_equal(base[n]._data, base._data[n])
+
+    def test_getmaskarray(self):
+        # Test getmaskarray on flexible dtype
+        ndtype = [('a', int), ('b', float)]
+        test = empty(3, dtype=ndtype)
+        assert_equal(getmaskarray(test),
+                     np.array([(0, 0), (0, 0), (0, 0)],
+                              dtype=[('a', '|b1'), ('b', '|b1')]))
+        test[:] = masked
+        assert_equal(getmaskarray(test),
+                     np.array([(1, 1), (1, 1), (1, 1)],
+                              dtype=[('a', '|b1'), ('b', '|b1')]))
+
+    def test_view(self):
+        # Test view w/ flexible dtype
+        iterator = list(zip(np.arange(10), np.random.rand(10)))
+        data = np.array(iterator)
+        a = array(iterator, dtype=[('a', float), ('b', float)])
+        a.mask[0] = (1, 0)
+        controlmask = np.array([1] + 19 * [0], dtype=bool)
+        # Transform globally to simple dtype
+        test = a.view(float)
+        assert_equal(test, data.ravel())
+        assert_equal(test.mask, controlmask)
+        # Transform globally to dty
+        test = a.view((float, 2))
+        assert_equal(test, data)
+        assert_equal(test.mask, controlmask.reshape(-1, 2))
+
+    def test_getitem(self):
+        ndtype = [('a', float), ('b', float)]
+        a = array(list(zip(np.random.rand(10), np.arange(10))), dtype=ndtype)
+        a.mask = np.array(list(zip([0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
+                                   [1, 0, 0, 0, 0, 0, 0, 0, 1, 0])),
+                          dtype=[('a', bool), ('b', bool)])
+
+        def _test_index(i):
+            assert_equal(type(a[i]), mvoid)
+            assert_equal_records(a[i]._data, a._data[i])
+            assert_equal_records(a[i]._mask, a._mask[i])
+
+            assert_equal(type(a[i, ...]), MaskedArray)
+            assert_equal_records(a[i,...]._data, a._data[i,...])
+            assert_equal_records(a[i,...]._mask, a._mask[i,...])
+
+        _test_index(1)   # No mask
+        _test_index(0)   # One element masked
+        _test_index(-2)  # All element masked
+
+    def test_setitem(self):
+        # Issue 4866: check that one can set individual items in [record][col]
+        # and [col][record] order
+        ndtype = np.dtype([('a', float), ('b', int)])
+        ma = np.ma.MaskedArray([(1.0, 1), (2.0, 2)], dtype=ndtype)
+        ma['a'][1] = 3.0
+        assert_equal(ma['a'], np.array([1.0, 3.0]))
+        ma[1]['a'] = 4.0
+        assert_equal(ma['a'], np.array([1.0, 4.0]))
+        # Issue 2403
+        mdtype = np.dtype([('a', bool), ('b', bool)])
+        # soft mask
+        control = np.array([(False, True), (True, True)], dtype=mdtype)
+        a = np.ma.masked_all((2,), dtype=ndtype)
+        a['a'][0] = 2
+        assert_equal(a.mask, control)
+        a = np.ma.masked_all((2,), dtype=ndtype)
+        a[0]['a'] = 2
+        assert_equal(a.mask, control)
+        # hard mask
+        control = np.array([(True, True), (True, True)], dtype=mdtype)
+        a = np.ma.masked_all((2,), dtype=ndtype)
+        a.harden_mask()
+        a['a'][0] = 2
+        assert_equal(a.mask, control)
+        a = np.ma.masked_all((2,), dtype=ndtype)
+        a.harden_mask()
+        a[0]['a'] = 2
+        assert_equal(a.mask, control)
+
+    def test_setitem_scalar(self):
+        # 8510
+        mask_0d = np.ma.masked_array(1, mask=True)
+        arr = np.ma.arange(3)
+        arr[0] = mask_0d
+        assert_array_equal(arr.mask, [True, False, False])
+
+    def test_element_len(self):
+        # check that len() works for mvoid (Github issue #576)
+        for rec in self.data['base']:
+            assert_equal(len(rec), len(self.data['ddtype']))
+
+
+class TestMaskedObjectArray(object):
+
+    def test_getitem(self):
+        arr = np.ma.array([None, None])
+        for dt in [float, object]:
+            a0 = np.eye(2).astype(dt)
+            a1 = np.eye(3).astype(dt)
+            arr[0] = a0
+            arr[1] = a1
+
+            assert_(arr[0] is a0)
+            assert_(arr[1] is a1)
+            assert_(isinstance(arr[0,...], MaskedArray))
+            assert_(isinstance(arr[1,...], MaskedArray))
+            assert_(arr[0,...][()] is a0)
+            assert_(arr[1,...][()] is a1)
+
+            arr[0] = np.ma.masked
+
+            assert_(arr[1] is a1)
+            assert_(isinstance(arr[0,...], MaskedArray))
+            assert_(isinstance(arr[1,...], MaskedArray))
+            assert_equal(arr[0,...].mask, True)
+            assert_(arr[1,...][()] is a1)
+
+            # gh-5962 - object arrays of arrays do something special
+            assert_equal(arr[0].data, a0)
+            assert_equal(arr[0].mask, True)
+            assert_equal(arr[0,...][()].data, a0)
+            assert_equal(arr[0,...][()].mask, True)
+
+    def test_nested_ma(self):
+
+        arr = np.ma.array([None, None])
+        # set the first object to be an unmasked masked constant. A little fiddly
+        arr[0,...] = np.array([np.ma.masked], object)[0,...]
+
+        # check the above line did what we were aiming for
+        assert_(arr.data[0] is np.ma.masked)
+
+        # test that getitem returned the value by identity
+        assert_(arr[0] is np.ma.masked)
+
+        # now mask the masked value!
+        arr[0] = np.ma.masked
+        assert_(arr[0] is np.ma.masked)
+
+
+class TestMaskedView(object):
+
+    def setup(self):
+        iterator = list(zip(np.arange(10), np.random.rand(10)))
+        data = np.array(iterator)
+        a = array(iterator, dtype=[('a', float), ('b', float)])
+        a.mask[0] = (1, 0)
+        controlmask = np.array([1] + 19 * [0], dtype=bool)
+        self.data = (data, a, controlmask)
+
+    def test_view_to_nothing(self):
+        (data, a, controlmask) = self.data
+        test = a.view()
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test._data, a._data)
+        assert_equal(test._mask, a._mask)
+
+    def test_view_to_type(self):
+        (data, a, controlmask) = self.data
+        test = a.view(np.ndarray)
+        assert_(not isinstance(test, MaskedArray))
+        assert_equal(test, a._data)
+        assert_equal_records(test, data.view(a.dtype).squeeze())
+
+    def test_view_to_simple_dtype(self):
+        (data, a, controlmask) = self.data
+        # View globally
+        test = a.view(float)
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test, data.ravel())
+        assert_equal(test.mask, controlmask)
+
+    def test_view_to_flexible_dtype(self):
+        (data, a, controlmask) = self.data
+
+        test = a.view([('A', float), ('B', float)])
+        assert_equal(test.mask.dtype.names, ('A', 'B'))
+        assert_equal(test['A'], a['a'])
+        assert_equal(test['B'], a['b'])
+
+        test = a[0].view([('A', float), ('B', float)])
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test.mask.dtype.names, ('A', 'B'))
+        assert_equal(test['A'], a['a'][0])
+        assert_equal(test['B'], a['b'][0])
+
+        test = a[-1].view([('A', float), ('B', float)])
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test.dtype.names, ('A', 'B'))
+        assert_equal(test['A'], a['a'][-1])
+        assert_equal(test['B'], a['b'][-1])
+
+    def test_view_to_subdtype(self):
+        (data, a, controlmask) = self.data
+        # View globally
+        test = a.view((float, 2))
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test, data)
+        assert_equal(test.mask, controlmask.reshape(-1, 2))
+        # View on 1 masked element
+        test = a[0].view((float, 2))
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test, data[0])
+        assert_equal(test.mask, (1, 0))
+        # View on 1 unmasked element
+        test = a[-1].view((float, 2))
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test, data[-1])
+
+    def test_view_to_dtype_and_type(self):
+        (data, a, controlmask) = self.data
+
+        test = a.view((float, 2), np.recarray)
+        assert_equal(test, data)
+        assert_(isinstance(test, np.recarray))
+        assert_(not isinstance(test, MaskedArray))
+
+
+class TestOptionalArgs(object):
+    def test_ndarrayfuncs(self):
+        # test axis arg behaves the same as ndarray (including multiple axes)
+
+        d = np.arange(24.0).reshape((2,3,4))
+        m = np.zeros(24, dtype=bool).reshape((2,3,4))
+        # mask out last element of last dimension
+        m[:,:,-1] = True
+        a = np.ma.array(d, mask=m)
+
+        def testaxis(f, a, d):
+            numpy_f = numpy.__getattribute__(f)
+            ma_f = np.ma.__getattribute__(f)
+
+            # test axis arg
+            assert_equal(ma_f(a, axis=1)[...,:-1], numpy_f(d[...,:-1], axis=1))
+            assert_equal(ma_f(a, axis=(0,1))[...,:-1],
+                         numpy_f(d[...,:-1], axis=(0,1)))
+
+        def testkeepdims(f, a, d):
+            numpy_f = numpy.__getattribute__(f)
+            ma_f = np.ma.__getattribute__(f)
+
+            # test keepdims arg
+            assert_equal(ma_f(a, keepdims=True).shape,
+                         numpy_f(d, keepdims=True).shape)
+            assert_equal(ma_f(a, keepdims=False).shape,
+                         numpy_f(d, keepdims=False).shape)
+
+            # test both at once
+            assert_equal(ma_f(a, axis=1, keepdims=True)[...,:-1],
+                         numpy_f(d[...,:-1], axis=1, keepdims=True))
+            assert_equal(ma_f(a, axis=(0,1), keepdims=True)[...,:-1],
+                         numpy_f(d[...,:-1], axis=(0,1), keepdims=True))
+
+        for f in ['sum', 'prod', 'mean', 'var', 'std']:
+            testaxis(f, a, d)
+            testkeepdims(f, a, d)
+
+        for f in ['min', 'max']:
+            testaxis(f, a, d)
+
+        d = (np.arange(24).reshape((2,3,4))%2 == 0)
+        a = np.ma.array(d, mask=m)
+        for f in ['all', 'any']:
+            testaxis(f, a, d)
+            testkeepdims(f, a, d)
+
+    def test_count(self):
+        # test np.ma.count specially
+
+        d = np.arange(24.0).reshape((2,3,4))
+        m = np.zeros(24, dtype=bool).reshape((2,3,4))
+        m[:,0,:] = True
+        a = np.ma.array(d, mask=m)
+
+        assert_equal(count(a), 16)
+        assert_equal(count(a, axis=1), 2*ones((2,4)))
+        assert_equal(count(a, axis=(0,1)), 4*ones((4,)))
+        assert_equal(count(a, keepdims=True), 16*ones((1,1,1)))
+        assert_equal(count(a, axis=1, keepdims=True), 2*ones((2,1,4)))
+        assert_equal(count(a, axis=(0,1), keepdims=True), 4*ones((1,1,4)))
+        assert_equal(count(a, axis=-2), 2*ones((2,4)))
+        assert_raises(ValueError, count, a, axis=(1,1))
+        assert_raises(np.AxisError, count, a, axis=3)
+
+        # check the 'nomask' path
+        a = np.ma.array(d, mask=nomask)
+
+        assert_equal(count(a), 24)
+        assert_equal(count(a, axis=1), 3*ones((2,4)))
+        assert_equal(count(a, axis=(0,1)), 6*ones((4,)))
+        assert_equal(count(a, keepdims=True), 24*ones((1,1,1)))
+        assert_equal(np.ndim(count(a, keepdims=True)), 3)
+        assert_equal(count(a, axis=1, keepdims=True), 3*ones((2,1,4)))
+        assert_equal(count(a, axis=(0,1), keepdims=True), 6*ones((1,1,4)))
+        assert_equal(count(a, axis=-2), 3*ones((2,4)))
+        assert_raises(ValueError, count, a, axis=(1,1))
+        assert_raises(np.AxisError, count, a, axis=3)
+
+        # check the 'masked' singleton
+        assert_equal(count(np.ma.masked), 0)
+
+        # check 0-d arrays do not allow axis > 0
+        assert_raises(np.AxisError, count, np.ma.array(1), axis=1)
+
+
+class TestMaskedConstant(object):
+    def _do_add_test(self, add):
+        # sanity check
+        assert_(add(np.ma.masked, 1) is np.ma.masked)
+
+        # now try with a vector
+        vector = np.array([1, 2, 3])
+        result = add(np.ma.masked, vector)
+
+        # lots of things could go wrong here
+        assert_(result is not np.ma.masked)
+        assert_(not isinstance(result, np.ma.core.MaskedConstant))
+        assert_equal(result.shape, vector.shape)
+        assert_equal(np.ma.getmask(result), np.ones(vector.shape, dtype=bool))
+
+    def test_ufunc(self):
+        self._do_add_test(np.add)
+
+    def test_operator(self):
+        self._do_add_test(lambda a, b: a + b)
+
+    def test_ctor(self):
+        m = np.ma.array(np.ma.masked)
+
+        # most importantly, we do not want to create a new MaskedConstant
+        # instance
+        assert_(not isinstance(m, np.ma.core.MaskedConstant))
+        assert_(m is not np.ma.masked)
+
+    def test_repr(self):
+        # copies should not exist, but if they do, it should be obvious that
+        # something is wrong
+        assert_equal(repr(np.ma.masked), 'masked')
+
+        # create a new instance in a weird way
+        masked2 = np.ma.MaskedArray.__new__(np.ma.core.MaskedConstant)
+        assert_not_equal(repr(masked2), 'masked')
+
+    def test_pickle(self):
+        from io import BytesIO
+
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            with BytesIO() as f:
+                pickle.dump(np.ma.masked, f, protocol=proto)
+                f.seek(0)
+                res = pickle.load(f)
+            assert_(res is np.ma.masked)
+
+    def test_copy(self):
+        # gh-9328
+        # copy is a no-op, like it is with np.True_
+        assert_equal(
+            np.ma.masked.copy() is np.ma.masked,
+            np.True_.copy() is np.True_)
+
+    def test__copy(self):
+        import copy
+        assert_(
+            copy.copy(np.ma.masked) is np.ma.masked)
+
+    def test_deepcopy(self):
+        import copy
+        assert_(
+            copy.deepcopy(np.ma.masked) is np.ma.masked)
+
+    def test_immutable(self):
+        orig = np.ma.masked
+        assert_raises(np.ma.core.MaskError, operator.setitem, orig, (), 1)
+        assert_raises(ValueError,operator.setitem, orig.data, (), 1)
+        assert_raises(ValueError, operator.setitem, orig.mask, (), False)
+
+        view = np.ma.masked.view(np.ma.MaskedArray)
+        assert_raises(ValueError, operator.setitem, view, (), 1)
+        assert_raises(ValueError, operator.setitem, view.data, (), 1)
+        assert_raises(ValueError, operator.setitem, view.mask, (), False)
+
+    def test_coercion_int(self):
+        a_i = np.zeros((), int)
+        assert_raises(MaskError, operator.setitem, a_i, (), np.ma.masked)
+        assert_raises(MaskError, int, np.ma.masked)
+
+    @pytest.mark.skipif(sys.version_info.major == 3,
+                        reason="long doesn't exist in Python 3")
+    def test_coercion_long(self):
+        assert_raises(MaskError, long, np.ma.masked)
+
+    def test_coercion_float(self):
+        a_f = np.zeros((), float)
+        assert_warns(UserWarning, operator.setitem, a_f, (), np.ma.masked)
+        assert_(np.isnan(a_f[()]))
+
+    @pytest.mark.xfail(reason="See gh-9750")
+    def test_coercion_unicode(self):
+        a_u = np.zeros((), 'U10')
+        a_u[()] = np.ma.masked
+        assert_equal(a_u[()], u'--')
+
+    @pytest.mark.xfail(reason="See gh-9750")
+    def test_coercion_bytes(self):
+        a_b = np.zeros((), 'S10')
+        a_b[()] = np.ma.masked
+        assert_equal(a_b[()], b'--')
+
+    def test_subclass(self):
+        # https://github.com/astropy/astropy/issues/6645
+        class Sub(type(np.ma.masked)): pass
+
+        a = Sub()
+        assert_(a is Sub())
+        assert_(a is not np.ma.masked)
+        assert_not_equal(repr(a), 'masked')
+
+    def test_attributes_readonly(self):
+        assert_raises(AttributeError, setattr, np.ma.masked, 'shape', (1,))
+        assert_raises(AttributeError, setattr, np.ma.masked, 'dtype', np.int64)
+
+
+class TestMaskedWhereAliases(object):
+
+    # TODO: Test masked_object, masked_equal, ...
+
+    def test_masked_values(self):
+        res = masked_values(np.array([-32768.0]), np.int16(-32768))
+        assert_equal(res.mask, [True])
+
+        res = masked_values(np.inf, np.inf)
+        assert_equal(res.mask, True)
+
+        res = np.ma.masked_values(np.inf, -np.inf)
+        assert_equal(res.mask, False)
+
+        res = np.ma.masked_values([1, 2, 3, 4], 5, shrink=True)
+        assert_(res.mask is np.ma.nomask)
+
+        res = np.ma.masked_values([1, 2, 3, 4], 5, shrink=False)
+        assert_equal(res.mask, [False] * 4)
+
+
+def test_masked_array():
+    a = np.ma.array([0, 1, 2, 3], mask=[0, 0, 1, 0])
+    assert_equal(np.argwhere(a), [[1], [3]])
+
+def test_append_masked_array():
+    a = np.ma.masked_equal([1,2,3], value=2)
+    b = np.ma.masked_equal([4,3,2], value=2)
+
+    result = np.ma.append(a, b)
+    expected_data = [1, 2, 3, 4, 3, 2]
+    expected_mask = [False, True, False, False, False, True]
+    assert_array_equal(result.data, expected_data)
+    assert_array_equal(result.mask, expected_mask)
+
+    a = np.ma.masked_all((2,2))
+    b = np.ma.ones((3,1))
+
+    result = np.ma.append(a, b)
+    expected_data = [1] * 3
+    expected_mask = [True] * 4 + [False] * 3
+    assert_array_equal(result.data[-3], expected_data)
+    assert_array_equal(result.mask, expected_mask)
+
+    result = np.ma.append(a, b, axis=None)
+    assert_array_equal(result.data[-3], expected_data)
+    assert_array_equal(result.mask, expected_mask)
+
+
+def test_append_masked_array_along_axis():
+    a = np.ma.masked_equal([1,2,3], value=2)
+    b = np.ma.masked_values([[4, 5, 6], [7, 8, 9]], 7)
+
+    # When `axis` is specified, `values` must have the correct shape.
+    assert_raises(ValueError, np.ma.append, a, b, axis=0)
+
+    result = np.ma.append(a[np.newaxis,:], b, axis=0)
+    expected = np.ma.arange(1, 10)
+    expected[[1, 6]] = np.ma.masked
+    expected = expected.reshape((3,3))
+    assert_array_equal(result.data, expected.data)
+    assert_array_equal(result.mask, expected.mask)
+
+
+def test_default_fill_value_complex():
+    # regression test for Python 3, where 'unicode' was not defined
+    assert_(default_fill_value(1 + 1j) == 1.e20 + 0.0j)
+
+
+def test_ufunc_with_output():
+    # check that giving an output argument always returns that output.
+    # Regression test for gh-8416.
+    x = array([1., 2., 3.], mask=[0, 0, 1])
+    y = np.add(x, 1., out=x)
+    assert_(y is x)
+
+
+def test_ufunc_with_out_varied():
+    """ Test that masked arrays are immune to gh-10459 """
+    # the mask of the output should not affect the result, however it is passed
+    a        = array([ 1,  2,  3], mask=[1, 0, 0])
+    b        = array([10, 20, 30], mask=[1, 0, 0])
+    out      = array([ 0,  0,  0], mask=[0, 0, 1])
+    expected = array([11, 22, 33], mask=[1, 0, 0])
+
+    out_pos = out.copy()
+    res_pos = np.add(a, b, out_pos)
+
+    out_kw = out.copy()
+    res_kw = np.add(a, b, out=out_kw)
+
+    out_tup = out.copy()
+    res_tup = np.add(a, b, out=(out_tup,))
+
+    assert_equal(res_kw.mask,  expected.mask)
+    assert_equal(res_kw.data,  expected.data)
+    assert_equal(res_tup.mask, expected.mask)
+    assert_equal(res_tup.data, expected.data)
+    assert_equal(res_pos.mask, expected.mask)
+    assert_equal(res_pos.data, expected.data)
+
+
+def test_astype_mask_ordering():
+    descr = [('v', int, 3), ('x', [('y', float)])]
+    x = array([
+        [([1, 2, 3], (1.0,)),  ([1, 2, 3], (2.0,))],
+        [([1, 2, 3], (3.0,)),  ([1, 2, 3], (4.0,))]], dtype=descr)
+    x[0]['v'][0] = np.ma.masked
+
+    x_a = x.astype(descr)
+    assert x_a.dtype.names == np.dtype(descr).names
+    assert x_a.mask.dtype.names == np.dtype(descr).names
+    assert_equal(x, x_a)
+
+    assert_(x is x.astype(x.dtype, copy=False))
+    assert_equal(type(x.astype(x.dtype, subok=False)), np.ndarray)
+
+    x_f = x.astype(x.dtype, order='F')
+    assert_(x_f.flags.f_contiguous)
+    assert_(x_f.mask.flags.f_contiguous)
+
+    # Also test the same indirectly, via np.array
+    x_a2 = np.array(x, dtype=descr, subok=True)
+    assert x_a2.dtype.names == np.dtype(descr).names
+    assert x_a2.mask.dtype.names == np.dtype(descr).names
+    assert_equal(x, x_a2)
+
+    assert_(x is np.array(x, dtype=descr, copy=False, subok=True))
+
+    x_f2 = np.array(x, dtype=x.dtype, order='F', subok=True)
+    assert_(x_f2.flags.f_contiguous)
+    assert_(x_f2.mask.flags.f_contiguous)
+
+
+[email protected]('dt1', num_dts, ids=num_ids)
+[email protected]('dt2', num_dts, ids=num_ids)
+[email protected]('ignore::numpy.ComplexWarning')
+def test_astype_basic(dt1, dt2):
+    # See gh-12070
+    src = np.ma.array(ones(3, dt1), fill_value=1)
+    dst = src.astype(dt2)
+
+    assert_(src.fill_value == 1)
+    assert_(src.dtype == dt1)
+    assert_(src.fill_value.dtype == dt1)
+
+    assert_(dst.fill_value == 1)
+    assert_(dst.dtype == dt2)
+    assert_(dst.fill_value.dtype == dt2)
+
+    assert_equal(src, dst)
+
+
+def test_fieldless_void():
+    dt = np.dtype([])  # a void dtype with no fields
+    x = np.empty(4, dt)
+
+    # these arrays contain no values, so there's little to test - but this
+    # shouldn't crash
+    mx = np.ma.array(x)
+    assert_equal(mx.dtype, x.dtype)
+    assert_equal(mx.shape, x.shape)
+
+    mx = np.ma.array(x, mask=x)
+    assert_equal(mx.dtype, x.dtype)
+    assert_equal(mx.shape, x.shape)
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_deprecations.py b/contrib/python/numpy/py2/numpy/ma/tests/test_deprecations.py
new file mode 100644
index 00000000000..72cc29aa046
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_deprecations.py
@@ -0,0 +1,70 @@
+"""Test deprecation and future warnings.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_warns
+from numpy.ma.testutils import assert_equal
+from numpy.ma.core import MaskedArrayFutureWarning
+
+class TestArgsort(object):
+    """ gh-8701 """
+    def _test_base(self, argsort, cls):
+        arr_0d = np.array(1).view(cls)
+        argsort(arr_0d)
+
+        arr_1d = np.array([1, 2, 3]).view(cls)
+        argsort(arr_1d)
+
+        # argsort has a bad default for >1d arrays
+        arr_2d = np.array([[1, 2], [3, 4]]).view(cls)
+        result = assert_warns(
+            np.ma.core.MaskedArrayFutureWarning, argsort, arr_2d)
+        assert_equal(result, argsort(arr_2d, axis=None))
+
+        # should be no warnings for explicitly specifying it
+        argsort(arr_2d, axis=None)
+        argsort(arr_2d, axis=-1)
+
+    def test_function_ndarray(self):
+        return self._test_base(np.ma.argsort, np.ndarray)
+
+    def test_function_maskedarray(self):
+        return self._test_base(np.ma.argsort, np.ma.MaskedArray)
+
+    def test_method(self):
+        return self._test_base(np.ma.MaskedArray.argsort, np.ma.MaskedArray)
+
+
+class TestMinimumMaximum(object):
+    def test_minimum(self):
+        assert_warns(DeprecationWarning, np.ma.minimum, np.ma.array([1, 2]))
+
+    def test_maximum(self):
+        assert_warns(DeprecationWarning, np.ma.maximum, np.ma.array([1, 2]))
+
+    def test_axis_default(self):
+        # NumPy 1.13, 2017-05-06
+
+        data1d = np.ma.arange(6)
+        data2d = data1d.reshape(2, 3)
+
+        ma_min = np.ma.minimum.reduce
+        ma_max = np.ma.maximum.reduce
+
+        # check that the default axis is still None, but warns on 2d arrays
+        result = assert_warns(MaskedArrayFutureWarning, ma_max, data2d)
+        assert_equal(result, ma_max(data2d, axis=None))
+
+        result = assert_warns(MaskedArrayFutureWarning, ma_min, data2d)
+        assert_equal(result, ma_min(data2d, axis=None))
+
+        # no warnings on 1d, as both new and old defaults are equivalent
+        result = ma_min(data1d)
+        assert_equal(result, ma_min(data1d, axis=None))
+        assert_equal(result, ma_min(data1d, axis=0))
+
+        result = ma_max(data1d)
+        assert_equal(result, ma_max(data1d, axis=None))
+        assert_equal(result, ma_max(data1d, axis=0))
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_extras.py b/contrib/python/numpy/py2/numpy/ma/tests/test_extras.py
new file mode 100644
index 00000000000..5243cf714a0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_extras.py
@@ -0,0 +1,1678 @@
+# pylint: disable-msg=W0611, W0612, W0511
+"""Tests suite for MaskedArray.
+Adapted from the original test_ma by Pierre Gerard-Marchant
+
+:author: Pierre Gerard-Marchant
+:contact: pierregm_at_uga_dot_edu
+:version: $Id: test_extras.py 3473 2007-10-29 15:18:13Z jarrod.millman $
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import warnings
+import itertools
+
+import numpy as np
+from numpy.testing import (
+    assert_warns, suppress_warnings
+    )
+from numpy.ma.testutils import (
+    assert_, assert_array_equal, assert_equal, assert_almost_equal
+    )
+from numpy.ma.core import (
+    array, arange, masked, MaskedArray, masked_array, getmaskarray, shape,
+    nomask, ones, zeros, count
+    )
+from numpy.ma.extras import (
+    atleast_1d, atleast_2d, atleast_3d, mr_, dot, polyfit, cov, corrcoef,
+    median, average, unique, setxor1d, setdiff1d, union1d, intersect1d, in1d,
+    ediff1d, apply_over_axes, apply_along_axis, compress_nd, compress_rowcols,
+    mask_rowcols, clump_masked, clump_unmasked, flatnotmasked_contiguous,
+    notmasked_contiguous, notmasked_edges, masked_all, masked_all_like, isin,
+    diagflat, stack, vstack
+    )
+
+
+class TestGeneric(object):
+    #
+    def test_masked_all(self):
+        # Tests masked_all
+        # Standard dtype
+        test = masked_all((2,), dtype=float)
+        control = array([1, 1], mask=[1, 1], dtype=float)
+        assert_equal(test, control)
+        # Flexible dtype
+        dt = np.dtype({'names': ['a', 'b'], 'formats': ['f', 'f']})
+        test = masked_all((2,), dtype=dt)
+        control = array([(0, 0), (0, 0)], mask=[(1, 1), (1, 1)], dtype=dt)
+        assert_equal(test, control)
+        test = masked_all((2, 2), dtype=dt)
+        control = array([[(0, 0), (0, 0)], [(0, 0), (0, 0)]],
+                        mask=[[(1, 1), (1, 1)], [(1, 1), (1, 1)]],
+                        dtype=dt)
+        assert_equal(test, control)
+        # Nested dtype
+        dt = np.dtype([('a', 'f'), ('b', [('ba', 'f'), ('bb', 'f')])])
+        test = masked_all((2,), dtype=dt)
+        control = array([(1, (1, 1)), (1, (1, 1))],
+                        mask=[(1, (1, 1)), (1, (1, 1))], dtype=dt)
+        assert_equal(test, control)
+        test = masked_all((2,), dtype=dt)
+        control = array([(1, (1, 1)), (1, (1, 1))],
+                        mask=[(1, (1, 1)), (1, (1, 1))], dtype=dt)
+        assert_equal(test, control)
+        test = masked_all((1, 1), dtype=dt)
+        control = array([[(1, (1, 1))]], mask=[[(1, (1, 1))]], dtype=dt)
+        assert_equal(test, control)
+
+    def test_masked_all_like(self):
+        # Tests masked_all
+        # Standard dtype
+        base = array([1, 2], dtype=float)
+        test = masked_all_like(base)
+        control = array([1, 1], mask=[1, 1], dtype=float)
+        assert_equal(test, control)
+        # Flexible dtype
+        dt = np.dtype({'names': ['a', 'b'], 'formats': ['f', 'f']})
+        base = array([(0, 0), (0, 0)], mask=[(1, 1), (1, 1)], dtype=dt)
+        test = masked_all_like(base)
+        control = array([(10, 10), (10, 10)], mask=[(1, 1), (1, 1)], dtype=dt)
+        assert_equal(test, control)
+        # Nested dtype
+        dt = np.dtype([('a', 'f'), ('b', [('ba', 'f'), ('bb', 'f')])])
+        control = array([(1, (1, 1)), (1, (1, 1))],
+                        mask=[(1, (1, 1)), (1, (1, 1))], dtype=dt)
+        test = masked_all_like(control)
+        assert_equal(test, control)
+
+    def check_clump(self, f):
+        for i in range(1, 7):
+            for j in range(2**i):
+                k = np.arange(i, dtype=int)
+                ja = np.full(i, j, dtype=int)
+                a = masked_array(2**k)
+                a.mask = (ja & (2**k)) != 0
+                s = 0
+                for sl in f(a):
+                    s += a.data[sl].sum()
+                if f == clump_unmasked:
+                    assert_equal(a.compressed().sum(), s)
+                else:
+                    a.mask = ~a.mask
+                    assert_equal(a.compressed().sum(), s)
+
+    def test_clump_masked(self):
+        # Test clump_masked
+        a = masked_array(np.arange(10))
+        a[[0, 1, 2, 6, 8, 9]] = masked
+        #
+        test = clump_masked(a)
+        control = [slice(0, 3), slice(6, 7), slice(8, 10)]
+        assert_equal(test, control)
+
+        self.check_clump(clump_masked)
+
+    def test_clump_unmasked(self):
+        # Test clump_unmasked
+        a = masked_array(np.arange(10))
+        a[[0, 1, 2, 6, 8, 9]] = masked
+        test = clump_unmasked(a)
+        control = [slice(3, 6), slice(7, 8), ]
+        assert_equal(test, control)
+
+        self.check_clump(clump_unmasked)
+
+    def test_flatnotmasked_contiguous(self):
+        # Test flatnotmasked_contiguous
+        a = arange(10)
+        # No mask
+        test = flatnotmasked_contiguous(a)
+        assert_equal(test, [slice(0, a.size)])
+        # mask of all false
+        a.mask = np.zeros(10, dtype=bool)
+        assert_equal(test, [slice(0, a.size)])
+        # Some mask
+        a[(a < 3) | (a > 8) | (a == 5)] = masked
+        test = flatnotmasked_contiguous(a)
+        assert_equal(test, [slice(3, 5), slice(6, 9)])
+        #
+        a[:] = masked
+        test = flatnotmasked_contiguous(a)
+        assert_equal(test, [])
+
+
+class TestAverage(object):
+    # Several tests of average. Why so many ? Good point...
+    def test_testAverage1(self):
+        # Test of average.
+        ott = array([0., 1., 2., 3.], mask=[True, False, False, False])
+        assert_equal(2.0, average(ott, axis=0))
+        assert_equal(2.0, average(ott, weights=[1., 1., 2., 1.]))
+        result, wts = average(ott, weights=[1., 1., 2., 1.], returned=1)
+        assert_equal(2.0, result)
+        assert_(wts == 4.0)
+        ott[:] = masked
+        assert_equal(average(ott, axis=0).mask, [True])
+        ott = array([0., 1., 2., 3.], mask=[True, False, False, False])
+        ott = ott.reshape(2, 2)
+        ott[:, 1] = masked
+        assert_equal(average(ott, axis=0), [2.0, 0.0])
+        assert_equal(average(ott, axis=1).mask[0], [True])
+        assert_equal([2., 0.], average(ott, axis=0))
+        result, wts = average(ott, axis=0, returned=1)
+        assert_equal(wts, [1., 0.])
+
+    def test_testAverage2(self):
+        # More tests of average.
+        w1 = [0, 1, 1, 1, 1, 0]
+        w2 = [[0, 1, 1, 1, 1, 0], [1, 0, 0, 0, 0, 1]]
+        x = arange(6, dtype=np.float_)
+        assert_equal(average(x, axis=0), 2.5)
+        assert_equal(average(x, axis=0, weights=w1), 2.5)
+        y = array([arange(6, dtype=np.float_), 2.0 * arange(6)])
+        assert_equal(average(y, None), np.add.reduce(np.arange(6)) * 3. / 12.)
+        assert_equal(average(y, axis=0), np.arange(6) * 3. / 2.)
+        assert_equal(average(y, axis=1),
+                     [average(x, axis=0), average(x, axis=0) * 2.0])
+        assert_equal(average(y, None, weights=w2), 20. / 6.)
+        assert_equal(average(y, axis=0, weights=w2),
+                     [0., 1., 2., 3., 4., 10.])
+        assert_equal(average(y, axis=1),
+                     [average(x, axis=0), average(x, axis=0) * 2.0])
+        m1 = zeros(6)
+        m2 = [0, 0, 1, 1, 0, 0]
+        m3 = [[0, 0, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0]]
+        m4 = ones(6)
+        m5 = [0, 1, 1, 1, 1, 1]
+        assert_equal(average(masked_array(x, m1), axis=0), 2.5)
+        assert_equal(average(masked_array(x, m2), axis=0), 2.5)
+        assert_equal(average(masked_array(x, m4), axis=0).mask, [True])
+        assert_equal(average(masked_array(x, m5), axis=0), 0.0)
+        assert_equal(count(average(masked_array(x, m4), axis=0)), 0)
+        z = masked_array(y, m3)
+        assert_equal(average(z, None), 20. / 6.)
+        assert_equal(average(z, axis=0), [0., 1., 99., 99., 4.0, 7.5])
+        assert_equal(average(z, axis=1), [2.5, 5.0])
+        assert_equal(average(z, axis=0, weights=w2),
+                     [0., 1., 99., 99., 4.0, 10.0])
+
+    def test_testAverage3(self):
+        # Yet more tests of average!
+        a = arange(6)
+        b = arange(6) * 3
+        r1, w1 = average([[a, b], [b, a]], axis=1, returned=1)
+        assert_equal(shape(r1), shape(w1))
+        assert_equal(r1.shape, w1.shape)
+        r2, w2 = average(ones((2, 2, 3)), axis=0, weights=[3, 1], returned=1)
+        assert_equal(shape(w2), shape(r2))
+        r2, w2 = average(ones((2, 2, 3)), returned=1)
+        assert_equal(shape(w2), shape(r2))
+        r2, w2 = average(ones((2, 2, 3)), weights=ones((2, 2, 3)), returned=1)
+        assert_equal(shape(w2), shape(r2))
+        a2d = array([[1, 2], [0, 4]], float)
+        a2dm = masked_array(a2d, [[False, False], [True, False]])
+        a2da = average(a2d, axis=0)
+        assert_equal(a2da, [0.5, 3.0])
+        a2dma = average(a2dm, axis=0)
+        assert_equal(a2dma, [1.0, 3.0])
+        a2dma = average(a2dm, axis=None)
+        assert_equal(a2dma, 7. / 3.)
+        a2dma = average(a2dm, axis=1)
+        assert_equal(a2dma, [1.5, 4.0])
+
+    def test_onintegers_with_mask(self):
+        # Test average on integers with mask
+        a = average(array([1, 2]))
+        assert_equal(a, 1.5)
+        a = average(array([1, 2, 3, 4], mask=[False, False, True, True]))
+        assert_equal(a, 1.5)
+
+    def test_complex(self):
+        # Test with complex data.
+        # (Regression test for https://github.com/numpy/numpy/issues/2684)
+        mask = np.array([[0, 0, 0, 1, 0],
+                         [0, 1, 0, 0, 0]], dtype=bool)
+        a = masked_array([[0, 1+2j, 3+4j, 5+6j, 7+8j],
+                          [9j, 0+1j, 2+3j, 4+5j, 7+7j]],
+                         mask=mask)
+
+        av = average(a)
+        expected = np.average(a.compressed())
+        assert_almost_equal(av.real, expected.real)
+        assert_almost_equal(av.imag, expected.imag)
+
+        av0 = average(a, axis=0)
+        expected0 = average(a.real, axis=0) + average(a.imag, axis=0)*1j
+        assert_almost_equal(av0.real, expected0.real)
+        assert_almost_equal(av0.imag, expected0.imag)
+
+        av1 = average(a, axis=1)
+        expected1 = average(a.real, axis=1) + average(a.imag, axis=1)*1j
+        assert_almost_equal(av1.real, expected1.real)
+        assert_almost_equal(av1.imag, expected1.imag)
+
+        # Test with the 'weights' argument.
+        wts = np.array([[0.5, 1.0, 2.0, 1.0, 0.5],
+                        [1.0, 1.0, 1.0, 1.0, 1.0]])
+        wav = average(a, weights=wts)
+        expected = np.average(a.compressed(), weights=wts[~mask])
+        assert_almost_equal(wav.real, expected.real)
+        assert_almost_equal(wav.imag, expected.imag)
+
+        wav0 = average(a, weights=wts, axis=0)
+        expected0 = (average(a.real, weights=wts, axis=0) +
+                     average(a.imag, weights=wts, axis=0)*1j)
+        assert_almost_equal(wav0.real, expected0.real)
+        assert_almost_equal(wav0.imag, expected0.imag)
+
+        wav1 = average(a, weights=wts, axis=1)
+        expected1 = (average(a.real, weights=wts, axis=1) +
+                     average(a.imag, weights=wts, axis=1)*1j)
+        assert_almost_equal(wav1.real, expected1.real)
+        assert_almost_equal(wav1.imag, expected1.imag)
+
+
+class TestConcatenator(object):
+    # Tests for mr_, the equivalent of r_ for masked arrays.
+
+    def test_1d(self):
+        # Tests mr_ on 1D arrays.
+        assert_array_equal(mr_[1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6]))
+        b = ones(5)
+        m = [1, 0, 0, 0, 0]
+        d = masked_array(b, mask=m)
+        c = mr_[d, 0, 0, d]
+        assert_(isinstance(c, MaskedArray))
+        assert_array_equal(c, [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])
+        assert_array_equal(c.mask, mr_[m, 0, 0, m])
+
+    def test_2d(self):
+        # Tests mr_ on 2D arrays.
+        a_1 = np.random.rand(5, 5)
+        a_2 = np.random.rand(5, 5)
+        m_1 = np.round_(np.random.rand(5, 5), 0)
+        m_2 = np.round_(np.random.rand(5, 5), 0)
+        b_1 = masked_array(a_1, mask=m_1)
+        b_2 = masked_array(a_2, mask=m_2)
+        # append columns
+        d = mr_['1', b_1, b_2]
+        assert_(d.shape == (5, 10))
+        assert_array_equal(d[:, :5], b_1)
+        assert_array_equal(d[:, 5:], b_2)
+        assert_array_equal(d.mask, np.r_['1', m_1, m_2])
+        d = mr_[b_1, b_2]
+        assert_(d.shape == (10, 5))
+        assert_array_equal(d[:5,:], b_1)
+        assert_array_equal(d[5:,:], b_2)
+        assert_array_equal(d.mask, np.r_[m_1, m_2])
+
+    def test_masked_constant(self):
+        actual = mr_[np.ma.masked, 1]
+        assert_equal(actual.mask, [True, False])
+        assert_equal(actual.data[1], 1)
+
+        actual = mr_[[1, 2], np.ma.masked]
+        assert_equal(actual.mask, [False, False, True])
+        assert_equal(actual.data[:2], [1, 2])
+
+
+class TestNotMasked(object):
+    # Tests notmasked_edges and notmasked_contiguous.
+
+    def test_edges(self):
+        # Tests unmasked_edges
+        data = masked_array(np.arange(25).reshape(5, 5),
+                            mask=[[0, 0, 1, 0, 0],
+                                  [0, 0, 0, 1, 1],
+                                  [1, 1, 0, 0, 0],
+                                  [0, 0, 0, 0, 0],
+                                  [1, 1, 1, 0, 0]],)
+        test = notmasked_edges(data, None)
+        assert_equal(test, [0, 24])
+        test = notmasked_edges(data, 0)
+        assert_equal(test[0], [(0, 0, 1, 0, 0), (0, 1, 2, 3, 4)])
+        assert_equal(test[1], [(3, 3, 3, 4, 4), (0, 1, 2, 3, 4)])
+        test = notmasked_edges(data, 1)
+        assert_equal(test[0], [(0, 1, 2, 3, 4), (0, 0, 2, 0, 3)])
+        assert_equal(test[1], [(0, 1, 2, 3, 4), (4, 2, 4, 4, 4)])
+        #
+        test = notmasked_edges(data.data, None)
+        assert_equal(test, [0, 24])
+        test = notmasked_edges(data.data, 0)
+        assert_equal(test[0], [(0, 0, 0, 0, 0), (0, 1, 2, 3, 4)])
+        assert_equal(test[1], [(4, 4, 4, 4, 4), (0, 1, 2, 3, 4)])
+        test = notmasked_edges(data.data, -1)
+        assert_equal(test[0], [(0, 1, 2, 3, 4), (0, 0, 0, 0, 0)])
+        assert_equal(test[1], [(0, 1, 2, 3, 4), (4, 4, 4, 4, 4)])
+        #
+        data[-2] = masked
+        test = notmasked_edges(data, 0)
+        assert_equal(test[0], [(0, 0, 1, 0, 0), (0, 1, 2, 3, 4)])
+        assert_equal(test[1], [(1, 1, 2, 4, 4), (0, 1, 2, 3, 4)])
+        test = notmasked_edges(data, -1)
+        assert_equal(test[0], [(0, 1, 2, 4), (0, 0, 2, 3)])
+        assert_equal(test[1], [(0, 1, 2, 4), (4, 2, 4, 4)])
+
+    def test_contiguous(self):
+        # Tests notmasked_contiguous
+        a = masked_array(np.arange(24).reshape(3, 8),
+                         mask=[[0, 0, 0, 0, 1, 1, 1, 1],
+                               [1, 1, 1, 1, 1, 1, 1, 1],
+                               [0, 0, 0, 0, 0, 0, 1, 0]])
+        tmp = notmasked_contiguous(a, None)
+        assert_equal(tmp, [
+            slice(0, 4, None),
+            slice(16, 22, None),
+            slice(23, 24, None)
+        ])
+
+        tmp = notmasked_contiguous(a, 0)
+        assert_equal(tmp, [
+            [slice(0, 1, None), slice(2, 3, None)],
+            [slice(0, 1, None), slice(2, 3, None)],
+            [slice(0, 1, None), slice(2, 3, None)],
+            [slice(0, 1, None), slice(2, 3, None)],
+            [slice(2, 3, None)],
+            [slice(2, 3, None)],
+            [],
+            [slice(2, 3, None)]
+        ])
+        #
+        tmp = notmasked_contiguous(a, 1)
+        assert_equal(tmp, [
+            [slice(0, 4, None)],
+            [],
+            [slice(0, 6, None), slice(7, 8, None)]
+        ])
+
+
+class TestCompressFunctions(object):
+
+    def test_compress_nd(self):
+        # Tests compress_nd
+        x = np.array(list(range(3*4*5))).reshape(3, 4, 5)
+        m = np.zeros((3,4,5)).astype(bool)
+        m[1,1,1] = True
+        x = array(x, mask=m)
+
+        # axis=None
+        a = compress_nd(x)
+        assert_equal(a, [[[ 0,  2,  3,  4],
+                          [10, 12, 13, 14],
+                          [15, 17, 18, 19]],
+                         [[40, 42, 43, 44],
+                          [50, 52, 53, 54],
+                          [55, 57, 58, 59]]])
+
+        # axis=0
+        a = compress_nd(x, 0)
+        assert_equal(a, [[[ 0,  1,  2,  3,  4],
+                          [ 5,  6,  7,  8,  9],
+                          [10, 11, 12, 13, 14],
+                          [15, 16, 17, 18, 19]],
+                         [[40, 41, 42, 43, 44],
+                          [45, 46, 47, 48, 49],
+                          [50, 51, 52, 53, 54],
+                          [55, 56, 57, 58, 59]]])
+
+        # axis=1
+        a = compress_nd(x, 1)
+        assert_equal(a, [[[ 0,  1,  2,  3,  4],
+                          [10, 11, 12, 13, 14],
+                          [15, 16, 17, 18, 19]],
+                         [[20, 21, 22, 23, 24],
+                          [30, 31, 32, 33, 34],
+                          [35, 36, 37, 38, 39]],
+                         [[40, 41, 42, 43, 44],
+                          [50, 51, 52, 53, 54],
+                          [55, 56, 57, 58, 59]]])
+
+        a2 = compress_nd(x, (1,))
+        a3 = compress_nd(x, -2)
+        a4 = compress_nd(x, (-2,))
+        assert_equal(a, a2)
+        assert_equal(a, a3)
+        assert_equal(a, a4)
+
+        # axis=2
+        a = compress_nd(x, 2)
+        assert_equal(a, [[[ 0, 2,  3,  4],
+                          [ 5, 7,  8,  9],
+                          [10, 12, 13, 14],
+                          [15, 17, 18, 19]],
+                         [[20, 22, 23, 24],
+                          [25, 27, 28, 29],
+                          [30, 32, 33, 34],
+                          [35, 37, 38, 39]],
+                         [[40, 42, 43, 44],
+                          [45, 47, 48, 49],
+                          [50, 52, 53, 54],
+                          [55, 57, 58, 59]]])
+
+        a2 = compress_nd(x, (2,))
+        a3 = compress_nd(x, -1)
+        a4 = compress_nd(x, (-1,))
+        assert_equal(a, a2)
+        assert_equal(a, a3)
+        assert_equal(a, a4)
+
+        # axis=(0, 1)
+        a = compress_nd(x, (0, 1))
+        assert_equal(a, [[[ 0,  1,  2,  3,  4],
+                          [10, 11, 12, 13, 14],
+                          [15, 16, 17, 18, 19]],
+                         [[40, 41, 42, 43, 44],
+                          [50, 51, 52, 53, 54],
+                          [55, 56, 57, 58, 59]]])
+        a2 = compress_nd(x, (0, -2))
+        assert_equal(a, a2)
+
+        # axis=(1, 2)
+        a = compress_nd(x, (1, 2))
+        assert_equal(a, [[[ 0,  2,  3,  4],
+                          [10, 12, 13, 14],
+                          [15, 17, 18, 19]],
+                         [[20, 22, 23, 24],
+                          [30, 32, 33, 34],
+                          [35, 37, 38, 39]],
+                         [[40, 42, 43, 44],
+                          [50, 52, 53, 54],
+                          [55, 57, 58, 59]]])
+
+        a2 = compress_nd(x, (-2, 2))
+        a3 = compress_nd(x, (1, -1))
+        a4 = compress_nd(x, (-2, -1))
+        assert_equal(a, a2)
+        assert_equal(a, a3)
+        assert_equal(a, a4)
+
+        # axis=(0, 2)
+        a = compress_nd(x, (0, 2))
+        assert_equal(a, [[[ 0,  2,  3,  4],
+                          [ 5,  7,  8,  9],
+                          [10, 12, 13, 14],
+                          [15, 17, 18, 19]],
+                         [[40, 42, 43, 44],
+                          [45, 47, 48, 49],
+                          [50, 52, 53, 54],
+                          [55, 57, 58, 59]]])
+
+        a2 = compress_nd(x, (0, -1))
+        assert_equal(a, a2)
+
+    def test_compress_rowcols(self):
+        # Tests compress_rowcols
+        x = array(np.arange(9).reshape(3, 3),
+                  mask=[[1, 0, 0], [0, 0, 0], [0, 0, 0]])
+        assert_equal(compress_rowcols(x), [[4, 5], [7, 8]])
+        assert_equal(compress_rowcols(x, 0), [[3, 4, 5], [6, 7, 8]])
+        assert_equal(compress_rowcols(x, 1), [[1, 2], [4, 5], [7, 8]])
+        x = array(x._data, mask=[[0, 0, 0], [0, 1, 0], [0, 0, 0]])
+        assert_equal(compress_rowcols(x), [[0, 2], [6, 8]])
+        assert_equal(compress_rowcols(x, 0), [[0, 1, 2], [6, 7, 8]])
+        assert_equal(compress_rowcols(x, 1), [[0, 2], [3, 5], [6, 8]])
+        x = array(x._data, mask=[[1, 0, 0], [0, 1, 0], [0, 0, 0]])
+        assert_equal(compress_rowcols(x), [[8]])
+        assert_equal(compress_rowcols(x, 0), [[6, 7, 8]])
+        assert_equal(compress_rowcols(x, 1,), [[2], [5], [8]])
+        x = array(x._data, mask=[[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        assert_equal(compress_rowcols(x).size, 0)
+        assert_equal(compress_rowcols(x, 0).size, 0)
+        assert_equal(compress_rowcols(x, 1).size, 0)
+
+    def test_mask_rowcols(self):
+        # Tests mask_rowcols.
+        x = array(np.arange(9).reshape(3, 3),
+                  mask=[[1, 0, 0], [0, 0, 0], [0, 0, 0]])
+        assert_equal(mask_rowcols(x).mask,
+                     [[1, 1, 1], [1, 0, 0], [1, 0, 0]])
+        assert_equal(mask_rowcols(x, 0).mask,
+                     [[1, 1, 1], [0, 0, 0], [0, 0, 0]])
+        assert_equal(mask_rowcols(x, 1).mask,
+                     [[1, 0, 0], [1, 0, 0], [1, 0, 0]])
+        x = array(x._data, mask=[[0, 0, 0], [0, 1, 0], [0, 0, 0]])
+        assert_equal(mask_rowcols(x).mask,
+                     [[0, 1, 0], [1, 1, 1], [0, 1, 0]])
+        assert_equal(mask_rowcols(x, 0).mask,
+                     [[0, 0, 0], [1, 1, 1], [0, 0, 0]])
+        assert_equal(mask_rowcols(x, 1).mask,
+                     [[0, 1, 0], [0, 1, 0], [0, 1, 0]])
+        x = array(x._data, mask=[[1, 0, 0], [0, 1, 0], [0, 0, 0]])
+        assert_equal(mask_rowcols(x).mask,
+                     [[1, 1, 1], [1, 1, 1], [1, 1, 0]])
+        assert_equal(mask_rowcols(x, 0).mask,
+                     [[1, 1, 1], [1, 1, 1], [0, 0, 0]])
+        assert_equal(mask_rowcols(x, 1,).mask,
+                     [[1, 1, 0], [1, 1, 0], [1, 1, 0]])
+        x = array(x._data, mask=[[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        assert_(mask_rowcols(x).all() is masked)
+        assert_(mask_rowcols(x, 0).all() is masked)
+        assert_(mask_rowcols(x, 1).all() is masked)
+        assert_(mask_rowcols(x).mask.all())
+        assert_(mask_rowcols(x, 0).mask.all())
+        assert_(mask_rowcols(x, 1).mask.all())
+
+    def test_dot(self):
+        # Tests dot product
+        n = np.arange(1, 7)
+        #
+        m = [1, 0, 0, 0, 0, 0]
+        a = masked_array(n, mask=m).reshape(2, 3)
+        b = masked_array(n, mask=m).reshape(3, 2)
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[1, 1], [1, 0]])
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[1, 1, 1], [1, 0, 0], [1, 0, 0]])
+        c = dot(a, b, strict=False)
+        assert_equal(c, np.dot(a.filled(0), b.filled(0)))
+        c = dot(b, a, strict=False)
+        assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+        #
+        m = [0, 0, 0, 0, 0, 1]
+        a = masked_array(n, mask=m).reshape(2, 3)
+        b = masked_array(n, mask=m).reshape(3, 2)
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[0, 1], [1, 1]])
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[0, 0, 1], [0, 0, 1], [1, 1, 1]])
+        c = dot(a, b, strict=False)
+        assert_equal(c, np.dot(a.filled(0), b.filled(0)))
+        assert_equal(c, dot(a, b))
+        c = dot(b, a, strict=False)
+        assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+        #
+        m = [0, 0, 0, 0, 0, 0]
+        a = masked_array(n, mask=m).reshape(2, 3)
+        b = masked_array(n, mask=m).reshape(3, 2)
+        c = dot(a, b)
+        assert_equal(c.mask, nomask)
+        c = dot(b, a)
+        assert_equal(c.mask, nomask)
+        #
+        a = masked_array(n, mask=[1, 0, 0, 0, 0, 0]).reshape(2, 3)
+        b = masked_array(n, mask=[0, 0, 0, 0, 0, 0]).reshape(3, 2)
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[1, 1], [0, 0]])
+        c = dot(a, b, strict=False)
+        assert_equal(c, np.dot(a.filled(0), b.filled(0)))
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[1, 0, 0], [1, 0, 0], [1, 0, 0]])
+        c = dot(b, a, strict=False)
+        assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+        #
+        a = masked_array(n, mask=[0, 0, 0, 0, 0, 1]).reshape(2, 3)
+        b = masked_array(n, mask=[0, 0, 0, 0, 0, 0]).reshape(3, 2)
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[0, 0], [1, 1]])
+        c = dot(a, b)
+        assert_equal(c, np.dot(a.filled(0), b.filled(0)))
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[0, 0, 1], [0, 0, 1], [0, 0, 1]])
+        c = dot(b, a, strict=False)
+        assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+        #
+        a = masked_array(n, mask=[0, 0, 0, 0, 0, 1]).reshape(2, 3)
+        b = masked_array(n, mask=[0, 0, 1, 0, 0, 0]).reshape(3, 2)
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[1, 0], [1, 1]])
+        c = dot(a, b, strict=False)
+        assert_equal(c, np.dot(a.filled(0), b.filled(0)))
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[0, 0, 1], [1, 1, 1], [0, 0, 1]])
+        c = dot(b, a, strict=False)
+        assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+
+    def test_dot_returns_maskedarray(self):
+        # See gh-6611
+        a = np.eye(3)
+        b = array(a)
+        assert_(type(dot(a, a)) is MaskedArray)
+        assert_(type(dot(a, b)) is MaskedArray)
+        assert_(type(dot(b, a)) is MaskedArray)
+        assert_(type(dot(b, b)) is MaskedArray)
+
+    def test_dot_out(self):
+        a = array(np.eye(3))
+        out = array(np.zeros((3, 3)))
+        res = dot(a, a, out=out)
+        assert_(res is out)
+        assert_equal(a, res)
+
+
+class TestApplyAlongAxis(object):
+    # Tests 2D functions
+    def test_3d(self):
+        a = arange(12.).reshape(2, 2, 3)
+
+        def myfunc(b):
+            return b[1]
+
+        xa = apply_along_axis(myfunc, 2, a)
+        assert_equal(xa, [[1, 4], [7, 10]])
+
+    # Tests kwargs functions
+    def test_3d_kwargs(self):
+        a = arange(12).reshape(2, 2, 3)
+
+        def myfunc(b, offset=0):
+            return b[1+offset]
+
+        xa = apply_along_axis(myfunc, 2, a, offset=1)
+        assert_equal(xa, [[2, 5], [8, 11]])
+
+
+class TestApplyOverAxes(object):
+    # Tests apply_over_axes
+    def test_basic(self):
+        a = arange(24).reshape(2, 3, 4)
+        test = apply_over_axes(np.sum, a, [0, 2])
+        ctrl = np.array([[[60], [92], [124]]])
+        assert_equal(test, ctrl)
+        a[(a % 2).astype(bool)] = masked
+        test = apply_over_axes(np.sum, a, [0, 2])
+        ctrl = np.array([[[28], [44], [60]]])
+        assert_equal(test, ctrl)
+
+
+class TestMedian(object):
+    def test_pytype(self):
+        r = np.ma.median([[np.inf, np.inf], [np.inf, np.inf]], axis=-1)
+        assert_equal(r, np.inf)
+
+    def test_inf(self):
+        # test that even which computes handles inf / x = masked
+        r = np.ma.median(np.ma.masked_array([[np.inf, np.inf],
+                                             [np.inf, np.inf]]), axis=-1)
+        assert_equal(r, np.inf)
+        r = np.ma.median(np.ma.masked_array([[np.inf, np.inf],
+                                             [np.inf, np.inf]]), axis=None)
+        assert_equal(r, np.inf)
+        # all masked
+        r = np.ma.median(np.ma.masked_array([[np.inf, np.inf],
+                                             [np.inf, np.inf]], mask=True),
+                         axis=-1)
+        assert_equal(r.mask, True)
+        r = np.ma.median(np.ma.masked_array([[np.inf, np.inf],
+                                             [np.inf, np.inf]], mask=True),
+                         axis=None)
+        assert_equal(r.mask, True)
+
+    def test_non_masked(self):
+        x = np.arange(9)
+        assert_equal(np.ma.median(x), 4.)
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        x = range(8)
+        assert_equal(np.ma.median(x), 3.5)
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        x = 5
+        assert_equal(np.ma.median(x), 5.)
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        # integer
+        x = np.arange(9 * 8).reshape(9, 8)
+        assert_equal(np.ma.median(x, axis=0), np.median(x, axis=0))
+        assert_equal(np.ma.median(x, axis=1), np.median(x, axis=1))
+        assert_(np.ma.median(x, axis=1) is not MaskedArray)
+        # float
+        x = np.arange(9 * 8.).reshape(9, 8)
+        assert_equal(np.ma.median(x, axis=0), np.median(x, axis=0))
+        assert_equal(np.ma.median(x, axis=1), np.median(x, axis=1))
+        assert_(np.ma.median(x, axis=1) is not MaskedArray)
+
+    def test_docstring_examples(self):
+        "test the examples given in the docstring of ma.median"
+        x = array(np.arange(8), mask=[0]*4 + [1]*4)
+        assert_equal(np.ma.median(x), 1.5)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        x = array(np.arange(10).reshape(2, 5), mask=[0]*6 + [1]*4)
+        assert_equal(np.ma.median(x), 2.5)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        ma_x = np.ma.median(x, axis=-1, overwrite_input=True)
+        assert_equal(ma_x, [2., 5.])
+        assert_equal(ma_x.shape, (2,), "shape mismatch")
+        assert_(type(ma_x) is MaskedArray)
+
+    def test_axis_argument_errors(self):
+        msg = "mask = %s, ndim = %s, axis = %s, overwrite_input = %s"
+        for ndmin in range(5):
+            for mask in [False, True]:
+                x = array(1, ndmin=ndmin, mask=mask)
+
+                # Valid axis values should not raise exception
+                args = itertools.product(range(-ndmin, ndmin), [False, True])
+                for axis, over in args:
+                    try:
+                        np.ma.median(x, axis=axis, overwrite_input=over)
+                    except Exception:
+                        raise AssertionError(msg % (mask, ndmin, axis, over))
+
+                # Invalid axis values should raise exception
+                args = itertools.product([-(ndmin + 1), ndmin], [False, True])
+                for axis, over in args:
+                    try:
+                        np.ma.median(x, axis=axis, overwrite_input=over)
+                    except np.AxisError:
+                        pass
+                    else:
+                        raise AssertionError(msg % (mask, ndmin, axis, over))
+
+    def test_masked_0d(self):
+        # Check values
+        x = array(1, mask=False)
+        assert_equal(np.ma.median(x), 1)
+        x = array(1, mask=True)
+        assert_equal(np.ma.median(x), np.ma.masked)
+
+    def test_masked_1d(self):
+        x = array(np.arange(5), mask=True)
+        assert_equal(np.ma.median(x), np.ma.masked)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is np.ma.core.MaskedConstant)
+        x = array(np.arange(5), mask=False)
+        assert_equal(np.ma.median(x), 2.)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        x = array(np.arange(5), mask=[0,1,0,0,0])
+        assert_equal(np.ma.median(x), 2.5)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        x = array(np.arange(5), mask=[0,1,1,1,1])
+        assert_equal(np.ma.median(x), 0.)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        # integer
+        x = array(np.arange(5), mask=[0,1,1,0,0])
+        assert_equal(np.ma.median(x), 3.)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        # float
+        x = array(np.arange(5.), mask=[0,1,1,0,0])
+        assert_equal(np.ma.median(x), 3.)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        # integer
+        x = array(np.arange(6), mask=[0,1,1,1,1,0])
+        assert_equal(np.ma.median(x), 2.5)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        # float
+        x = array(np.arange(6.), mask=[0,1,1,1,1,0])
+        assert_equal(np.ma.median(x), 2.5)
+        assert_equal(np.ma.median(x).shape, (), "shape mismatch")
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+
+    def test_1d_shape_consistency(self):
+        assert_equal(np.ma.median(array([1,2,3],mask=[0,0,0])).shape,
+                     np.ma.median(array([1,2,3],mask=[0,1,0])).shape )
+
+    def test_2d(self):
+        # Tests median w/ 2D
+        (n, p) = (101, 30)
+        x = masked_array(np.linspace(-1., 1., n),)
+        x[:10] = x[-10:] = masked
+        z = masked_array(np.empty((n, p), dtype=float))
+        z[:, 0] = x[:]
+        idx = np.arange(len(x))
+        for i in range(1, p):
+            np.random.shuffle(idx)
+            z[:, i] = x[idx]
+        assert_equal(median(z[:, 0]), 0)
+        assert_equal(median(z), 0)
+        assert_equal(median(z, axis=0), np.zeros(p))
+        assert_equal(median(z.T, axis=1), np.zeros(p))
+
+    def test_2d_waxis(self):
+        # Tests median w/ 2D arrays and different axis.
+        x = masked_array(np.arange(30).reshape(10, 3))
+        x[:3] = x[-3:] = masked
+        assert_equal(median(x), 14.5)
+        assert_(type(np.ma.median(x)) is not MaskedArray)
+        assert_equal(median(x, axis=0), [13.5, 14.5, 15.5])
+        assert_(type(np.ma.median(x, axis=0)) is MaskedArray)
+        assert_equal(median(x, axis=1), [0, 0, 0, 10, 13, 16, 19, 0, 0, 0])
+        assert_(type(np.ma.median(x, axis=1)) is MaskedArray)
+        assert_equal(median(x, axis=1).mask, [1, 1, 1, 0, 0, 0, 0, 1, 1, 1])
+
+    def test_3d(self):
+        # Tests median w/ 3D
+        x = np.ma.arange(24).reshape(3, 4, 2)
+        x[x % 3 == 0] = masked
+        assert_equal(median(x, 0), [[12, 9], [6, 15], [12, 9], [18, 15]])
+        x.shape = (4, 3, 2)
+        assert_equal(median(x, 0), [[99, 10], [11, 99], [13, 14]])
+        x = np.ma.arange(24).reshape(4, 3, 2)
+        x[x % 5 == 0] = masked
+        assert_equal(median(x, 0), [[12, 10], [8, 9], [16, 17]])
+
+    def test_neg_axis(self):
+        x = masked_array(np.arange(30).reshape(10, 3))
+        x[:3] = x[-3:] = masked
+        assert_equal(median(x, axis=-1), median(x, axis=1))
+
+    def test_out_1d(self):
+        # integer float even odd
+        for v in (30, 30., 31, 31.):
+            x = masked_array(np.arange(v))
+            x[:3] = x[-3:] = masked
+            out = masked_array(np.ones(()))
+            r = median(x, out=out)
+            if v == 30:
+                assert_equal(out, 14.5)
+            else:
+                assert_equal(out, 15.)
+            assert_(r is out)
+            assert_(type(r) is MaskedArray)
+
+    def test_out(self):
+        # integer float even odd
+        for v in (40, 40., 30, 30.):
+            x = masked_array(np.arange(v).reshape(10, -1))
+            x[:3] = x[-3:] = masked
+            out = masked_array(np.ones(10))
+            r = median(x, axis=1, out=out)
+            if v == 30:
+                e = masked_array([0.]*3 + [10, 13, 16, 19] + [0.]*3,
+                                 mask=[True] * 3 + [False] * 4 + [True] * 3)
+            else:
+                e = masked_array([0.]*3 + [13.5, 17.5, 21.5, 25.5] + [0.]*3,
+                                 mask=[True]*3 + [False]*4 + [True]*3)
+            assert_equal(r, e)
+            assert_(r is out)
+            assert_(type(r) is MaskedArray)
+
+    def test_single_non_masked_value_on_axis(self):
+        data = [[1., 0.],
+                [0., 3.],
+                [0., 0.]]
+        masked_arr = np.ma.masked_equal(data, 0)
+        expected = [1., 3.]
+        assert_array_equal(np.ma.median(masked_arr, axis=0),
+                           expected)
+
+    def test_nan(self):
+        with suppress_warnings() as w:
+            w.record(RuntimeWarning)
+            for mask in (False, np.zeros(6, dtype=bool)):
+                dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
+                dm.mask = mask
+
+                # scalar result
+                r = np.ma.median(dm, axis=None)
+                assert_(np.isscalar(r))
+                assert_array_equal(r, np.nan)
+                r = np.ma.median(dm.ravel(), axis=0)
+                assert_(np.isscalar(r))
+                assert_array_equal(r, np.nan)
+
+                r = np.ma.median(dm, axis=0)
+                assert_equal(type(r), MaskedArray)
+                assert_array_equal(r, [1, np.nan, 3])
+                r = np.ma.median(dm, axis=1)
+                assert_equal(type(r), MaskedArray)
+                assert_array_equal(r, [np.nan, 2])
+                r = np.ma.median(dm, axis=-1)
+                assert_equal(type(r), MaskedArray)
+                assert_array_equal(r, [np.nan, 2])
+
+            dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
+            dm[:, 2] = np.ma.masked
+            assert_array_equal(np.ma.median(dm, axis=None), np.nan)
+            assert_array_equal(np.ma.median(dm, axis=0), [1, np.nan, 3])
+            assert_array_equal(np.ma.median(dm, axis=1), [np.nan, 1.5])
+            assert_equal([x.category is RuntimeWarning for x in w.log],
+                         [True]*13)
+
+    def test_out_nan(self):
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            o = np.ma.masked_array(np.zeros((4,)))
+            d = np.ma.masked_array(np.ones((3, 4)))
+            d[2, 1] = np.nan
+            d[2, 2] = np.ma.masked
+            assert_equal(np.ma.median(d, 0, out=o), o)
+            o = np.ma.masked_array(np.zeros((3,)))
+            assert_equal(np.ma.median(d, 1, out=o), o)
+            o = np.ma.masked_array(np.zeros(()))
+            assert_equal(np.ma.median(d, out=o), o)
+
+    def test_nan_behavior(self):
+        a = np.ma.masked_array(np.arange(24, dtype=float))
+        a[::3] = np.ma.masked
+        a[2] = np.nan
+        with suppress_warnings() as w:
+            w.record(RuntimeWarning)
+            assert_array_equal(np.ma.median(a), np.nan)
+            assert_array_equal(np.ma.median(a, axis=0), np.nan)
+            assert_(w.log[0].category is RuntimeWarning)
+            assert_(w.log[1].category is RuntimeWarning)
+
+        a = np.ma.masked_array(np.arange(24, dtype=float).reshape(2, 3, 4))
+        a.mask = np.arange(a.size) % 2 == 1
+        aorig = a.copy()
+        a[1, 2, 3] = np.nan
+        a[1, 1, 2] = np.nan
+
+        # no axis
+        with suppress_warnings() as w:
+            w.record(RuntimeWarning)
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_array_equal(np.ma.median(a), np.nan)
+            assert_(np.isscalar(np.ma.median(a)))
+            assert_(w.log[0].category is RuntimeWarning)
+
+        # axis0
+        b = np.ma.median(aorig, axis=0)
+        b[2, 3] = np.nan
+        b[1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.ma.median(a, 0), b)
+            assert_equal(len(w), 1)
+
+        # axis1
+        b = np.ma.median(aorig, axis=1)
+        b[1, 3] = np.nan
+        b[1, 2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.ma.median(a, 1), b)
+            assert_equal(len(w), 1)
+
+        # axis02
+        b = np.ma.median(aorig, axis=(0, 2))
+        b[1] = np.nan
+        b[2] = np.nan
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.ma.median(a, (0, 2)), b)
+            assert_equal(len(w), 1)
+
+    def test_ambigous_fill(self):
+        # 255 is max value, used as filler for sort
+        a = np.array([[3, 3, 255], [3, 3, 255]], dtype=np.uint8)
+        a = np.ma.masked_array(a, mask=a == 3)
+        assert_array_equal(np.ma.median(a, axis=1), 255)
+        assert_array_equal(np.ma.median(a, axis=1).mask, False)
+        assert_array_equal(np.ma.median(a, axis=0), a[0])
+        assert_array_equal(np.ma.median(a), 255)
+
+    def test_special(self):
+        for inf in [np.inf, -np.inf]:
+            a = np.array([[inf,  np.nan], [np.nan, np.nan]])
+            a = np.ma.masked_array(a, mask=np.isnan(a))
+            assert_equal(np.ma.median(a, axis=0), [inf,  np.nan])
+            assert_equal(np.ma.median(a, axis=1), [inf,  np.nan])
+            assert_equal(np.ma.median(a), inf)
+
+            a = np.array([[np.nan, np.nan, inf], [np.nan, np.nan, inf]])
+            a = np.ma.masked_array(a, mask=np.isnan(a))
+            assert_array_equal(np.ma.median(a, axis=1), inf)
+            assert_array_equal(np.ma.median(a, axis=1).mask, False)
+            assert_array_equal(np.ma.median(a, axis=0), a[0])
+            assert_array_equal(np.ma.median(a), inf)
+
+            # no mask
+            a = np.array([[inf, inf], [inf, inf]])
+            assert_equal(np.ma.median(a), inf)
+            assert_equal(np.ma.median(a, axis=0), inf)
+            assert_equal(np.ma.median(a, axis=1), inf)
+
+            a = np.array([[inf, 7, -inf, -9],
+                          [-10, np.nan, np.nan, 5],
+                          [4, np.nan, np.nan, inf]],
+                          dtype=np.float32)
+            a = np.ma.masked_array(a, mask=np.isnan(a))
+            if inf > 0:
+                assert_equal(np.ma.median(a, axis=0), [4., 7., -inf, 5.])
+                assert_equal(np.ma.median(a), 4.5)
+            else:
+                assert_equal(np.ma.median(a, axis=0), [-10., 7., -inf, -9.])
+                assert_equal(np.ma.median(a), -2.5)
+            assert_equal(np.ma.median(a, axis=1), [-1., -2.5, inf])
+
+            for i in range(0, 10):
+                for j in range(1, 10):
+                    a = np.array([([np.nan] * i) + ([inf] * j)] * 2)
+                    a = np.ma.masked_array(a, mask=np.isnan(a))
+                    assert_equal(np.ma.median(a), inf)
+                    assert_equal(np.ma.median(a, axis=1), inf)
+                    assert_equal(np.ma.median(a, axis=0),
+                                 ([np.nan] * i) + [inf] * j)
+
+    def test_empty(self):
+        # empty arrays
+        a = np.ma.masked_array(np.array([], dtype=float))
+        with suppress_warnings() as w:
+            w.record(RuntimeWarning)
+            assert_array_equal(np.ma.median(a), np.nan)
+            assert_(w.log[0].category is RuntimeWarning)
+
+        # multiple dimensions
+        a = np.ma.masked_array(np.array([], dtype=float, ndmin=3))
+        # no axis
+        with suppress_warnings() as w:
+            w.record(RuntimeWarning)
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_array_equal(np.ma.median(a), np.nan)
+            assert_(w.log[0].category is RuntimeWarning)
+
+        # axis 0 and 1
+        b = np.ma.masked_array(np.array([], dtype=float, ndmin=2))
+        assert_equal(np.ma.median(a, axis=0), b)
+        assert_equal(np.ma.median(a, axis=1), b)
+
+        # axis 2
+        b = np.ma.masked_array(np.array(np.nan, dtype=float, ndmin=2))
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', RuntimeWarning)
+            assert_equal(np.ma.median(a, axis=2), b)
+            assert_(w[0].category is RuntimeWarning)
+
+    def test_object(self):
+        o = np.ma.masked_array(np.arange(7.))
+        assert_(type(np.ma.median(o.astype(object))), float)
+        o[2] = np.nan
+        assert_(type(np.ma.median(o.astype(object))), float)
+
+
+class TestCov(object):
+
+    def setup(self):
+        self.data = array(np.random.rand(12))
+
+    def test_1d_without_missing(self):
+        # Test cov on 1D variable w/o missing values
+        x = self.data
+        assert_almost_equal(np.cov(x), cov(x))
+        assert_almost_equal(np.cov(x, rowvar=False), cov(x, rowvar=False))
+        assert_almost_equal(np.cov(x, rowvar=False, bias=True),
+                            cov(x, rowvar=False, bias=True))
+
+    def test_2d_without_missing(self):
+        # Test cov on 1 2D variable w/o missing values
+        x = self.data.reshape(3, 4)
+        assert_almost_equal(np.cov(x), cov(x))
+        assert_almost_equal(np.cov(x, rowvar=False), cov(x, rowvar=False))
+        assert_almost_equal(np.cov(x, rowvar=False, bias=True),
+                            cov(x, rowvar=False, bias=True))
+
+    def test_1d_with_missing(self):
+        # Test cov 1 1D variable w/missing values
+        x = self.data
+        x[-1] = masked
+        x -= x.mean()
+        nx = x.compressed()
+        assert_almost_equal(np.cov(nx), cov(x))
+        assert_almost_equal(np.cov(nx, rowvar=False), cov(x, rowvar=False))
+        assert_almost_equal(np.cov(nx, rowvar=False, bias=True),
+                            cov(x, rowvar=False, bias=True))
+        #
+        try:
+            cov(x, allow_masked=False)
+        except ValueError:
+            pass
+        #
+        # 2 1D variables w/ missing values
+        nx = x[1:-1]
+        assert_almost_equal(np.cov(nx, nx[::-1]), cov(x, x[::-1]))
+        assert_almost_equal(np.cov(nx, nx[::-1], rowvar=False),
+                            cov(x, x[::-1], rowvar=False))
+        assert_almost_equal(np.cov(nx, nx[::-1], rowvar=False, bias=True),
+                            cov(x, x[::-1], rowvar=False, bias=True))
+
+    def test_2d_with_missing(self):
+        # Test cov on 2D variable w/ missing value
+        x = self.data
+        x[-1] = masked
+        x = x.reshape(3, 4)
+        valid = np.logical_not(getmaskarray(x)).astype(int)
+        frac = np.dot(valid, valid.T)
+        xf = (x - x.mean(1)[:, None]).filled(0)
+        assert_almost_equal(cov(x),
+                            np.cov(xf) * (x.shape[1] - 1) / (frac - 1.))
+        assert_almost_equal(cov(x, bias=True),
+                            np.cov(xf, bias=True) * x.shape[1] / frac)
+        frac = np.dot(valid.T, valid)
+        xf = (x - x.mean(0)).filled(0)
+        assert_almost_equal(cov(x, rowvar=False),
+                            (np.cov(xf, rowvar=False) *
+                             (x.shape[0] - 1) / (frac - 1.)))
+        assert_almost_equal(cov(x, rowvar=False, bias=True),
+                            (np.cov(xf, rowvar=False, bias=True) *
+                             x.shape[0] / frac))
+
+
+class TestCorrcoef(object):
+
+    def setup(self):
+        self.data = array(np.random.rand(12))
+        self.data2 = array(np.random.rand(12))
+
+    def test_ddof(self):
+        # ddof raises DeprecationWarning
+        x, y = self.data, self.data2
+        expected = np.corrcoef(x)
+        expected2 = np.corrcoef(x, y)
+        with suppress_warnings() as sup:
+            warnings.simplefilter("always")
+            assert_warns(DeprecationWarning, corrcoef, x, ddof=-1)
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            # ddof has no or negligible effect on the function
+            assert_almost_equal(np.corrcoef(x, ddof=0), corrcoef(x, ddof=0))
+            assert_almost_equal(corrcoef(x, ddof=-1), expected)
+            assert_almost_equal(corrcoef(x, y, ddof=-1), expected2)
+            assert_almost_equal(corrcoef(x, ddof=3), expected)
+            assert_almost_equal(corrcoef(x, y, ddof=3), expected2)
+
+    def test_bias(self):
+        x, y = self.data, self.data2
+        expected = np.corrcoef(x)
+        # bias raises DeprecationWarning
+        with suppress_warnings() as sup:
+            warnings.simplefilter("always")
+            assert_warns(DeprecationWarning, corrcoef, x, y, True, False)
+            assert_warns(DeprecationWarning, corrcoef, x, y, True, True)
+            assert_warns(DeprecationWarning, corrcoef, x, bias=False)
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            # bias has no or negligible effect on the function
+            assert_almost_equal(corrcoef(x, bias=1), expected)
+
+    def test_1d_without_missing(self):
+        # Test cov on 1D variable w/o missing values
+        x = self.data
+        assert_almost_equal(np.corrcoef(x), corrcoef(x))
+        assert_almost_equal(np.corrcoef(x, rowvar=False),
+                            corrcoef(x, rowvar=False))
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            assert_almost_equal(np.corrcoef(x, rowvar=False, bias=True),
+                                corrcoef(x, rowvar=False, bias=True))
+
+    def test_2d_without_missing(self):
+        # Test corrcoef on 1 2D variable w/o missing values
+        x = self.data.reshape(3, 4)
+        assert_almost_equal(np.corrcoef(x), corrcoef(x))
+        assert_almost_equal(np.corrcoef(x, rowvar=False),
+                            corrcoef(x, rowvar=False))
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            assert_almost_equal(np.corrcoef(x, rowvar=False, bias=True),
+                                corrcoef(x, rowvar=False, bias=True))
+
+    def test_1d_with_missing(self):
+        # Test corrcoef 1 1D variable w/missing values
+        x = self.data
+        x[-1] = masked
+        x -= x.mean()
+        nx = x.compressed()
+        assert_almost_equal(np.corrcoef(nx), corrcoef(x))
+        assert_almost_equal(np.corrcoef(nx, rowvar=False),
+                            corrcoef(x, rowvar=False))
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            assert_almost_equal(np.corrcoef(nx, rowvar=False, bias=True),
+                                corrcoef(x, rowvar=False, bias=True))
+        try:
+            corrcoef(x, allow_masked=False)
+        except ValueError:
+            pass
+        # 2 1D variables w/ missing values
+        nx = x[1:-1]
+        assert_almost_equal(np.corrcoef(nx, nx[::-1]), corrcoef(x, x[::-1]))
+        assert_almost_equal(np.corrcoef(nx, nx[::-1], rowvar=False),
+                            corrcoef(x, x[::-1], rowvar=False))
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            # ddof and bias have no or negligible effect on the function
+            assert_almost_equal(np.corrcoef(nx, nx[::-1]),
+                                corrcoef(x, x[::-1], bias=1))
+            assert_almost_equal(np.corrcoef(nx, nx[::-1]),
+                                corrcoef(x, x[::-1], ddof=2))
+
+    def test_2d_with_missing(self):
+        # Test corrcoef on 2D variable w/ missing value
+        x = self.data
+        x[-1] = masked
+        x = x.reshape(3, 4)
+
+        test = corrcoef(x)
+        control = np.corrcoef(x)
+        assert_almost_equal(test[:-1, :-1], control[:-1, :-1])
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            # ddof and bias have no or negligible effect on the function
+            assert_almost_equal(corrcoef(x, ddof=-2)[:-1, :-1],
+                                control[:-1, :-1])
+            assert_almost_equal(corrcoef(x, ddof=3)[:-1, :-1],
+                                control[:-1, :-1])
+            assert_almost_equal(corrcoef(x, bias=1)[:-1, :-1],
+                                control[:-1, :-1])
+
+
+class TestPolynomial(object):
+    #
+    def test_polyfit(self):
+        # Tests polyfit
+        # On ndarrays
+        x = np.random.rand(10)
+        y = np.random.rand(20).reshape(-1, 2)
+        assert_almost_equal(polyfit(x, y, 3), np.polyfit(x, y, 3))
+        # ON 1D maskedarrays
+        x = x.view(MaskedArray)
+        x[0] = masked
+        y = y.view(MaskedArray)
+        y[0, 0] = y[-1, -1] = masked
+        #
+        (C, R, K, S, D) = polyfit(x, y[:, 0], 3, full=True)
+        (c, r, k, s, d) = np.polyfit(x[1:], y[1:, 0].compressed(), 3,
+                                     full=True)
+        for (a, a_) in zip((C, R, K, S, D), (c, r, k, s, d)):
+            assert_almost_equal(a, a_)
+        #
+        (C, R, K, S, D) = polyfit(x, y[:, -1], 3, full=True)
+        (c, r, k, s, d) = np.polyfit(x[1:-1], y[1:-1, -1], 3, full=True)
+        for (a, a_) in zip((C, R, K, S, D), (c, r, k, s, d)):
+            assert_almost_equal(a, a_)
+        #
+        (C, R, K, S, D) = polyfit(x, y, 3, full=True)
+        (c, r, k, s, d) = np.polyfit(x[1:-1], y[1:-1,:], 3, full=True)
+        for (a, a_) in zip((C, R, K, S, D), (c, r, k, s, d)):
+            assert_almost_equal(a, a_)
+        #
+        w = np.random.rand(10) + 1
+        wo = w.copy()
+        xs = x[1:-1]
+        ys = y[1:-1]
+        ws = w[1:-1]
+        (C, R, K, S, D) = polyfit(x, y, 3, full=True, w=w)
+        (c, r, k, s, d) = np.polyfit(xs, ys, 3, full=True, w=ws)
+        assert_equal(w, wo)
+        for (a, a_) in zip((C, R, K, S, D), (c, r, k, s, d)):
+            assert_almost_equal(a, a_)
+
+    def test_polyfit_with_masked_NaNs(self):
+        x = np.random.rand(10)
+        y = np.random.rand(20).reshape(-1, 2)
+
+        x[0] = np.nan
+        y[-1,-1] = np.nan
+        x = x.view(MaskedArray)
+        y = y.view(MaskedArray)
+        x[0] = masked
+        y[-1,-1] = masked
+
+        (C, R, K, S, D) = polyfit(x, y, 3, full=True)
+        (c, r, k, s, d) = np.polyfit(x[1:-1], y[1:-1,:], 3, full=True)
+        for (a, a_) in zip((C, R, K, S, D), (c, r, k, s, d)):
+            assert_almost_equal(a, a_)
+
+
+class TestArraySetOps(object):
+
+    def test_unique_onlist(self):
+        # Test unique on list
+        data = [1, 1, 1, 2, 2, 3]
+        test = unique(data, return_index=True, return_inverse=True)
+        assert_(isinstance(test[0], MaskedArray))
+        assert_equal(test[0], masked_array([1, 2, 3], mask=[0, 0, 0]))
+        assert_equal(test[1], [0, 3, 5])
+        assert_equal(test[2], [0, 0, 0, 1, 1, 2])
+
+    def test_unique_onmaskedarray(self):
+        # Test unique on masked data w/use_mask=True
+        data = masked_array([1, 1, 1, 2, 2, 3], mask=[0, 0, 1, 0, 1, 0])
+        test = unique(data, return_index=True, return_inverse=True)
+        assert_equal(test[0], masked_array([1, 2, 3, -1], mask=[0, 0, 0, 1]))
+        assert_equal(test[1], [0, 3, 5, 2])
+        assert_equal(test[2], [0, 0, 3, 1, 3, 2])
+        #
+        data.fill_value = 3
+        data = masked_array(data=[1, 1, 1, 2, 2, 3],
+                            mask=[0, 0, 1, 0, 1, 0], fill_value=3)
+        test = unique(data, return_index=True, return_inverse=True)
+        assert_equal(test[0], masked_array([1, 2, 3, -1], mask=[0, 0, 0, 1]))
+        assert_equal(test[1], [0, 3, 5, 2])
+        assert_equal(test[2], [0, 0, 3, 1, 3, 2])
+
+    def test_unique_allmasked(self):
+        # Test all masked
+        data = masked_array([1, 1, 1], mask=True)
+        test = unique(data, return_index=True, return_inverse=True)
+        assert_equal(test[0], masked_array([1, ], mask=[True]))
+        assert_equal(test[1], [0])
+        assert_equal(test[2], [0, 0, 0])
+        #
+        # Test masked
+        data = masked
+        test = unique(data, return_index=True, return_inverse=True)
+        assert_equal(test[0], masked_array(masked))
+        assert_equal(test[1], [0])
+        assert_equal(test[2], [0])
+
+    def test_ediff1d(self):
+        # Tests mediff1d
+        x = masked_array(np.arange(5), mask=[1, 0, 0, 0, 1])
+        control = array([1, 1, 1, 4], mask=[1, 0, 0, 1])
+        test = ediff1d(x)
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+
+    def test_ediff1d_tobegin(self):
+        # Test ediff1d w/ to_begin
+        x = masked_array(np.arange(5), mask=[1, 0, 0, 0, 1])
+        test = ediff1d(x, to_begin=masked)
+        control = array([0, 1, 1, 1, 4], mask=[1, 1, 0, 0, 1])
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+        #
+        test = ediff1d(x, to_begin=[1, 2, 3])
+        control = array([1, 2, 3, 1, 1, 1, 4], mask=[0, 0, 0, 1, 0, 0, 1])
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+
+    def test_ediff1d_toend(self):
+        # Test ediff1d w/ to_end
+        x = masked_array(np.arange(5), mask=[1, 0, 0, 0, 1])
+        test = ediff1d(x, to_end=masked)
+        control = array([1, 1, 1, 4, 0], mask=[1, 0, 0, 1, 1])
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+        #
+        test = ediff1d(x, to_end=[1, 2, 3])
+        control = array([1, 1, 1, 4, 1, 2, 3], mask=[1, 0, 0, 1, 0, 0, 0])
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+
+    def test_ediff1d_tobegin_toend(self):
+        # Test ediff1d w/ to_begin and to_end
+        x = masked_array(np.arange(5), mask=[1, 0, 0, 0, 1])
+        test = ediff1d(x, to_end=masked, to_begin=masked)
+        control = array([0, 1, 1, 1, 4, 0], mask=[1, 1, 0, 0, 1, 1])
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+        #
+        test = ediff1d(x, to_end=[1, 2, 3], to_begin=masked)
+        control = array([0, 1, 1, 1, 4, 1, 2, 3],
+                        mask=[1, 1, 0, 0, 1, 0, 0, 0])
+        assert_equal(test, control)
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+
+    def test_ediff1d_ndarray(self):
+        # Test ediff1d w/ a ndarray
+        x = np.arange(5)
+        test = ediff1d(x)
+        control = array([1, 1, 1, 1], mask=[0, 0, 0, 0])
+        assert_equal(test, control)
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+        #
+        test = ediff1d(x, to_end=masked, to_begin=masked)
+        control = array([0, 1, 1, 1, 1, 0], mask=[1, 0, 0, 0, 0, 1])
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test.filled(0), control.filled(0))
+        assert_equal(test.mask, control.mask)
+
+    def test_intersect1d(self):
+        # Test intersect1d
+        x = array([1, 3, 3, 3], mask=[0, 0, 0, 1])
+        y = array([3, 1, 1, 1], mask=[0, 0, 0, 1])
+        test = intersect1d(x, y)
+        control = array([1, 3, -1], mask=[0, 0, 1])
+        assert_equal(test, control)
+
+    def test_setxor1d(self):
+        # Test setxor1d
+        a = array([1, 2, 5, 7, -1], mask=[0, 0, 0, 0, 1])
+        b = array([1, 2, 3, 4, 5, -1], mask=[0, 0, 0, 0, 0, 1])
+        test = setxor1d(a, b)
+        assert_equal(test, array([3, 4, 7]))
+        #
+        a = array([1, 2, 5, 7, -1], mask=[0, 0, 0, 0, 1])
+        b = [1, 2, 3, 4, 5]
+        test = setxor1d(a, b)
+        assert_equal(test, array([3, 4, 7, -1], mask=[0, 0, 0, 1]))
+        #
+        a = array([1, 2, 3])
+        b = array([6, 5, 4])
+        test = setxor1d(a, b)
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test, [1, 2, 3, 4, 5, 6])
+        #
+        a = array([1, 8, 2, 3], mask=[0, 1, 0, 0])
+        b = array([6, 5, 4, 8], mask=[0, 0, 0, 1])
+        test = setxor1d(a, b)
+        assert_(isinstance(test, MaskedArray))
+        assert_equal(test, [1, 2, 3, 4, 5, 6])
+        #
+        assert_array_equal([], setxor1d([], []))
+
+    def test_isin(self):
+        # the tests for in1d cover most of isin's behavior
+        # if in1d is removed, would need to change those tests to test
+        # isin instead.
+        a = np.arange(24).reshape([2, 3, 4])
+        mask = np.zeros([2, 3, 4])
+        mask[1, 2, 0] = 1
+        a = array(a, mask=mask)
+        b = array(data=[0, 10, 20, 30,  1,  3, 11, 22, 33],
+                  mask=[0,  1,  0,  1,  0,  1,  0,  1,  0])
+        ec = zeros((2, 3, 4), dtype=bool)
+        ec[0, 0, 0] = True
+        ec[0, 0, 1] = True
+        ec[0, 2, 3] = True
+        c = isin(a, b)
+        assert_(isinstance(c, MaskedArray))
+        assert_array_equal(c, ec)
+        #compare results of np.isin to ma.isin
+        d = np.isin(a, b[~b.mask]) & ~a.mask
+        assert_array_equal(c, d)
+
+    def test_in1d(self):
+        # Test in1d
+        a = array([1, 2, 5, 7, -1], mask=[0, 0, 0, 0, 1])
+        b = array([1, 2, 3, 4, 5, -1], mask=[0, 0, 0, 0, 0, 1])
+        test = in1d(a, b)
+        assert_equal(test, [True, True, True, False, True])
+        #
+        a = array([5, 5, 2, 1, -1], mask=[0, 0, 0, 0, 1])
+        b = array([1, 5, -1], mask=[0, 0, 1])
+        test = in1d(a, b)
+        assert_equal(test, [True, True, False, True, True])
+        #
+        assert_array_equal([], in1d([], []))
+
+    def test_in1d_invert(self):
+        # Test in1d's invert parameter
+        a = array([1, 2, 5, 7, -1], mask=[0, 0, 0, 0, 1])
+        b = array([1, 2, 3, 4, 5, -1], mask=[0, 0, 0, 0, 0, 1])
+        assert_equal(np.invert(in1d(a, b)), in1d(a, b, invert=True))
+
+        a = array([5, 5, 2, 1, -1], mask=[0, 0, 0, 0, 1])
+        b = array([1, 5, -1], mask=[0, 0, 1])
+        assert_equal(np.invert(in1d(a, b)), in1d(a, b, invert=True))
+
+        assert_array_equal([], in1d([], [], invert=True))
+
+    def test_union1d(self):
+        # Test union1d
+        a = array([1, 2, 5, 7, 5, -1], mask=[0, 0, 0, 0, 0, 1])
+        b = array([1, 2, 3, 4, 5, -1], mask=[0, 0, 0, 0, 0, 1])
+        test = union1d(a, b)
+        control = array([1, 2, 3, 4, 5, 7, -1], mask=[0, 0, 0, 0, 0, 0, 1])
+        assert_equal(test, control)
+
+        # Tests gh-10340, arguments to union1d should be
+        # flattened if they are not already 1D
+        x = array([[0, 1, 2], [3, 4, 5]], mask=[[0, 0, 0], [0, 0, 1]])
+        y = array([0, 1, 2, 3, 4], mask=[0, 0, 0, 0, 1])
+        ez = array([0, 1, 2, 3, 4, 5], mask=[0, 0, 0, 0, 0, 1])
+        z = union1d(x, y)
+        assert_equal(z, ez)
+        #
+        assert_array_equal([], union1d([], []))
+
+    def test_setdiff1d(self):
+        # Test setdiff1d
+        a = array([6, 5, 4, 7, 7, 1, 2, 1], mask=[0, 0, 0, 0, 0, 0, 0, 1])
+        b = array([2, 4, 3, 3, 2, 1, 5])
+        test = setdiff1d(a, b)
+        assert_equal(test, array([6, 7, -1], mask=[0, 0, 1]))
+        #
+        a = arange(10)
+        b = arange(8)
+        assert_equal(setdiff1d(a, b), array([8, 9]))
+        a = array([], np.uint32, mask=[])
+        assert_equal(setdiff1d(a, []).dtype, np.uint32)
+
+    def test_setdiff1d_char_array(self):
+        # Test setdiff1d_charray
+        a = np.array(['a', 'b', 'c'])
+        b = np.array(['a', 'b', 's'])
+        assert_array_equal(setdiff1d(a, b), np.array(['c']))
+
+
+class TestShapeBase(object):
+
+    def test_atleast_2d(self):
+        # Test atleast_2d
+        a = masked_array([0, 1, 2], mask=[0, 1, 0])
+        b = atleast_2d(a)
+        assert_equal(b.shape, (1, 3))
+        assert_equal(b.mask.shape, b.data.shape)
+        assert_equal(a.shape, (3,))
+        assert_equal(a.mask.shape, a.data.shape)
+        assert_equal(b.mask.shape, b.data.shape)
+
+    def test_shape_scalar(self):
+        # the atleast and diagflat function should work with scalars
+        # GitHub issue #3367
+        # Additionally, the atleast functions should accept multiple scalars
+        # correctly
+        b = atleast_1d(1.0)
+        assert_equal(b.shape, (1,))
+        assert_equal(b.mask.shape, b.shape)
+        assert_equal(b.data.shape, b.shape)
+
+        b = atleast_1d(1.0, 2.0)
+        for a in b:
+            assert_equal(a.shape, (1,))
+            assert_equal(a.mask.shape, a.shape)
+            assert_equal(a.data.shape, a.shape)
+
+        b = atleast_2d(1.0)
+        assert_equal(b.shape, (1, 1))
+        assert_equal(b.mask.shape, b.shape)
+        assert_equal(b.data.shape, b.shape)
+
+        b = atleast_2d(1.0, 2.0)
+        for a in b:
+            assert_equal(a.shape, (1, 1))
+            assert_equal(a.mask.shape, a.shape)
+            assert_equal(a.data.shape, a.shape)
+
+        b = atleast_3d(1.0)
+        assert_equal(b.shape, (1, 1, 1))
+        assert_equal(b.mask.shape, b.shape)
+        assert_equal(b.data.shape, b.shape)
+
+        b = atleast_3d(1.0, 2.0)
+        for a in b:
+            assert_equal(a.shape, (1, 1, 1))
+            assert_equal(a.mask.shape, a.shape)
+            assert_equal(a.data.shape, a.shape)
+
+
+        b = diagflat(1.0)
+        assert_equal(b.shape, (1, 1))
+        assert_equal(b.mask.shape, b.data.shape)
+
+
+class TestStack(object):
+
+    def test_stack_1d(self):
+        a = masked_array([0, 1, 2], mask=[0, 1, 0])
+        b = masked_array([9, 8, 7], mask=[1, 0, 0])
+
+        c = stack([a, b], axis=0)
+        assert_equal(c.shape, (2, 3))
+        assert_array_equal(a.mask, c[0].mask)
+        assert_array_equal(b.mask, c[1].mask)
+
+        d = vstack([a, b])
+        assert_array_equal(c.data, d.data)
+        assert_array_equal(c.mask, d.mask)
+
+        c = stack([a, b], axis=1)
+        assert_equal(c.shape, (3, 2))
+        assert_array_equal(a.mask, c[:, 0].mask)
+        assert_array_equal(b.mask, c[:, 1].mask)
+
+    def test_stack_masks(self):
+        a = masked_array([0, 1, 2], mask=True)
+        b = masked_array([9, 8, 7], mask=False)
+
+        c = stack([a, b], axis=0)
+        assert_equal(c.shape, (2, 3))
+        assert_array_equal(a.mask, c[0].mask)
+        assert_array_equal(b.mask, c[1].mask)
+
+        d = vstack([a, b])
+        assert_array_equal(c.data, d.data)
+        assert_array_equal(c.mask, d.mask)
+
+        c = stack([a, b], axis=1)
+        assert_equal(c.shape, (3, 2))
+        assert_array_equal(a.mask, c[:, 0].mask)
+        assert_array_equal(b.mask, c[:, 1].mask)
+
+    def test_stack_nd(self):
+        # 2D
+        shp = (3, 2)
+        d1 = np.random.randint(0, 10, shp)
+        d2 = np.random.randint(0, 10, shp)
+        m1 = np.random.randint(0, 2, shp).astype(bool)
+        m2 = np.random.randint(0, 2, shp).astype(bool)
+        a1 = masked_array(d1, mask=m1)
+        a2 = masked_array(d2, mask=m2)
+
+        c = stack([a1, a2], axis=0)
+        c_shp = (2,) + shp
+        assert_equal(c.shape, c_shp)
+        assert_array_equal(a1.mask, c[0].mask)
+        assert_array_equal(a2.mask, c[1].mask)
+
+        c = stack([a1, a2], axis=-1)
+        c_shp = shp + (2,)
+        assert_equal(c.shape, c_shp)
+        assert_array_equal(a1.mask, c[..., 0].mask)
+        assert_array_equal(a2.mask, c[..., 1].mask)
+
+        # 4D
+        shp = (3, 2, 4, 5,)
+        d1 = np.random.randint(0, 10, shp)
+        d2 = np.random.randint(0, 10, shp)
+        m1 = np.random.randint(0, 2, shp).astype(bool)
+        m2 = np.random.randint(0, 2, shp).astype(bool)
+        a1 = masked_array(d1, mask=m1)
+        a2 = masked_array(d2, mask=m2)
+
+        c = stack([a1, a2], axis=0)
+        c_shp = (2,) + shp
+        assert_equal(c.shape, c_shp)
+        assert_array_equal(a1.mask, c[0].mask)
+        assert_array_equal(a2.mask, c[1].mask)
+
+        c = stack([a1, a2], axis=-1)
+        c_shp = shp + (2,)
+        assert_equal(c.shape, c_shp)
+        assert_array_equal(a1.mask, c[..., 0].mask)
+        assert_array_equal(a2.mask, c[..., 1].mask)
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_mrecords.py b/contrib/python/numpy/py2/numpy/ma/tests/test_mrecords.py
new file mode 100644
index 00000000000..dbbf1c8a1e8
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_mrecords.py
@@ -0,0 +1,495 @@
+# pylint: disable-msg=W0611, W0612, W0511,R0201
+"""Tests suite for mrecords.
+
+:author: Pierre Gerard-Marchant
+:contact: pierregm_at_uga_dot_edu
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+import numpy.ma as ma
+from numpy import recarray
+from numpy.ma import masked, nomask
+from numpy.testing import temppath
+from numpy.core.records import (
+    fromrecords as recfromrecords, fromarrays as recfromarrays
+    )
+from numpy.ma.mrecords import (
+    MaskedRecords, mrecarray, fromarrays, fromtextfile, fromrecords,
+    addfield
+    )
+from numpy.ma.testutils import (
+    assert_, assert_equal,
+    assert_equal_records,
+    )
+from numpy.core.numeric import pickle
+
+
+class TestMRecords(object):
+
+    ilist = [1, 2, 3, 4, 5]
+    flist = [1.1, 2.2, 3.3, 4.4, 5.5]
+    slist = [b'one', b'two', b'three', b'four', b'five']
+    ddtype = [('a', int), ('b', float), ('c', '|S8')]
+    mask = [0, 1, 0, 0, 1]
+    base = ma.array(list(zip(ilist, flist, slist)), mask=mask, dtype=ddtype)
+
+    def test_byview(self):
+        # Test creation by view
+        base = self.base
+        mbase = base.view(mrecarray)
+        assert_equal(mbase.recordmask, base.recordmask)
+        assert_equal_records(mbase._mask, base._mask)
+        assert_(isinstance(mbase._data, recarray))
+        assert_equal_records(mbase._data, base._data.view(recarray))
+        for field in ('a', 'b', 'c'):
+            assert_equal(base[field], mbase[field])
+        assert_equal_records(mbase.view(mrecarray), mbase)
+
+    def test_get(self):
+        # Tests fields retrieval
+        base = self.base.copy()
+        mbase = base.view(mrecarray)
+        # As fields..........
+        for field in ('a', 'b', 'c'):
+            assert_equal(getattr(mbase, field), mbase[field])
+            assert_equal(base[field], mbase[field])
+        # as elements .......
+        mbase_first = mbase[0]
+        assert_(isinstance(mbase_first, mrecarray))
+        assert_equal(mbase_first.dtype, mbase.dtype)
+        assert_equal(mbase_first.tolist(), (1, 1.1, b'one'))
+        # Used to be mask, now it's recordmask
+        assert_equal(mbase_first.recordmask, nomask)
+        assert_equal(mbase_first._mask.item(), (False, False, False))
+        assert_equal(mbase_first['a'], mbase['a'][0])
+        mbase_last = mbase[-1]
+        assert_(isinstance(mbase_last, mrecarray))
+        assert_equal(mbase_last.dtype, mbase.dtype)
+        assert_equal(mbase_last.tolist(), (None, None, None))
+        # Used to be mask, now it's recordmask
+        assert_equal(mbase_last.recordmask, True)
+        assert_equal(mbase_last._mask.item(), (True, True, True))
+        assert_equal(mbase_last['a'], mbase['a'][-1])
+        assert_((mbase_last['a'] is masked))
+        # as slice ..........
+        mbase_sl = mbase[:2]
+        assert_(isinstance(mbase_sl, mrecarray))
+        assert_equal(mbase_sl.dtype, mbase.dtype)
+        # Used to be mask, now it's recordmask
+        assert_equal(mbase_sl.recordmask, [0, 1])
+        assert_equal_records(mbase_sl.mask,
+                             np.array([(False, False, False),
+                                       (True, True, True)],
+                                      dtype=mbase._mask.dtype))
+        assert_equal_records(mbase_sl, base[:2].view(mrecarray))
+        for field in ('a', 'b', 'c'):
+            assert_equal(getattr(mbase_sl, field), base[:2][field])
+
+    def test_set_fields(self):
+        # Tests setting fields.
+        base = self.base.copy()
+        mbase = base.view(mrecarray)
+        mbase = mbase.copy()
+        mbase.fill_value = (999999, 1e20, 'N/A')
+        # Change the data, the mask should be conserved
+        mbase.a._data[:] = 5
+        assert_equal(mbase['a']._data, [5, 5, 5, 5, 5])
+        assert_equal(mbase['a']._mask, [0, 1, 0, 0, 1])
+        # Change the elements, and the mask will follow
+        mbase.a = 1
+        assert_equal(mbase['a']._data, [1]*5)
+        assert_equal(ma.getmaskarray(mbase['a']), [0]*5)
+        # Use to be _mask, now it's recordmask
+        assert_equal(mbase.recordmask, [False]*5)
+        assert_equal(mbase._mask.tolist(),
+                     np.array([(0, 0, 0),
+                               (0, 1, 1),
+                               (0, 0, 0),
+                               (0, 0, 0),
+                               (0, 1, 1)],
+                              dtype=bool))
+        # Set a field to mask ........................
+        mbase.c = masked
+        # Use to be mask, and now it's still mask !
+        assert_equal(mbase.c.mask, [1]*5)
+        assert_equal(mbase.c.recordmask, [1]*5)
+        assert_equal(ma.getmaskarray(mbase['c']), [1]*5)
+        assert_equal(ma.getdata(mbase['c']), [b'N/A']*5)
+        assert_equal(mbase._mask.tolist(),
+                     np.array([(0, 0, 1),
+                               (0, 1, 1),
+                               (0, 0, 1),
+                               (0, 0, 1),
+                               (0, 1, 1)],
+                              dtype=bool))
+        # Set fields by slices .......................
+        mbase = base.view(mrecarray).copy()
+        mbase.a[3:] = 5
+        assert_equal(mbase.a, [1, 2, 3, 5, 5])
+        assert_equal(mbase.a._mask, [0, 1, 0, 0, 0])
+        mbase.b[3:] = masked
+        assert_equal(mbase.b, base['b'])
+        assert_equal(mbase.b._mask, [0, 1, 0, 1, 1])
+        # Set fields globally..........................
+        ndtype = [('alpha', '|S1'), ('num', int)]
+        data = ma.array([('a', 1), ('b', 2), ('c', 3)], dtype=ndtype)
+        rdata = data.view(MaskedRecords)
+        val = ma.array([10, 20, 30], mask=[1, 0, 0])
+
+        rdata['num'] = val
+        assert_equal(rdata.num, val)
+        assert_equal(rdata.num.mask, [1, 0, 0])
+
+    def test_set_fields_mask(self):
+        # Tests setting the mask of a field.
+        base = self.base.copy()
+        # This one has already a mask....
+        mbase = base.view(mrecarray)
+        mbase['a'][-2] = masked
+        assert_equal(mbase.a, [1, 2, 3, 4, 5])
+        assert_equal(mbase.a._mask, [0, 1, 0, 1, 1])
+        # This one has not yet
+        mbase = fromarrays([np.arange(5), np.random.rand(5)],
+                           dtype=[('a', int), ('b', float)])
+        mbase['a'][-2] = masked
+        assert_equal(mbase.a, [0, 1, 2, 3, 4])
+        assert_equal(mbase.a._mask, [0, 0, 0, 1, 0])
+
+    def test_set_mask(self):
+        base = self.base.copy()
+        mbase = base.view(mrecarray)
+        # Set the mask to True .......................
+        mbase.mask = masked
+        assert_equal(ma.getmaskarray(mbase['b']), [1]*5)
+        assert_equal(mbase['a']._mask, mbase['b']._mask)
+        assert_equal(mbase['a']._mask, mbase['c']._mask)
+        assert_equal(mbase._mask.tolist(),
+                     np.array([(1, 1, 1)]*5, dtype=bool))
+        # Delete the mask ............................
+        mbase.mask = nomask
+        assert_equal(ma.getmaskarray(mbase['c']), [0]*5)
+        assert_equal(mbase._mask.tolist(),
+                     np.array([(0, 0, 0)]*5, dtype=bool))
+
+    def test_set_mask_fromarray(self):
+        base = self.base.copy()
+        mbase = base.view(mrecarray)
+        # Sets the mask w/ an array
+        mbase.mask = [1, 0, 0, 0, 1]
+        assert_equal(mbase.a.mask, [1, 0, 0, 0, 1])
+        assert_equal(mbase.b.mask, [1, 0, 0, 0, 1])
+        assert_equal(mbase.c.mask, [1, 0, 0, 0, 1])
+        # Yay, once more !
+        mbase.mask = [0, 0, 0, 0, 1]
+        assert_equal(mbase.a.mask, [0, 0, 0, 0, 1])
+        assert_equal(mbase.b.mask, [0, 0, 0, 0, 1])
+        assert_equal(mbase.c.mask, [0, 0, 0, 0, 1])
+
+    def test_set_mask_fromfields(self):
+        mbase = self.base.copy().view(mrecarray)
+
+        nmask = np.array(
+            [(0, 1, 0), (0, 1, 0), (1, 0, 1), (1, 0, 1), (0, 0, 0)],
+            dtype=[('a', bool), ('b', bool), ('c', bool)])
+        mbase.mask = nmask
+        assert_equal(mbase.a.mask, [0, 0, 1, 1, 0])
+        assert_equal(mbase.b.mask, [1, 1, 0, 0, 0])
+        assert_equal(mbase.c.mask, [0, 0, 1, 1, 0])
+        # Reinitialize and redo
+        mbase.mask = False
+        mbase.fieldmask = nmask
+        assert_equal(mbase.a.mask, [0, 0, 1, 1, 0])
+        assert_equal(mbase.b.mask, [1, 1, 0, 0, 0])
+        assert_equal(mbase.c.mask, [0, 0, 1, 1, 0])
+
+    def test_set_elements(self):
+        base = self.base.copy()
+        # Set an element to mask .....................
+        mbase = base.view(mrecarray).copy()
+        mbase[-2] = masked
+        assert_equal(
+            mbase._mask.tolist(),
+            np.array([(0, 0, 0), (1, 1, 1), (0, 0, 0), (1, 1, 1), (1, 1, 1)],
+                     dtype=bool))
+        # Used to be mask, now it's recordmask!
+        assert_equal(mbase.recordmask, [0, 1, 0, 1, 1])
+        # Set slices .................................
+        mbase = base.view(mrecarray).copy()
+        mbase[:2] = (5, 5, 5)
+        assert_equal(mbase.a._data, [5, 5, 3, 4, 5])
+        assert_equal(mbase.a._mask, [0, 0, 0, 0, 1])
+        assert_equal(mbase.b._data, [5., 5., 3.3, 4.4, 5.5])
+        assert_equal(mbase.b._mask, [0, 0, 0, 0, 1])
+        assert_equal(mbase.c._data,
+                     [b'5', b'5', b'three', b'four', b'five'])
+        assert_equal(mbase.b._mask, [0, 0, 0, 0, 1])
+
+        mbase = base.view(mrecarray).copy()
+        mbase[:2] = masked
+        assert_equal(mbase.a._data, [1, 2, 3, 4, 5])
+        assert_equal(mbase.a._mask, [1, 1, 0, 0, 1])
+        assert_equal(mbase.b._data, [1.1, 2.2, 3.3, 4.4, 5.5])
+        assert_equal(mbase.b._mask, [1, 1, 0, 0, 1])
+        assert_equal(mbase.c._data,
+                     [b'one', b'two', b'three', b'four', b'five'])
+        assert_equal(mbase.b._mask, [1, 1, 0, 0, 1])
+
+    def test_setslices_hardmask(self):
+        # Tests setting slices w/ hardmask.
+        base = self.base.copy()
+        mbase = base.view(mrecarray)
+        mbase.harden_mask()
+        try:
+            mbase[-2:] = (5, 5, 5)
+            assert_equal(mbase.a._data, [1, 2, 3, 5, 5])
+            assert_equal(mbase.b._data, [1.1, 2.2, 3.3, 5, 5.5])
+            assert_equal(mbase.c._data,
+                         [b'one', b'two', b'three', b'5', b'five'])
+            assert_equal(mbase.a._mask, [0, 1, 0, 0, 1])
+            assert_equal(mbase.b._mask, mbase.a._mask)
+            assert_equal(mbase.b._mask, mbase.c._mask)
+        except NotImplementedError:
+            # OK, not implemented yet...
+            pass
+        except AssertionError:
+            raise
+        else:
+            raise Exception("Flexible hard masks should be supported !")
+        # Not using a tuple should crash
+        try:
+            mbase[-2:] = 3
+        except (NotImplementedError, TypeError):
+            pass
+        else:
+            raise TypeError("Should have expected a readable buffer object!")
+
+    def test_hardmask(self):
+        # Test hardmask
+        base = self.base.copy()
+        mbase = base.view(mrecarray)
+        mbase.harden_mask()
+        assert_(mbase._hardmask)
+        mbase.mask = nomask
+        assert_equal_records(mbase._mask, base._mask)
+        mbase.soften_mask()
+        assert_(not mbase._hardmask)
+        mbase.mask = nomask
+        # So, the mask of a field is no longer set to nomask...
+        assert_equal_records(mbase._mask,
+                             ma.make_mask_none(base.shape, base.dtype))
+        assert_(ma.make_mask(mbase['b']._mask) is nomask)
+        assert_equal(mbase['a']._mask, mbase['b']._mask)
+
+    def test_pickling(self):
+        # Test pickling
+        base = self.base.copy()
+        mrec = base.view(mrecarray)
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            _ = pickle.dumps(mrec, protocol=proto)
+            mrec_ = pickle.loads(_)
+            assert_equal(mrec_.dtype, mrec.dtype)
+            assert_equal_records(mrec_._data, mrec._data)
+            assert_equal(mrec_._mask, mrec._mask)
+            assert_equal_records(mrec_._mask, mrec._mask)
+
+    def test_filled(self):
+        # Test filling the array
+        _a = ma.array([1, 2, 3], mask=[0, 0, 1], dtype=int)
+        _b = ma.array([1.1, 2.2, 3.3], mask=[0, 0, 1], dtype=float)
+        _c = ma.array(['one', 'two', 'three'], mask=[0, 0, 1], dtype='|S8')
+        ddtype = [('a', int), ('b', float), ('c', '|S8')]
+        mrec = fromarrays([_a, _b, _c], dtype=ddtype,
+                          fill_value=(99999, 99999., 'N/A'))
+        mrecfilled = mrec.filled()
+        assert_equal(mrecfilled['a'], np.array((1, 2, 99999), dtype=int))
+        assert_equal(mrecfilled['b'], np.array((1.1, 2.2, 99999.),
+                                               dtype=float))
+        assert_equal(mrecfilled['c'], np.array(('one', 'two', 'N/A'),
+                                               dtype='|S8'))
+
+    def test_tolist(self):
+        # Test tolist.
+        _a = ma.array([1, 2, 3], mask=[0, 0, 1], dtype=int)
+        _b = ma.array([1.1, 2.2, 3.3], mask=[0, 0, 1], dtype=float)
+        _c = ma.array(['one', 'two', 'three'], mask=[1, 0, 0], dtype='|S8')
+        ddtype = [('a', int), ('b', float), ('c', '|S8')]
+        mrec = fromarrays([_a, _b, _c], dtype=ddtype,
+                          fill_value=(99999, 99999., 'N/A'))
+
+        assert_equal(mrec.tolist(),
+                     [(1, 1.1, None), (2, 2.2, b'two'),
+                      (None, None, b'three')])
+
+    def test_withnames(self):
+        # Test the creation w/ format and names
+        x = mrecarray(1, formats=float, names='base')
+        x[0]['base'] = 10
+        assert_equal(x['base'][0], 10)
+
+    def test_exotic_formats(self):
+        # Test that 'exotic' formats are processed properly
+        easy = mrecarray(1, dtype=[('i', int), ('s', '|S8'), ('f', float)])
+        easy[0] = masked
+        assert_equal(easy.filled(1).item(), (1, b'1', 1.))
+
+        solo = mrecarray(1, dtype=[('f0', '<f8', (2, 2))])
+        solo[0] = masked
+        assert_equal(solo.filled(1).item(),
+                     np.array((1,), dtype=solo.dtype).item())
+
+        mult = mrecarray(2, dtype="i4, (2,3)float, float")
+        mult[0] = masked
+        mult[1] = (1, 1, 1)
+        mult.filled(0)
+        assert_equal_records(mult.filled(0),
+                             np.array([(0, 0, 0), (1, 1, 1)],
+                                      dtype=mult.dtype))
+
+
+class TestView(object):
+
+    def setup(self):
+        (a, b) = (np.arange(10), np.random.rand(10))
+        ndtype = [('a', float), ('b', float)]
+        arr = np.array(list(zip(a, b)), dtype=ndtype)
+
+        mrec = fromarrays([a, b], dtype=ndtype, fill_value=(-9., -99.))
+        mrec.mask[3] = (False, True)
+        self.data = (mrec, a, b, arr)
+
+    def test_view_by_itself(self):
+        (mrec, a, b, arr) = self.data
+        test = mrec.view()
+        assert_(isinstance(test, MaskedRecords))
+        assert_equal_records(test, mrec)
+        assert_equal_records(test._mask, mrec._mask)
+
+    def test_view_simple_dtype(self):
+        (mrec, a, b, arr) = self.data
+        ntype = (float, 2)
+        test = mrec.view(ntype)
+        assert_(isinstance(test, ma.MaskedArray))
+        assert_equal(test, np.array(list(zip(a, b)), dtype=float))
+        assert_(test[3, 1] is ma.masked)
+
+    def test_view_flexible_type(self):
+        (mrec, a, b, arr) = self.data
+        alttype = [('A', float), ('B', float)]
+        test = mrec.view(alttype)
+        assert_(isinstance(test, MaskedRecords))
+        assert_equal_records(test, arr.view(alttype))
+        assert_(test['B'][3] is masked)
+        assert_equal(test.dtype, np.dtype(alttype))
+        assert_(test._fill_value is None)
+
+
+##############################################################################
+class TestMRecordsImport(object):
+
+    _a = ma.array([1, 2, 3], mask=[0, 0, 1], dtype=int)
+    _b = ma.array([1.1, 2.2, 3.3], mask=[0, 0, 1], dtype=float)
+    _c = ma.array([b'one', b'two', b'three'],
+                  mask=[0, 0, 1], dtype='|S8')
+    ddtype = [('a', int), ('b', float), ('c', '|S8')]
+    mrec = fromarrays([_a, _b, _c], dtype=ddtype,
+                      fill_value=(b'99999', b'99999.',
+                                  b'N/A'))
+    nrec = recfromarrays((_a._data, _b._data, _c._data), dtype=ddtype)
+    data = (mrec, nrec, ddtype)
+
+    def test_fromarrays(self):
+        _a = ma.array([1, 2, 3], mask=[0, 0, 1], dtype=int)
+        _b = ma.array([1.1, 2.2, 3.3], mask=[0, 0, 1], dtype=float)
+        _c = ma.array(['one', 'two', 'three'], mask=[0, 0, 1], dtype='|S8')
+        (mrec, nrec, _) = self.data
+        for (f, l) in zip(('a', 'b', 'c'), (_a, _b, _c)):
+            assert_equal(getattr(mrec, f)._mask, l._mask)
+        # One record only
+        _x = ma.array([1, 1.1, 'one'], mask=[1, 0, 0],)
+        assert_equal_records(fromarrays(_x, dtype=mrec.dtype), mrec[0])
+
+    def test_fromrecords(self):
+        # Test construction from records.
+        (mrec, nrec, ddtype) = self.data
+        #......
+        palist = [(1, 'abc', 3.7000002861022949, 0),
+                  (2, 'xy', 6.6999998092651367, 1),
+                  (0, ' ', 0.40000000596046448, 0)]
+        pa = recfromrecords(palist, names='c1, c2, c3, c4')
+        mpa = fromrecords(palist, names='c1, c2, c3, c4')
+        assert_equal_records(pa, mpa)
+        #.....
+        _mrec = fromrecords(nrec)
+        assert_equal(_mrec.dtype, mrec.dtype)
+        for field in _mrec.dtype.names:
+            assert_equal(getattr(_mrec, field), getattr(mrec._data, field))
+
+        _mrec = fromrecords(nrec.tolist(), names='c1,c2,c3')
+        assert_equal(_mrec.dtype, [('c1', int), ('c2', float), ('c3', '|S5')])
+        for (f, n) in zip(('c1', 'c2', 'c3'), ('a', 'b', 'c')):
+            assert_equal(getattr(_mrec, f), getattr(mrec._data, n))
+
+        _mrec = fromrecords(mrec)
+        assert_equal(_mrec.dtype, mrec.dtype)
+        assert_equal_records(_mrec._data, mrec.filled())
+        assert_equal_records(_mrec._mask, mrec._mask)
+
+    def test_fromrecords_wmask(self):
+        # Tests construction from records w/ mask.
+        (mrec, nrec, ddtype) = self.data
+
+        _mrec = fromrecords(nrec.tolist(), dtype=ddtype, mask=[0, 1, 0,])
+        assert_equal_records(_mrec._data, mrec._data)
+        assert_equal(_mrec._mask.tolist(), [(0, 0, 0), (1, 1, 1), (0, 0, 0)])
+
+        _mrec = fromrecords(nrec.tolist(), dtype=ddtype, mask=True)
+        assert_equal_records(_mrec._data, mrec._data)
+        assert_equal(_mrec._mask.tolist(), [(1, 1, 1), (1, 1, 1), (1, 1, 1)])
+
+        _mrec = fromrecords(nrec.tolist(), dtype=ddtype, mask=mrec._mask)
+        assert_equal_records(_mrec._data, mrec._data)
+        assert_equal(_mrec._mask.tolist(), mrec._mask.tolist())
+
+        _mrec = fromrecords(nrec.tolist(), dtype=ddtype,
+                            mask=mrec._mask.tolist())
+        assert_equal_records(_mrec._data, mrec._data)
+        assert_equal(_mrec._mask.tolist(), mrec._mask.tolist())
+
+    def test_fromtextfile(self):
+        # Tests reading from a text file.
+        fcontent = (
+"""#
+'One (S)','Two (I)','Three (F)','Four (M)','Five (-)','Six (C)'
+'strings',1,1.0,'mixed column',,1
+'with embedded "double quotes"',2,2.0,1.0,,1
+'strings',3,3.0E5,3,,1
+'strings',4,-1e-10,,,1
+""")
+        with temppath() as path:
+            with open(path, 'w') as f:
+                f.write(fcontent)
+            mrectxt = fromtextfile(path, delimitor=',', varnames='ABCDEFG')
+        assert_(isinstance(mrectxt, MaskedRecords))
+        assert_equal(mrectxt.F, [1, 1, 1, 1])
+        assert_equal(mrectxt.E._mask, [1, 1, 1, 1])
+        assert_equal(mrectxt.C, [1, 2, 3.e+5, -1e-10])
+
+    def test_addfield(self):
+        # Tests addfield
+        (mrec, nrec, ddtype) = self.data
+        (d, m) = ([100, 200, 300], [1, 0, 0])
+        mrec = addfield(mrec, ma.array(d, mask=m))
+        assert_equal(mrec.f3, d)
+        assert_equal(mrec.f3._mask, m)
+
+
+def test_record_array_with_object_field():
+    # Trac #1839
+    y = ma.masked_array(
+        [(1, '2'), (3, '4')],
+        mask=[(0, 0), (0, 1)],
+        dtype=[('a', int), ('b', object)])
+    # getting an item used to fail
+    y[1]
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_old_ma.py b/contrib/python/numpy/py2/numpy/ma/tests/test_old_ma.py
new file mode 100644
index 00000000000..2978be22cab
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_old_ma.py
@@ -0,0 +1,860 @@
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.core.umath as umath
+import numpy.core.fromnumeric as fromnumeric
+from numpy.testing import (
+    assert_, assert_raises, assert_equal,
+    )
+from numpy.ma import (
+    MaskType, MaskedArray, absolute, add, all, allclose, allequal, alltrue,
+    arange, arccos, arcsin, arctan, arctan2, array, average, choose,
+    concatenate, conjugate, cos, cosh, count, divide, equal, exp, filled,
+    getmask, greater, greater_equal, inner, isMaskedArray, less,
+    less_equal, log, log10, make_mask, masked, masked_array, masked_equal,
+    masked_greater, masked_greater_equal, masked_inside, masked_less,
+    masked_less_equal, masked_not_equal, masked_outside,
+    masked_print_option, masked_values, masked_where, maximum, minimum,
+    multiply, nomask, nonzero, not_equal, ones, outer, product, put, ravel,
+    repeat, resize, shape, sin, sinh, sometrue, sort, sqrt, subtract, sum,
+    take, tan, tanh, transpose, where, zeros,
+    )
+from numpy.core.numeric import pickle
+
+pi = np.pi
+
+
+def eq(v, w, msg=''):
+    result = allclose(v, w)
+    if not result:
+        print("Not eq:%s\n%s\n----%s" % (msg, str(v), str(w)))
+    return result
+
+
+class TestMa(object):
+
+    def setup(self):
+        x = np.array([1., 1., 1., -2., pi/2.0, 4., 5., -10., 10., 1., 2., 3.])
+        y = np.array([5., 0., 3., 2., -1., -4., 0., -10., 10., 1., 0., 3.])
+        a10 = 10.
+        m1 = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+        m2 = [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1]
+        xm = array(x, mask=m1)
+        ym = array(y, mask=m2)
+        z = np.array([-.5, 0., .5, .8])
+        zm = array(z, mask=[0, 1, 0, 0])
+        xf = np.where(m1, 1e+20, x)
+        s = x.shape
+        xm.set_fill_value(1e+20)
+        self.d = (x, y, a10, m1, m2, xm, ym, z, zm, xf, s)
+
+    def test_testBasic1d(self):
+        # Test of basic array creation and properties in 1 dimension.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+        assert_(not isMaskedArray(x))
+        assert_(isMaskedArray(xm))
+        assert_equal(shape(xm), s)
+        assert_equal(xm.shape, s)
+        assert_equal(xm.dtype, x.dtype)
+        assert_equal(xm.size, reduce(lambda x, y:x * y, s))
+        assert_equal(count(xm), len(m1) - reduce(lambda x, y:x + y, m1))
+        assert_(eq(xm, xf))
+        assert_(eq(filled(xm, 1.e20), xf))
+        assert_(eq(x, xm))
+
+    def test_testBasic2d(self):
+        # Test of basic array creation and properties in 2 dimensions.
+        for s in [(4, 3), (6, 2)]:
+            (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+            x.shape = s
+            y.shape = s
+            xm.shape = s
+            ym.shape = s
+            xf.shape = s
+
+            assert_(not isMaskedArray(x))
+            assert_(isMaskedArray(xm))
+            assert_equal(shape(xm), s)
+            assert_equal(xm.shape, s)
+            assert_equal(xm.size, reduce(lambda x, y:x * y, s))
+            assert_equal(count(xm),
+                             len(m1) - reduce(lambda x, y:x + y, m1))
+            assert_(eq(xm, xf))
+            assert_(eq(filled(xm, 1.e20), xf))
+            assert_(eq(x, xm))
+            self.setup()
+
+    def test_testArithmetic(self):
+        # Test of basic arithmetic.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+        a2d = array([[1, 2], [0, 4]])
+        a2dm = masked_array(a2d, [[0, 0], [1, 0]])
+        assert_(eq(a2d * a2d, a2d * a2dm))
+        assert_(eq(a2d + a2d, a2d + a2dm))
+        assert_(eq(a2d - a2d, a2d - a2dm))
+        for s in [(12,), (4, 3), (2, 6)]:
+            x = x.reshape(s)
+            y = y.reshape(s)
+            xm = xm.reshape(s)
+            ym = ym.reshape(s)
+            xf = xf.reshape(s)
+            assert_(eq(-x, -xm))
+            assert_(eq(x + y, xm + ym))
+            assert_(eq(x - y, xm - ym))
+            assert_(eq(x * y, xm * ym))
+            with np.errstate(divide='ignore', invalid='ignore'):
+                assert_(eq(x / y, xm / ym))
+            assert_(eq(a10 + y, a10 + ym))
+            assert_(eq(a10 - y, a10 - ym))
+            assert_(eq(a10 * y, a10 * ym))
+            with np.errstate(divide='ignore', invalid='ignore'):
+                assert_(eq(a10 / y, a10 / ym))
+            assert_(eq(x + a10, xm + a10))
+            assert_(eq(x - a10, xm - a10))
+            assert_(eq(x * a10, xm * a10))
+            assert_(eq(x / a10, xm / a10))
+            assert_(eq(x ** 2, xm ** 2))
+            assert_(eq(abs(x) ** 2.5, abs(xm) ** 2.5))
+            assert_(eq(x ** y, xm ** ym))
+            assert_(eq(np.add(x, y), add(xm, ym)))
+            assert_(eq(np.subtract(x, y), subtract(xm, ym)))
+            assert_(eq(np.multiply(x, y), multiply(xm, ym)))
+            with np.errstate(divide='ignore', invalid='ignore'):
+                assert_(eq(np.divide(x, y), divide(xm, ym)))
+
+    def test_testMixedArithmetic(self):
+        na = np.array([1])
+        ma = array([1])
+        assert_(isinstance(na + ma, MaskedArray))
+        assert_(isinstance(ma + na, MaskedArray))
+
+    def test_testUfuncs1(self):
+        # Test various functions such as sin, cos.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+        assert_(eq(np.cos(x), cos(xm)))
+        assert_(eq(np.cosh(x), cosh(xm)))
+        assert_(eq(np.sin(x), sin(xm)))
+        assert_(eq(np.sinh(x), sinh(xm)))
+        assert_(eq(np.tan(x), tan(xm)))
+        assert_(eq(np.tanh(x), tanh(xm)))
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert_(eq(np.sqrt(abs(x)), sqrt(xm)))
+            assert_(eq(np.log(abs(x)), log(xm)))
+            assert_(eq(np.log10(abs(x)), log10(xm)))
+        assert_(eq(np.exp(x), exp(xm)))
+        assert_(eq(np.arcsin(z), arcsin(zm)))
+        assert_(eq(np.arccos(z), arccos(zm)))
+        assert_(eq(np.arctan(z), arctan(zm)))
+        assert_(eq(np.arctan2(x, y), arctan2(xm, ym)))
+        assert_(eq(np.absolute(x), absolute(xm)))
+        assert_(eq(np.equal(x, y), equal(xm, ym)))
+        assert_(eq(np.not_equal(x, y), not_equal(xm, ym)))
+        assert_(eq(np.less(x, y), less(xm, ym)))
+        assert_(eq(np.greater(x, y), greater(xm, ym)))
+        assert_(eq(np.less_equal(x, y), less_equal(xm, ym)))
+        assert_(eq(np.greater_equal(x, y), greater_equal(xm, ym)))
+        assert_(eq(np.conjugate(x), conjugate(xm)))
+        assert_(eq(np.concatenate((x, y)), concatenate((xm, ym))))
+        assert_(eq(np.concatenate((x, y)), concatenate((x, y))))
+        assert_(eq(np.concatenate((x, y)), concatenate((xm, y))))
+        assert_(eq(np.concatenate((x, y, x)), concatenate((x, ym, x))))
+
+    def test_xtestCount(self):
+        # Test count
+        ott = array([0., 1., 2., 3.], mask=[1, 0, 0, 0])
+        assert_(count(ott).dtype.type is np.intp)
+        assert_equal(3, count(ott))
+        assert_equal(1, count(1))
+        assert_(eq(0, array(1, mask=[1])))
+        ott = ott.reshape((2, 2))
+        assert_(count(ott).dtype.type is np.intp)
+        assert_(isinstance(count(ott, 0), np.ndarray))
+        assert_(count(ott).dtype.type is np.intp)
+        assert_(eq(3, count(ott)))
+        assert_(getmask(count(ott, 0)) is nomask)
+        assert_(eq([1, 2], count(ott, 0)))
+
+    def test_testMinMax(self):
+        # Test minimum and maximum.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+        xr = np.ravel(x)  # max doesn't work if shaped
+        xmr = ravel(xm)
+
+        # true because of careful selection of data
+        assert_(eq(max(xr), maximum.reduce(xmr)))
+        assert_(eq(min(xr), minimum.reduce(xmr)))
+
+    def test_testAddSumProd(self):
+        # Test add, sum, product.
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+        assert_(eq(np.add.reduce(x), add.reduce(x)))
+        assert_(eq(np.add.accumulate(x), add.accumulate(x)))
+        assert_(eq(4, sum(array(4), axis=0)))
+        assert_(eq(4, sum(array(4), axis=0)))
+        assert_(eq(np.sum(x, axis=0), sum(x, axis=0)))
+        assert_(eq(np.sum(filled(xm, 0), axis=0), sum(xm, axis=0)))
+        assert_(eq(np.sum(x, 0), sum(x, 0)))
+        assert_(eq(np.product(x, axis=0), product(x, axis=0)))
+        assert_(eq(np.product(x, 0), product(x, 0)))
+        assert_(eq(np.product(filled(xm, 1), axis=0),
+                           product(xm, axis=0)))
+        if len(s) > 1:
+            assert_(eq(np.concatenate((x, y), 1),
+                               concatenate((xm, ym), 1)))
+            assert_(eq(np.add.reduce(x, 1), add.reduce(x, 1)))
+            assert_(eq(np.sum(x, 1), sum(x, 1)))
+            assert_(eq(np.product(x, 1), product(x, 1)))
+
+    def test_testCI(self):
+        # Test of conversions and indexing
+        x1 = np.array([1, 2, 4, 3])
+        x2 = array(x1, mask=[1, 0, 0, 0])
+        x3 = array(x1, mask=[0, 1, 0, 1])
+        x4 = array(x1)
+        # test conversion to strings
+        str(x2)  # raises?
+        repr(x2)  # raises?
+        assert_(eq(np.sort(x1), sort(x2, fill_value=0)))
+        # tests of indexing
+        assert_(type(x2[1]) is type(x1[1]))
+        assert_(x1[1] == x2[1])
+        assert_(x2[0] is masked)
+        assert_(eq(x1[2], x2[2]))
+        assert_(eq(x1[2:5], x2[2:5]))
+        assert_(eq(x1[:], x2[:]))
+        assert_(eq(x1[1:], x3[1:]))
+        x1[2] = 9
+        x2[2] = 9
+        assert_(eq(x1, x2))
+        x1[1:3] = 99
+        x2[1:3] = 99
+        assert_(eq(x1, x2))
+        x2[1] = masked
+        assert_(eq(x1, x2))
+        x2[1:3] = masked
+        assert_(eq(x1, x2))
+        x2[:] = x1
+        x2[1] = masked
+        assert_(allequal(getmask(x2), array([0, 1, 0, 0])))
+        x3[:] = masked_array([1, 2, 3, 4], [0, 1, 1, 0])
+        assert_(allequal(getmask(x3), array([0, 1, 1, 0])))
+        x4[:] = masked_array([1, 2, 3, 4], [0, 1, 1, 0])
+        assert_(allequal(getmask(x4), array([0, 1, 1, 0])))
+        assert_(allequal(x4, array([1, 2, 3, 4])))
+        x1 = np.arange(5) * 1.0
+        x2 = masked_values(x1, 3.0)
+        assert_(eq(x1, x2))
+        assert_(allequal(array([0, 0, 0, 1, 0], MaskType), x2.mask))
+        assert_(eq(3.0, x2.fill_value))
+        x1 = array([1, 'hello', 2, 3], object)
+        x2 = np.array([1, 'hello', 2, 3], object)
+        s1 = x1[1]
+        s2 = x2[1]
+        assert_equal(type(s2), str)
+        assert_equal(type(s1), str)
+        assert_equal(s1, s2)
+        assert_(x1[1:1].shape == (0,))
+
+    def test_testCopySize(self):
+        # Tests of some subtle points of copying and sizing.
+        n = [0, 0, 1, 0, 0]
+        m = make_mask(n)
+        m2 = make_mask(m)
+        assert_(m is m2)
+        m3 = make_mask(m, copy=1)
+        assert_(m is not m3)
+
+        x1 = np.arange(5)
+        y1 = array(x1, mask=m)
+        assert_(y1._data is not x1)
+        assert_(allequal(x1, y1._data))
+        assert_(y1.mask is m)
+
+        y1a = array(y1, copy=0)
+        # For copy=False, one might expect that the array would just
+        # passed on, i.e., that it would be "is" instead of "==".
+        # See gh-4043 for discussion.
+        assert_(y1a._mask.__array_interface__ ==
+                y1._mask.__array_interface__)
+
+        y2 = array(x1, mask=m3, copy=0)
+        assert_(y2.mask is m3)
+        assert_(y2[2] is masked)
+        y2[2] = 9
+        assert_(y2[2] is not masked)
+        assert_(y2.mask is m3)
+        assert_(allequal(y2.mask, 0))
+
+        y2a = array(x1, mask=m, copy=1)
+        assert_(y2a.mask is not m)
+        assert_(y2a[2] is masked)
+        y2a[2] = 9
+        assert_(y2a[2] is not masked)
+        assert_(y2a.mask is not m)
+        assert_(allequal(y2a.mask, 0))
+
+        y3 = array(x1 * 1.0, mask=m)
+        assert_(filled(y3).dtype is (x1 * 1.0).dtype)
+
+        x4 = arange(4)
+        x4[2] = masked
+        y4 = resize(x4, (8,))
+        assert_(eq(concatenate([x4, x4]), y4))
+        assert_(eq(getmask(y4), [0, 0, 1, 0, 0, 0, 1, 0]))
+        y5 = repeat(x4, (2, 2, 2, 2), axis=0)
+        assert_(eq(y5, [0, 0, 1, 1, 2, 2, 3, 3]))
+        y6 = repeat(x4, 2, axis=0)
+        assert_(eq(y5, y6))
+
+    def test_testPut(self):
+        # Test of put
+        d = arange(5)
+        n = [0, 0, 0, 1, 1]
+        m = make_mask(n)
+        m2 = m.copy()
+        x = array(d, mask=m)
+        assert_(x[3] is masked)
+        assert_(x[4] is masked)
+        x[[1, 4]] = [10, 40]
+        assert_(x.mask is m)
+        assert_(x[3] is masked)
+        assert_(x[4] is not masked)
+        assert_(eq(x, [0, 10, 2, -1, 40]))
+
+        x = array(d, mask=m2, copy=True)
+        x.put([0, 1, 2], [-1, 100, 200])
+        assert_(x.mask is not m2)
+        assert_(x[3] is masked)
+        assert_(x[4] is masked)
+        assert_(eq(x, [-1, 100, 200, 0, 0]))
+
+    def test_testPut2(self):
+        # Test of put
+        d = arange(5)
+        x = array(d, mask=[0, 0, 0, 0, 0])
+        z = array([10, 40], mask=[1, 0])
+        assert_(x[2] is not masked)
+        assert_(x[3] is not masked)
+        x[2:4] = z
+        assert_(x[2] is masked)
+        assert_(x[3] is not masked)
+        assert_(eq(x, [0, 1, 10, 40, 4]))
+
+        d = arange(5)
+        x = array(d, mask=[0, 0, 0, 0, 0])
+        y = x[2:4]
+        z = array([10, 40], mask=[1, 0])
+        assert_(x[2] is not masked)
+        assert_(x[3] is not masked)
+        y[:] = z
+        assert_(y[0] is masked)
+        assert_(y[1] is not masked)
+        assert_(eq(y, [10, 40]))
+        assert_(x[2] is masked)
+        assert_(x[3] is not masked)
+        assert_(eq(x, [0, 1, 10, 40, 4]))
+
+    def test_testMaPut(self):
+        (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d
+        m = [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1]
+        i = np.nonzero(m)[0]
+        put(ym, i, zm)
+        assert_(all(take(ym, i, axis=0) == zm))
+
+    def test_testOddFeatures(self):
+        # Test of other odd features
+        x = arange(20)
+        x = x.reshape(4, 5)
+        x.flat[5] = 12
+        assert_(x[1, 0] == 12)
+        z = x + 10j * x
+        assert_(eq(z.real, x))
+        assert_(eq(z.imag, 10 * x))
+        assert_(eq((z * conjugate(z)).real, 101 * x * x))
+        z.imag[...] = 0.0
+
+        x = arange(10)
+        x[3] = masked
+        assert_(str(x[3]) == str(masked))
+        c = x >= 8
+        assert_(count(where(c, masked, masked)) == 0)
+        assert_(shape(where(c, masked, masked)) == c.shape)
+        z = where(c, x, masked)
+        assert_(z.dtype is x.dtype)
+        assert_(z[3] is masked)
+        assert_(z[4] is masked)
+        assert_(z[7] is masked)
+        assert_(z[8] is not masked)
+        assert_(z[9] is not masked)
+        assert_(eq(x, z))
+        z = where(c, masked, x)
+        assert_(z.dtype is x.dtype)
+        assert_(z[3] is masked)
+        assert_(z[4] is not masked)
+        assert_(z[7] is not masked)
+        assert_(z[8] is masked)
+        assert_(z[9] is masked)
+        z = masked_where(c, x)
+        assert_(z.dtype is x.dtype)
+        assert_(z[3] is masked)
+        assert_(z[4] is not masked)
+        assert_(z[7] is not masked)
+        assert_(z[8] is masked)
+        assert_(z[9] is masked)
+        assert_(eq(x, z))
+        x = array([1., 2., 3., 4., 5.])
+        c = array([1, 1, 1, 0, 0])
+        x[2] = masked
+        z = where(c, x, -x)
+        assert_(eq(z, [1., 2., 0., -4., -5]))
+        c[0] = masked
+        z = where(c, x, -x)
+        assert_(eq(z, [1., 2., 0., -4., -5]))
+        assert_(z[0] is masked)
+        assert_(z[1] is not masked)
+        assert_(z[2] is masked)
+        assert_(eq(masked_where(greater(x, 2), x), masked_greater(x, 2)))
+        assert_(eq(masked_where(greater_equal(x, 2), x),
+                   masked_greater_equal(x, 2)))
+        assert_(eq(masked_where(less(x, 2), x), masked_less(x, 2)))
+        assert_(eq(masked_where(less_equal(x, 2), x), masked_less_equal(x, 2)))
+        assert_(eq(masked_where(not_equal(x, 2), x), masked_not_equal(x, 2)))
+        assert_(eq(masked_where(equal(x, 2), x), masked_equal(x, 2)))
+        assert_(eq(masked_where(not_equal(x, 2), x), masked_not_equal(x, 2)))
+        assert_(eq(masked_inside(list(range(5)), 1, 3), [0, 199, 199, 199, 4]))
+        assert_(eq(masked_outside(list(range(5)), 1, 3), [199, 1, 2, 3, 199]))
+        assert_(eq(masked_inside(array(list(range(5)),
+                                       mask=[1, 0, 0, 0, 0]), 1, 3).mask,
+                   [1, 1, 1, 1, 0]))
+        assert_(eq(masked_outside(array(list(range(5)),
+                                        mask=[0, 1, 0, 0, 0]), 1, 3).mask,
+                   [1, 1, 0, 0, 1]))
+        assert_(eq(masked_equal(array(list(range(5)),
+                                      mask=[1, 0, 0, 0, 0]), 2).mask,
+                   [1, 0, 1, 0, 0]))
+        assert_(eq(masked_not_equal(array([2, 2, 1, 2, 1],
+                                          mask=[1, 0, 0, 0, 0]), 2).mask,
+                   [1, 0, 1, 0, 1]))
+        assert_(eq(masked_where([1, 1, 0, 0, 0], [1, 2, 3, 4, 5]),
+                   [99, 99, 3, 4, 5]))
+        atest = ones((10, 10, 10), dtype=np.float32)
+        btest = zeros(atest.shape, MaskType)
+        ctest = masked_where(btest, atest)
+        assert_(eq(atest, ctest))
+        z = choose(c, (-x, x))
+        assert_(eq(z, [1., 2., 0., -4., -5]))
+        assert_(z[0] is masked)
+        assert_(z[1] is not masked)
+        assert_(z[2] is masked)
+        x = arange(6)
+        x[5] = masked
+        y = arange(6) * 10
+        y[2] = masked
+        c = array([1, 1, 1, 0, 0, 0], mask=[1, 0, 0, 0, 0, 0])
+        cm = c.filled(1)
+        z = where(c, x, y)
+        zm = where(cm, x, y)
+        assert_(eq(z, zm))
+        assert_(getmask(zm) is nomask)
+        assert_(eq(zm, [0, 1, 2, 30, 40, 50]))
+        z = where(c, masked, 1)
+        assert_(eq(z, [99, 99, 99, 1, 1, 1]))
+        z = where(c, 1, masked)
+        assert_(eq(z, [99, 1, 1, 99, 99, 99]))
+
+    def test_testMinMax2(self):
+        # Test of minimum, maximum.
+        assert_(eq(minimum([1, 2, 3], [4, 0, 9]), [1, 0, 3]))
+        assert_(eq(maximum([1, 2, 3], [4, 0, 9]), [4, 2, 9]))
+        x = arange(5)
+        y = arange(5) - 2
+        x[3] = masked
+        y[0] = masked
+        assert_(eq(minimum(x, y), where(less(x, y), x, y)))
+        assert_(eq(maximum(x, y), where(greater(x, y), x, y)))
+        assert_(minimum.reduce(x) == 0)
+        assert_(maximum.reduce(x) == 4)
+
+    def test_testTakeTransposeInnerOuter(self):
+        # Test of take, transpose, inner, outer products
+        x = arange(24)
+        y = np.arange(24)
+        x[5:6] = masked
+        x = x.reshape(2, 3, 4)
+        y = y.reshape(2, 3, 4)
+        assert_(eq(np.transpose(y, (2, 0, 1)), transpose(x, (2, 0, 1))))
+        assert_(eq(np.take(y, (2, 0, 1), 1), take(x, (2, 0, 1), 1)))
+        assert_(eq(np.inner(filled(x, 0), filled(y, 0)),
+                   inner(x, y)))
+        assert_(eq(np.outer(filled(x, 0), filled(y, 0)),
+                   outer(x, y)))
+        y = array(['abc', 1, 'def', 2, 3], object)
+        y[2] = masked
+        t = take(y, [0, 3, 4])
+        assert_(t[0] == 'abc')
+        assert_(t[1] == 2)
+        assert_(t[2] == 3)
+
+    def test_testInplace(self):
+        # Test of inplace operations and rich comparisons
+        y = arange(10)
+
+        x = arange(10)
+        xm = arange(10)
+        xm[2] = masked
+        x += 1
+        assert_(eq(x, y + 1))
+        xm += 1
+        assert_(eq(x, y + 1))
+
+        x = arange(10)
+        xm = arange(10)
+        xm[2] = masked
+        x -= 1
+        assert_(eq(x, y - 1))
+        xm -= 1
+        assert_(eq(xm, y - 1))
+
+        x = arange(10) * 1.0
+        xm = arange(10) * 1.0
+        xm[2] = masked
+        x *= 2.0
+        assert_(eq(x, y * 2))
+        xm *= 2.0
+        assert_(eq(xm, y * 2))
+
+        x = arange(10) * 2
+        xm = arange(10)
+        xm[2] = masked
+        x //= 2
+        assert_(eq(x, y))
+        xm //= 2
+        assert_(eq(x, y))
+
+        x = arange(10) * 1.0
+        xm = arange(10) * 1.0
+        xm[2] = masked
+        x /= 2.0
+        assert_(eq(x, y / 2.0))
+        xm /= arange(10)
+        assert_(eq(xm, ones((10,))))
+
+        x = arange(10).astype(np.float32)
+        xm = arange(10)
+        xm[2] = masked
+        x += 1.
+        assert_(eq(x, y + 1.))
+
+    def test_testPickle(self):
+        # Test of pickling
+        x = arange(12)
+        x[4:10:2] = masked
+        x = x.reshape(4, 3)
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            s = pickle.dumps(x, protocol=proto)
+            y = pickle.loads(s)
+            assert_(eq(x, y))
+
+    def test_testMasked(self):
+        # Test of masked element
+        xx = arange(6)
+        xx[1] = masked
+        assert_(str(masked) == '--')
+        assert_(xx[1] is masked)
+        assert_equal(filled(xx[1], 0), 0)
+
+    def test_testAverage1(self):
+        # Test of average.
+        ott = array([0., 1., 2., 3.], mask=[1, 0, 0, 0])
+        assert_(eq(2.0, average(ott, axis=0)))
+        assert_(eq(2.0, average(ott, weights=[1., 1., 2., 1.])))
+        result, wts = average(ott, weights=[1., 1., 2., 1.], returned=1)
+        assert_(eq(2.0, result))
+        assert_(wts == 4.0)
+        ott[:] = masked
+        assert_(average(ott, axis=0) is masked)
+        ott = array([0., 1., 2., 3.], mask=[1, 0, 0, 0])
+        ott = ott.reshape(2, 2)
+        ott[:, 1] = masked
+        assert_(eq(average(ott, axis=0), [2.0, 0.0]))
+        assert_(average(ott, axis=1)[0] is masked)
+        assert_(eq([2., 0.], average(ott, axis=0)))
+        result, wts = average(ott, axis=0, returned=1)
+        assert_(eq(wts, [1., 0.]))
+
+    def test_testAverage2(self):
+        # More tests of average.
+        w1 = [0, 1, 1, 1, 1, 0]
+        w2 = [[0, 1, 1, 1, 1, 0], [1, 0, 0, 0, 0, 1]]
+        x = arange(6)
+        assert_(allclose(average(x, axis=0), 2.5))
+        assert_(allclose(average(x, axis=0, weights=w1), 2.5))
+        y = array([arange(6), 2.0 * arange(6)])
+        assert_(allclose(average(y, None),
+                                 np.add.reduce(np.arange(6)) * 3. / 12.))
+        assert_(allclose(average(y, axis=0), np.arange(6) * 3. / 2.))
+        assert_(allclose(average(y, axis=1),
+                                 [average(x, axis=0), average(x, axis=0)*2.0]))
+        assert_(allclose(average(y, None, weights=w2), 20. / 6.))
+        assert_(allclose(average(y, axis=0, weights=w2),
+                                 [0., 1., 2., 3., 4., 10.]))
+        assert_(allclose(average(y, axis=1),
+                                 [average(x, axis=0), average(x, axis=0)*2.0]))
+        m1 = zeros(6)
+        m2 = [0, 0, 1, 1, 0, 0]
+        m3 = [[0, 0, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0]]
+        m4 = ones(6)
+        m5 = [0, 1, 1, 1, 1, 1]
+        assert_(allclose(average(masked_array(x, m1), axis=0), 2.5))
+        assert_(allclose(average(masked_array(x, m2), axis=0), 2.5))
+        assert_(average(masked_array(x, m4), axis=0) is masked)
+        assert_equal(average(masked_array(x, m5), axis=0), 0.0)
+        assert_equal(count(average(masked_array(x, m4), axis=0)), 0)
+        z = masked_array(y, m3)
+        assert_(allclose(average(z, None), 20. / 6.))
+        assert_(allclose(average(z, axis=0),
+                                 [0., 1., 99., 99., 4.0, 7.5]))
+        assert_(allclose(average(z, axis=1), [2.5, 5.0]))
+        assert_(allclose(average(z, axis=0, weights=w2),
+                                 [0., 1., 99., 99., 4.0, 10.0]))
+
+        a = arange(6)
+        b = arange(6) * 3
+        r1, w1 = average([[a, b], [b, a]], axis=1, returned=1)
+        assert_equal(shape(r1), shape(w1))
+        assert_equal(r1.shape, w1.shape)
+        r2, w2 = average(ones((2, 2, 3)), axis=0, weights=[3, 1], returned=1)
+        assert_equal(shape(w2), shape(r2))
+        r2, w2 = average(ones((2, 2, 3)), returned=1)
+        assert_equal(shape(w2), shape(r2))
+        r2, w2 = average(ones((2, 2, 3)), weights=ones((2, 2, 3)), returned=1)
+        assert_(shape(w2) == shape(r2))
+        a2d = array([[1, 2], [0, 4]], float)
+        a2dm = masked_array(a2d, [[0, 0], [1, 0]])
+        a2da = average(a2d, axis=0)
+        assert_(eq(a2da, [0.5, 3.0]))
+        a2dma = average(a2dm, axis=0)
+        assert_(eq(a2dma, [1.0, 3.0]))
+        a2dma = average(a2dm, axis=None)
+        assert_(eq(a2dma, 7. / 3.))
+        a2dma = average(a2dm, axis=1)
+        assert_(eq(a2dma, [1.5, 4.0]))
+
+    def test_testToPython(self):
+        assert_equal(1, int(array(1)))
+        assert_equal(1.0, float(array(1)))
+        assert_equal(1, int(array([[[1]]])))
+        assert_equal(1.0, float(array([[1]])))
+        assert_raises(TypeError, float, array([1, 1]))
+        assert_raises(ValueError, bool, array([0, 1]))
+        assert_raises(ValueError, bool, array([0, 0], mask=[0, 1]))
+
+    def test_testScalarArithmetic(self):
+        xm = array(0, mask=1)
+        #TODO FIXME: Find out what the following raises a warning in r8247
+        with np.errstate(divide='ignore'):
+            assert_((1 / array(0)).mask)
+        assert_((1 + xm).mask)
+        assert_((-xm).mask)
+        assert_((-xm).mask)
+        assert_(maximum(xm, xm).mask)
+        assert_(minimum(xm, xm).mask)
+        assert_(xm.filled().dtype is xm._data.dtype)
+        x = array(0, mask=0)
+        assert_(x.filled() == x._data)
+        assert_equal(str(xm), str(masked_print_option))
+
+    def test_testArrayMethods(self):
+        a = array([1, 3, 2])
+        assert_(eq(a.any(), a._data.any()))
+        assert_(eq(a.all(), a._data.all()))
+        assert_(eq(a.argmax(), a._data.argmax()))
+        assert_(eq(a.argmin(), a._data.argmin()))
+        assert_(eq(a.choose(0, 1, 2, 3, 4),
+                           a._data.choose(0, 1, 2, 3, 4)))
+        assert_(eq(a.compress([1, 0, 1]), a._data.compress([1, 0, 1])))
+        assert_(eq(a.conj(), a._data.conj()))
+        assert_(eq(a.conjugate(), a._data.conjugate()))
+        m = array([[1, 2], [3, 4]])
+        assert_(eq(m.diagonal(), m._data.diagonal()))
+        assert_(eq(a.sum(), a._data.sum()))
+        assert_(eq(a.take([1, 2]), a._data.take([1, 2])))
+        assert_(eq(m.transpose(), m._data.transpose()))
+
+    def test_testArrayAttributes(self):
+        a = array([1, 3, 2])
+        assert_equal(a.ndim, 1)
+
+    def test_testAPI(self):
+        assert_(not [m for m in dir(np.ndarray)
+                     if m not in dir(MaskedArray) and
+                     not m.startswith('_')])
+
+    def test_testSingleElementSubscript(self):
+        a = array([1, 3, 2])
+        b = array([1, 3, 2], mask=[1, 0, 1])
+        assert_equal(a[0].shape, ())
+        assert_equal(b[0].shape, ())
+        assert_equal(b[1].shape, ())
+
+
+class TestUfuncs(object):
+    def setup(self):
+        self.d = (array([1.0, 0, -1, pi / 2] * 2, mask=[0, 1] + [0] * 6),
+                  array([1.0, 0, -1, pi / 2] * 2, mask=[1, 0] + [0] * 6),)
+
+    def test_testUfuncRegression(self):
+        f_invalid_ignore = [
+            'sqrt', 'arctanh', 'arcsin', 'arccos',
+            'arccosh', 'arctanh', 'log', 'log10', 'divide',
+            'true_divide', 'floor_divide', 'remainder', 'fmod']
+        for f in ['sqrt', 'log', 'log10', 'exp', 'conjugate',
+                  'sin', 'cos', 'tan',
+                  'arcsin', 'arccos', 'arctan',
+                  'sinh', 'cosh', 'tanh',
+                  'arcsinh',
+                  'arccosh',
+                  'arctanh',
+                  'absolute', 'fabs', 'negative',
+                  'floor', 'ceil',
+                  'logical_not',
+                  'add', 'subtract', 'multiply',
+                  'divide', 'true_divide', 'floor_divide',
+                  'remainder', 'fmod', 'hypot', 'arctan2',
+                  'equal', 'not_equal', 'less_equal', 'greater_equal',
+                  'less', 'greater',
+                  'logical_and', 'logical_or', 'logical_xor']:
+            try:
+                uf = getattr(umath, f)
+            except AttributeError:
+                uf = getattr(fromnumeric, f)
+            mf = getattr(np.ma, f)
+            args = self.d[:uf.nin]
+            with np.errstate():
+                if f in f_invalid_ignore:
+                    np.seterr(invalid='ignore')
+                if f in ['arctanh', 'log', 'log10']:
+                    np.seterr(divide='ignore')
+                ur = uf(*args)
+                mr = mf(*args)
+            assert_(eq(ur.filled(0), mr.filled(0), f))
+            assert_(eqmask(ur.mask, mr.mask))
+
+    def test_reduce(self):
+        a = self.d[0]
+        assert_(not alltrue(a, axis=0))
+        assert_(sometrue(a, axis=0))
+        assert_equal(sum(a[:3], axis=0), 0)
+        assert_equal(product(a, axis=0), 0)
+
+    def test_minmax(self):
+        a = arange(1, 13).reshape(3, 4)
+        amask = masked_where(a < 5, a)
+        assert_equal(amask.max(), a.max())
+        assert_equal(amask.min(), 5)
+        assert_((amask.max(0) == a.max(0)).all())
+        assert_((amask.min(0) == [5, 6, 7, 8]).all())
+        assert_(amask.max(1)[0].mask)
+        assert_(amask.min(1)[0].mask)
+
+    def test_nonzero(self):
+        for t in "?bhilqpBHILQPfdgFDGO":
+            x = array([1, 0, 2, 0], mask=[0, 0, 1, 1])
+            assert_(eq(nonzero(x), [0]))
+
+
+class TestArrayMethods(object):
+
+    def setup(self):
+        x = np.array([8.375, 7.545, 8.828, 8.5, 1.757, 5.928,
+                      8.43, 7.78, 9.865, 5.878, 8.979, 4.732,
+                      3.012, 6.022, 5.095, 3.116, 5.238, 3.957,
+                      6.04, 9.63, 7.712, 3.382, 4.489, 6.479,
+                      7.189, 9.645, 5.395, 4.961, 9.894, 2.893,
+                      7.357, 9.828, 6.272, 3.758, 6.693, 0.993])
+        X = x.reshape(6, 6)
+        XX = x.reshape(3, 2, 2, 3)
+
+        m = np.array([0, 1, 0, 1, 0, 0,
+                      1, 0, 1, 1, 0, 1,
+                      0, 0, 0, 1, 0, 1,
+                      0, 0, 0, 1, 1, 1,
+                      1, 0, 0, 1, 0, 0,
+                      0, 0, 1, 0, 1, 0])
+        mx = array(data=x, mask=m)
+        mX = array(data=X, mask=m.reshape(X.shape))
+        mXX = array(data=XX, mask=m.reshape(XX.shape))
+
+        self.d = (x, X, XX, m, mx, mX, mXX)
+
+    def test_trace(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        mXdiag = mX.diagonal()
+        assert_equal(mX.trace(), mX.diagonal().compressed().sum())
+        assert_(eq(mX.trace(),
+                           X.trace() - sum(mXdiag.mask * X.diagonal(),
+                                           axis=0)))
+
+    def test_clip(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        clipped = mx.clip(2, 8)
+        assert_(eq(clipped.mask, mx.mask))
+        assert_(eq(clipped._data, x.clip(2, 8)))
+        assert_(eq(clipped._data, mx._data.clip(2, 8)))
+
+    def test_ptp(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        (n, m) = X.shape
+        assert_equal(mx.ptp(), mx.compressed().ptp())
+        rows = np.zeros(n, np.float_)
+        cols = np.zeros(m, np.float_)
+        for k in range(m):
+            cols[k] = mX[:, k].compressed().ptp()
+        for k in range(n):
+            rows[k] = mX[k].compressed().ptp()
+        assert_(eq(mX.ptp(0), cols))
+        assert_(eq(mX.ptp(1), rows))
+
+    def test_swapaxes(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        mXswapped = mX.swapaxes(0, 1)
+        assert_(eq(mXswapped[-1], mX[:, -1]))
+        mXXswapped = mXX.swapaxes(0, 2)
+        assert_equal(mXXswapped.shape, (2, 2, 3, 3))
+
+    def test_cumprod(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        mXcp = mX.cumprod(0)
+        assert_(eq(mXcp._data, mX.filled(1).cumprod(0)))
+        mXcp = mX.cumprod(1)
+        assert_(eq(mXcp._data, mX.filled(1).cumprod(1)))
+
+    def test_cumsum(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        mXcp = mX.cumsum(0)
+        assert_(eq(mXcp._data, mX.filled(0).cumsum(0)))
+        mXcp = mX.cumsum(1)
+        assert_(eq(mXcp._data, mX.filled(0).cumsum(1)))
+
+    def test_varstd(self):
+        (x, X, XX, m, mx, mX, mXX,) = self.d
+        assert_(eq(mX.var(axis=None), mX.compressed().var()))
+        assert_(eq(mX.std(axis=None), mX.compressed().std()))
+        assert_(eq(mXX.var(axis=3).shape, XX.var(axis=3).shape))
+        assert_(eq(mX.var().shape, X.var().shape))
+        (mXvar0, mXvar1) = (mX.var(axis=0), mX.var(axis=1))
+        for k in range(6):
+            assert_(eq(mXvar1[k], mX[k].compressed().var()))
+            assert_(eq(mXvar0[k], mX[:, k].compressed().var()))
+            assert_(eq(np.sqrt(mXvar0[k]),
+                               mX[:, k].compressed().std()))
+
+
+def eqmask(m1, m2):
+    if m1 is nomask:
+        return m2 is nomask
+    if m2 is nomask:
+        return m1 is nomask
+    return (m1 == m2).all()
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_regression.py b/contrib/python/numpy/py2/numpy/ma/tests/test_regression.py
new file mode 100644
index 00000000000..54f1bda7db7
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_regression.py
@@ -0,0 +1,89 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import (
+    assert_, assert_array_equal, assert_allclose, suppress_warnings
+    )
+
+
+class TestRegression(object):
+    def test_masked_array_create(self):
+        # Ticket #17
+        x = np.ma.masked_array([0, 1, 2, 3, 0, 4, 5, 6],
+                               mask=[0, 0, 0, 1, 1, 1, 0, 0])
+        assert_array_equal(np.ma.nonzero(x), [[1, 2, 6, 7]])
+
+    def test_masked_array(self):
+        # Ticket #61
+        np.ma.array(1, mask=[1])
+
+    def test_mem_masked_where(self):
+        # Ticket #62
+        from numpy.ma import masked_where, MaskType
+        a = np.zeros((1, 1))
+        b = np.zeros(a.shape, MaskType)
+        c = masked_where(b, a)
+        a-c
+
+    def test_masked_array_multiply(self):
+        # Ticket #254
+        a = np.ma.zeros((4, 1))
+        a[2, 0] = np.ma.masked
+        b = np.zeros((4, 2))
+        a*b
+        b*a
+
+    def test_masked_array_repeat(self):
+        # Ticket #271
+        np.ma.array([1], mask=False).repeat(10)
+
+    def test_masked_array_repr_unicode(self):
+        # Ticket #1256
+        repr(np.ma.array(u"Unicode"))
+
+    def test_atleast_2d(self):
+        # Ticket #1559
+        a = np.ma.masked_array([0.0, 1.2, 3.5], mask=[False, True, False])
+        b = np.atleast_2d(a)
+        assert_(a.mask.ndim == 1)
+        assert_(b.mask.ndim == 2)
+
+    def test_set_fill_value_unicode_py3(self):
+        # Ticket #2733
+        a = np.ma.masked_array(['a', 'b', 'c'], mask=[1, 0, 0])
+        a.fill_value = 'X'
+        assert_(a.fill_value == 'X')
+
+    def test_var_sets_maskedarray_scalar(self):
+        # Issue gh-2757
+        a = np.ma.array(np.arange(5), mask=True)
+        mout = np.ma.array(-1, dtype=float)
+        a.var(out=mout)
+        assert_(mout._data == 0)
+
+    def test_ddof_corrcoef(self):
+        # See gh-3336
+        x = np.ma.masked_equal([1, 2, 3, 4, 5], 4)
+        y = np.array([2, 2.5, 3.1, 3, 5])
+        # this test can be removed after deprecation.
+        with suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "bias and ddof have no effect")
+            r0 = np.ma.corrcoef(x, y, ddof=0)
+            r1 = np.ma.corrcoef(x, y, ddof=1)
+            # ddof should not have an effect (it gets cancelled out)
+            assert_allclose(r0.data, r1.data)
+
+    def test_mask_not_backmangled(self):
+        # See gh-10314.  Test case taken from gh-3140.
+        a = np.ma.MaskedArray([1., 2.], mask=[False, False])
+        assert_(a.mask.shape == (2,))
+        b = np.tile(a, (2, 1))
+        # Check that the above no longer changes a.shape to (1, 2)
+        assert_(a.mask.shape == (2,))
+        assert_(b.shape == (2, 2))
+        assert_(b.mask.shape == (2, 2))
+
+    def test_empty_list_on_structured(self):
+        # See gh-12464. Indexing with empty list should give empty result.
+        ma = np.ma.MaskedArray([(1, 1.), (2, 2.), (3, 3.)], dtype='i4,f4')
+        assert_array_equal(ma[[]], ma[:0])
diff --git a/contrib/python/numpy/py2/numpy/ma/tests/test_subclassing.py b/contrib/python/numpy/py2/numpy/ma/tests/test_subclassing.py
new file mode 100644
index 00000000000..f8ab52bb9e1
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/ma/tests/test_subclassing.py
@@ -0,0 +1,351 @@
+# pylint: disable-msg=W0611, W0612, W0511,R0201
+"""Tests suite for MaskedArray & subclassing.
+
+:author: Pierre Gerard-Marchant
+:contact: pierregm_at_uga_dot_edu
+:version: $Id: test_subclassing.py 3473 2007-10-29 15:18:13Z jarrod.millman $
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_, assert_raises
+from numpy.ma.testutils import assert_equal
+from numpy.ma.core import (
+    array, arange, masked, MaskedArray, masked_array, log, add, hypot,
+    divide, asarray, asanyarray, nomask
+    )
+# from numpy.ma.core import (
+
+def assert_startswith(a, b):
+    # produces a better error message than assert_(a.startswith(b))
+    assert_equal(a[:len(b)], b)
+
+class SubArray(np.ndarray):
+    # Defines a generic np.ndarray subclass, that stores some metadata
+    # in the  dictionary `info`.
+    def __new__(cls,arr,info={}):
+        x = np.asanyarray(arr).view(cls)
+        x.info = info.copy()
+        return x
+
+    def __array_finalize__(self, obj):
+        if callable(getattr(super(SubArray, self),
+                            '__array_finalize__', None)):
+            super(SubArray, self).__array_finalize__(obj)
+        self.info = getattr(obj, 'info', {}).copy()
+        return
+
+    def __add__(self, other):
+        result = super(SubArray, self).__add__(other)
+        result.info['added'] = result.info.get('added', 0) + 1
+        return result
+
+    def __iadd__(self, other):
+        result = super(SubArray, self).__iadd__(other)
+        result.info['iadded'] = result.info.get('iadded', 0) + 1
+        return result
+
+
+subarray = SubArray
+
+
+class SubMaskedArray(MaskedArray):
+    """Pure subclass of MaskedArray, keeping some info on subclass."""
+    def __new__(cls, info=None, **kwargs):
+        obj = super(SubMaskedArray, cls).__new__(cls, **kwargs)
+        obj._optinfo['info'] = info
+        return obj
+
+
+class MSubArray(SubArray, MaskedArray):
+
+    def __new__(cls, data, info={}, mask=nomask):
+        subarr = SubArray(data, info)
+        _data = MaskedArray.__new__(cls, data=subarr, mask=mask)
+        _data.info = subarr.info
+        return _data
+
+    def _get_series(self):
+        _view = self.view(MaskedArray)
+        _view._sharedmask = False
+        return _view
+    _series = property(fget=_get_series)
+
+msubarray = MSubArray
+
+
+# Also a subclass that overrides __str__, __repr__ and __setitem__, disallowing
+# setting to non-class values (and thus np.ma.core.masked_print_option)
+# and overrides __array_wrap__, updating the info dict, to check that this
+# doesn't get destroyed by MaskedArray._update_from.  But this one also needs
+# its own iterator...
+class CSAIterator(object):
+    """
+    Flat iterator object that uses its own setter/getter
+    (works around ndarray.flat not propagating subclass setters/getters
+    see https://github.com/numpy/numpy/issues/4564)
+    roughly following MaskedIterator
+    """
+    def __init__(self, a):
+        self._original = a
+        self._dataiter = a.view(np.ndarray).flat
+
+    def __iter__(self):
+        return self
+
+    def __getitem__(self, indx):
+        out = self._dataiter.__getitem__(indx)
+        if not isinstance(out, np.ndarray):
+            out = out.__array__()
+        out = out.view(type(self._original))
+        return out
+
+    def __setitem__(self, index, value):
+        self._dataiter[index] = self._original._validate_input(value)
+
+    def __next__(self):
+        return next(self._dataiter).__array__().view(type(self._original))
+
+    next = __next__
+
+
+class ComplicatedSubArray(SubArray):
+
+    def __str__(self):
+        return 'myprefix {0} mypostfix'.format(self.view(SubArray))
+
+    def __repr__(self):
+        # Return a repr that does not start with 'name('
+        return '<{0} {1}>'.format(self.__class__.__name__, self)
+
+    def _validate_input(self, value):
+        if not isinstance(value, ComplicatedSubArray):
+            raise ValueError("Can only set to MySubArray values")
+        return value
+
+    def __setitem__(self, item, value):
+        # validation ensures direct assignment with ndarray or
+        # masked_print_option will fail
+        super(ComplicatedSubArray, self).__setitem__(
+            item, self._validate_input(value))
+
+    def __getitem__(self, item):
+        # ensure getter returns our own class also for scalars
+        value = super(ComplicatedSubArray, self).__getitem__(item)
+        if not isinstance(value, np.ndarray):  # scalar
+            value = value.__array__().view(ComplicatedSubArray)
+        return value
+
+    @property
+    def flat(self):
+        return CSAIterator(self)
+
+    @flat.setter
+    def flat(self, value):
+        y = self.ravel()
+        y[:] = value
+
+    def __array_wrap__(self, obj, context=None):
+        obj = super(ComplicatedSubArray, self).__array_wrap__(obj, context)
+        if context is not None and context[0] is np.multiply:
+            obj.info['multiplied'] = obj.info.get('multiplied', 0) + 1
+
+        return obj
+
+
+class TestSubclassing(object):
+    # Test suite for masked subclasses of ndarray.
+
+    def setup(self):
+        x = np.arange(5, dtype='float')
+        mx = msubarray(x, mask=[0, 1, 0, 0, 0])
+        self.data = (x, mx)
+
+    def test_data_subclassing(self):
+        # Tests whether the subclass is kept.
+        x = np.arange(5)
+        m = [0, 0, 1, 0, 0]
+        xsub = SubArray(x)
+        xmsub = masked_array(xsub, mask=m)
+        assert_(isinstance(xmsub, MaskedArray))
+        assert_equal(xmsub._data, xsub)
+        assert_(isinstance(xmsub._data, SubArray))
+
+    def test_maskedarray_subclassing(self):
+        # Tests subclassing MaskedArray
+        (x, mx) = self.data
+        assert_(isinstance(mx._data, subarray))
+
+    def test_masked_unary_operations(self):
+        # Tests masked_unary_operation
+        (x, mx) = self.data
+        with np.errstate(divide='ignore'):
+            assert_(isinstance(log(mx), msubarray))
+            assert_equal(log(x), np.log(x))
+
+    def test_masked_binary_operations(self):
+        # Tests masked_binary_operation
+        (x, mx) = self.data
+        # Result should be a msubarray
+        assert_(isinstance(add(mx, mx), msubarray))
+        assert_(isinstance(add(mx, x), msubarray))
+        # Result should work
+        assert_equal(add(mx, x), mx+x)
+        assert_(isinstance(add(mx, mx)._data, subarray))
+        assert_(isinstance(add.outer(mx, mx), msubarray))
+        assert_(isinstance(hypot(mx, mx), msubarray))
+        assert_(isinstance(hypot(mx, x), msubarray))
+
+    def test_masked_binary_operations2(self):
+        # Tests domained_masked_binary_operation
+        (x, mx) = self.data
+        xmx = masked_array(mx.data.__array__(), mask=mx.mask)
+        assert_(isinstance(divide(mx, mx), msubarray))
+        assert_(isinstance(divide(mx, x), msubarray))
+        assert_equal(divide(mx, mx), divide(xmx, xmx))
+
+    def test_attributepropagation(self):
+        x = array(arange(5), mask=[0]+[1]*4)
+        my = masked_array(subarray(x))
+        ym = msubarray(x)
+        #
+        z = (my+1)
+        assert_(isinstance(z, MaskedArray))
+        assert_(not isinstance(z, MSubArray))
+        assert_(isinstance(z._data, SubArray))
+        assert_equal(z._data.info, {})
+        #
+        z = (ym+1)
+        assert_(isinstance(z, MaskedArray))
+        assert_(isinstance(z, MSubArray))
+        assert_(isinstance(z._data, SubArray))
+        assert_(z._data.info['added'] > 0)
+        # Test that inplace methods from data get used (gh-4617)
+        ym += 1
+        assert_(isinstance(ym, MaskedArray))
+        assert_(isinstance(ym, MSubArray))
+        assert_(isinstance(ym._data, SubArray))
+        assert_(ym._data.info['iadded'] > 0)
+        #
+        ym._set_mask([1, 0, 0, 0, 1])
+        assert_equal(ym._mask, [1, 0, 0, 0, 1])
+        ym._series._set_mask([0, 0, 0, 0, 1])
+        assert_equal(ym._mask, [0, 0, 0, 0, 1])
+        #
+        xsub = subarray(x, info={'name':'x'})
+        mxsub = masked_array(xsub)
+        assert_(hasattr(mxsub, 'info'))
+        assert_equal(mxsub.info, xsub.info)
+
+    def test_subclasspreservation(self):
+        # Checks that masked_array(...,subok=True) preserves the class.
+        x = np.arange(5)
+        m = [0, 0, 1, 0, 0]
+        xinfo = [(i, j) for (i, j) in zip(x, m)]
+        xsub = MSubArray(x, mask=m, info={'xsub':xinfo})
+        #
+        mxsub = masked_array(xsub, subok=False)
+        assert_(not isinstance(mxsub, MSubArray))
+        assert_(isinstance(mxsub, MaskedArray))
+        assert_equal(mxsub._mask, m)
+        #
+        mxsub = asarray(xsub)
+        assert_(not isinstance(mxsub, MSubArray))
+        assert_(isinstance(mxsub, MaskedArray))
+        assert_equal(mxsub._mask, m)
+        #
+        mxsub = masked_array(xsub, subok=True)
+        assert_(isinstance(mxsub, MSubArray))
+        assert_equal(mxsub.info, xsub.info)
+        assert_equal(mxsub._mask, xsub._mask)
+        #
+        mxsub = asanyarray(xsub)
+        assert_(isinstance(mxsub, MSubArray))
+        assert_equal(mxsub.info, xsub.info)
+        assert_equal(mxsub._mask, m)
+
+    def test_subclass_items(self):
+        """test that getter and setter go via baseclass"""
+        x = np.arange(5)
+        xcsub = ComplicatedSubArray(x)
+        mxcsub = masked_array(xcsub, mask=[True, False, True, False, False])
+        # getter should  return a ComplicatedSubArray, even for single item
+        # first check we wrote ComplicatedSubArray correctly
+        assert_(isinstance(xcsub[1], ComplicatedSubArray))
+        assert_(isinstance(xcsub[1,...], ComplicatedSubArray))
+        assert_(isinstance(xcsub[1:4], ComplicatedSubArray))
+
+        # now that it propagates inside the MaskedArray
+        assert_(isinstance(mxcsub[1], ComplicatedSubArray))
+        assert_(isinstance(mxcsub[1,...].data, ComplicatedSubArray))
+        assert_(mxcsub[0] is masked)
+        assert_(isinstance(mxcsub[0,...].data, ComplicatedSubArray))
+        assert_(isinstance(mxcsub[1:4].data, ComplicatedSubArray))
+
+        # also for flattened version (which goes via MaskedIterator)
+        assert_(isinstance(mxcsub.flat[1].data, ComplicatedSubArray))
+        assert_(mxcsub.flat[0] is masked)
+        assert_(isinstance(mxcsub.flat[1:4].base, ComplicatedSubArray))
+
+        # setter should only work with ComplicatedSubArray input
+        # first check we wrote ComplicatedSubArray correctly
+        assert_raises(ValueError, xcsub.__setitem__, 1, x[4])
+        # now that it propagates inside the MaskedArray
+        assert_raises(ValueError, mxcsub.__setitem__, 1, x[4])
+        assert_raises(ValueError, mxcsub.__setitem__, slice(1, 4), x[1:4])
+        mxcsub[1] = xcsub[4]
+        mxcsub[1:4] = xcsub[1:4]
+        # also for flattened version (which goes via MaskedIterator)
+        assert_raises(ValueError, mxcsub.flat.__setitem__, 1, x[4])
+        assert_raises(ValueError, mxcsub.flat.__setitem__, slice(1, 4), x[1:4])
+        mxcsub.flat[1] = xcsub[4]
+        mxcsub.flat[1:4] = xcsub[1:4]
+
+    def test_subclass_nomask_items(self):
+        x = np.arange(5)
+        xcsub = ComplicatedSubArray(x)
+        mxcsub_nomask = masked_array(xcsub)
+
+        assert_(isinstance(mxcsub_nomask[1,...].data, ComplicatedSubArray))
+        assert_(isinstance(mxcsub_nomask[0,...].data, ComplicatedSubArray))
+
+        assert_(isinstance(mxcsub_nomask[1], ComplicatedSubArray))
+        assert_(isinstance(mxcsub_nomask[0], ComplicatedSubArray))
+
+    def test_subclass_repr(self):
+        """test that repr uses the name of the subclass
+        and 'array' for np.ndarray"""
+        x = np.arange(5)
+        mx = masked_array(x, mask=[True, False, True, False, False])
+        assert_startswith(repr(mx), 'masked_array')
+        xsub = SubArray(x)
+        mxsub = masked_array(xsub, mask=[True, False, True, False, False])
+        assert_startswith(repr(mxsub),
+            'masked_{0}(data=[--, 1, --, 3, 4]'.format(SubArray.__name__))
+
+    def test_subclass_str(self):
+        """test str with subclass that has overridden str, setitem"""
+        # first without override
+        x = np.arange(5)
+        xsub = SubArray(x)
+        mxsub = masked_array(xsub, mask=[True, False, True, False, False])
+        assert_equal(str(mxsub), '[-- 1 -- 3 4]')
+
+        xcsub = ComplicatedSubArray(x)
+        assert_raises(ValueError, xcsub.__setitem__, 0,
+                      np.ma.core.masked_print_option)
+        mxcsub = masked_array(xcsub, mask=[True, False, True, False, False])
+        assert_equal(str(mxcsub), 'myprefix [-- 1 -- 3 4] mypostfix')
+
+    def test_pure_subclass_info_preservation(self):
+        # Test that ufuncs and methods conserve extra information consistently;
+        # see gh-7122.
+        arr1 = SubMaskedArray('test', data=[1,2,3,4,5,6])
+        arr2 = SubMaskedArray(data=[0,1,2,3,4,5])
+        diff1 = np.subtract(arr1, arr2)
+        assert_('info' in diff1._optinfo)
+        assert_(diff1._optinfo['info'] == 'test')
+        diff2 = arr1 - arr2
+        assert_('info' in diff2._optinfo)
+        assert_(diff2._optinfo['info'] == 'test')
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/setup.py b/contrib/python/numpy/py2/numpy/matrixlib/setup.py
new file mode 100644
index 00000000000..d0981d6584b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/setup.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+from __future__ import division, print_function
+
+def configuration(parent_package='', top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('matrixlib', parent_package, top_path)
+    config.add_data_dir('tests')
+    return config
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    config = configuration(top_path='').todict()
+    setup(**config)
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/__init__.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_defmatrix.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_defmatrix.py
new file mode 100644
index 00000000000..aa6e08d64de
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_defmatrix.py
@@ -0,0 +1,460 @@
+from __future__ import division, absolute_import, print_function
+
+try:
+    # Accessing collections abstract classes from collections
+    # has been deprecated since Python 3.3
+    import collections.abc as collections_abc
+except ImportError:
+    import collections as collections_abc
+
+import numpy as np
+from numpy import matrix, asmatrix, bmat
+from numpy.testing import (
+    assert_, assert_equal, assert_almost_equal, assert_array_equal,
+    assert_array_almost_equal, assert_raises
+    )
+from numpy.linalg import matrix_power
+from numpy.matrixlib import mat
+
+class TestCtor(object):
+    def test_basic(self):
+        A = np.array([[1, 2], [3, 4]])
+        mA = matrix(A)
+        assert_(np.all(mA.A == A))
+
+        B = bmat("A,A;A,A")
+        C = bmat([[A, A], [A, A]])
+        D = np.array([[1, 2, 1, 2],
+                      [3, 4, 3, 4],
+                      [1, 2, 1, 2],
+                      [3, 4, 3, 4]])
+        assert_(np.all(B.A == D))
+        assert_(np.all(C.A == D))
+
+        E = np.array([[5, 6], [7, 8]])
+        AEresult = matrix([[1, 2, 5, 6], [3, 4, 7, 8]])
+        assert_(np.all(bmat([A, E]) == AEresult))
+
+        vec = np.arange(5)
+        mvec = matrix(vec)
+        assert_(mvec.shape == (1, 5))
+
+    def test_exceptions(self):
+        # Check for ValueError when called with invalid string data.
+        assert_raises(ValueError, matrix, "invalid")
+
+    def test_bmat_nondefault_str(self):
+        A = np.array([[1, 2], [3, 4]])
+        B = np.array([[5, 6], [7, 8]])
+        Aresult = np.array([[1, 2, 1, 2],
+                            [3, 4, 3, 4],
+                            [1, 2, 1, 2],
+                            [3, 4, 3, 4]])
+        mixresult = np.array([[1, 2, 5, 6],
+                              [3, 4, 7, 8],
+                              [5, 6, 1, 2],
+                              [7, 8, 3, 4]])
+        assert_(np.all(bmat("A,A;A,A") == Aresult))
+        assert_(np.all(bmat("A,A;A,A", ldict={'A':B}) == Aresult))
+        assert_raises(TypeError, bmat, "A,A;A,A", gdict={'A':B})
+        assert_(
+            np.all(bmat("A,A;A,A", ldict={'A':A}, gdict={'A':B}) == Aresult))
+        b2 = bmat("A,B;C,D", ldict={'A':A,'B':B}, gdict={'C':B,'D':A})
+        assert_(np.all(b2 == mixresult))
+
+
+class TestProperties(object):
+    def test_sum(self):
+        """Test whether matrix.sum(axis=1) preserves orientation.
+        Fails in NumPy <= 0.9.6.2127.
+        """
+        M = matrix([[1, 2, 0, 0],
+                   [3, 4, 0, 0],
+                   [1, 2, 1, 2],
+                   [3, 4, 3, 4]])
+        sum0 = matrix([8, 12, 4, 6])
+        sum1 = matrix([3, 7, 6, 14]).T
+        sumall = 30
+        assert_array_equal(sum0, M.sum(axis=0))
+        assert_array_equal(sum1, M.sum(axis=1))
+        assert_equal(sumall, M.sum())
+
+        assert_array_equal(sum0, np.sum(M, axis=0))
+        assert_array_equal(sum1, np.sum(M, axis=1))
+        assert_equal(sumall, np.sum(M))
+
+    def test_prod(self):
+        x = matrix([[1, 2, 3], [4, 5, 6]])
+        assert_equal(x.prod(), 720)
+        assert_equal(x.prod(0), matrix([[4, 10, 18]]))
+        assert_equal(x.prod(1), matrix([[6], [120]]))
+
+        assert_equal(np.prod(x), 720)
+        assert_equal(np.prod(x, axis=0), matrix([[4, 10, 18]]))
+        assert_equal(np.prod(x, axis=1), matrix([[6], [120]]))
+
+        y = matrix([0, 1, 3])
+        assert_(y.prod() == 0)
+
+    def test_max(self):
+        x = matrix([[1, 2, 3], [4, 5, 6]])
+        assert_equal(x.max(), 6)
+        assert_equal(x.max(0), matrix([[4, 5, 6]]))
+        assert_equal(x.max(1), matrix([[3], [6]]))
+
+        assert_equal(np.max(x), 6)
+        assert_equal(np.max(x, axis=0), matrix([[4, 5, 6]]))
+        assert_equal(np.max(x, axis=1), matrix([[3], [6]]))
+
+    def test_min(self):
+        x = matrix([[1, 2, 3], [4, 5, 6]])
+        assert_equal(x.min(), 1)
+        assert_equal(x.min(0), matrix([[1, 2, 3]]))
+        assert_equal(x.min(1), matrix([[1], [4]]))
+
+        assert_equal(np.min(x), 1)
+        assert_equal(np.min(x, axis=0), matrix([[1, 2, 3]]))
+        assert_equal(np.min(x, axis=1), matrix([[1], [4]]))
+
+    def test_ptp(self):
+        x = np.arange(4).reshape((2, 2))
+        assert_(x.ptp() == 3)
+        assert_(np.all(x.ptp(0) == np.array([2, 2])))
+        assert_(np.all(x.ptp(1) == np.array([1, 1])))
+
+    def test_var(self):
+        x = np.arange(9).reshape((3, 3))
+        mx = x.view(np.matrix)
+        assert_equal(x.var(ddof=0), mx.var(ddof=0))
+        assert_equal(x.var(ddof=1), mx.var(ddof=1))
+
+    def test_basic(self):
+        import numpy.linalg as linalg
+
+        A = np.array([[1., 2.],
+                      [3., 4.]])
+        mA = matrix(A)
+        assert_(np.allclose(linalg.inv(A), mA.I))
+        assert_(np.all(np.array(np.transpose(A) == mA.T)))
+        assert_(np.all(np.array(np.transpose(A) == mA.H)))
+        assert_(np.all(A == mA.A))
+
+        B = A + 2j*A
+        mB = matrix(B)
+        assert_(np.allclose(linalg.inv(B), mB.I))
+        assert_(np.all(np.array(np.transpose(B) == mB.T)))
+        assert_(np.all(np.array(np.transpose(B).conj() == mB.H)))
+
+    def test_pinv(self):
+        x = matrix(np.arange(6).reshape(2, 3))
+        xpinv = matrix([[-0.77777778,  0.27777778],
+                        [-0.11111111,  0.11111111],
+                        [ 0.55555556, -0.05555556]])
+        assert_almost_equal(x.I, xpinv)
+
+    def test_comparisons(self):
+        A = np.arange(100).reshape(10, 10)
+        mA = matrix(A)
+        mB = matrix(A) + 0.1
+        assert_(np.all(mB == A+0.1))
+        assert_(np.all(mB == matrix(A+0.1)))
+        assert_(not np.any(mB == matrix(A-0.1)))
+        assert_(np.all(mA < mB))
+        assert_(np.all(mA <= mB))
+        assert_(np.all(mA <= mA))
+        assert_(not np.any(mA < mA))
+
+        assert_(not np.any(mB < mA))
+        assert_(np.all(mB >= mA))
+        assert_(np.all(mB >= mB))
+        assert_(not np.any(mB > mB))
+
+        assert_(np.all(mA == mA))
+        assert_(not np.any(mA == mB))
+        assert_(np.all(mB != mA))
+
+        assert_(not np.all(abs(mA) > 0))
+        assert_(np.all(abs(mB > 0)))
+
+    def test_asmatrix(self):
+        A = np.arange(100).reshape(10, 10)
+        mA = asmatrix(A)
+        A[0, 0] = -10
+        assert_(A[0, 0] == mA[0, 0])
+
+    def test_noaxis(self):
+        A = matrix([[1, 0], [0, 1]])
+        assert_(A.sum() == matrix(2))
+        assert_(A.mean() == matrix(0.5))
+
+    def test_repr(self):
+        A = matrix([[1, 0], [0, 1]])
+        assert_(repr(A) == "matrix([[1, 0],\n        [0, 1]])")
+
+    def test_make_bool_matrix_from_str(self):
+        A = matrix('True; True; False')
+        B = matrix([[True], [True], [False]])
+        assert_array_equal(A, B)
+
+class TestCasting(object):
+    def test_basic(self):
+        A = np.arange(100).reshape(10, 10)
+        mA = matrix(A)
+
+        mB = mA.copy()
+        O = np.ones((10, 10), np.float64) * 0.1
+        mB = mB + O
+        assert_(mB.dtype.type == np.float64)
+        assert_(np.all(mA != mB))
+        assert_(np.all(mB == mA+0.1))
+
+        mC = mA.copy()
+        O = np.ones((10, 10), np.complex128)
+        mC = mC * O
+        assert_(mC.dtype.type == np.complex128)
+        assert_(np.all(mA != mB))
+
+
+class TestAlgebra(object):
+    def test_basic(self):
+        import numpy.linalg as linalg
+
+        A = np.array([[1., 2.], [3., 4.]])
+        mA = matrix(A)
+
+        B = np.identity(2)
+        for i in range(6):
+            assert_(np.allclose((mA ** i).A, B))
+            B = np.dot(B, A)
+
+        Ainv = linalg.inv(A)
+        B = np.identity(2)
+        for i in range(6):
+            assert_(np.allclose((mA ** -i).A, B))
+            B = np.dot(B, Ainv)
+
+        assert_(np.allclose((mA * mA).A, np.dot(A, A)))
+        assert_(np.allclose((mA + mA).A, (A + A)))
+        assert_(np.allclose((3*mA).A, (3*A)))
+
+        mA2 = matrix(A)
+        mA2 *= 3
+        assert_(np.allclose(mA2.A, 3*A))
+
+    def test_pow(self):
+        """Test raising a matrix to an integer power works as expected."""
+        m = matrix("1. 2.; 3. 4.")
+        m2 = m.copy()
+        m2 **= 2
+        mi = m.copy()
+        mi **= -1
+        m4 = m2.copy()
+        m4 **= 2
+        assert_array_almost_equal(m2, m**2)
+        assert_array_almost_equal(m4, np.dot(m2, m2))
+        assert_array_almost_equal(np.dot(mi, m), np.eye(2))
+
+    def test_scalar_type_pow(self):
+        m = matrix([[1, 2], [3, 4]])
+        for scalar_t in [np.int8, np.uint8]:
+            two = scalar_t(2)
+            assert_array_almost_equal(m ** 2, m ** two)
+
+    def test_notimplemented(self):
+        '''Check that 'not implemented' operations produce a failure.'''
+        A = matrix([[1., 2.],
+                    [3., 4.]])
+
+        # __rpow__
+        with assert_raises(TypeError):
+            1.0**A
+
+        # __mul__ with something not a list, ndarray, tuple, or scalar
+        with assert_raises(TypeError):
+            A*object()
+
+
+class TestMatrixReturn(object):
+    def test_instance_methods(self):
+        a = matrix([1.0], dtype='f8')
+        methodargs = {
+            'astype': ('intc',),
+            'clip': (0.0, 1.0),
+            'compress': ([1],),
+            'repeat': (1,),
+            'reshape': (1,),
+            'swapaxes': (0, 0),
+            'dot': np.array([1.0]),
+            }
+        excluded_methods = [
+            'argmin', 'choose', 'dump', 'dumps', 'fill', 'getfield',
+            'getA', 'getA1', 'item', 'nonzero', 'put', 'putmask', 'resize',
+            'searchsorted', 'setflags', 'setfield', 'sort',
+            'partition', 'argpartition',
+            'take', 'tofile', 'tolist', 'tostring', 'tobytes', 'all', 'any',
+            'sum', 'argmax', 'argmin', 'min', 'max', 'mean', 'var', 'ptp',
+            'prod', 'std', 'ctypes', 'itemset',
+            ]
+        for attrib in dir(a):
+            if attrib.startswith('_') or attrib in excluded_methods:
+                continue
+            f = getattr(a, attrib)
+            if isinstance(f, collections_abc.Callable):
+                # reset contents of a
+                a.astype('f8')
+                a.fill(1.0)
+                if attrib in methodargs:
+                    args = methodargs[attrib]
+                else:
+                    args = ()
+                b = f(*args)
+                assert_(type(b) is matrix, "%s" % attrib)
+        assert_(type(a.real) is matrix)
+        assert_(type(a.imag) is matrix)
+        c, d = matrix([0.0]).nonzero()
+        assert_(type(c) is np.ndarray)
+        assert_(type(d) is np.ndarray)
+
+
+class TestIndexing(object):
+    def test_basic(self):
+        x = asmatrix(np.zeros((3, 2), float))
+        y = np.zeros((3, 1), float)
+        y[:, 0] = [0.8, 0.2, 0.3]
+        x[:, 1] = y > 0.5
+        assert_equal(x, [[0, 1], [0, 0], [0, 0]])
+
+
+class TestNewScalarIndexing(object):
+    a = matrix([[1, 2], [3, 4]])
+
+    def test_dimesions(self):
+        a = self.a
+        x = a[0]
+        assert_equal(x.ndim, 2)
+
+    def test_array_from_matrix_list(self):
+        a = self.a
+        x = np.array([a, a])
+        assert_equal(x.shape, [2, 2, 2])
+
+    def test_array_to_list(self):
+        a = self.a
+        assert_equal(a.tolist(), [[1, 2], [3, 4]])
+
+    def test_fancy_indexing(self):
+        a = self.a
+        x = a[1, [0, 1, 0]]
+        assert_(isinstance(x, matrix))
+        assert_equal(x, matrix([[3,  4,  3]]))
+        x = a[[1, 0]]
+        assert_(isinstance(x, matrix))
+        assert_equal(x, matrix([[3,  4], [1, 2]]))
+        x = a[[[1], [0]], [[1, 0], [0, 1]]]
+        assert_(isinstance(x, matrix))
+        assert_equal(x, matrix([[4,  3], [1,  2]]))
+
+    def test_matrix_element(self):
+        x = matrix([[1, 2, 3], [4, 5, 6]])
+        assert_equal(x[0][0], matrix([[1, 2, 3]]))
+        assert_equal(x[0][0].shape, (1, 3))
+        assert_equal(x[0].shape, (1, 3))
+        assert_equal(x[:, 0].shape, (2, 1))
+
+        x = matrix(0)
+        assert_equal(x[0, 0], 0)
+        assert_equal(x[0], 0)
+        assert_equal(x[:, 0].shape, x.shape)
+
+    def test_scalar_indexing(self):
+        x = asmatrix(np.zeros((3, 2), float))
+        assert_equal(x[0, 0], x[0][0])
+
+    def test_row_column_indexing(self):
+        x = asmatrix(np.eye(2))
+        assert_array_equal(x[0,:], [[1, 0]])
+        assert_array_equal(x[1,:], [[0, 1]])
+        assert_array_equal(x[:, 0], [[1], [0]])
+        assert_array_equal(x[:, 1], [[0], [1]])
+
+    def test_boolean_indexing(self):
+        A = np.arange(6)
+        A.shape = (3, 2)
+        x = asmatrix(A)
+        assert_array_equal(x[:, np.array([True, False])], x[:, 0])
+        assert_array_equal(x[np.array([True, False, False]),:], x[0,:])
+
+    def test_list_indexing(self):
+        A = np.arange(6)
+        A.shape = (3, 2)
+        x = asmatrix(A)
+        assert_array_equal(x[:, [1, 0]], x[:, ::-1])
+        assert_array_equal(x[[2, 1, 0],:], x[::-1,:])
+
+
+class TestPower(object):
+    def test_returntype(self):
+        a = np.array([[0, 1], [0, 0]])
+        assert_(type(matrix_power(a, 2)) is np.ndarray)
+        a = mat(a)
+        assert_(type(matrix_power(a, 2)) is matrix)
+
+    def test_list(self):
+        assert_array_equal(matrix_power([[0, 1], [0, 0]], 2), [[0, 0], [0, 0]])
+
+
+class TestShape(object):
+
+    a = np.array([[1], [2]])
+    m = matrix([[1], [2]])
+
+    def test_shape(self):
+        assert_equal(self.a.shape, (2, 1))
+        assert_equal(self.m.shape, (2, 1))
+
+    def test_numpy_ravel(self):
+        assert_equal(np.ravel(self.a).shape, (2,))
+        assert_equal(np.ravel(self.m).shape, (2,))
+
+    def test_member_ravel(self):
+        assert_equal(self.a.ravel().shape, (2,))
+        assert_equal(self.m.ravel().shape, (1, 2))
+
+    def test_member_flatten(self):
+        assert_equal(self.a.flatten().shape, (2,))
+        assert_equal(self.m.flatten().shape, (1, 2))
+
+    def test_numpy_ravel_order(self):
+        x = np.array([[1, 2, 3], [4, 5, 6]])
+        assert_equal(np.ravel(x), [1, 2, 3, 4, 5, 6])
+        assert_equal(np.ravel(x, order='F'), [1, 4, 2, 5, 3, 6])
+        assert_equal(np.ravel(x.T), [1, 4, 2, 5, 3, 6])
+        assert_equal(np.ravel(x.T, order='A'), [1, 2, 3, 4, 5, 6])
+        x = matrix([[1, 2, 3], [4, 5, 6]])
+        assert_equal(np.ravel(x), [1, 2, 3, 4, 5, 6])
+        assert_equal(np.ravel(x, order='F'), [1, 4, 2, 5, 3, 6])
+        assert_equal(np.ravel(x.T), [1, 4, 2, 5, 3, 6])
+        assert_equal(np.ravel(x.T, order='A'), [1, 2, 3, 4, 5, 6])
+
+    def test_matrix_ravel_order(self):
+        x = matrix([[1, 2, 3], [4, 5, 6]])
+        assert_equal(x.ravel(), [[1, 2, 3, 4, 5, 6]])
+        assert_equal(x.ravel(order='F'), [[1, 4, 2, 5, 3, 6]])
+        assert_equal(x.T.ravel(), [[1, 4, 2, 5, 3, 6]])
+        assert_equal(x.T.ravel(order='A'), [[1, 2, 3, 4, 5, 6]])
+
+    def test_array_memory_sharing(self):
+        assert_(np.may_share_memory(self.a, self.a.ravel()))
+        assert_(not np.may_share_memory(self.a, self.a.flatten()))
+
+    def test_matrix_memory_sharing(self):
+        assert_(np.may_share_memory(self.m, self.m.ravel()))
+        assert_(not np.may_share_memory(self.m, self.m.flatten()))
+
+    def test_expand_dims_matrix(self):
+        # matrices are always 2d - so expand_dims only makes sense when the
+        # type is changed away from matrix.
+        a = np.arange(10).reshape((2, 5)).view(np.matrix)
+        expanded = np.expand_dims(a, axis=1)
+        assert_equal(expanded.ndim, 3)
+        assert_(not isinstance(expanded, np.matrix))
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_interaction.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_interaction.py
new file mode 100644
index 00000000000..088ae3c6a66
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_interaction.py
@@ -0,0 +1,363 @@
+"""Tests of interaction of matrix with other parts of numpy.
+
+Note that tests with MaskedArray and linalg are done in separate files.
+"""
+from __future__ import division, absolute_import, print_function
+
+import pytest
+
+import textwrap
+import warnings
+
+import numpy as np
+from numpy.testing import (assert_, assert_equal, assert_raises,
+                           assert_raises_regex, assert_array_equal,
+                           assert_almost_equal, assert_array_almost_equal)
+
+
+def test_fancy_indexing():
+    # The matrix class messes with the shape. While this is always
+    # weird (getitem is not used, it does not have setitem nor knows
+    # about fancy indexing), this tests gh-3110
+    # 2018-04-29: moved here from core.tests.test_index.
+    m = np.matrix([[1, 2], [3, 4]])
+
+    assert_(isinstance(m[[0, 1, 0], :], np.matrix))
+
+    # gh-3110. Note the transpose currently because matrices do *not*
+    # support dimension fixing for fancy indexing correctly.
+    x = np.asmatrix(np.arange(50).reshape(5, 10))
+    assert_equal(x[:2, np.array(-1)], x[:2, -1].T)
+
+
+def test_polynomial_mapdomain():
+    # test that polynomial preserved matrix subtype.
+    # 2018-04-29: moved here from polynomial.tests.polyutils.
+    dom1 = [0, 4]
+    dom2 = [1, 3]
+    x = np.matrix([dom1, dom1])
+    res = np.polynomial.polyutils.mapdomain(x, dom1, dom2)
+    assert_(isinstance(res, np.matrix))
+
+
+def test_sort_matrix_none():
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    a = np.matrix([[2, 1, 0]])
+    actual = np.sort(a, axis=None)
+    expected = np.matrix([[0, 1, 2]])
+    assert_equal(actual, expected)
+    assert_(type(expected) is np.matrix)
+
+
+def test_partition_matrix_none():
+    # gh-4301
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    a = np.matrix([[2, 1, 0]])
+    actual = np.partition(a, 1, axis=None)
+    expected = np.matrix([[0, 1, 2]])
+    assert_equal(actual, expected)
+    assert_(type(expected) is np.matrix)
+
+
+def test_dot_scalar_and_matrix_of_objects():
+    # Ticket #2469
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.dot(arr, 3), desired)
+    assert_equal(np.dot(3, arr), desired)
+
+
+def test_inner_scalar_and_matrix():
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
+        sca = np.array(3, dtype=dt)[()]
+        arr = np.matrix([[1, 2], [3, 4]], dtype=dt)
+        desired = np.matrix([[3, 6], [9, 12]], dtype=dt)
+        assert_equal(np.inner(arr, sca), desired)
+        assert_equal(np.inner(sca, arr), desired)
+
+
+def test_inner_scalar_and_matrix_of_objects():
+    # Ticket #4482
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.inner(arr, 3), desired)
+    assert_equal(np.inner(3, arr), desired)
+
+
+def test_iter_allocate_output_subtype():
+    # Make sure that the subtype with priority wins
+    # 2018-04-29: moved here from core.tests.test_nditer, given the
+    # matrix specific shape test.
+
+    # matrix vs ndarray
+    a = np.matrix([[1, 2], [3, 4]])
+    b = np.arange(4).reshape(2, 2).T
+    i = np.nditer([a, b, None], [],
+                  [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    assert_(type(i.operands[2]) is np.matrix)
+    assert_(type(i.operands[2]) is not np.ndarray)
+    assert_equal(i.operands[2].shape, (2, 2))
+
+    # matrix always wants things to be 2D
+    b = np.arange(4).reshape(1, 2, 2)
+    assert_raises(RuntimeError, np.nditer, [a, b, None], [],
+                  [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    # but if subtypes are disabled, the result can still work
+    i = np.nditer([a, b, None], [],
+                  [['readonly'], ['readonly'],
+                   ['writeonly', 'allocate', 'no_subtype']])
+    assert_(type(i.operands[2]) is np.ndarray)
+    assert_(type(i.operands[2]) is not np.matrix)
+    assert_equal(i.operands[2].shape, (1, 2, 2))
+
+
+def like_function():
+    # 2018-04-29: moved here from core.tests.test_numeric
+    a = np.matrix([[1, 2], [3, 4]])
+    for like_function in np.zeros_like, np.ones_like, np.empty_like:
+        b = like_function(a)
+        assert_(type(b) is np.matrix)
+
+        c = like_function(a, subok=False)
+        assert_(type(c) is not np.matrix)
+
+
+def test_array_astype():
+    # 2018-04-29: copied here from core.tests.test_api
+    # subok=True passes through a matrix
+    a = np.matrix([[0, 1, 2], [3, 4, 5]], dtype='f4')
+    b = a.astype('f4', subok=True, copy=False)
+    assert_(a is b)
+
+    # subok=True is default, and creates a subtype on a cast
+    b = a.astype('i4', copy=False)
+    assert_equal(a, b)
+    assert_equal(type(b), np.matrix)
+
+    # subok=False never returns a matrix
+    b = a.astype('f4', subok=False, copy=False)
+    assert_equal(a, b)
+    assert_(not (a is b))
+    assert_(type(b) is not np.matrix)
+
+
+def test_stack():
+    # 2018-04-29: copied here from core.tests.test_shape_base
+    # check np.matrix cannot be stacked
+    m = np.matrix([[1, 2], [3, 4]])
+    assert_raises_regex(ValueError, 'shape too large to be a matrix',
+                        np.stack, [m, m])
+
+
+def test_object_scalar_multiply():
+    # Tickets #2469 and #4482
+    # 2018-04-29: moved here from core.tests.test_ufunc
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.multiply(arr, 3), desired)
+    assert_equal(np.multiply(3, arr), desired)
+
+
+def test_nanfunctions_matrices():
+    # Check that it works and that type and
+    # shape are preserved
+    # 2018-04-29: moved here from core.tests.test_nanfunctions
+    mat = np.matrix(np.eye(3))
+    for f in [np.nanmin, np.nanmax]:
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 1))
+        res = f(mat)
+        assert_(np.isscalar(res))
+    # check that rows of nan are dealt with for subclasses (#4628)
+    mat[1] = np.nan
+    for f in [np.nanmin, np.nanmax]:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat, axis=0)
+            assert_(isinstance(res, np.matrix))
+            assert_(not np.any(np.isnan(res)))
+            assert_(len(w) == 0)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat, axis=1)
+            assert_(isinstance(res, np.matrix))
+            assert_(np.isnan(res[1, 0]) and not np.isnan(res[0, 0])
+                    and not np.isnan(res[2, 0]))
+            assert_(len(w) == 1, 'no warning raised')
+            assert_(issubclass(w[0].category, RuntimeWarning))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat)
+            assert_(np.isscalar(res))
+            assert_(res != np.nan)
+            assert_(len(w) == 0)
+
+
+def test_nanfunctions_matrices_general():
+    # Check that it works and that type and
+    # shape are preserved
+    # 2018-04-29: moved here from core.tests.test_nanfunctions
+    mat = np.matrix(np.eye(3))
+    for f in (np.nanargmin, np.nanargmax, np.nansum, np.nanprod,
+              np.nanmean, np.nanvar, np.nanstd):
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 1))
+        res = f(mat)
+        assert_(np.isscalar(res))
+
+    for f in np.nancumsum, np.nancumprod:
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 3))
+        res = f(mat)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3*3))
+
+
+def test_average_matrix():
+    # 2018-04-29: moved here from core.tests.test_function_base.
+    y = np.matrix(np.random.rand(5, 5))
+    assert_array_equal(y.mean(0), np.average(y, 0))
+
+    a = np.matrix([[1, 2], [3, 4]])
+    w = np.matrix([[1, 2], [3, 4]])
+
+    r = np.average(a, axis=0, weights=w)
+    assert_equal(type(r), np.matrix)
+    assert_equal(r, [[2.5, 10.0/3]])
+
+
+def test_trapz_matrix():
+    # Test to make sure matrices give the same answer as ndarrays
+    # 2018-04-29: moved here from core.tests.test_function_base.
+    x = np.linspace(0, 5)
+    y = x * x
+    r = np.trapz(y, x)
+    mx = np.matrix(x)
+    my = np.matrix(y)
+    mr = np.trapz(my, mx)
+    assert_almost_equal(mr, r)
+
+
+def test_ediff1d_matrix():
+    # 2018-04-29: moved here from core.tests.test_arraysetops.
+    assert(isinstance(np.ediff1d(np.matrix(1)), np.matrix))
+    assert(isinstance(np.ediff1d(np.matrix(1), to_begin=1), np.matrix))
+
+
+def test_apply_along_axis_matrix():
+    # this test is particularly malicious because matrix
+    # refuses to become 1d
+    # 2018-04-29: moved here from core.tests.test_shape_base.
+    def double(row):
+        return row * 2
+
+    m = np.matrix([[0, 1], [2, 3]])
+    expected = np.matrix([[0, 2], [4, 6]])
+
+    result = np.apply_along_axis(double, 0, m)
+    assert_(isinstance(result, np.matrix))
+    assert_array_equal(result, expected)
+
+    result = np.apply_along_axis(double, 1, m)
+    assert_(isinstance(result, np.matrix))
+    assert_array_equal(result, expected)
+
+
+def test_kron_matrix():
+    # 2018-04-29: moved here from core.tests.test_shape_base.
+    a = np.ones([2, 2])
+    m = np.asmatrix(a)
+    assert_equal(type(np.kron(a, a)), np.ndarray)
+    assert_equal(type(np.kron(m, m)), np.matrix)
+    assert_equal(type(np.kron(a, m)), np.matrix)
+    assert_equal(type(np.kron(m, a)), np.matrix)
+
+
+class TestConcatenatorMatrix(object):
+    # 2018-04-29: moved here from core.tests.test_index_tricks.
+    def test_matrix(self):
+        a = [1, 2]
+        b = [3, 4]
+
+        ab_r = np.r_['r', a, b]
+        ab_c = np.r_['c', a, b]
+
+        assert_equal(type(ab_r), np.matrix)
+        assert_equal(type(ab_c), np.matrix)
+
+        assert_equal(np.array(ab_r), [[1, 2, 3, 4]])
+        assert_equal(np.array(ab_c), [[1], [2], [3], [4]])
+
+        assert_raises(ValueError, lambda: np.r_['rc', a, b])
+
+    def test_matrix_scalar(self):
+        r = np.r_['r', [1, 2], 3]
+        assert_equal(type(r), np.matrix)
+        assert_equal(np.array(r), [[1, 2, 3]])
+
+    def test_matrix_builder(self):
+        a = np.array([1])
+        b = np.array([2])
+        c = np.array([3])
+        d = np.array([4])
+        actual = np.r_['a, b; c, d']
+        expected = np.bmat([[a, b], [c, d]])
+
+        assert_equal(actual, expected)
+        assert_equal(type(actual), type(expected))
+
+
+def test_array_equal_error_message_matrix():
+    # 2018-04-29: moved here from testing.tests.test_utils.
+    try:
+        assert_equal(np.array([1, 2]), np.matrix([1, 2]))
+    except AssertionError as e:
+        msg = str(e)
+        msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
+        msg_reference = textwrap.dedent("""\
+
+        Arrays are not equal
+
+        (shapes (2,), (1, 2) mismatch)
+         x: array([1, 2])
+         y: matrix([[1, 2]])""")
+        try:
+            assert_equal(msg, msg_reference)
+        except AssertionError:
+            assert_equal(msg2, msg_reference)
+    else:
+        raise AssertionError("Did not raise")
+
+
+def test_array_almost_equal_matrix():
+    # Matrix slicing keeps things 2-D, while array does not necessarily.
+    # See gh-8452.
+    # 2018-04-29: moved here from testing.tests.test_utils.
+    m1 = np.matrix([[1., 2.]])
+    m2 = np.matrix([[1., np.nan]])
+    m3 = np.matrix([[1., -np.inf]])
+    m4 = np.matrix([[np.nan, np.inf]])
+    m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
+    for assert_func in assert_array_almost_equal, assert_almost_equal:
+        for m in m1, m2, m3, m4, m5:
+            assert_func(m, m)
+            a = np.array(m)
+            assert_func(a, m)
+            assert_func(m, a)
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_masked_matrix.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_masked_matrix.py
new file mode 100644
index 00000000000..52fd185773b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_masked_matrix.py
@@ -0,0 +1,231 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.ma.testutils import (assert_, assert_equal, assert_raises,
+                                assert_array_equal)
+from numpy.ma.core import (masked_array, masked_values, masked, allequal,
+                           MaskType, getmask, MaskedArray, nomask,
+                           log, add, hypot, divide)
+from numpy.ma.extras import mr_
+from numpy.core.numeric import pickle
+
+
+class MMatrix(MaskedArray, np.matrix,):
+
+    def __new__(cls, data, mask=nomask):
+        mat = np.matrix(data)
+        _data = MaskedArray.__new__(cls, data=mat, mask=mask)
+        return _data
+
+    def __array_finalize__(self, obj):
+        np.matrix.__array_finalize__(self, obj)
+        MaskedArray.__array_finalize__(self, obj)
+        return
+
+    def _get_series(self):
+        _view = self.view(MaskedArray)
+        _view._sharedmask = False
+        return _view
+    _series = property(fget=_get_series)
+
+
+class TestMaskedMatrix(object):
+    def test_matrix_indexing(self):
+        # Tests conversions and indexing
+        x1 = np.matrix([[1, 2, 3], [4, 3, 2]])
+        x2 = masked_array(x1, mask=[[1, 0, 0], [0, 1, 0]])
+        x3 = masked_array(x1, mask=[[0, 1, 0], [1, 0, 0]])
+        x4 = masked_array(x1)
+        # test conversion to strings
+        str(x2)  # raises?
+        repr(x2)  # raises?
+        # tests of indexing
+        assert_(type(x2[1, 0]) is type(x1[1, 0]))
+        assert_(x1[1, 0] == x2[1, 0])
+        assert_(x2[1, 1] is masked)
+        assert_equal(x1[0, 2], x2[0, 2])
+        assert_equal(x1[0, 1:], x2[0, 1:])
+        assert_equal(x1[:, 2], x2[:, 2])
+        assert_equal(x1[:], x2[:])
+        assert_equal(x1[1:], x3[1:])
+        x1[0, 2] = 9
+        x2[0, 2] = 9
+        assert_equal(x1, x2)
+        x1[0, 1:] = 99
+        x2[0, 1:] = 99
+        assert_equal(x1, x2)
+        x2[0, 1] = masked
+        assert_equal(x1, x2)
+        x2[0, 1:] = masked
+        assert_equal(x1, x2)
+        x2[0, :] = x1[0, :]
+        x2[0, 1] = masked
+        assert_(allequal(getmask(x2), np.array([[0, 1, 0], [0, 1, 0]])))
+        x3[1, :] = masked_array([1, 2, 3], [1, 1, 0])
+        assert_(allequal(getmask(x3)[1], masked_array([1, 1, 0])))
+        assert_(allequal(getmask(x3[1]), masked_array([1, 1, 0])))
+        x4[1, :] = masked_array([1, 2, 3], [1, 1, 0])
+        assert_(allequal(getmask(x4[1]), masked_array([1, 1, 0])))
+        assert_(allequal(x4[1], masked_array([1, 2, 3])))
+        x1 = np.matrix(np.arange(5) * 1.0)
+        x2 = masked_values(x1, 3.0)
+        assert_equal(x1, x2)
+        assert_(allequal(masked_array([0, 0, 0, 1, 0], dtype=MaskType),
+                         x2.mask))
+        assert_equal(3.0, x2.fill_value)
+
+    def test_pickling_subbaseclass(self):
+        # Test pickling w/ a subclass of ndarray
+        a = masked_array(np.matrix(list(range(10))), mask=[1, 0, 1, 0, 0] * 2)
+        for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+            a_pickled = pickle.loads(pickle.dumps(a, protocol=proto))
+            assert_equal(a_pickled._mask, a._mask)
+            assert_equal(a_pickled, a)
+            assert_(isinstance(a_pickled._data, np.matrix))
+
+    def test_count_mean_with_matrix(self):
+        m = masked_array(np.matrix([[1, 2], [3, 4]]), mask=np.zeros((2, 2)))
+
+        assert_equal(m.count(axis=0).shape, (1, 2))
+        assert_equal(m.count(axis=1).shape, (2, 1))
+
+        # Make sure broadcasting inside mean and var work
+        assert_equal(m.mean(axis=0), [[2., 3.]])
+        assert_equal(m.mean(axis=1), [[1.5], [3.5]])
+
+    def test_flat(self):
+        # Test that flat can return items even for matrices [#4585, #4615]
+        # test simple access
+        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
+        assert_equal(test.flat[1], 2)
+        assert_equal(test.flat[2], masked)
+        assert_(np.all(test.flat[0:2] == test[0, 0:2]))
+        # Test flat on masked_matrices
+        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
+        test.flat = masked_array([3, 2, 1], mask=[1, 0, 0])
+        control = masked_array(np.matrix([[3, 2, 1]]), mask=[1, 0, 0])
+        assert_equal(test, control)
+        # Test setting
+        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
+        testflat = test.flat
+        testflat[:] = testflat[[2, 1, 0]]
+        assert_equal(test, control)
+        testflat[0] = 9
+        # test that matrices keep the correct shape (#4615)
+        a = masked_array(np.matrix(np.eye(2)), mask=0)
+        b = a.flat
+        b01 = b[:2]
+        assert_equal(b01.data, np.array([[1., 0.]]))
+        assert_equal(b01.mask, np.array([[False, False]]))
+
+    def test_allany_onmatrices(self):
+        x = np.array([[0.13, 0.26, 0.90],
+                      [0.28, 0.33, 0.63],
+                      [0.31, 0.87, 0.70]])
+        X = np.matrix(x)
+        m = np.array([[True, False, False],
+                      [False, False, False],
+                      [True, True, False]], dtype=np.bool_)
+        mX = masked_array(X, mask=m)
+        mXbig = (mX > 0.5)
+        mXsmall = (mX < 0.5)
+
+        assert_(not mXbig.all())
+        assert_(mXbig.any())
+        assert_equal(mXbig.all(0), np.matrix([False, False, True]))
+        assert_equal(mXbig.all(1), np.matrix([False, False, True]).T)
+        assert_equal(mXbig.any(0), np.matrix([False, False, True]))
+        assert_equal(mXbig.any(1), np.matrix([True, True, True]).T)
+
+        assert_(not mXsmall.all())
+        assert_(mXsmall.any())
+        assert_equal(mXsmall.all(0), np.matrix([True, True, False]))
+        assert_equal(mXsmall.all(1), np.matrix([False, False, False]).T)
+        assert_equal(mXsmall.any(0), np.matrix([True, True, False]))
+        assert_equal(mXsmall.any(1), np.matrix([True, True, False]).T)
+
+    def test_compressed(self):
+        a = masked_array(np.matrix([1, 2, 3, 4]), mask=[0, 0, 0, 0])
+        b = a.compressed()
+        assert_equal(b, a)
+        assert_(isinstance(b, np.matrix))
+        a[0, 0] = masked
+        b = a.compressed()
+        assert_equal(b, [[2, 3, 4]])
+
+    def test_ravel(self):
+        a = masked_array(np.matrix([1, 2, 3, 4, 5]), mask=[[0, 1, 0, 0, 0]])
+        aravel = a.ravel()
+        assert_equal(aravel.shape, (1, 5))
+        assert_equal(aravel._mask.shape, a.shape)
+
+    def test_view(self):
+        # Test view w/ flexible dtype
+        iterator = list(zip(np.arange(10), np.random.rand(10)))
+        data = np.array(iterator)
+        a = masked_array(iterator, dtype=[('a', float), ('b', float)])
+        a.mask[0] = (1, 0)
+        test = a.view((float, 2), np.matrix)
+        assert_equal(test, data)
+        assert_(isinstance(test, np.matrix))
+        assert_(not isinstance(test, MaskedArray))
+
+
+class TestSubclassing(object):
+    # Test suite for masked subclasses of ndarray.
+
+    def setup(self):
+        x = np.arange(5, dtype='float')
+        mx = MMatrix(x, mask=[0, 1, 0, 0, 0])
+        self.data = (x, mx)
+
+    def test_maskedarray_subclassing(self):
+        # Tests subclassing MaskedArray
+        (x, mx) = self.data
+        assert_(isinstance(mx._data, np.matrix))
+
+    def test_masked_unary_operations(self):
+        # Tests masked_unary_operation
+        (x, mx) = self.data
+        with np.errstate(divide='ignore'):
+            assert_(isinstance(log(mx), MMatrix))
+            assert_equal(log(x), np.log(x))
+
+    def test_masked_binary_operations(self):
+        # Tests masked_binary_operation
+        (x, mx) = self.data
+        # Result should be a MMatrix
+        assert_(isinstance(add(mx, mx), MMatrix))
+        assert_(isinstance(add(mx, x), MMatrix))
+        # Result should work
+        assert_equal(add(mx, x), mx+x)
+        assert_(isinstance(add(mx, mx)._data, np.matrix))
+        assert_(isinstance(add.outer(mx, mx), MMatrix))
+        assert_(isinstance(hypot(mx, mx), MMatrix))
+        assert_(isinstance(hypot(mx, x), MMatrix))
+
+    def test_masked_binary_operations2(self):
+        # Tests domained_masked_binary_operation
+        (x, mx) = self.data
+        xmx = masked_array(mx.data.__array__(), mask=mx.mask)
+        assert_(isinstance(divide(mx, mx), MMatrix))
+        assert_(isinstance(divide(mx, x), MMatrix))
+        assert_equal(divide(mx, mx), divide(xmx, xmx))
+
+class TestConcatenator(object):
+    # Tests for mr_, the equivalent of r_ for masked arrays.
+
+    def test_matrix_builder(self):
+        assert_raises(np.ma.MAError, lambda: mr_['1, 2; 3, 4'])
+
+    def test_matrix(self):
+        # Test consistency with unmasked version.  If we ever deprecate
+        # matrix, this test should either still pass, or both actual and
+        # expected should fail to be build.
+        actual = mr_['r', 1, 2, 3]
+        expected = np.ma.array(np.r_['r', 1, 2, 3])
+        assert_array_equal(actual, expected)
+
+        # outer type is masked array, inner type is matrix
+        assert_equal(type(actual), type(expected))
+        assert_equal(type(actual.data), type(expected.data))
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_matrix_linalg.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_matrix_linalg.py
new file mode 100644
index 00000000000..6fc733c2e91
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_matrix_linalg.py
@@ -0,0 +1,95 @@
+""" Test functions for linalg module using the matrix class."""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+
+from numpy.linalg.tests.test_linalg import (
+    LinalgCase, apply_tag, TestQR as _TestQR, LinalgTestCase,
+    _TestNorm2D, _TestNormDoubleBase, _TestNormSingleBase, _TestNormInt64Base,
+    SolveCases, InvCases, EigvalsCases, EigCases, SVDCases, CondCases,
+    PinvCases, DetCases, LstsqCases)
+
+
+CASES = []
+
+# square test cases
+CASES += apply_tag('square', [
+    LinalgCase("0x0_matrix",
+               np.empty((0, 0), dtype=np.double).view(np.matrix),
+               np.empty((0, 1), dtype=np.double).view(np.matrix),
+               tags={'size-0'}),
+    LinalgCase("matrix_b_only",
+               np.array([[1., 2.], [3., 4.]]),
+               np.matrix([2., 1.]).T),
+    LinalgCase("matrix_a_and_b",
+               np.matrix([[1., 2.], [3., 4.]]),
+               np.matrix([2., 1.]).T),
+])
+
+# hermitian test-cases
+CASES += apply_tag('hermitian', [
+    LinalgCase("hmatrix_a_and_b",
+               np.matrix([[1., 2.], [2., 1.]]),
+               None),
+])
+# No need to make generalized or strided cases for matrices.
+
+
+class MatrixTestCase(LinalgTestCase):
+    TEST_CASES = CASES
+
+
+class TestSolveMatrix(SolveCases, MatrixTestCase):
+    pass
+
+
+class TestInvMatrix(InvCases, MatrixTestCase):
+    pass
+
+
+class TestEigvalsMatrix(EigvalsCases, MatrixTestCase):
+    pass
+
+
+class TestEigMatrix(EigCases, MatrixTestCase):
+    pass
+
+
+class TestSVDMatrix(SVDCases, MatrixTestCase):
+    pass
+
+
+class TestCondMatrix(CondCases, MatrixTestCase):
+    pass
+
+
+class TestPinvMatrix(PinvCases, MatrixTestCase):
+    pass
+
+
+class TestDetMatrix(DetCases, MatrixTestCase):
+    pass
+
+
+class TestLstsqMatrix(LstsqCases, MatrixTestCase):
+    pass
+
+
+class _TestNorm2DMatrix(_TestNorm2D):
+    array = np.matrix
+
+
+class TestNormDoubleMatrix(_TestNorm2DMatrix, _TestNormDoubleBase):
+    pass
+
+
+class TestNormSingleMatrix(_TestNorm2DMatrix, _TestNormSingleBase):
+    pass
+
+
+class TestNormInt64Matrix(_TestNorm2DMatrix, _TestNormInt64Base):
+    pass
+
+
+class TestQRMatrix(_TestQR):
+    array = np.matrix
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_multiarray.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_multiarray.py
new file mode 100644
index 00000000000..6d84bd4777a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_multiarray.py
@@ -0,0 +1,18 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_, assert_equal, assert_array_equal
+
+class TestView(object):
+    def test_type(self):
+        x = np.array([1, 2, 3])
+        assert_(isinstance(x.view(np.matrix), np.matrix))
+
+    def test_keywords(self):
+        x = np.array([(1, 2)], dtype=[('a', np.int8), ('b', np.int8)])
+        # We must be specific about the endianness here:
+        y = x.view(dtype='<i2', type=np.matrix)
+        assert_array_equal(y, [[513]])
+
+        assert_(isinstance(y, np.matrix))
+        assert_equal(y.dtype, np.dtype('<i2'))
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_numeric.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_numeric.py
new file mode 100644
index 00000000000..95e1c800177
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_numeric.py
@@ -0,0 +1,19 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_equal
+
+class TestDot(object):
+    def test_matscalar(self):
+        b1 = np.matrix(np.ones((3, 3), dtype=complex))
+        assert_equal(b1*1.0, b1)
+
+
+def test_diagonal():
+    b1 = np.matrix([[1,2],[3,4]])
+    diag_b1 = np.matrix([[1, 4]])
+    array_b1 = np.array([1, 4])
+
+    assert_equal(b1.diagonal(), diag_b1)
+    assert_equal(np.diagonal(b1), array_b1)
+    assert_equal(np.diag(b1), array_b1)
diff --git a/contrib/python/numpy/py2/numpy/matrixlib/tests/test_regression.py b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_regression.py
new file mode 100644
index 00000000000..70e1472793f
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/matrixlib/tests/test_regression.py
@@ -0,0 +1,33 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+from numpy.testing import assert_, assert_equal, assert_raises
+
+
+class TestRegression(object):
+    def test_kron_matrix(self):
+        # Ticket #71
+        x = np.matrix('[1 0; 1 0]')
+        assert_equal(type(np.kron(x, x)), type(x))
+
+    def test_matrix_properties(self):
+        # Ticket #125
+        a = np.matrix([1.0], dtype=float)
+        assert_(type(a.real) is np.matrix)
+        assert_(type(a.imag) is np.matrix)
+        c, d = np.matrix([0.0]).nonzero()
+        assert_(type(c) is np.ndarray)
+        assert_(type(d) is np.ndarray)
+
+    def test_matrix_multiply_by_1d_vector(self):
+        # Ticket #473
+        def mul():
+            np.mat(np.eye(2))*np.ones(2)
+
+        assert_raises(ValueError, mul)
+
+    def test_matrix_std_argmax(self):
+        # Ticket #83
+        x = np.asmatrix(np.random.uniform(0, 1, (3, 3)))
+        assert_equal(x.std().shape, ())
+        assert_equal(x.argmax().shape, ())
diff --git a/contrib/python/numpy/py2/numpy/polynomial/setup.py b/contrib/python/numpy/py2/numpy/polynomial/setup.py
new file mode 100644
index 00000000000..cb59ee1e56d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/setup.py
@@ -0,0 +1,11 @@
+from __future__ import division, print_function
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('polynomial', parent_package, top_path)
+    config.add_data_dir('tests')
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/__init__.py b/contrib/python/numpy/py2/numpy/polynomial/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_chebyshev.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_chebyshev.py
new file mode 100644
index 00000000000..7fb7492c620
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_chebyshev.py
@@ -0,0 +1,621 @@
+"""Tests for chebyshev module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.polynomial.chebyshev as cheb
+from numpy.polynomial.polynomial import polyval
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+
+def trim(x):
+    return cheb.chebtrim(x, tol=1e-6)
+
+T0 = [1]
+T1 = [0, 1]
+T2 = [-1, 0, 2]
+T3 = [0, -3, 0, 4]
+T4 = [1, 0, -8, 0, 8]
+T5 = [0, 5, 0, -20, 0, 16]
+T6 = [-1, 0, 18, 0, -48, 0, 32]
+T7 = [0, -7, 0, 56, 0, -112, 0, 64]
+T8 = [1, 0, -32, 0, 160, 0, -256, 0, 128]
+T9 = [0, 9, 0, -120, 0, 432, 0, -576, 0, 256]
+
+Tlist = [T0, T1, T2, T3, T4, T5, T6, T7, T8, T9]
+
+
+class TestPrivate(object):
+
+    def test__cseries_to_zseries(self):
+        for i in range(5):
+            inp = np.array([2] + [1]*i, np.double)
+            tgt = np.array([.5]*i + [2] + [.5]*i, np.double)
+            res = cheb._cseries_to_zseries(inp)
+            assert_equal(res, tgt)
+
+    def test__zseries_to_cseries(self):
+        for i in range(5):
+            inp = np.array([.5]*i + [2] + [.5]*i, np.double)
+            tgt = np.array([2] + [1]*i, np.double)
+            res = cheb._zseries_to_cseries(inp)
+            assert_equal(res, tgt)
+
+
+class TestConstants(object):
+
+    def test_chebdomain(self):
+        assert_equal(cheb.chebdomain, [-1, 1])
+
+    def test_chebzero(self):
+        assert_equal(cheb.chebzero, [0])
+
+    def test_chebone(self):
+        assert_equal(cheb.chebone, [1])
+
+    def test_chebx(self):
+        assert_equal(cheb.chebx, [0, 1])
+
+
+class TestArithmetic(object):
+
+    def test_chebadd(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] += 1
+                res = cheb.chebadd([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_chebsub(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] -= 1
+                res = cheb.chebsub([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_chebmulx(self):
+        assert_equal(cheb.chebmulx([0]), [0])
+        assert_equal(cheb.chebmulx([1]), [0, 1])
+        for i in range(1, 5):
+            ser = [0]*i + [1]
+            tgt = [0]*(i - 1) + [.5, 0, .5]
+            assert_equal(cheb.chebmulx(ser), tgt)
+
+    def test_chebmul(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(i + j + 1)
+                tgt[i + j] += .5
+                tgt[abs(i - j)] += .5
+                res = cheb.chebmul([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_chebdiv(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                ci = [0]*i + [1]
+                cj = [0]*j + [1]
+                tgt = cheb.chebadd(ci, cj)
+                quo, rem = cheb.chebdiv(tgt, ci)
+                res = cheb.chebadd(cheb.chebmul(quo, ci), rem)
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_chebpow(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                c = np.arange(i + 1)
+                tgt = reduce(cheb.chebmul, [c]*j, np.array([1]))
+                res = cheb.chebpow(c, j)
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+
+class TestEvaluation(object):
+    # coefficients of 1 + 2*x + 3*x**2
+    c1d = np.array([2.5, 2., 1.5])
+    c2d = np.einsum('i,j->ij', c1d, c1d)
+    c3d = np.einsum('i,j,k->ijk', c1d, c1d, c1d)
+
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+    y = polyval(x, [1., 2., 3.])
+
+    def test_chebval(self):
+        #check empty input
+        assert_equal(cheb.chebval([], [1]).size, 0)
+
+        #check normal input)
+        x = np.linspace(-1, 1)
+        y = [polyval(x, c) for c in Tlist]
+        for i in range(10):
+            msg = "At i=%d" % i
+            tgt = y[i]
+            res = cheb.chebval(x, [0]*i + [1])
+            assert_almost_equal(res, tgt, err_msg=msg)
+
+        #check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(cheb.chebval(x, [1]).shape, dims)
+            assert_equal(cheb.chebval(x, [1, 0]).shape, dims)
+            assert_equal(cheb.chebval(x, [1, 0, 0]).shape, dims)
+
+    def test_chebval2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, cheb.chebval2d, x1, x2[:2], self.c2d)
+
+        #test values
+        tgt = y1*y2
+        res = cheb.chebval2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = cheb.chebval2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3))
+
+    def test_chebval3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, cheb.chebval3d, x1, x2, x3[:2], self.c3d)
+
+        #test values
+        tgt = y1*y2*y3
+        res = cheb.chebval3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = cheb.chebval3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3))
+
+    def test_chebgrid2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j->ij', y1, y2)
+        res = cheb.chebgrid2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = cheb.chebgrid2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3)*2)
+
+    def test_chebgrid3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j,k->ijk', y1, y2, y3)
+        res = cheb.chebgrid3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = cheb.chebgrid3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3)*3)
+
+
+class TestIntegral(object):
+
+    def test_chebint(self):
+        # check exceptions
+        assert_raises(ValueError, cheb.chebint, [0], .5)
+        assert_raises(ValueError, cheb.chebint, [0], -1)
+        assert_raises(ValueError, cheb.chebint, [0], 1, [0, 0])
+        assert_raises(ValueError, cheb.chebint, [0], lbnd=[0])
+        assert_raises(ValueError, cheb.chebint, [0], scl=[0])
+        assert_raises(ValueError, cheb.chebint, [0], axis=.5)
+
+        # test integration of zero polynomial
+        for i in range(2, 5):
+            k = [0]*(i - 2) + [1]
+            res = cheb.chebint([0], m=i, k=k)
+            assert_almost_equal(res, [0, 1])
+
+        # check single integration with integration constant
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [1/scl]
+            chebpol = cheb.poly2cheb(pol)
+            chebint = cheb.chebint(chebpol, m=1, k=[i])
+            res = cheb.cheb2poly(chebint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check single integration with integration constant and lbnd
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            chebpol = cheb.poly2cheb(pol)
+            chebint = cheb.chebint(chebpol, m=1, k=[i], lbnd=-1)
+            assert_almost_equal(cheb.chebval(-1, chebint), i)
+
+        # check single integration with integration constant and scaling
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [2/scl]
+            chebpol = cheb.poly2cheb(pol)
+            chebint = cheb.chebint(chebpol, m=1, k=[i], scl=2)
+            res = cheb.cheb2poly(chebint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with default k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = cheb.chebint(tgt, m=1)
+                res = cheb.chebint(pol, m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with defined k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = cheb.chebint(tgt, m=1, k=[k])
+                res = cheb.chebint(pol, m=j, k=list(range(j)))
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with lbnd
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = cheb.chebint(tgt, m=1, k=[k], lbnd=-1)
+                res = cheb.chebint(pol, m=j, k=list(range(j)), lbnd=-1)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = cheb.chebint(tgt, m=1, k=[k], scl=2)
+                res = cheb.chebint(pol, m=j, k=list(range(j)), scl=2)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_chebint_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([cheb.chebint(c) for c in c2d.T]).T
+        res = cheb.chebint(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([cheb.chebint(c) for c in c2d])
+        res = cheb.chebint(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([cheb.chebint(c, k=3) for c in c2d])
+        res = cheb.chebint(c2d, k=3, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestDerivative(object):
+
+    def test_chebder(self):
+        # check exceptions
+        assert_raises(ValueError, cheb.chebder, [0], .5)
+        assert_raises(ValueError, cheb.chebder, [0], -1)
+
+        # check that zeroth derivative does nothing
+        for i in range(5):
+            tgt = [0]*i + [1]
+            res = cheb.chebder(tgt, m=0)
+            assert_equal(trim(res), trim(tgt))
+
+        # check that derivation is the inverse of integration
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = cheb.chebder(cheb.chebint(tgt, m=j), m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check derivation with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = cheb.chebder(cheb.chebint(tgt, m=j, scl=2), m=j, scl=.5)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_chebder_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([cheb.chebder(c) for c in c2d.T]).T
+        res = cheb.chebder(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([cheb.chebder(c) for c in c2d])
+        res = cheb.chebder(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestVander(object):
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+
+    def test_chebvander(self):
+        # check for 1d x
+        x = np.arange(3)
+        v = cheb.chebvander(x, 3)
+        assert_(v.shape == (3, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], cheb.chebval(x, coef))
+
+        # check for 2d x
+        x = np.array([[1, 2], [3, 4], [5, 6]])
+        v = cheb.chebvander(x, 3)
+        assert_(v.shape == (3, 2, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], cheb.chebval(x, coef))
+
+    def test_chebvander2d(self):
+        # also tests chebval2d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3))
+        van = cheb.chebvander2d(x1, x2, [1, 2])
+        tgt = cheb.chebval2d(x1, x2, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = cheb.chebvander2d([x1], [x2], [1, 2])
+        assert_(van.shape == (1, 5, 6))
+
+    def test_chebvander3d(self):
+        # also tests chebval3d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3, 4))
+        van = cheb.chebvander3d(x1, x2, x3, [1, 2, 3])
+        tgt = cheb.chebval3d(x1, x2, x3, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = cheb.chebvander3d([x1], [x2], [x3], [1, 2, 3])
+        assert_(van.shape == (1, 5, 24))
+
+
+class TestFitting(object):
+
+    def test_chebfit(self):
+        def f(x):
+            return x*(x - 1)*(x - 2)
+
+        def f2(x):
+            return x**4 + x**2 + 1
+
+        # Test exceptions
+        assert_raises(ValueError, cheb.chebfit, [1], [1], -1)
+        assert_raises(TypeError, cheb.chebfit, [[1]], [1], 0)
+        assert_raises(TypeError, cheb.chebfit, [], [1], 0)
+        assert_raises(TypeError, cheb.chebfit, [1], [[[1]]], 0)
+        assert_raises(TypeError, cheb.chebfit, [1, 2], [1], 0)
+        assert_raises(TypeError, cheb.chebfit, [1], [1, 2], 0)
+        assert_raises(TypeError, cheb.chebfit, [1], [1], 0, w=[[1]])
+        assert_raises(TypeError, cheb.chebfit, [1], [1], 0, w=[1, 1])
+        assert_raises(ValueError, cheb.chebfit, [1], [1], [-1,])
+        assert_raises(ValueError, cheb.chebfit, [1], [1], [2, -1, 6])
+        assert_raises(TypeError, cheb.chebfit, [1], [1], [])
+
+        # Test fit
+        x = np.linspace(0, 2)
+        y = f(x)
+        #
+        coef3 = cheb.chebfit(x, y, 3)
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(cheb.chebval(x, coef3), y)
+        coef3 = cheb.chebfit(x, y, [0, 1, 2, 3])
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(cheb.chebval(x, coef3), y)
+        #
+        coef4 = cheb.chebfit(x, y, 4)
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(cheb.chebval(x, coef4), y)
+        coef4 = cheb.chebfit(x, y, [0, 1, 2, 3, 4])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(cheb.chebval(x, coef4), y)
+        # check things still work if deg is not in strict increasing
+        coef4 = cheb.chebfit(x, y, [2, 3, 4, 1, 0])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(cheb.chebval(x, coef4), y)
+        #
+        coef2d = cheb.chebfit(x, np.array([y, y]).T, 3)
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        coef2d = cheb.chebfit(x, np.array([y, y]).T, [0, 1, 2, 3])
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        # test weighting
+        w = np.zeros_like(x)
+        yw = y.copy()
+        w[1::2] = 1
+        y[0::2] = 0
+        wcoef3 = cheb.chebfit(x, yw, 3, w=w)
+        assert_almost_equal(wcoef3, coef3)
+        wcoef3 = cheb.chebfit(x, yw, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef3, coef3)
+        #
+        wcoef2d = cheb.chebfit(x, np.array([yw, yw]).T, 3, w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        wcoef2d = cheb.chebfit(x, np.array([yw, yw]).T, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        # test scaling with complex values x points whose square
+        # is zero when summed.
+        x = [1, 1j, -1, -1j]
+        assert_almost_equal(cheb.chebfit(x, x, 1), [0, 1])
+        assert_almost_equal(cheb.chebfit(x, x, [0, 1]), [0, 1])
+        # test fitting only even polynomials
+        x = np.linspace(-1, 1)
+        y = f2(x)
+        coef1 = cheb.chebfit(x, y, 4)
+        assert_almost_equal(cheb.chebval(x, coef1), y)
+        coef2 = cheb.chebfit(x, y, [0, 2, 4])
+        assert_almost_equal(cheb.chebval(x, coef2), y)
+        assert_almost_equal(coef1, coef2)
+
+
+class TestInterpolate(object):
+
+    def f(self, x):
+        return x * (x - 1) * (x - 2)
+
+    def test_raises(self):
+        assert_raises(ValueError, cheb.chebinterpolate, self.f, -1)
+        assert_raises(TypeError, cheb.chebinterpolate, self.f, 10.)
+
+    def test_dimensions(self):
+        for deg in range(1, 5):
+            assert_(cheb.chebinterpolate(self.f, deg).shape == (deg + 1,))
+
+    def test_approximation(self):
+
+        def powx(x, p):
+            return x**p
+
+        x = np.linspace(-1, 1, 10)
+        for deg in range(0, 10):
+            for p in range(0, deg + 1):
+                c = cheb.chebinterpolate(powx, deg, (p,))
+                assert_almost_equal(cheb.chebval(x, c), powx(x, p), decimal=12)
+
+
+class TestCompanion(object):
+
+    def test_raises(self):
+        assert_raises(ValueError, cheb.chebcompanion, [])
+        assert_raises(ValueError, cheb.chebcompanion, [1])
+
+    def test_dimensions(self):
+        for i in range(1, 5):
+            coef = [0]*i + [1]
+            assert_(cheb.chebcompanion(coef).shape == (i, i))
+
+    def test_linear_root(self):
+        assert_(cheb.chebcompanion([1, 2])[0, 0] == -.5)
+
+
+class TestGauss(object):
+
+    def test_100(self):
+        x, w = cheb.chebgauss(100)
+
+        # test orthogonality. Note that the results need to be normalized,
+        # otherwise the huge values that can arise from fast growing
+        # functions like Laguerre can be very confusing.
+        v = cheb.chebvander(x, 99)
+        vv = np.dot(v.T * w, v)
+        vd = 1/np.sqrt(vv.diagonal())
+        vv = vd[:, None] * vv * vd
+        assert_almost_equal(vv, np.eye(100))
+
+        # check that the integral of 1 is correct
+        tgt = np.pi
+        assert_almost_equal(w.sum(), tgt)
+
+
+class TestMisc(object):
+
+    def test_chebfromroots(self):
+        res = cheb.chebfromroots([])
+        assert_almost_equal(trim(res), [1])
+        for i in range(1, 5):
+            roots = np.cos(np.linspace(-np.pi, 0, 2*i + 1)[1::2])
+            tgt = [0]*i + [1]
+            res = cheb.chebfromroots(roots)*2**(i-1)
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_chebroots(self):
+        assert_almost_equal(cheb.chebroots([1]), [])
+        assert_almost_equal(cheb.chebroots([1, 2]), [-.5])
+        for i in range(2, 5):
+            tgt = np.linspace(-1, 1, i)
+            res = cheb.chebroots(cheb.chebfromroots(tgt))
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_chebtrim(self):
+        coef = [2, -1, 1, 0]
+
+        # Test exceptions
+        assert_raises(ValueError, cheb.chebtrim, coef, -1)
+
+        # Test results
+        assert_equal(cheb.chebtrim(coef), coef[:-1])
+        assert_equal(cheb.chebtrim(coef, 1), coef[:-3])
+        assert_equal(cheb.chebtrim(coef, 2), [0])
+
+    def test_chebline(self):
+        assert_equal(cheb.chebline(3, 4), [3, 4])
+
+    def test_cheb2poly(self):
+        for i in range(10):
+            assert_almost_equal(cheb.cheb2poly([0]*i + [1]), Tlist[i])
+
+    def test_poly2cheb(self):
+        for i in range(10):
+            assert_almost_equal(cheb.poly2cheb(Tlist[i]), [0]*i + [1])
+
+    def test_weight(self):
+        x = np.linspace(-1, 1, 11)[1:-1]
+        tgt = 1./(np.sqrt(1 + x) * np.sqrt(1 - x))
+        res = cheb.chebweight(x)
+        assert_almost_equal(res, tgt)
+
+    def test_chebpts1(self):
+        #test exceptions
+        assert_raises(ValueError, cheb.chebpts1, 1.5)
+        assert_raises(ValueError, cheb.chebpts1, 0)
+
+        #test points
+        tgt = [0]
+        assert_almost_equal(cheb.chebpts1(1), tgt)
+        tgt = [-0.70710678118654746, 0.70710678118654746]
+        assert_almost_equal(cheb.chebpts1(2), tgt)
+        tgt = [-0.86602540378443871, 0, 0.86602540378443871]
+        assert_almost_equal(cheb.chebpts1(3), tgt)
+        tgt = [-0.9238795325, -0.3826834323, 0.3826834323, 0.9238795325]
+        assert_almost_equal(cheb.chebpts1(4), tgt)
+
+    def test_chebpts2(self):
+        #test exceptions
+        assert_raises(ValueError, cheb.chebpts2, 1.5)
+        assert_raises(ValueError, cheb.chebpts2, 1)
+
+        #test points
+        tgt = [-1, 1]
+        assert_almost_equal(cheb.chebpts2(2), tgt)
+        tgt = [-1, 0, 1]
+        assert_almost_equal(cheb.chebpts2(3), tgt)
+        tgt = [-1, -0.5, .5, 1]
+        assert_almost_equal(cheb.chebpts2(4), tgt)
+        tgt = [-1.0, -0.707106781187, 0, 0.707106781187, 1.0]
+        assert_almost_equal(cheb.chebpts2(5), tgt)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_classes.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_classes.py
new file mode 100644
index 00000000000..15e24f92b04
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_classes.py
@@ -0,0 +1,642 @@
+"""Test inter-conversion of different polynomial classes.
+
+This tests the convert and cast methods of all the polynomial classes.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import operator as op
+from numbers import Number
+
+import pytest
+import numpy as np
+from numpy.polynomial import (
+    Polynomial, Legendre, Chebyshev, Laguerre, Hermite, HermiteE)
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+from numpy.compat import long
+
+
+#
+# fixtures
+#
+
+classes = (
+    Polynomial, Legendre, Chebyshev, Laguerre,
+    Hermite, HermiteE
+    )
+classids = tuple(cls.__name__ for cls in classes)
+
+[email protected](params=classes, ids=classids)
+def Poly(request):
+    return request.param
+
+#
+# helper functions
+#
+random = np.random.random
+
+
+def assert_poly_almost_equal(p1, p2, msg=""):
+    try:
+        assert_(np.all(p1.domain == p2.domain))
+        assert_(np.all(p1.window == p2.window))
+        assert_almost_equal(p1.coef, p2.coef)
+    except AssertionError:
+        msg = "Result: %s\nTarget: %s", (p1, p2)
+        raise AssertionError(msg)
+
+
+#
+# Test conversion methods that depend on combinations of two classes.
+#
+
+Poly1 = Poly
+Poly2 = Poly
+
+
+def test_conversion(Poly1, Poly2):
+    x = np.linspace(0, 1, 10)
+    coef = random((3,))
+
+    d1 = Poly1.domain + random((2,))*.25
+    w1 = Poly1.window + random((2,))*.25
+    p1 = Poly1(coef, domain=d1, window=w1)
+
+    d2 = Poly2.domain + random((2,))*.25
+    w2 = Poly2.window + random((2,))*.25
+    p2 = p1.convert(kind=Poly2, domain=d2, window=w2)
+
+    assert_almost_equal(p2.domain, d2)
+    assert_almost_equal(p2.window, w2)
+    assert_almost_equal(p2(x), p1(x))
+
+
+def test_cast(Poly1, Poly2):
+    x = np.linspace(0, 1, 10)
+    coef = random((3,))
+
+    d1 = Poly1.domain + random((2,))*.25
+    w1 = Poly1.window + random((2,))*.25
+    p1 = Poly1(coef, domain=d1, window=w1)
+
+    d2 = Poly2.domain + random((2,))*.25
+    w2 = Poly2.window + random((2,))*.25
+    p2 = Poly2.cast(p1, domain=d2, window=w2)
+
+    assert_almost_equal(p2.domain, d2)
+    assert_almost_equal(p2.window, w2)
+    assert_almost_equal(p2(x), p1(x))
+
+
+#
+# test methods that depend on one class
+#
+
+
+def test_identity(Poly):
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    x = np.linspace(d[0], d[1], 11)
+    p = Poly.identity(domain=d, window=w)
+    assert_equal(p.domain, d)
+    assert_equal(p.window, w)
+    assert_almost_equal(p(x), x)
+
+
+def test_basis(Poly):
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    p = Poly.basis(5, domain=d, window=w)
+    assert_equal(p.domain, d)
+    assert_equal(p.window, w)
+    assert_equal(p.coef, [0]*5 + [1])
+
+
+def test_fromroots(Poly):
+    # check that requested roots are zeros of a polynomial
+    # of correct degree, domain, and window.
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    r = random((5,))
+    p1 = Poly.fromroots(r, domain=d, window=w)
+    assert_equal(p1.degree(), len(r))
+    assert_equal(p1.domain, d)
+    assert_equal(p1.window, w)
+    assert_almost_equal(p1(r), 0)
+
+    # check that polynomial is monic
+    pdom = Polynomial.domain
+    pwin = Polynomial.window
+    p2 = Polynomial.cast(p1, domain=pdom, window=pwin)
+    assert_almost_equal(p2.coef[-1], 1)
+
+
+def test_fit(Poly):
+
+    def f(x):
+        return x*(x - 1)*(x - 2)
+    x = np.linspace(0, 3)
+    y = f(x)
+
+    # check default value of domain and window
+    p = Poly.fit(x, y, 3)
+    assert_almost_equal(p.domain, [0, 3])
+    assert_almost_equal(p(x), y)
+    assert_equal(p.degree(), 3)
+
+    # check with given domains and window
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    p = Poly.fit(x, y, 3, domain=d, window=w)
+    assert_almost_equal(p(x), y)
+    assert_almost_equal(p.domain, d)
+    assert_almost_equal(p.window, w)
+    p = Poly.fit(x, y, [0, 1, 2, 3], domain=d, window=w)
+    assert_almost_equal(p(x), y)
+    assert_almost_equal(p.domain, d)
+    assert_almost_equal(p.window, w)
+
+    # check with class domain default
+    p = Poly.fit(x, y, 3, [])
+    assert_equal(p.domain, Poly.domain)
+    assert_equal(p.window, Poly.window)
+    p = Poly.fit(x, y, [0, 1, 2, 3], [])
+    assert_equal(p.domain, Poly.domain)
+    assert_equal(p.window, Poly.window)
+
+    # check that fit accepts weights.
+    w = np.zeros_like(x)
+    z = y + random(y.shape)*.25
+    w[::2] = 1
+    p1 = Poly.fit(x[::2], z[::2], 3)
+    p2 = Poly.fit(x, z, 3, w=w)
+    p3 = Poly.fit(x, z, [0, 1, 2, 3], w=w)
+    assert_almost_equal(p1(x), p2(x))
+    assert_almost_equal(p2(x), p3(x))
+
+
+def test_equal(Poly):
+    p1 = Poly([1, 2, 3], domain=[0, 1], window=[2, 3])
+    p2 = Poly([1, 1, 1], domain=[0, 1], window=[2, 3])
+    p3 = Poly([1, 2, 3], domain=[1, 2], window=[2, 3])
+    p4 = Poly([1, 2, 3], domain=[0, 1], window=[1, 2])
+    assert_(p1 == p1)
+    assert_(not p1 == p2)
+    assert_(not p1 == p3)
+    assert_(not p1 == p4)
+
+
+def test_not_equal(Poly):
+    p1 = Poly([1, 2, 3], domain=[0, 1], window=[2, 3])
+    p2 = Poly([1, 1, 1], domain=[0, 1], window=[2, 3])
+    p3 = Poly([1, 2, 3], domain=[1, 2], window=[2, 3])
+    p4 = Poly([1, 2, 3], domain=[0, 1], window=[1, 2])
+    assert_(not p1 != p1)
+    assert_(p1 != p2)
+    assert_(p1 != p3)
+    assert_(p1 != p4)
+
+
+def test_add(Poly):
+    # This checks commutation, not numerical correctness
+    c1 = list(random((4,)) + .5)
+    c2 = list(random((3,)) + .5)
+    p1 = Poly(c1)
+    p2 = Poly(c2)
+    p3 = p1 + p2
+    assert_poly_almost_equal(p2 + p1, p3)
+    assert_poly_almost_equal(p1 + c2, p3)
+    assert_poly_almost_equal(c2 + p1, p3)
+    assert_poly_almost_equal(p1 + tuple(c2), p3)
+    assert_poly_almost_equal(tuple(c2) + p1, p3)
+    assert_poly_almost_equal(p1 + np.array(c2), p3)
+    assert_poly_almost_equal(np.array(c2) + p1, p3)
+    assert_raises(TypeError, op.add, p1, Poly([0], domain=Poly.domain + 1))
+    assert_raises(TypeError, op.add, p1, Poly([0], window=Poly.window + 1))
+    if Poly is Polynomial:
+        assert_raises(TypeError, op.add, p1, Chebyshev([0]))
+    else:
+        assert_raises(TypeError, op.add, p1, Polynomial([0]))
+
+
+def test_sub(Poly):
+    # This checks commutation, not numerical correctness
+    c1 = list(random((4,)) + .5)
+    c2 = list(random((3,)) + .5)
+    p1 = Poly(c1)
+    p2 = Poly(c2)
+    p3 = p1 - p2
+    assert_poly_almost_equal(p2 - p1, -p3)
+    assert_poly_almost_equal(p1 - c2, p3)
+    assert_poly_almost_equal(c2 - p1, -p3)
+    assert_poly_almost_equal(p1 - tuple(c2), p3)
+    assert_poly_almost_equal(tuple(c2) - p1, -p3)
+    assert_poly_almost_equal(p1 - np.array(c2), p3)
+    assert_poly_almost_equal(np.array(c2) - p1, -p3)
+    assert_raises(TypeError, op.sub, p1, Poly([0], domain=Poly.domain + 1))
+    assert_raises(TypeError, op.sub, p1, Poly([0], window=Poly.window + 1))
+    if Poly is Polynomial:
+        assert_raises(TypeError, op.sub, p1, Chebyshev([0]))
+    else:
+        assert_raises(TypeError, op.sub, p1, Polynomial([0]))
+
+
+def test_mul(Poly):
+    c1 = list(random((4,)) + .5)
+    c2 = list(random((3,)) + .5)
+    p1 = Poly(c1)
+    p2 = Poly(c2)
+    p3 = p1 * p2
+    assert_poly_almost_equal(p2 * p1, p3)
+    assert_poly_almost_equal(p1 * c2, p3)
+    assert_poly_almost_equal(c2 * p1, p3)
+    assert_poly_almost_equal(p1 * tuple(c2), p3)
+    assert_poly_almost_equal(tuple(c2) * p1, p3)
+    assert_poly_almost_equal(p1 * np.array(c2), p3)
+    assert_poly_almost_equal(np.array(c2) * p1, p3)
+    assert_poly_almost_equal(p1 * 2, p1 * Poly([2]))
+    assert_poly_almost_equal(2 * p1, p1 * Poly([2]))
+    assert_raises(TypeError, op.mul, p1, Poly([0], domain=Poly.domain + 1))
+    assert_raises(TypeError, op.mul, p1, Poly([0], window=Poly.window + 1))
+    if Poly is Polynomial:
+        assert_raises(TypeError, op.mul, p1, Chebyshev([0]))
+    else:
+        assert_raises(TypeError, op.mul, p1, Polynomial([0]))
+
+
+def test_floordiv(Poly):
+    c1 = list(random((4,)) + .5)
+    c2 = list(random((3,)) + .5)
+    c3 = list(random((2,)) + .5)
+    p1 = Poly(c1)
+    p2 = Poly(c2)
+    p3 = Poly(c3)
+    p4 = p1 * p2 + p3
+    c4 = list(p4.coef)
+    assert_poly_almost_equal(p4 // p2, p1)
+    assert_poly_almost_equal(p4 // c2, p1)
+    assert_poly_almost_equal(c4 // p2, p1)
+    assert_poly_almost_equal(p4 // tuple(c2), p1)
+    assert_poly_almost_equal(tuple(c4) // p2, p1)
+    assert_poly_almost_equal(p4 // np.array(c2), p1)
+    assert_poly_almost_equal(np.array(c4) // p2, p1)
+    assert_poly_almost_equal(2 // p2, Poly([0]))
+    assert_poly_almost_equal(p2 // 2, 0.5*p2)
+    assert_raises(
+        TypeError, op.floordiv, p1, Poly([0], domain=Poly.domain + 1))
+    assert_raises(
+        TypeError, op.floordiv, p1, Poly([0], window=Poly.window + 1))
+    if Poly is Polynomial:
+        assert_raises(TypeError, op.floordiv, p1, Chebyshev([0]))
+    else:
+        assert_raises(TypeError, op.floordiv, p1, Polynomial([0]))
+
+
+def test_truediv(Poly):
+    # true division is valid only if the denominator is a Number and
+    # not a python bool.
+    p1 = Poly([1,2,3])
+    p2 = p1 * 5
+
+    for stype in np.ScalarType:
+        if not issubclass(stype, Number) or issubclass(stype, bool):
+            continue
+        s = stype(5)
+        assert_poly_almost_equal(op.truediv(p2, s), p1)
+        assert_raises(TypeError, op.truediv, s, p2)
+    for stype in (int, long, float):
+        s = stype(5)
+        assert_poly_almost_equal(op.truediv(p2, s), p1)
+        assert_raises(TypeError, op.truediv, s, p2)
+    for stype in [complex]:
+        s = stype(5, 0)
+        assert_poly_almost_equal(op.truediv(p2, s), p1)
+        assert_raises(TypeError, op.truediv, s, p2)
+    for s in [tuple(), list(), dict(), bool(), np.array([1])]:
+        assert_raises(TypeError, op.truediv, p2, s)
+        assert_raises(TypeError, op.truediv, s, p2)
+    for ptype in classes:
+        assert_raises(TypeError, op.truediv, p2, ptype(1))
+
+
+def test_mod(Poly):
+    # This checks commutation, not numerical correctness
+    c1 = list(random((4,)) + .5)
+    c2 = list(random((3,)) + .5)
+    c3 = list(random((2,)) + .5)
+    p1 = Poly(c1)
+    p2 = Poly(c2)
+    p3 = Poly(c3)
+    p4 = p1 * p2 + p3
+    c4 = list(p4.coef)
+    assert_poly_almost_equal(p4 % p2, p3)
+    assert_poly_almost_equal(p4 % c2, p3)
+    assert_poly_almost_equal(c4 % p2, p3)
+    assert_poly_almost_equal(p4 % tuple(c2), p3)
+    assert_poly_almost_equal(tuple(c4) % p2, p3)
+    assert_poly_almost_equal(p4 % np.array(c2), p3)
+    assert_poly_almost_equal(np.array(c4) % p2, p3)
+    assert_poly_almost_equal(2 % p2, Poly([2]))
+    assert_poly_almost_equal(p2 % 2, Poly([0]))
+    assert_raises(TypeError, op.mod, p1, Poly([0], domain=Poly.domain + 1))
+    assert_raises(TypeError, op.mod, p1, Poly([0], window=Poly.window + 1))
+    if Poly is Polynomial:
+        assert_raises(TypeError, op.mod, p1, Chebyshev([0]))
+    else:
+        assert_raises(TypeError, op.mod, p1, Polynomial([0]))
+
+
+def test_divmod(Poly):
+    # This checks commutation, not numerical correctness
+    c1 = list(random((4,)) + .5)
+    c2 = list(random((3,)) + .5)
+    c3 = list(random((2,)) + .5)
+    p1 = Poly(c1)
+    p2 = Poly(c2)
+    p3 = Poly(c3)
+    p4 = p1 * p2 + p3
+    c4 = list(p4.coef)
+    quo, rem = divmod(p4, p2)
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(p4, c2)
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(c4, p2)
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(p4, tuple(c2))
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(tuple(c4), p2)
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(p4, np.array(c2))
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(np.array(c4), p2)
+    assert_poly_almost_equal(quo, p1)
+    assert_poly_almost_equal(rem, p3)
+    quo, rem = divmod(p2, 2)
+    assert_poly_almost_equal(quo, 0.5*p2)
+    assert_poly_almost_equal(rem, Poly([0]))
+    quo, rem = divmod(2, p2)
+    assert_poly_almost_equal(quo, Poly([0]))
+    assert_poly_almost_equal(rem, Poly([2]))
+    assert_raises(TypeError, divmod, p1, Poly([0], domain=Poly.domain + 1))
+    assert_raises(TypeError, divmod, p1, Poly([0], window=Poly.window + 1))
+    if Poly is Polynomial:
+        assert_raises(TypeError, divmod, p1, Chebyshev([0]))
+    else:
+        assert_raises(TypeError, divmod, p1, Polynomial([0]))
+
+
+def test_roots(Poly):
+    d = Poly.domain * 1.25 + .25
+    w = Poly.window
+    tgt = np.linspace(d[0], d[1], 5)
+    res = np.sort(Poly.fromroots(tgt, domain=d, window=w).roots())
+    assert_almost_equal(res, tgt)
+    # default domain and window
+    res = np.sort(Poly.fromroots(tgt).roots())
+    assert_almost_equal(res, tgt)
+
+
+def test_degree(Poly):
+    p = Poly.basis(5)
+    assert_equal(p.degree(), 5)
+
+
+def test_copy(Poly):
+    p1 = Poly.basis(5)
+    p2 = p1.copy()
+    assert_(p1 == p2)
+    assert_(p1 is not p2)
+    assert_(p1.coef is not p2.coef)
+    assert_(p1.domain is not p2.domain)
+    assert_(p1.window is not p2.window)
+
+
+def test_integ(Poly):
+    P = Polynomial
+    # Check defaults
+    p0 = Poly.cast(P([1*2, 2*3, 3*4]))
+    p1 = P.cast(p0.integ())
+    p2 = P.cast(p0.integ(2))
+    assert_poly_almost_equal(p1, P([0, 2, 3, 4]))
+    assert_poly_almost_equal(p2, P([0, 0, 1, 1, 1]))
+    # Check with k
+    p0 = Poly.cast(P([1*2, 2*3, 3*4]))
+    p1 = P.cast(p0.integ(k=1))
+    p2 = P.cast(p0.integ(2, k=[1, 1]))
+    assert_poly_almost_equal(p1, P([1, 2, 3, 4]))
+    assert_poly_almost_equal(p2, P([1, 1, 1, 1, 1]))
+    # Check with lbnd
+    p0 = Poly.cast(P([1*2, 2*3, 3*4]))
+    p1 = P.cast(p0.integ(lbnd=1))
+    p2 = P.cast(p0.integ(2, lbnd=1))
+    assert_poly_almost_equal(p1, P([-9, 2, 3, 4]))
+    assert_poly_almost_equal(p2, P([6, -9, 1, 1, 1]))
+    # Check scaling
+    d = 2*Poly.domain
+    p0 = Poly.cast(P([1*2, 2*3, 3*4]), domain=d)
+    p1 = P.cast(p0.integ())
+    p2 = P.cast(p0.integ(2))
+    assert_poly_almost_equal(p1, P([0, 2, 3, 4]))
+    assert_poly_almost_equal(p2, P([0, 0, 1, 1, 1]))
+
+
+def test_deriv(Poly):
+    # Check that the derivative is the inverse of integration. It is
+    # assumes that the integration has been checked elsewhere.
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    p1 = Poly([1, 2, 3], domain=d, window=w)
+    p2 = p1.integ(2, k=[1, 2])
+    p3 = p1.integ(1, k=[1])
+    assert_almost_equal(p2.deriv(1).coef, p3.coef)
+    assert_almost_equal(p2.deriv(2).coef, p1.coef)
+    # default domain and window
+    p1 = Poly([1, 2, 3])
+    p2 = p1.integ(2, k=[1, 2])
+    p3 = p1.integ(1, k=[1])
+    assert_almost_equal(p2.deriv(1).coef, p3.coef)
+    assert_almost_equal(p2.deriv(2).coef, p1.coef)
+
+
+def test_linspace(Poly):
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    p = Poly([1, 2, 3], domain=d, window=w)
+    # check default domain
+    xtgt = np.linspace(d[0], d[1], 20)
+    ytgt = p(xtgt)
+    xres, yres = p.linspace(20)
+    assert_almost_equal(xres, xtgt)
+    assert_almost_equal(yres, ytgt)
+    # check specified domain
+    xtgt = np.linspace(0, 2, 20)
+    ytgt = p(xtgt)
+    xres, yres = p.linspace(20, domain=[0, 2])
+    assert_almost_equal(xres, xtgt)
+    assert_almost_equal(yres, ytgt)
+
+
+def test_pow(Poly):
+    d = Poly.domain + random((2,))*.25
+    w = Poly.window + random((2,))*.25
+    tgt = Poly([1], domain=d, window=w)
+    tst = Poly([1, 2, 3], domain=d, window=w)
+    for i in range(5):
+        assert_poly_almost_equal(tst**i, tgt)
+        tgt = tgt * tst
+    # default domain and window
+    tgt = Poly([1])
+    tst = Poly([1, 2, 3])
+    for i in range(5):
+        assert_poly_almost_equal(tst**i, tgt)
+        tgt = tgt * tst
+    # check error for invalid powers
+    assert_raises(ValueError, op.pow, tgt, 1.5)
+    assert_raises(ValueError, op.pow, tgt, -1)
+
+
+def test_call(Poly):
+    P = Polynomial
+    d = Poly.domain
+    x = np.linspace(d[0], d[1], 11)
+
+    # Check defaults
+    p = Poly.cast(P([1, 2, 3]))
+    tgt = 1 + x*(2 + 3*x)
+    res = p(x)
+    assert_almost_equal(res, tgt)
+
+
+def test_cutdeg(Poly):
+    p = Poly([1, 2, 3])
+    assert_raises(ValueError, p.cutdeg, .5)
+    assert_raises(ValueError, p.cutdeg, -1)
+    assert_equal(len(p.cutdeg(3)), 3)
+    assert_equal(len(p.cutdeg(2)), 3)
+    assert_equal(len(p.cutdeg(1)), 2)
+    assert_equal(len(p.cutdeg(0)), 1)
+
+
+def test_truncate(Poly):
+    p = Poly([1, 2, 3])
+    assert_raises(ValueError, p.truncate, .5)
+    assert_raises(ValueError, p.truncate, 0)
+    assert_equal(len(p.truncate(4)), 3)
+    assert_equal(len(p.truncate(3)), 3)
+    assert_equal(len(p.truncate(2)), 2)
+    assert_equal(len(p.truncate(1)), 1)
+
+
+def test_trim(Poly):
+    c = [1, 1e-6, 1e-12, 0]
+    p = Poly(c)
+    assert_equal(p.trim().coef, c[:3])
+    assert_equal(p.trim(1e-10).coef, c[:2])
+    assert_equal(p.trim(1e-5).coef, c[:1])
+
+
+def test_mapparms(Poly):
+    # check with defaults. Should be identity.
+    d = Poly.domain
+    w = Poly.window
+    p = Poly([1], domain=d, window=w)
+    assert_almost_equal([0, 1], p.mapparms())
+    #
+    w = 2*d + 1
+    p = Poly([1], domain=d, window=w)
+    assert_almost_equal([1, 2], p.mapparms())
+
+
+def test_ufunc_override(Poly):
+    p = Poly([1, 2, 3])
+    x = np.ones(3)
+    assert_raises(TypeError, np.add, p, x)
+    assert_raises(TypeError, np.add, x, p)
+
+
+
+class TestLatexRepr(object):
+    """Test the latex repr used by ipython """
+
+    def as_latex(self, obj):
+        # right now we ignore the formatting of scalars in our tests, since
+        # it makes them too verbose. Ideally, the formatting of scalars will
+        # be fixed such that tests below continue to pass
+        obj._repr_latex_scalar = lambda x: str(x)
+        try:
+            return obj._repr_latex_()
+        finally:
+            del obj._repr_latex_scalar
+
+    def test_simple_polynomial(self):
+        # default input
+        p = Polynomial([1, 2, 3])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0 + 2.0\,x + 3.0\,x^{2}$')
+
+        # translated input
+        p = Polynomial([1, 2, 3], domain=[-2, 0])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0 + 2.0\,\left(1.0 + x\right) + 3.0\,\left(1.0 + x\right)^{2}$')
+
+        # scaled input
+        p = Polynomial([1, 2, 3], domain=[-0.5, 0.5])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0 + 2.0\,\left(2.0x\right) + 3.0\,\left(2.0x\right)^{2}$')
+
+        # affine input
+        p = Polynomial([1, 2, 3], domain=[-1, 0])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0 + 2.0\,\left(1.0 + 2.0x\right) + 3.0\,\left(1.0 + 2.0x\right)^{2}$')
+
+    def test_basis_func(self):
+        p = Chebyshev([1, 2, 3])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0\,{T}_{0}(x) + 2.0\,{T}_{1}(x) + 3.0\,{T}_{2}(x)$')
+        # affine input - check no surplus parens are added
+        p = Chebyshev([1, 2, 3], domain=[-1, 0])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0\,{T}_{0}(1.0 + 2.0x) + 2.0\,{T}_{1}(1.0 + 2.0x) + 3.0\,{T}_{2}(1.0 + 2.0x)$')
+
+    def test_multichar_basis_func(self):
+        p = HermiteE([1, 2, 3])
+        assert_equal(self.as_latex(p),
+            r'$x \mapsto 1.0\,{He}_{0}(x) + 2.0\,{He}_{1}(x) + 3.0\,{He}_{2}(x)$')
+
+
+#
+# Test class method that only exists for some classes
+#
+
+
+class TestInterpolate(object):
+
+    def f(self, x):
+        return x * (x - 1) * (x - 2)
+
+    def test_raises(self):
+        assert_raises(ValueError, Chebyshev.interpolate, self.f, -1)
+        assert_raises(TypeError, Chebyshev.interpolate, self.f, 10.)
+
+    def test_dimensions(self):
+        for deg in range(1, 5):
+            assert_(Chebyshev.interpolate(self.f, deg).degree() == deg)
+
+    def test_approximation(self):
+
+        def powx(x, p):
+            return x**p
+
+        x = np.linspace(0, 2, 10)
+        for deg in range(0, 10):
+            for t in range(0, deg + 1):
+                p = Chebyshev.interpolate(powx, deg, domain=[0, 2], args=(t,))
+                assert_almost_equal(p(x), powx(x, t), decimal=12)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_hermite.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_hermite.py
new file mode 100644
index 00000000000..1287ef3fe17
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_hermite.py
@@ -0,0 +1,557 @@
+"""Tests for hermite module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.polynomial.hermite as herm
+from numpy.polynomial.polynomial import polyval
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+H0 = np.array([1])
+H1 = np.array([0, 2])
+H2 = np.array([-2, 0, 4])
+H3 = np.array([0, -12, 0, 8])
+H4 = np.array([12, 0, -48, 0, 16])
+H5 = np.array([0, 120, 0, -160, 0, 32])
+H6 = np.array([-120, 0, 720, 0, -480, 0, 64])
+H7 = np.array([0, -1680, 0, 3360, 0, -1344, 0, 128])
+H8 = np.array([1680, 0, -13440, 0, 13440, 0, -3584, 0, 256])
+H9 = np.array([0, 30240, 0, -80640, 0, 48384, 0, -9216, 0, 512])
+
+Hlist = [H0, H1, H2, H3, H4, H5, H6, H7, H8, H9]
+
+
+def trim(x):
+    return herm.hermtrim(x, tol=1e-6)
+
+
+class TestConstants(object):
+
+    def test_hermdomain(self):
+        assert_equal(herm.hermdomain, [-1, 1])
+
+    def test_hermzero(self):
+        assert_equal(herm.hermzero, [0])
+
+    def test_hermone(self):
+        assert_equal(herm.hermone, [1])
+
+    def test_hermx(self):
+        assert_equal(herm.hermx, [0, .5])
+
+
+class TestArithmetic(object):
+    x = np.linspace(-3, 3, 100)
+
+    def test_hermadd(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] += 1
+                res = herm.hermadd([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_hermsub(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] -= 1
+                res = herm.hermsub([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_hermmulx(self):
+        assert_equal(herm.hermmulx([0]), [0])
+        assert_equal(herm.hermmulx([1]), [0, .5])
+        for i in range(1, 5):
+            ser = [0]*i + [1]
+            tgt = [0]*(i - 1) + [i, 0, .5]
+            assert_equal(herm.hermmulx(ser), tgt)
+
+    def test_hermmul(self):
+        # check values of result
+        for i in range(5):
+            pol1 = [0]*i + [1]
+            val1 = herm.hermval(self.x, pol1)
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                pol2 = [0]*j + [1]
+                val2 = herm.hermval(self.x, pol2)
+                pol3 = herm.hermmul(pol1, pol2)
+                val3 = herm.hermval(self.x, pol3)
+                assert_(len(pol3) == i + j + 1, msg)
+                assert_almost_equal(val3, val1*val2, err_msg=msg)
+
+    def test_hermdiv(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                ci = [0]*i + [1]
+                cj = [0]*j + [1]
+                tgt = herm.hermadd(ci, cj)
+                quo, rem = herm.hermdiv(tgt, ci)
+                res = herm.hermadd(herm.hermmul(quo, ci), rem)
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_hermpow(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                c = np.arange(i + 1)
+                tgt = reduce(herm.hermmul, [c]*j, np.array([1]))
+                res = herm.hermpow(c, j) 
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+
+class TestEvaluation(object):
+    # coefficients of 1 + 2*x + 3*x**2
+    c1d = np.array([2.5, 1., .75])
+    c2d = np.einsum('i,j->ij', c1d, c1d)
+    c3d = np.einsum('i,j,k->ijk', c1d, c1d, c1d)
+
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+    y = polyval(x, [1., 2., 3.])
+
+    def test_hermval(self):
+        #check empty input
+        assert_equal(herm.hermval([], [1]).size, 0)
+
+        #check normal input)
+        x = np.linspace(-1, 1)
+        y = [polyval(x, c) for c in Hlist]
+        for i in range(10):
+            msg = "At i=%d" % i
+            tgt = y[i]
+            res = herm.hermval(x, [0]*i + [1])
+            assert_almost_equal(res, tgt, err_msg=msg)
+
+        #check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(herm.hermval(x, [1]).shape, dims)
+            assert_equal(herm.hermval(x, [1, 0]).shape, dims)
+            assert_equal(herm.hermval(x, [1, 0, 0]).shape, dims)
+
+    def test_hermval2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, herm.hermval2d, x1, x2[:2], self.c2d)
+
+        #test values
+        tgt = y1*y2
+        res = herm.hermval2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herm.hermval2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3))
+
+    def test_hermval3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, herm.hermval3d, x1, x2, x3[:2], self.c3d)
+
+        #test values
+        tgt = y1*y2*y3
+        res = herm.hermval3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herm.hermval3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3))
+
+    def test_hermgrid2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j->ij', y1, y2)
+        res = herm.hermgrid2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herm.hermgrid2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3)*2)
+
+    def test_hermgrid3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j,k->ijk', y1, y2, y3)
+        res = herm.hermgrid3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herm.hermgrid3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3)*3)
+
+
+class TestIntegral(object):
+
+    def test_hermint(self):
+        # check exceptions
+        assert_raises(ValueError, herm.hermint, [0], .5)
+        assert_raises(ValueError, herm.hermint, [0], -1)
+        assert_raises(ValueError, herm.hermint, [0], 1, [0, 0])
+        assert_raises(ValueError, herm.hermint, [0], lbnd=[0])
+        assert_raises(ValueError, herm.hermint, [0], scl=[0])
+        assert_raises(ValueError, herm.hermint, [0], axis=.5)
+
+        # test integration of zero polynomial
+        for i in range(2, 5):
+            k = [0]*(i - 2) + [1]
+            res = herm.hermint([0], m=i, k=k)
+            assert_almost_equal(res, [0, .5])
+
+        # check single integration with integration constant
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [1/scl]
+            hermpol = herm.poly2herm(pol)
+            hermint = herm.hermint(hermpol, m=1, k=[i])
+            res = herm.herm2poly(hermint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check single integration with integration constant and lbnd
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            hermpol = herm.poly2herm(pol)
+            hermint = herm.hermint(hermpol, m=1, k=[i], lbnd=-1)
+            assert_almost_equal(herm.hermval(-1, hermint), i)
+
+        # check single integration with integration constant and scaling
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [2/scl]
+            hermpol = herm.poly2herm(pol)
+            hermint = herm.hermint(hermpol, m=1, k=[i], scl=2)
+            res = herm.herm2poly(hermint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with default k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herm.hermint(tgt, m=1)
+                res = herm.hermint(pol, m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with defined k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herm.hermint(tgt, m=1, k=[k])
+                res = herm.hermint(pol, m=j, k=list(range(j)))
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with lbnd
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herm.hermint(tgt, m=1, k=[k], lbnd=-1)
+                res = herm.hermint(pol, m=j, k=list(range(j)), lbnd=-1)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herm.hermint(tgt, m=1, k=[k], scl=2)
+                res = herm.hermint(pol, m=j, k=list(range(j)), scl=2)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_hermint_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([herm.hermint(c) for c in c2d.T]).T
+        res = herm.hermint(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([herm.hermint(c) for c in c2d])
+        res = herm.hermint(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([herm.hermint(c, k=3) for c in c2d])
+        res = herm.hermint(c2d, k=3, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestDerivative(object):
+
+    def test_hermder(self):
+        # check exceptions
+        assert_raises(ValueError, herm.hermder, [0], .5)
+        assert_raises(ValueError, herm.hermder, [0], -1)
+
+        # check that zeroth derivative does nothing
+        for i in range(5):
+            tgt = [0]*i + [1]
+            res = herm.hermder(tgt, m=0)
+            assert_equal(trim(res), trim(tgt))
+
+        # check that derivation is the inverse of integration
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = herm.hermder(herm.hermint(tgt, m=j), m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check derivation with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = herm.hermder(herm.hermint(tgt, m=j, scl=2), m=j, scl=.5)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_hermder_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([herm.hermder(c) for c in c2d.T]).T
+        res = herm.hermder(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([herm.hermder(c) for c in c2d])
+        res = herm.hermder(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestVander(object):
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+
+    def test_hermvander(self):
+        # check for 1d x
+        x = np.arange(3)
+        v = herm.hermvander(x, 3)
+        assert_(v.shape == (3, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], herm.hermval(x, coef))
+
+        # check for 2d x
+        x = np.array([[1, 2], [3, 4], [5, 6]])
+        v = herm.hermvander(x, 3)
+        assert_(v.shape == (3, 2, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], herm.hermval(x, coef))
+
+    def test_hermvander2d(self):
+        # also tests hermval2d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3))
+        van = herm.hermvander2d(x1, x2, [1, 2])
+        tgt = herm.hermval2d(x1, x2, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = herm.hermvander2d([x1], [x2], [1, 2])
+        assert_(van.shape == (1, 5, 6))
+
+    def test_hermvander3d(self):
+        # also tests hermval3d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3, 4))
+        van = herm.hermvander3d(x1, x2, x3, [1, 2, 3])
+        tgt = herm.hermval3d(x1, x2, x3, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = herm.hermvander3d([x1], [x2], [x3], [1, 2, 3])
+        assert_(van.shape == (1, 5, 24))
+
+
+class TestFitting(object):
+
+    def test_hermfit(self):
+        def f(x):
+            return x*(x - 1)*(x - 2)
+
+        def f2(x):
+            return x**4 + x**2 + 1
+
+        # Test exceptions
+        assert_raises(ValueError, herm.hermfit, [1], [1], -1)
+        assert_raises(TypeError, herm.hermfit, [[1]], [1], 0)
+        assert_raises(TypeError, herm.hermfit, [], [1], 0)
+        assert_raises(TypeError, herm.hermfit, [1], [[[1]]], 0)
+        assert_raises(TypeError, herm.hermfit, [1, 2], [1], 0)
+        assert_raises(TypeError, herm.hermfit, [1], [1, 2], 0)
+        assert_raises(TypeError, herm.hermfit, [1], [1], 0, w=[[1]])
+        assert_raises(TypeError, herm.hermfit, [1], [1], 0, w=[1, 1])
+        assert_raises(ValueError, herm.hermfit, [1], [1], [-1,])
+        assert_raises(ValueError, herm.hermfit, [1], [1], [2, -1, 6])
+        assert_raises(TypeError, herm.hermfit, [1], [1], [])
+
+        # Test fit
+        x = np.linspace(0, 2)
+        y = f(x)
+        #
+        coef3 = herm.hermfit(x, y, 3)
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(herm.hermval(x, coef3), y)
+        coef3 = herm.hermfit(x, y, [0, 1, 2, 3])
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(herm.hermval(x, coef3), y)
+        #
+        coef4 = herm.hermfit(x, y, 4)
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(herm.hermval(x, coef4), y)
+        coef4 = herm.hermfit(x, y, [0, 1, 2, 3, 4])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(herm.hermval(x, coef4), y)
+        # check things still work if deg is not in strict increasing
+        coef4 = herm.hermfit(x, y, [2, 3, 4, 1, 0])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(herm.hermval(x, coef4), y)
+        #
+        coef2d = herm.hermfit(x, np.array([y, y]).T, 3)
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        coef2d = herm.hermfit(x, np.array([y, y]).T, [0, 1, 2, 3])
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        # test weighting
+        w = np.zeros_like(x)
+        yw = y.copy()
+        w[1::2] = 1
+        y[0::2] = 0
+        wcoef3 = herm.hermfit(x, yw, 3, w=w)
+        assert_almost_equal(wcoef3, coef3)
+        wcoef3 = herm.hermfit(x, yw, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef3, coef3)
+        #
+        wcoef2d = herm.hermfit(x, np.array([yw, yw]).T, 3, w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        wcoef2d = herm.hermfit(x, np.array([yw, yw]).T, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        # test scaling with complex values x points whose square
+        # is zero when summed.
+        x = [1, 1j, -1, -1j]
+        assert_almost_equal(herm.hermfit(x, x, 1), [0, .5])
+        assert_almost_equal(herm.hermfit(x, x, [0, 1]), [0, .5])
+        # test fitting only even Legendre polynomials
+        x = np.linspace(-1, 1)
+        y = f2(x)
+        coef1 = herm.hermfit(x, y, 4)
+        assert_almost_equal(herm.hermval(x, coef1), y)
+        coef2 = herm.hermfit(x, y, [0, 2, 4])
+        assert_almost_equal(herm.hermval(x, coef2), y)
+        assert_almost_equal(coef1, coef2)
+
+
+class TestCompanion(object):
+
+    def test_raises(self):
+        assert_raises(ValueError, herm.hermcompanion, [])
+        assert_raises(ValueError, herm.hermcompanion, [1])
+
+    def test_dimensions(self):
+        for i in range(1, 5):
+            coef = [0]*i + [1]
+            assert_(herm.hermcompanion(coef).shape == (i, i))
+
+    def test_linear_root(self):
+        assert_(herm.hermcompanion([1, 2])[0, 0] == -.25)
+
+
+class TestGauss(object):
+
+    def test_100(self):
+        x, w = herm.hermgauss(100)
+
+        # test orthogonality. Note that the results need to be normalized,
+        # otherwise the huge values that can arise from fast growing
+        # functions like Laguerre can be very confusing.
+        v = herm.hermvander(x, 99)
+        vv = np.dot(v.T * w, v)
+        vd = 1/np.sqrt(vv.diagonal())
+        vv = vd[:, None] * vv * vd
+        assert_almost_equal(vv, np.eye(100))
+
+        # check that the integral of 1 is correct
+        tgt = np.sqrt(np.pi)
+        assert_almost_equal(w.sum(), tgt)
+
+
+class TestMisc(object):
+
+    def test_hermfromroots(self):
+        res = herm.hermfromroots([])
+        assert_almost_equal(trim(res), [1])
+        for i in range(1, 5):
+            roots = np.cos(np.linspace(-np.pi, 0, 2*i + 1)[1::2])
+            pol = herm.hermfromroots(roots)
+            res = herm.hermval(roots, pol)
+            tgt = 0
+            assert_(len(pol) == i + 1)
+            assert_almost_equal(herm.herm2poly(pol)[-1], 1)
+            assert_almost_equal(res, tgt)
+
+    def test_hermroots(self):
+        assert_almost_equal(herm.hermroots([1]), [])
+        assert_almost_equal(herm.hermroots([1, 1]), [-.5])
+        for i in range(2, 5):
+            tgt = np.linspace(-1, 1, i)
+            res = herm.hermroots(herm.hermfromroots(tgt))
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_hermtrim(self):
+        coef = [2, -1, 1, 0]
+
+        # Test exceptions
+        assert_raises(ValueError, herm.hermtrim, coef, -1)
+
+        # Test results
+        assert_equal(herm.hermtrim(coef), coef[:-1])
+        assert_equal(herm.hermtrim(coef, 1), coef[:-3])
+        assert_equal(herm.hermtrim(coef, 2), [0])
+
+    def test_hermline(self):
+        assert_equal(herm.hermline(3, 4), [3, 2])
+
+    def test_herm2poly(self):
+        for i in range(10):
+            assert_almost_equal(herm.herm2poly([0]*i + [1]), Hlist[i])
+
+    def test_poly2herm(self):
+        for i in range(10):
+            assert_almost_equal(herm.poly2herm(Hlist[i]), [0]*i + [1])
+
+    def test_weight(self):
+        x = np.linspace(-5, 5, 11)
+        tgt = np.exp(-x**2)
+        res = herm.hermweight(x)
+        assert_almost_equal(res, tgt)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_hermite_e.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_hermite_e.py
new file mode 100644
index 00000000000..ccb44ad73ce
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_hermite_e.py
@@ -0,0 +1,558 @@
+"""Tests for hermite_e module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.polynomial.hermite_e as herme
+from numpy.polynomial.polynomial import polyval
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+He0 = np.array([1])
+He1 = np.array([0, 1])
+He2 = np.array([-1, 0, 1])
+He3 = np.array([0, -3, 0, 1])
+He4 = np.array([3, 0, -6, 0, 1])
+He5 = np.array([0, 15, 0, -10, 0, 1])
+He6 = np.array([-15, 0, 45, 0, -15, 0, 1])
+He7 = np.array([0, -105, 0, 105, 0, -21, 0, 1])
+He8 = np.array([105, 0, -420, 0, 210, 0, -28, 0, 1])
+He9 = np.array([0, 945, 0, -1260, 0, 378, 0, -36, 0, 1])
+
+Helist = [He0, He1, He2, He3, He4, He5, He6, He7, He8, He9]
+
+
+def trim(x):
+    return herme.hermetrim(x, tol=1e-6)
+
+
+class TestConstants(object):
+
+    def test_hermedomain(self):
+        assert_equal(herme.hermedomain, [-1, 1])
+
+    def test_hermezero(self):
+        assert_equal(herme.hermezero, [0])
+
+    def test_hermeone(self):
+        assert_equal(herme.hermeone, [1])
+
+    def test_hermex(self):
+        assert_equal(herme.hermex, [0, 1])
+
+
+class TestArithmetic(object):
+    x = np.linspace(-3, 3, 100)
+
+    def test_hermeadd(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] += 1
+                res = herme.hermeadd([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_hermesub(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] -= 1
+                res = herme.hermesub([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_hermemulx(self):
+        assert_equal(herme.hermemulx([0]), [0])
+        assert_equal(herme.hermemulx([1]), [0, 1])
+        for i in range(1, 5):
+            ser = [0]*i + [1]
+            tgt = [0]*(i - 1) + [i, 0, 1]
+            assert_equal(herme.hermemulx(ser), tgt)
+
+    def test_hermemul(self):
+        # check values of result
+        for i in range(5):
+            pol1 = [0]*i + [1]
+            val1 = herme.hermeval(self.x, pol1)
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                pol2 = [0]*j + [1]
+                val2 = herme.hermeval(self.x, pol2)
+                pol3 = herme.hermemul(pol1, pol2)
+                val3 = herme.hermeval(self.x, pol3)
+                assert_(len(pol3) == i + j + 1, msg)
+                assert_almost_equal(val3, val1*val2, err_msg=msg)
+
+    def test_hermediv(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                ci = [0]*i + [1]
+                cj = [0]*j + [1]
+                tgt = herme.hermeadd(ci, cj)
+                quo, rem = herme.hermediv(tgt, ci)
+                res = herme.hermeadd(herme.hermemul(quo, ci), rem)
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_hermepow(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                c = np.arange(i + 1)
+                tgt = reduce(herme.hermemul, [c]*j, np.array([1]))
+                res = herme.hermepow(c, j)
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+
+class TestEvaluation(object):
+    # coefficients of 1 + 2*x + 3*x**2
+    c1d = np.array([4., 2., 3.])
+    c2d = np.einsum('i,j->ij', c1d, c1d)
+    c3d = np.einsum('i,j,k->ijk', c1d, c1d, c1d)
+
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+    y = polyval(x, [1., 2., 3.])
+
+    def test_hermeval(self):
+        #check empty input
+        assert_equal(herme.hermeval([], [1]).size, 0)
+
+        #check normal input)
+        x = np.linspace(-1, 1)
+        y = [polyval(x, c) for c in Helist]
+        for i in range(10):
+            msg = "At i=%d" % i
+            tgt = y[i]
+            res = herme.hermeval(x, [0]*i + [1])
+            assert_almost_equal(res, tgt, err_msg=msg)
+
+        #check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(herme.hermeval(x, [1]).shape, dims)
+            assert_equal(herme.hermeval(x, [1, 0]).shape, dims)
+            assert_equal(herme.hermeval(x, [1, 0, 0]).shape, dims)
+
+    def test_hermeval2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, herme.hermeval2d, x1, x2[:2], self.c2d)
+
+        #test values
+        tgt = y1*y2
+        res = herme.hermeval2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herme.hermeval2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3))
+
+    def test_hermeval3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, herme.hermeval3d, x1, x2, x3[:2], self.c3d)
+
+        #test values
+        tgt = y1*y2*y3
+        res = herme.hermeval3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herme.hermeval3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3))
+
+    def test_hermegrid2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j->ij', y1, y2)
+        res = herme.hermegrid2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herme.hermegrid2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3)*2)
+
+    def test_hermegrid3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j,k->ijk', y1, y2, y3)
+        res = herme.hermegrid3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = herme.hermegrid3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3)*3)
+
+
+class TestIntegral(object):
+
+    def test_hermeint(self):
+        # check exceptions
+        assert_raises(ValueError, herme.hermeint, [0], .5)
+        assert_raises(ValueError, herme.hermeint, [0], -1)
+        assert_raises(ValueError, herme.hermeint, [0], 1, [0, 0])
+        assert_raises(ValueError, herme.hermeint, [0], lbnd=[0])
+        assert_raises(ValueError, herme.hermeint, [0], scl=[0])
+        assert_raises(ValueError, herme.hermeint, [0], axis=.5)
+
+        # test integration of zero polynomial
+        for i in range(2, 5):
+            k = [0]*(i - 2) + [1]
+            res = herme.hermeint([0], m=i, k=k)
+            assert_almost_equal(res, [0, 1])
+
+        # check single integration with integration constant
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [1/scl]
+            hermepol = herme.poly2herme(pol)
+            hermeint = herme.hermeint(hermepol, m=1, k=[i])
+            res = herme.herme2poly(hermeint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check single integration with integration constant and lbnd
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            hermepol = herme.poly2herme(pol)
+            hermeint = herme.hermeint(hermepol, m=1, k=[i], lbnd=-1)
+            assert_almost_equal(herme.hermeval(-1, hermeint), i)
+
+        # check single integration with integration constant and scaling
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [2/scl]
+            hermepol = herme.poly2herme(pol)
+            hermeint = herme.hermeint(hermepol, m=1, k=[i], scl=2)
+            res = herme.herme2poly(hermeint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with default k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herme.hermeint(tgt, m=1)
+                res = herme.hermeint(pol, m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with defined k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herme.hermeint(tgt, m=1, k=[k])
+                res = herme.hermeint(pol, m=j, k=list(range(j)))
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with lbnd
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herme.hermeint(tgt, m=1, k=[k], lbnd=-1)
+                res = herme.hermeint(pol, m=j, k=list(range(j)), lbnd=-1)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = herme.hermeint(tgt, m=1, k=[k], scl=2)
+                res = herme.hermeint(pol, m=j, k=list(range(j)), scl=2)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_hermeint_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([herme.hermeint(c) for c in c2d.T]).T
+        res = herme.hermeint(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([herme.hermeint(c) for c in c2d])
+        res = herme.hermeint(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([herme.hermeint(c, k=3) for c in c2d])
+        res = herme.hermeint(c2d, k=3, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestDerivative(object):
+
+    def test_hermeder(self):
+        # check exceptions
+        assert_raises(ValueError, herme.hermeder, [0], .5)
+        assert_raises(ValueError, herme.hermeder, [0], -1)
+
+        # check that zeroth derivative does nothing
+        for i in range(5):
+            tgt = [0]*i + [1]
+            res = herme.hermeder(tgt, m=0)
+            assert_equal(trim(res), trim(tgt))
+
+        # check that derivation is the inverse of integration
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = herme.hermeder(herme.hermeint(tgt, m=j), m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check derivation with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = herme.hermeder(
+                    herme.hermeint(tgt, m=j, scl=2), m=j, scl=.5)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_hermeder_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([herme.hermeder(c) for c in c2d.T]).T
+        res = herme.hermeder(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([herme.hermeder(c) for c in c2d])
+        res = herme.hermeder(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestVander(object):
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+
+    def test_hermevander(self):
+        # check for 1d x
+        x = np.arange(3)
+        v = herme.hermevander(x, 3)
+        assert_(v.shape == (3, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], herme.hermeval(x, coef))
+
+        # check for 2d x
+        x = np.array([[1, 2], [3, 4], [5, 6]])
+        v = herme.hermevander(x, 3)
+        assert_(v.shape == (3, 2, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], herme.hermeval(x, coef))
+
+    def test_hermevander2d(self):
+        # also tests hermeval2d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3))
+        van = herme.hermevander2d(x1, x2, [1, 2])
+        tgt = herme.hermeval2d(x1, x2, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = herme.hermevander2d([x1], [x2], [1, 2])
+        assert_(van.shape == (1, 5, 6))
+
+    def test_hermevander3d(self):
+        # also tests hermeval3d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3, 4))
+        van = herme.hermevander3d(x1, x2, x3, [1, 2, 3])
+        tgt = herme.hermeval3d(x1, x2, x3, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = herme.hermevander3d([x1], [x2], [x3], [1, 2, 3])
+        assert_(van.shape == (1, 5, 24))
+
+
+class TestFitting(object):
+
+    def test_hermefit(self):
+        def f(x):
+            return x*(x - 1)*(x - 2)
+
+        def f2(x):
+            return x**4 + x**2 + 1
+
+        # Test exceptions
+        assert_raises(ValueError, herme.hermefit, [1], [1], -1)
+        assert_raises(TypeError, herme.hermefit, [[1]], [1], 0)
+        assert_raises(TypeError, herme.hermefit, [], [1], 0)
+        assert_raises(TypeError, herme.hermefit, [1], [[[1]]], 0)
+        assert_raises(TypeError, herme.hermefit, [1, 2], [1], 0)
+        assert_raises(TypeError, herme.hermefit, [1], [1, 2], 0)
+        assert_raises(TypeError, herme.hermefit, [1], [1], 0, w=[[1]])
+        assert_raises(TypeError, herme.hermefit, [1], [1], 0, w=[1, 1])
+        assert_raises(ValueError, herme.hermefit, [1], [1], [-1,])
+        assert_raises(ValueError, herme.hermefit, [1], [1], [2, -1, 6])
+        assert_raises(TypeError, herme.hermefit, [1], [1], [])
+
+        # Test fit
+        x = np.linspace(0, 2)
+        y = f(x)
+        #
+        coef3 = herme.hermefit(x, y, 3)
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(herme.hermeval(x, coef3), y)
+        coef3 = herme.hermefit(x, y, [0, 1, 2, 3])
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(herme.hermeval(x, coef3), y)
+        #
+        coef4 = herme.hermefit(x, y, 4)
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(herme.hermeval(x, coef4), y)
+        coef4 = herme.hermefit(x, y, [0, 1, 2, 3, 4])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(herme.hermeval(x, coef4), y)
+        # check things still work if deg is not in strict increasing
+        coef4 = herme.hermefit(x, y, [2, 3, 4, 1, 0])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(herme.hermeval(x, coef4), y)
+        #
+        coef2d = herme.hermefit(x, np.array([y, y]).T, 3)
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        coef2d = herme.hermefit(x, np.array([y, y]).T, [0, 1, 2, 3])
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        # test weighting
+        w = np.zeros_like(x)
+        yw = y.copy()
+        w[1::2] = 1
+        y[0::2] = 0
+        wcoef3 = herme.hermefit(x, yw, 3, w=w)
+        assert_almost_equal(wcoef3, coef3)
+        wcoef3 = herme.hermefit(x, yw, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef3, coef3)
+        #
+        wcoef2d = herme.hermefit(x, np.array([yw, yw]).T, 3, w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        wcoef2d = herme.hermefit(x, np.array([yw, yw]).T, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        # test scaling with complex values x points whose square
+        # is zero when summed.
+        x = [1, 1j, -1, -1j]
+        assert_almost_equal(herme.hermefit(x, x, 1), [0, 1])
+        assert_almost_equal(herme.hermefit(x, x, [0, 1]), [0, 1])
+        # test fitting only even Legendre polynomials
+        x = np.linspace(-1, 1)
+        y = f2(x)
+        coef1 = herme.hermefit(x, y, 4)
+        assert_almost_equal(herme.hermeval(x, coef1), y)
+        coef2 = herme.hermefit(x, y, [0, 2, 4])
+        assert_almost_equal(herme.hermeval(x, coef2), y)
+        assert_almost_equal(coef1, coef2)
+
+
+class TestCompanion(object):
+
+    def test_raises(self):
+        assert_raises(ValueError, herme.hermecompanion, [])
+        assert_raises(ValueError, herme.hermecompanion, [1])
+
+    def test_dimensions(self):
+        for i in range(1, 5):
+            coef = [0]*i + [1]
+            assert_(herme.hermecompanion(coef).shape == (i, i))
+
+    def test_linear_root(self):
+        assert_(herme.hermecompanion([1, 2])[0, 0] == -.5)
+
+
+class TestGauss(object):
+
+    def test_100(self):
+        x, w = herme.hermegauss(100)
+
+        # test orthogonality. Note that the results need to be normalized,
+        # otherwise the huge values that can arise from fast growing
+        # functions like Laguerre can be very confusing.
+        v = herme.hermevander(x, 99)
+        vv = np.dot(v.T * w, v)
+        vd = 1/np.sqrt(vv.diagonal())
+        vv = vd[:, None] * vv * vd
+        assert_almost_equal(vv, np.eye(100))
+
+        # check that the integral of 1 is correct
+        tgt = np.sqrt(2*np.pi)
+        assert_almost_equal(w.sum(), tgt)
+
+
+class TestMisc(object):
+
+    def test_hermefromroots(self):
+        res = herme.hermefromroots([])
+        assert_almost_equal(trim(res), [1])
+        for i in range(1, 5):
+            roots = np.cos(np.linspace(-np.pi, 0, 2*i + 1)[1::2])
+            pol = herme.hermefromroots(roots)
+            res = herme.hermeval(roots, pol)
+            tgt = 0
+            assert_(len(pol) == i + 1)
+            assert_almost_equal(herme.herme2poly(pol)[-1], 1)
+            assert_almost_equal(res, tgt)
+
+    def test_hermeroots(self):
+        assert_almost_equal(herme.hermeroots([1]), [])
+        assert_almost_equal(herme.hermeroots([1, 1]), [-1])
+        for i in range(2, 5):
+            tgt = np.linspace(-1, 1, i)
+            res = herme.hermeroots(herme.hermefromroots(tgt))
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_hermetrim(self):
+        coef = [2, -1, 1, 0]
+
+        # Test exceptions
+        assert_raises(ValueError, herme.hermetrim, coef, -1)
+
+        # Test results
+        assert_equal(herme.hermetrim(coef), coef[:-1])
+        assert_equal(herme.hermetrim(coef, 1), coef[:-3])
+        assert_equal(herme.hermetrim(coef, 2), [0])
+
+    def test_hermeline(self):
+        assert_equal(herme.hermeline(3, 4), [3, 4])
+
+    def test_herme2poly(self):
+        for i in range(10):
+            assert_almost_equal(herme.herme2poly([0]*i + [1]), Helist[i])
+
+    def test_poly2herme(self):
+        for i in range(10):
+            assert_almost_equal(herme.poly2herme(Helist[i]), [0]*i + [1])
+
+    def test_weight(self):
+        x = np.linspace(-5, 5, 11)
+        tgt = np.exp(-.5*x**2)
+        res = herme.hermeweight(x)
+        assert_almost_equal(res, tgt)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_laguerre.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_laguerre.py
new file mode 100644
index 00000000000..3ababec5e9b
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_laguerre.py
@@ -0,0 +1,539 @@
+"""Tests for laguerre module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.polynomial.laguerre as lag
+from numpy.polynomial.polynomial import polyval
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+L0 = np.array([1])/1
+L1 = np.array([1, -1])/1
+L2 = np.array([2, -4, 1])/2
+L3 = np.array([6, -18, 9, -1])/6
+L4 = np.array([24, -96, 72, -16, 1])/24
+L5 = np.array([120, -600, 600, -200, 25, -1])/120
+L6 = np.array([720, -4320, 5400, -2400, 450, -36, 1])/720
+
+Llist = [L0, L1, L2, L3, L4, L5, L6]
+
+
+def trim(x):
+    return lag.lagtrim(x, tol=1e-6)
+
+
+class TestConstants(object):
+
+    def test_lagdomain(self):
+        assert_equal(lag.lagdomain, [0, 1])
+
+    def test_lagzero(self):
+        assert_equal(lag.lagzero, [0])
+
+    def test_lagone(self):
+        assert_equal(lag.lagone, [1])
+
+    def test_lagx(self):
+        assert_equal(lag.lagx, [1, -1])
+
+
+class TestArithmetic(object):
+    x = np.linspace(-3, 3, 100)
+
+    def test_lagadd(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] += 1
+                res = lag.lagadd([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_lagsub(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] -= 1
+                res = lag.lagsub([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_lagmulx(self):
+        assert_equal(lag.lagmulx([0]), [0])
+        assert_equal(lag.lagmulx([1]), [1, -1])
+        for i in range(1, 5):
+            ser = [0]*i + [1]
+            tgt = [0]*(i - 1) + [-i, 2*i + 1, -(i + 1)]
+            assert_almost_equal(lag.lagmulx(ser), tgt)
+
+    def test_lagmul(self):
+        # check values of result
+        for i in range(5):
+            pol1 = [0]*i + [1]
+            val1 = lag.lagval(self.x, pol1)
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                pol2 = [0]*j + [1]
+                val2 = lag.lagval(self.x, pol2)
+                pol3 = lag.lagmul(pol1, pol2)
+                val3 = lag.lagval(self.x, pol3)
+                assert_(len(pol3) == i + j + 1, msg)
+                assert_almost_equal(val3, val1*val2, err_msg=msg)
+
+    def test_lagdiv(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                ci = [0]*i + [1]
+                cj = [0]*j + [1]
+                tgt = lag.lagadd(ci, cj)
+                quo, rem = lag.lagdiv(tgt, ci)
+                res = lag.lagadd(lag.lagmul(quo, ci), rem)
+                assert_almost_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_lagpow(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                c = np.arange(i + 1)
+                tgt = reduce(lag.lagmul, [c]*j, np.array([1]))
+                res = lag.lagpow(c, j) 
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+
+class TestEvaluation(object):
+    # coefficients of 1 + 2*x + 3*x**2
+    c1d = np.array([9., -14., 6.])
+    c2d = np.einsum('i,j->ij', c1d, c1d)
+    c3d = np.einsum('i,j,k->ijk', c1d, c1d, c1d)
+
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+    y = polyval(x, [1., 2., 3.])
+
+    def test_lagval(self):
+        #check empty input
+        assert_equal(lag.lagval([], [1]).size, 0)
+
+        #check normal input)
+        x = np.linspace(-1, 1)
+        y = [polyval(x, c) for c in Llist]
+        for i in range(7):
+            msg = "At i=%d" % i
+            tgt = y[i]
+            res = lag.lagval(x, [0]*i + [1])
+            assert_almost_equal(res, tgt, err_msg=msg)
+
+        #check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(lag.lagval(x, [1]).shape, dims)
+            assert_equal(lag.lagval(x, [1, 0]).shape, dims)
+            assert_equal(lag.lagval(x, [1, 0, 0]).shape, dims)
+
+    def test_lagval2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, lag.lagval2d, x1, x2[:2], self.c2d)
+
+        #test values
+        tgt = y1*y2
+        res = lag.lagval2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = lag.lagval2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3))
+
+    def test_lagval3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, lag.lagval3d, x1, x2, x3[:2], self.c3d)
+
+        #test values
+        tgt = y1*y2*y3
+        res = lag.lagval3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = lag.lagval3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3))
+
+    def test_laggrid2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j->ij', y1, y2)
+        res = lag.laggrid2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = lag.laggrid2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3)*2)
+
+    def test_laggrid3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j,k->ijk', y1, y2, y3)
+        res = lag.laggrid3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = lag.laggrid3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3)*3)
+
+
+class TestIntegral(object):
+
+    def test_lagint(self):
+        # check exceptions
+        assert_raises(ValueError, lag.lagint, [0], .5)
+        assert_raises(ValueError, lag.lagint, [0], -1)
+        assert_raises(ValueError, lag.lagint, [0], 1, [0, 0])
+        assert_raises(ValueError, lag.lagint, [0], lbnd=[0])
+        assert_raises(ValueError, lag.lagint, [0], scl=[0])
+        assert_raises(ValueError, lag.lagint, [0], axis=.5)
+
+        # test integration of zero polynomial
+        for i in range(2, 5):
+            k = [0]*(i - 2) + [1]
+            res = lag.lagint([0], m=i, k=k)
+            assert_almost_equal(res, [1, -1])
+
+        # check single integration with integration constant
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [1/scl]
+            lagpol = lag.poly2lag(pol)
+            lagint = lag.lagint(lagpol, m=1, k=[i])
+            res = lag.lag2poly(lagint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check single integration with integration constant and lbnd
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            lagpol = lag.poly2lag(pol)
+            lagint = lag.lagint(lagpol, m=1, k=[i], lbnd=-1)
+            assert_almost_equal(lag.lagval(-1, lagint), i)
+
+        # check single integration with integration constant and scaling
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [2/scl]
+            lagpol = lag.poly2lag(pol)
+            lagint = lag.lagint(lagpol, m=1, k=[i], scl=2)
+            res = lag.lag2poly(lagint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with default k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = lag.lagint(tgt, m=1)
+                res = lag.lagint(pol, m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with defined k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = lag.lagint(tgt, m=1, k=[k])
+                res = lag.lagint(pol, m=j, k=list(range(j)))
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with lbnd
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = lag.lagint(tgt, m=1, k=[k], lbnd=-1)
+                res = lag.lagint(pol, m=j, k=list(range(j)), lbnd=-1)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = lag.lagint(tgt, m=1, k=[k], scl=2)
+                res = lag.lagint(pol, m=j, k=list(range(j)), scl=2)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_lagint_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([lag.lagint(c) for c in c2d.T]).T
+        res = lag.lagint(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([lag.lagint(c) for c in c2d])
+        res = lag.lagint(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([lag.lagint(c, k=3) for c in c2d])
+        res = lag.lagint(c2d, k=3, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestDerivative(object):
+
+    def test_lagder(self):
+        # check exceptions
+        assert_raises(ValueError, lag.lagder, [0], .5)
+        assert_raises(ValueError, lag.lagder, [0], -1)
+
+        # check that zeroth derivative does nothing
+        for i in range(5):
+            tgt = [0]*i + [1]
+            res = lag.lagder(tgt, m=0)
+            assert_equal(trim(res), trim(tgt))
+
+        # check that derivation is the inverse of integration
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = lag.lagder(lag.lagint(tgt, m=j), m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check derivation with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = lag.lagder(lag.lagint(tgt, m=j, scl=2), m=j, scl=.5)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_lagder_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([lag.lagder(c) for c in c2d.T]).T
+        res = lag.lagder(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([lag.lagder(c) for c in c2d])
+        res = lag.lagder(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestVander(object):
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+
+    def test_lagvander(self):
+        # check for 1d x
+        x = np.arange(3)
+        v = lag.lagvander(x, 3)
+        assert_(v.shape == (3, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], lag.lagval(x, coef))
+
+        # check for 2d x
+        x = np.array([[1, 2], [3, 4], [5, 6]])
+        v = lag.lagvander(x, 3)
+        assert_(v.shape == (3, 2, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], lag.lagval(x, coef))
+
+    def test_lagvander2d(self):
+        # also tests lagval2d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3))
+        van = lag.lagvander2d(x1, x2, [1, 2])
+        tgt = lag.lagval2d(x1, x2, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = lag.lagvander2d([x1], [x2], [1, 2])
+        assert_(van.shape == (1, 5, 6))
+
+    def test_lagvander3d(self):
+        # also tests lagval3d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3, 4))
+        van = lag.lagvander3d(x1, x2, x3, [1, 2, 3])
+        tgt = lag.lagval3d(x1, x2, x3, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = lag.lagvander3d([x1], [x2], [x3], [1, 2, 3])
+        assert_(van.shape == (1, 5, 24))
+
+
+class TestFitting(object):
+
+    def test_lagfit(self):
+        def f(x):
+            return x*(x - 1)*(x - 2)
+
+        # Test exceptions
+        assert_raises(ValueError, lag.lagfit, [1], [1], -1)
+        assert_raises(TypeError, lag.lagfit, [[1]], [1], 0)
+        assert_raises(TypeError, lag.lagfit, [], [1], 0)
+        assert_raises(TypeError, lag.lagfit, [1], [[[1]]], 0)
+        assert_raises(TypeError, lag.lagfit, [1, 2], [1], 0)
+        assert_raises(TypeError, lag.lagfit, [1], [1, 2], 0)
+        assert_raises(TypeError, lag.lagfit, [1], [1], 0, w=[[1]])
+        assert_raises(TypeError, lag.lagfit, [1], [1], 0, w=[1, 1])
+        assert_raises(ValueError, lag.lagfit, [1], [1], [-1,])
+        assert_raises(ValueError, lag.lagfit, [1], [1], [2, -1, 6])
+        assert_raises(TypeError, lag.lagfit, [1], [1], [])
+
+        # Test fit
+        x = np.linspace(0, 2)
+        y = f(x)
+        #
+        coef3 = lag.lagfit(x, y, 3)
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(lag.lagval(x, coef3), y)
+        coef3 = lag.lagfit(x, y, [0, 1, 2, 3])
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(lag.lagval(x, coef3), y)
+        #
+        coef4 = lag.lagfit(x, y, 4)
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(lag.lagval(x, coef4), y)
+        coef4 = lag.lagfit(x, y, [0, 1, 2, 3, 4])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(lag.lagval(x, coef4), y)
+        #
+        coef2d = lag.lagfit(x, np.array([y, y]).T, 3)
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        coef2d = lag.lagfit(x, np.array([y, y]).T, [0, 1, 2, 3])
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        # test weighting
+        w = np.zeros_like(x)
+        yw = y.copy()
+        w[1::2] = 1
+        y[0::2] = 0
+        wcoef3 = lag.lagfit(x, yw, 3, w=w)
+        assert_almost_equal(wcoef3, coef3)
+        wcoef3 = lag.lagfit(x, yw, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef3, coef3)
+        #
+        wcoef2d = lag.lagfit(x, np.array([yw, yw]).T, 3, w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        wcoef2d = lag.lagfit(x, np.array([yw, yw]).T, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        # test scaling with complex values x points whose square
+        # is zero when summed.
+        x = [1, 1j, -1, -1j]
+        assert_almost_equal(lag.lagfit(x, x, 1), [1, -1])
+        assert_almost_equal(lag.lagfit(x, x, [0, 1]), [1, -1])
+
+
+class TestCompanion(object):
+
+    def test_raises(self):
+        assert_raises(ValueError, lag.lagcompanion, [])
+        assert_raises(ValueError, lag.lagcompanion, [1])
+
+    def test_dimensions(self):
+        for i in range(1, 5):
+            coef = [0]*i + [1]
+            assert_(lag.lagcompanion(coef).shape == (i, i))
+
+    def test_linear_root(self):
+        assert_(lag.lagcompanion([1, 2])[0, 0] == 1.5)
+
+
+class TestGauss(object):
+
+    def test_100(self):
+        x, w = lag.laggauss(100)
+
+        # test orthogonality. Note that the results need to be normalized,
+        # otherwise the huge values that can arise from fast growing
+        # functions like Laguerre can be very confusing.
+        v = lag.lagvander(x, 99)
+        vv = np.dot(v.T * w, v)
+        vd = 1/np.sqrt(vv.diagonal())
+        vv = vd[:, None] * vv * vd
+        assert_almost_equal(vv, np.eye(100))
+
+        # check that the integral of 1 is correct
+        tgt = 1.0
+        assert_almost_equal(w.sum(), tgt)
+
+
+class TestMisc(object):
+
+    def test_lagfromroots(self):
+        res = lag.lagfromroots([])
+        assert_almost_equal(trim(res), [1])
+        for i in range(1, 5):
+            roots = np.cos(np.linspace(-np.pi, 0, 2*i + 1)[1::2])
+            pol = lag.lagfromroots(roots)
+            res = lag.lagval(roots, pol)
+            tgt = 0
+            assert_(len(pol) == i + 1)
+            assert_almost_equal(lag.lag2poly(pol)[-1], 1)
+            assert_almost_equal(res, tgt)
+
+    def test_lagroots(self):
+        assert_almost_equal(lag.lagroots([1]), [])
+        assert_almost_equal(lag.lagroots([0, 1]), [1])
+        for i in range(2, 5):
+            tgt = np.linspace(0, 3, i)
+            res = lag.lagroots(lag.lagfromroots(tgt))
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_lagtrim(self):
+        coef = [2, -1, 1, 0]
+
+        # Test exceptions
+        assert_raises(ValueError, lag.lagtrim, coef, -1)
+
+        # Test results
+        assert_equal(lag.lagtrim(coef), coef[:-1])
+        assert_equal(lag.lagtrim(coef, 1), coef[:-3])
+        assert_equal(lag.lagtrim(coef, 2), [0])
+
+    def test_lagline(self):
+        assert_equal(lag.lagline(3, 4), [7, -4])
+
+    def test_lag2poly(self):
+        for i in range(7):
+            assert_almost_equal(lag.lag2poly([0]*i + [1]), Llist[i])
+
+    def test_poly2lag(self):
+        for i in range(7):
+            assert_almost_equal(lag.poly2lag(Llist[i]), [0]*i + [1])
+
+    def test_weight(self):
+        x = np.linspace(0, 10, 11)
+        tgt = np.exp(-x)
+        res = lag.lagweight(x)
+        assert_almost_equal(res, tgt)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_legendre.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_legendre.py
new file mode 100644
index 00000000000..a23086d5948
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_legendre.py
@@ -0,0 +1,558 @@
+"""Tests for legendre module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.polynomial.legendre as leg
+from numpy.polynomial.polynomial import polyval
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+L0 = np.array([1])
+L1 = np.array([0, 1])
+L2 = np.array([-1, 0, 3])/2
+L3 = np.array([0, -3, 0, 5])/2
+L4 = np.array([3, 0, -30, 0, 35])/8
+L5 = np.array([0, 15, 0, -70, 0, 63])/8
+L6 = np.array([-5, 0, 105, 0, -315, 0, 231])/16
+L7 = np.array([0, -35, 0, 315, 0, -693, 0, 429])/16
+L8 = np.array([35, 0, -1260, 0, 6930, 0, -12012, 0, 6435])/128
+L9 = np.array([0, 315, 0, -4620, 0, 18018, 0, -25740, 0, 12155])/128
+
+Llist = [L0, L1, L2, L3, L4, L5, L6, L7, L8, L9]
+
+
+def trim(x):
+    return leg.legtrim(x, tol=1e-6)
+
+
+class TestConstants(object):
+
+    def test_legdomain(self):
+        assert_equal(leg.legdomain, [-1, 1])
+
+    def test_legzero(self):
+        assert_equal(leg.legzero, [0])
+
+    def test_legone(self):
+        assert_equal(leg.legone, [1])
+
+    def test_legx(self):
+        assert_equal(leg.legx, [0, 1])
+
+
+class TestArithmetic(object):
+    x = np.linspace(-1, 1, 100)
+
+    def test_legadd(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] += 1
+                res = leg.legadd([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_legsub(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] -= 1
+                res = leg.legsub([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_legmulx(self):
+        assert_equal(leg.legmulx([0]), [0])
+        assert_equal(leg.legmulx([1]), [0, 1])
+        for i in range(1, 5):
+            tmp = 2*i + 1
+            ser = [0]*i + [1]
+            tgt = [0]*(i - 1) + [i/tmp, 0, (i + 1)/tmp]
+            assert_equal(leg.legmulx(ser), tgt)
+
+    def test_legmul(self):
+        # check values of result
+        for i in range(5):
+            pol1 = [0]*i + [1]
+            val1 = leg.legval(self.x, pol1)
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                pol2 = [0]*j + [1]
+                val2 = leg.legval(self.x, pol2)
+                pol3 = leg.legmul(pol1, pol2)
+                val3 = leg.legval(self.x, pol3)
+                assert_(len(pol3) == i + j + 1, msg)
+                assert_almost_equal(val3, val1*val2, err_msg=msg)
+
+    def test_legdiv(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                ci = [0]*i + [1]
+                cj = [0]*j + [1]
+                tgt = leg.legadd(ci, cj)
+                quo, rem = leg.legdiv(tgt, ci)
+                res = leg.legadd(leg.legmul(quo, ci), rem)
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_legpow(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                c = np.arange(i + 1)
+                tgt = reduce(leg.legmul, [c]*j, np.array([1]))
+                res = leg.legpow(c, j) 
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+
+class TestEvaluation(object):
+    # coefficients of 1 + 2*x + 3*x**2
+    c1d = np.array([2., 2., 2.])
+    c2d = np.einsum('i,j->ij', c1d, c1d)
+    c3d = np.einsum('i,j,k->ijk', c1d, c1d, c1d)
+
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+    y = polyval(x, [1., 2., 3.])
+
+    def test_legval(self):
+        #check empty input
+        assert_equal(leg.legval([], [1]).size, 0)
+
+        #check normal input)
+        x = np.linspace(-1, 1)
+        y = [polyval(x, c) for c in Llist]
+        for i in range(10):
+            msg = "At i=%d" % i
+            tgt = y[i]
+            res = leg.legval(x, [0]*i + [1])
+            assert_almost_equal(res, tgt, err_msg=msg)
+
+        #check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(leg.legval(x, [1]).shape, dims)
+            assert_equal(leg.legval(x, [1, 0]).shape, dims)
+            assert_equal(leg.legval(x, [1, 0, 0]).shape, dims)
+
+    def test_legval2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, leg.legval2d, x1, x2[:2], self.c2d)
+
+        #test values
+        tgt = y1*y2
+        res = leg.legval2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = leg.legval2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3))
+
+    def test_legval3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, leg.legval3d, x1, x2, x3[:2], self.c3d)
+
+        #test values
+        tgt = y1*y2*y3
+        res = leg.legval3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = leg.legval3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3))
+
+    def test_leggrid2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j->ij', y1, y2)
+        res = leg.leggrid2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = leg.leggrid2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3)*2)
+
+    def test_leggrid3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j,k->ijk', y1, y2, y3)
+        res = leg.leggrid3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = leg.leggrid3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3)*3)
+
+
+class TestIntegral(object):
+
+    def test_legint(self):
+        # check exceptions
+        assert_raises(ValueError, leg.legint, [0], .5)
+        assert_raises(ValueError, leg.legint, [0], -1)
+        assert_raises(ValueError, leg.legint, [0], 1, [0, 0])
+        assert_raises(ValueError, leg.legint, [0], lbnd=[0])
+        assert_raises(ValueError, leg.legint, [0], scl=[0])
+        assert_raises(ValueError, leg.legint, [0], axis=.5)
+
+        # test integration of zero polynomial
+        for i in range(2, 5):
+            k = [0]*(i - 2) + [1]
+            res = leg.legint([0], m=i, k=k)
+            assert_almost_equal(res, [0, 1])
+
+        # check single integration with integration constant
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [1/scl]
+            legpol = leg.poly2leg(pol)
+            legint = leg.legint(legpol, m=1, k=[i])
+            res = leg.leg2poly(legint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check single integration with integration constant and lbnd
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            legpol = leg.poly2leg(pol)
+            legint = leg.legint(legpol, m=1, k=[i], lbnd=-1)
+            assert_almost_equal(leg.legval(-1, legint), i)
+
+        # check single integration with integration constant and scaling
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [2/scl]
+            legpol = leg.poly2leg(pol)
+            legint = leg.legint(legpol, m=1, k=[i], scl=2)
+            res = leg.leg2poly(legint)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with default k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = leg.legint(tgt, m=1)
+                res = leg.legint(pol, m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with defined k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = leg.legint(tgt, m=1, k=[k])
+                res = leg.legint(pol, m=j, k=list(range(j)))
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with lbnd
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = leg.legint(tgt, m=1, k=[k], lbnd=-1)
+                res = leg.legint(pol, m=j, k=list(range(j)), lbnd=-1)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = leg.legint(tgt, m=1, k=[k], scl=2)
+                res = leg.legint(pol, m=j, k=list(range(j)), scl=2)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_legint_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([leg.legint(c) for c in c2d.T]).T
+        res = leg.legint(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([leg.legint(c) for c in c2d])
+        res = leg.legint(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([leg.legint(c, k=3) for c in c2d])
+        res = leg.legint(c2d, k=3, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestDerivative(object):
+
+    def test_legder(self):
+        # check exceptions
+        assert_raises(ValueError, leg.legder, [0], .5)
+        assert_raises(ValueError, leg.legder, [0], -1)
+
+        # check that zeroth derivative does nothing
+        for i in range(5):
+            tgt = [0]*i + [1]
+            res = leg.legder(tgt, m=0)
+            assert_equal(trim(res), trim(tgt))
+
+        # check that derivation is the inverse of integration
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = leg.legder(leg.legint(tgt, m=j), m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check derivation with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = leg.legder(leg.legint(tgt, m=j, scl=2), m=j, scl=.5)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_legder_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([leg.legder(c) for c in c2d.T]).T
+        res = leg.legder(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([leg.legder(c) for c in c2d])
+        res = leg.legder(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestVander(object):
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+
+    def test_legvander(self):
+        # check for 1d x
+        x = np.arange(3)
+        v = leg.legvander(x, 3)
+        assert_(v.shape == (3, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], leg.legval(x, coef))
+
+        # check for 2d x
+        x = np.array([[1, 2], [3, 4], [5, 6]])
+        v = leg.legvander(x, 3)
+        assert_(v.shape == (3, 2, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], leg.legval(x, coef))
+
+    def test_legvander2d(self):
+        # also tests polyval2d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3))
+        van = leg.legvander2d(x1, x2, [1, 2])
+        tgt = leg.legval2d(x1, x2, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = leg.legvander2d([x1], [x2], [1, 2])
+        assert_(van.shape == (1, 5, 6))
+
+    def test_legvander3d(self):
+        # also tests polyval3d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3, 4))
+        van = leg.legvander3d(x1, x2, x3, [1, 2, 3])
+        tgt = leg.legval3d(x1, x2, x3, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = leg.legvander3d([x1], [x2], [x3], [1, 2, 3])
+        assert_(van.shape == (1, 5, 24))
+
+
+class TestFitting(object):
+
+    def test_legfit(self):
+        def f(x):
+            return x*(x - 1)*(x - 2)
+
+        def f2(x):
+            return x**4 + x**2 + 1
+
+        # Test exceptions
+        assert_raises(ValueError, leg.legfit, [1], [1], -1)
+        assert_raises(TypeError, leg.legfit, [[1]], [1], 0)
+        assert_raises(TypeError, leg.legfit, [], [1], 0)
+        assert_raises(TypeError, leg.legfit, [1], [[[1]]], 0)
+        assert_raises(TypeError, leg.legfit, [1, 2], [1], 0)
+        assert_raises(TypeError, leg.legfit, [1], [1, 2], 0)
+        assert_raises(TypeError, leg.legfit, [1], [1], 0, w=[[1]])
+        assert_raises(TypeError, leg.legfit, [1], [1], 0, w=[1, 1])
+        assert_raises(ValueError, leg.legfit, [1], [1], [-1,])
+        assert_raises(ValueError, leg.legfit, [1], [1], [2, -1, 6])
+        assert_raises(TypeError, leg.legfit, [1], [1], [])
+
+        # Test fit
+        x = np.linspace(0, 2)
+        y = f(x)
+        #
+        coef3 = leg.legfit(x, y, 3)
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(leg.legval(x, coef3), y)
+        coef3 = leg.legfit(x, y, [0, 1, 2, 3])
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(leg.legval(x, coef3), y)
+        #
+        coef4 = leg.legfit(x, y, 4)
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(leg.legval(x, coef4), y)
+        coef4 = leg.legfit(x, y, [0, 1, 2, 3, 4])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(leg.legval(x, coef4), y)
+        # check things still work if deg is not in strict increasing
+        coef4 = leg.legfit(x, y, [2, 3, 4, 1, 0])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(leg.legval(x, coef4), y)
+        #
+        coef2d = leg.legfit(x, np.array([y, y]).T, 3)
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        coef2d = leg.legfit(x, np.array([y, y]).T, [0, 1, 2, 3])
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        # test weighting
+        w = np.zeros_like(x)
+        yw = y.copy()
+        w[1::2] = 1
+        y[0::2] = 0
+        wcoef3 = leg.legfit(x, yw, 3, w=w)
+        assert_almost_equal(wcoef3, coef3)
+        wcoef3 = leg.legfit(x, yw, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef3, coef3)
+        #
+        wcoef2d = leg.legfit(x, np.array([yw, yw]).T, 3, w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        wcoef2d = leg.legfit(x, np.array([yw, yw]).T, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        # test scaling with complex values x points whose square
+        # is zero when summed.
+        x = [1, 1j, -1, -1j]
+        assert_almost_equal(leg.legfit(x, x, 1), [0, 1])
+        assert_almost_equal(leg.legfit(x, x, [0, 1]), [0, 1])
+        # test fitting only even Legendre polynomials
+        x = np.linspace(-1, 1)
+        y = f2(x)
+        coef1 = leg.legfit(x, y, 4)
+        assert_almost_equal(leg.legval(x, coef1), y)
+        coef2 = leg.legfit(x, y, [0, 2, 4])
+        assert_almost_equal(leg.legval(x, coef2), y)
+        assert_almost_equal(coef1, coef2)
+
+
+class TestCompanion(object):
+
+    def test_raises(self):
+        assert_raises(ValueError, leg.legcompanion, [])
+        assert_raises(ValueError, leg.legcompanion, [1])
+
+    def test_dimensions(self):
+        for i in range(1, 5):
+            coef = [0]*i + [1]
+            assert_(leg.legcompanion(coef).shape == (i, i))
+
+    def test_linear_root(self):
+        assert_(leg.legcompanion([1, 2])[0, 0] == -.5)
+
+
+class TestGauss(object):
+
+    def test_100(self):
+        x, w = leg.leggauss(100)
+
+        # test orthogonality. Note that the results need to be normalized,
+        # otherwise the huge values that can arise from fast growing
+        # functions like Laguerre can be very confusing.
+        v = leg.legvander(x, 99)
+        vv = np.dot(v.T * w, v)
+        vd = 1/np.sqrt(vv.diagonal())
+        vv = vd[:, None] * vv * vd
+        assert_almost_equal(vv, np.eye(100))
+
+        # check that the integral of 1 is correct
+        tgt = 2.0
+        assert_almost_equal(w.sum(), tgt)
+
+
+class TestMisc(object):
+
+    def test_legfromroots(self):
+        res = leg.legfromroots([])
+        assert_almost_equal(trim(res), [1])
+        for i in range(1, 5):
+            roots = np.cos(np.linspace(-np.pi, 0, 2*i + 1)[1::2])
+            pol = leg.legfromroots(roots)
+            res = leg.legval(roots, pol)
+            tgt = 0
+            assert_(len(pol) == i + 1)
+            assert_almost_equal(leg.leg2poly(pol)[-1], 1)
+            assert_almost_equal(res, tgt)
+
+    def test_legroots(self):
+        assert_almost_equal(leg.legroots([1]), [])
+        assert_almost_equal(leg.legroots([1, 2]), [-.5])
+        for i in range(2, 5):
+            tgt = np.linspace(-1, 1, i)
+            res = leg.legroots(leg.legfromroots(tgt))
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_legtrim(self):
+        coef = [2, -1, 1, 0]
+
+        # Test exceptions
+        assert_raises(ValueError, leg.legtrim, coef, -1)
+
+        # Test results
+        assert_equal(leg.legtrim(coef), coef[:-1])
+        assert_equal(leg.legtrim(coef, 1), coef[:-3])
+        assert_equal(leg.legtrim(coef, 2), [0])
+
+    def test_legline(self):
+        assert_equal(leg.legline(3, 4), [3, 4])
+
+    def test_leg2poly(self):
+        for i in range(10):
+            assert_almost_equal(leg.leg2poly([0]*i + [1]), Llist[i])
+
+    def test_poly2leg(self):
+        for i in range(10):
+            assert_almost_equal(leg.poly2leg(Llist[i]), [0]*i + [1])
+
+    def test_weight(self):
+        x = np.linspace(-1, 1, 11)
+        tgt = 1.
+        res = leg.legweight(x)
+        assert_almost_equal(res, tgt)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_polynomial.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_polynomial.py
new file mode 100644
index 00000000000..0c93be278ff
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_polynomial.py
@@ -0,0 +1,578 @@
+"""Tests for polynomial module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+from functools import reduce
+
+import numpy as np
+import numpy.polynomial.polynomial as poly
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+
+def trim(x):
+    return poly.polytrim(x, tol=1e-6)
+
+T0 = [1]
+T1 = [0, 1]
+T2 = [-1, 0, 2]
+T3 = [0, -3, 0, 4]
+T4 = [1, 0, -8, 0, 8]
+T5 = [0, 5, 0, -20, 0, 16]
+T6 = [-1, 0, 18, 0, -48, 0, 32]
+T7 = [0, -7, 0, 56, 0, -112, 0, 64]
+T8 = [1, 0, -32, 0, 160, 0, -256, 0, 128]
+T9 = [0, 9, 0, -120, 0, 432, 0, -576, 0, 256]
+
+Tlist = [T0, T1, T2, T3, T4, T5, T6, T7, T8, T9]
+
+
+class TestConstants(object):
+
+    def test_polydomain(self):
+        assert_equal(poly.polydomain, [-1, 1])
+
+    def test_polyzero(self):
+        assert_equal(poly.polyzero, [0])
+
+    def test_polyone(self):
+        assert_equal(poly.polyone, [1])
+
+    def test_polyx(self):
+        assert_equal(poly.polyx, [0, 1])
+
+
+class TestArithmetic(object):
+
+    def test_polyadd(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] += 1
+                res = poly.polyadd([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_polysub(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(max(i, j) + 1)
+                tgt[i] += 1
+                tgt[j] -= 1
+                res = poly.polysub([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_polymulx(self):
+        assert_equal(poly.polymulx([0]), [0])
+        assert_equal(poly.polymulx([1]), [0, 1])
+        for i in range(1, 5):
+            ser = [0]*i + [1]
+            tgt = [0]*(i + 1) + [1]
+            assert_equal(poly.polymulx(ser), tgt)
+
+    def test_polymul(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                tgt = np.zeros(i + j + 1)
+                tgt[i + j] += 1
+                res = poly.polymul([0]*i + [1], [0]*j + [1])
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+    def test_polydiv(self):
+        # check zero division
+        assert_raises(ZeroDivisionError, poly.polydiv, [1], [0])
+
+        # check scalar division
+        quo, rem = poly.polydiv([2], [2])
+        assert_equal((quo, rem), (1, 0))
+        quo, rem = poly.polydiv([2, 2], [2])
+        assert_equal((quo, rem), ((1, 1), 0))
+
+        # check rest.
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                ci = [0]*i + [1, 2]
+                cj = [0]*j + [1, 2]
+                tgt = poly.polyadd(ci, cj)
+                quo, rem = poly.polydiv(tgt, ci)
+                res = poly.polyadd(poly.polymul(quo, ci), rem)
+                assert_equal(res, tgt, err_msg=msg)
+
+    def test_polypow(self):
+        for i in range(5):
+            for j in range(5):
+                msg = "At i=%d, j=%d" % (i, j)
+                c = np.arange(i + 1)
+                tgt = reduce(poly.polymul, [c]*j, np.array([1]))
+                res = poly.polypow(c, j) 
+                assert_equal(trim(res), trim(tgt), err_msg=msg)
+
+
+class TestEvaluation(object):
+    # coefficients of 1 + 2*x + 3*x**2
+    c1d = np.array([1., 2., 3.])
+    c2d = np.einsum('i,j->ij', c1d, c1d)
+    c3d = np.einsum('i,j,k->ijk', c1d, c1d, c1d)
+
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+    y = poly.polyval(x, [1., 2., 3.])
+
+    def test_polyval(self):
+        #check empty input
+        assert_equal(poly.polyval([], [1]).size, 0)
+
+        #check normal input)
+        x = np.linspace(-1, 1)
+        y = [x**i for i in range(5)]
+        for i in range(5):
+            tgt = y[i]
+            res = poly.polyval(x, [0]*i + [1])
+            assert_almost_equal(res, tgt)
+        tgt = x*(x**2 - 1)
+        res = poly.polyval(x, [0, -1, 0, 1])
+        assert_almost_equal(res, tgt)
+
+        #check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(poly.polyval(x, [1]).shape, dims)
+            assert_equal(poly.polyval(x, [1, 0]).shape, dims)
+            assert_equal(poly.polyval(x, [1, 0, 0]).shape, dims)
+
+    def test_polyvalfromroots(self):
+        # check exception for broadcasting x values over root array with
+        # too few dimensions
+        assert_raises(ValueError, poly.polyvalfromroots,
+                      [1], [1], tensor=False)
+
+        # check empty input
+        assert_equal(poly.polyvalfromroots([], [1]).size, 0)
+        assert_(poly.polyvalfromroots([], [1]).shape == (0,))
+
+        # check empty input + multidimensional roots
+        assert_equal(poly.polyvalfromroots([], [[1] * 5]).size, 0)
+        assert_(poly.polyvalfromroots([], [[1] * 5]).shape == (5, 0))
+
+        # check scalar input
+        assert_equal(poly.polyvalfromroots(1, 1), 0)
+        assert_(poly.polyvalfromroots(1, np.ones((3, 3))).shape == (3,))
+
+        # check normal input)
+        x = np.linspace(-1, 1)
+        y = [x**i for i in range(5)]
+        for i in range(1, 5):
+            tgt = y[i]
+            res = poly.polyvalfromroots(x, [0]*i)
+            assert_almost_equal(res, tgt)
+        tgt = x*(x - 1)*(x + 1)
+        res = poly.polyvalfromroots(x, [-1, 0, 1])
+        assert_almost_equal(res, tgt)
+
+        # check that shape is preserved
+        for i in range(3):
+            dims = [2]*i
+            x = np.zeros(dims)
+            assert_equal(poly.polyvalfromroots(x, [1]).shape, dims)
+            assert_equal(poly.polyvalfromroots(x, [1, 0]).shape, dims)
+            assert_equal(poly.polyvalfromroots(x, [1, 0, 0]).shape, dims)
+
+        # check compatibility with factorization
+        ptest = [15, 2, -16, -2, 1]
+        r = poly.polyroots(ptest)
+        x = np.linspace(-1, 1)
+        assert_almost_equal(poly.polyval(x, ptest),
+                            poly.polyvalfromroots(x, r))
+
+        # check multidimensional arrays of roots and values
+        # check tensor=False
+        rshape = (3, 5)
+        x = np.arange(-3, 2)
+        r = np.random.randint(-5, 5, size=rshape)
+        res = poly.polyvalfromroots(x, r, tensor=False)
+        tgt = np.empty(r.shape[1:])
+        for ii in range(tgt.size):
+            tgt[ii] = poly.polyvalfromroots(x[ii], r[:, ii])
+        assert_equal(res, tgt)
+
+        # check tensor=True
+        x = np.vstack([x, 2*x])
+        res = poly.polyvalfromroots(x, r, tensor=True)
+        tgt = np.empty(r.shape[1:] + x.shape)
+        for ii in range(r.shape[1]):
+            for jj in range(x.shape[0]):
+                tgt[ii, jj, :] = poly.polyvalfromroots(x[jj], r[:, ii])
+        assert_equal(res, tgt)
+
+    def test_polyval2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, poly.polyval2d, x1, x2[:2], self.c2d)
+
+        #test values
+        tgt = y1*y2
+        res = poly.polyval2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = poly.polyval2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3))
+
+    def test_polyval3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test exceptions
+        assert_raises(ValueError, poly.polyval3d, x1, x2, x3[:2], self.c3d)
+
+        #test values
+        tgt = y1*y2*y3
+        res = poly.polyval3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = poly.polyval3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3))
+
+    def test_polygrid2d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j->ij', y1, y2)
+        res = poly.polygrid2d(x1, x2, self.c2d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = poly.polygrid2d(z, z, self.c2d)
+        assert_(res.shape == (2, 3)*2)
+
+    def test_polygrid3d(self):
+        x1, x2, x3 = self.x
+        y1, y2, y3 = self.y
+
+        #test values
+        tgt = np.einsum('i,j,k->ijk', y1, y2, y3)
+        res = poly.polygrid3d(x1, x2, x3, self.c3d)
+        assert_almost_equal(res, tgt)
+
+        #test shape
+        z = np.ones((2, 3))
+        res = poly.polygrid3d(z, z, z, self.c3d)
+        assert_(res.shape == (2, 3)*3)
+
+
+class TestIntegral(object):
+
+    def test_polyint(self):
+        # check exceptions
+        assert_raises(ValueError, poly.polyint, [0], .5)
+        assert_raises(ValueError, poly.polyint, [0], -1)
+        assert_raises(ValueError, poly.polyint, [0], 1, [0, 0])
+        assert_raises(ValueError, poly.polyint, [0], lbnd=[0])
+        assert_raises(ValueError, poly.polyint, [0], scl=[0])
+        assert_raises(ValueError, poly.polyint, [0], axis=.5)
+
+        # test integration of zero polynomial
+        for i in range(2, 5):
+            k = [0]*(i - 2) + [1]
+            res = poly.polyint([0], m=i, k=k)
+            assert_almost_equal(res, [0, 1])
+
+        # check single integration with integration constant
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [1/scl]
+            res = poly.polyint(pol, m=1, k=[i])
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check single integration with integration constant and lbnd
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            res = poly.polyint(pol, m=1, k=[i], lbnd=-1)
+            assert_almost_equal(poly.polyval(-1, res), i)
+
+        # check single integration with integration constant and scaling
+        for i in range(5):
+            scl = i + 1
+            pol = [0]*i + [1]
+            tgt = [i] + [0]*i + [2/scl]
+            res = poly.polyint(pol, m=1, k=[i], scl=2)
+            assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with default k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = poly.polyint(tgt, m=1)
+                res = poly.polyint(pol, m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with defined k
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = poly.polyint(tgt, m=1, k=[k])
+                res = poly.polyint(pol, m=j, k=list(range(j)))
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with lbnd
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = poly.polyint(tgt, m=1, k=[k], lbnd=-1)
+                res = poly.polyint(pol, m=j, k=list(range(j)), lbnd=-1)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check multiple integrations with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                pol = [0]*i + [1]
+                tgt = pol[:]
+                for k in range(j):
+                    tgt = poly.polyint(tgt, m=1, k=[k], scl=2)
+                res = poly.polyint(pol, m=j, k=list(range(j)), scl=2)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_polyint_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([poly.polyint(c) for c in c2d.T]).T
+        res = poly.polyint(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([poly.polyint(c) for c in c2d])
+        res = poly.polyint(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([poly.polyint(c, k=3) for c in c2d])
+        res = poly.polyint(c2d, k=3, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestDerivative(object):
+
+    def test_polyder(self):
+        # check exceptions
+        assert_raises(ValueError, poly.polyder, [0], .5)
+        assert_raises(ValueError, poly.polyder, [0], -1)
+
+        # check that zeroth derivative does nothing
+        for i in range(5):
+            tgt = [0]*i + [1]
+            res = poly.polyder(tgt, m=0)
+            assert_equal(trim(res), trim(tgt))
+
+        # check that derivation is the inverse of integration
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = poly.polyder(poly.polyint(tgt, m=j), m=j)
+                assert_almost_equal(trim(res), trim(tgt))
+
+        # check derivation with scaling
+        for i in range(5):
+            for j in range(2, 5):
+                tgt = [0]*i + [1]
+                res = poly.polyder(poly.polyint(tgt, m=j, scl=2), m=j, scl=.5)
+                assert_almost_equal(trim(res), trim(tgt))
+
+    def test_polyder_axis(self):
+        # check that axis keyword works
+        c2d = np.random.random((3, 4))
+
+        tgt = np.vstack([poly.polyder(c) for c in c2d.T]).T
+        res = poly.polyder(c2d, axis=0)
+        assert_almost_equal(res, tgt)
+
+        tgt = np.vstack([poly.polyder(c) for c in c2d])
+        res = poly.polyder(c2d, axis=1)
+        assert_almost_equal(res, tgt)
+
+
+class TestVander(object):
+    # some random values in [-1, 1)
+    x = np.random.random((3, 5))*2 - 1
+
+    def test_polyvander(self):
+        # check for 1d x
+        x = np.arange(3)
+        v = poly.polyvander(x, 3)
+        assert_(v.shape == (3, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], poly.polyval(x, coef))
+
+        # check for 2d x
+        x = np.array([[1, 2], [3, 4], [5, 6]])
+        v = poly.polyvander(x, 3)
+        assert_(v.shape == (3, 2, 4))
+        for i in range(4):
+            coef = [0]*i + [1]
+            assert_almost_equal(v[..., i], poly.polyval(x, coef))
+
+    def test_polyvander2d(self):
+        # also tests polyval2d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3))
+        van = poly.polyvander2d(x1, x2, [1, 2])
+        tgt = poly.polyval2d(x1, x2, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = poly.polyvander2d([x1], [x2], [1, 2])
+        assert_(van.shape == (1, 5, 6))
+
+    def test_polyvander3d(self):
+        # also tests polyval3d for non-square coefficient array
+        x1, x2, x3 = self.x
+        c = np.random.random((2, 3, 4))
+        van = poly.polyvander3d(x1, x2, x3, [1, 2, 3])
+        tgt = poly.polyval3d(x1, x2, x3, c)
+        res = np.dot(van, c.flat)
+        assert_almost_equal(res, tgt)
+
+        # check shape
+        van = poly.polyvander3d([x1], [x2], [x3], [1, 2, 3])
+        assert_(van.shape == (1, 5, 24))
+
+
+class TestCompanion(object):
+
+    def test_raises(self):
+        assert_raises(ValueError, poly.polycompanion, [])
+        assert_raises(ValueError, poly.polycompanion, [1])
+
+    def test_dimensions(self):
+        for i in range(1, 5):
+            coef = [0]*i + [1]
+            assert_(poly.polycompanion(coef).shape == (i, i))
+
+    def test_linear_root(self):
+        assert_(poly.polycompanion([1, 2])[0, 0] == -.5)
+
+
+class TestMisc(object):
+
+    def test_polyfromroots(self):
+        res = poly.polyfromroots([])
+        assert_almost_equal(trim(res), [1])
+        for i in range(1, 5):
+            roots = np.cos(np.linspace(-np.pi, 0, 2*i + 1)[1::2])
+            tgt = Tlist[i]
+            res = poly.polyfromroots(roots)*2**(i-1)
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_polyroots(self):
+        assert_almost_equal(poly.polyroots([1]), [])
+        assert_almost_equal(poly.polyroots([1, 2]), [-.5])
+        for i in range(2, 5):
+            tgt = np.linspace(-1, 1, i)
+            res = poly.polyroots(poly.polyfromroots(tgt))
+            assert_almost_equal(trim(res), trim(tgt))
+
+    def test_polyfit(self):
+        def f(x):
+            return x*(x - 1)*(x - 2)
+
+        def f2(x):
+            return x**4 + x**2 + 1
+
+        # Test exceptions
+        assert_raises(ValueError, poly.polyfit, [1], [1], -1)
+        assert_raises(TypeError, poly.polyfit, [[1]], [1], 0)
+        assert_raises(TypeError, poly.polyfit, [], [1], 0)
+        assert_raises(TypeError, poly.polyfit, [1], [[[1]]], 0)
+        assert_raises(TypeError, poly.polyfit, [1, 2], [1], 0)
+        assert_raises(TypeError, poly.polyfit, [1], [1, 2], 0)
+        assert_raises(TypeError, poly.polyfit, [1], [1], 0, w=[[1]])
+        assert_raises(TypeError, poly.polyfit, [1], [1], 0, w=[1, 1])
+        assert_raises(ValueError, poly.polyfit, [1], [1], [-1,])
+        assert_raises(ValueError, poly.polyfit, [1], [1], [2, -1, 6])
+        assert_raises(TypeError, poly.polyfit, [1], [1], [])
+
+        # Test fit
+        x = np.linspace(0, 2)
+        y = f(x)
+        #
+        coef3 = poly.polyfit(x, y, 3)
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(poly.polyval(x, coef3), y)
+        coef3 = poly.polyfit(x, y, [0, 1, 2, 3])
+        assert_equal(len(coef3), 4)
+        assert_almost_equal(poly.polyval(x, coef3), y)
+        #
+        coef4 = poly.polyfit(x, y, 4)
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(poly.polyval(x, coef4), y)
+        coef4 = poly.polyfit(x, y, [0, 1, 2, 3, 4])
+        assert_equal(len(coef4), 5)
+        assert_almost_equal(poly.polyval(x, coef4), y)
+        #
+        coef2d = poly.polyfit(x, np.array([y, y]).T, 3)
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        coef2d = poly.polyfit(x, np.array([y, y]).T, [0, 1, 2, 3])
+        assert_almost_equal(coef2d, np.array([coef3, coef3]).T)
+        # test weighting
+        w = np.zeros_like(x)
+        yw = y.copy()
+        w[1::2] = 1
+        yw[0::2] = 0
+        wcoef3 = poly.polyfit(x, yw, 3, w=w)
+        assert_almost_equal(wcoef3, coef3)
+        wcoef3 = poly.polyfit(x, yw, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef3, coef3)
+        #
+        wcoef2d = poly.polyfit(x, np.array([yw, yw]).T, 3, w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        wcoef2d = poly.polyfit(x, np.array([yw, yw]).T, [0, 1, 2, 3], w=w)
+        assert_almost_equal(wcoef2d, np.array([coef3, coef3]).T)
+        # test scaling with complex values x points whose square
+        # is zero when summed.
+        x = [1, 1j, -1, -1j]
+        assert_almost_equal(poly.polyfit(x, x, 1), [0, 1])
+        assert_almost_equal(poly.polyfit(x, x, [0, 1]), [0, 1])
+        # test fitting only even Polyendre polynomials
+        x = np.linspace(-1, 1)
+        y = f2(x)
+        coef1 = poly.polyfit(x, y, 4)
+        assert_almost_equal(poly.polyval(x, coef1), y)
+        coef2 = poly.polyfit(x, y, [0, 2, 4])
+        assert_almost_equal(poly.polyval(x, coef2), y)
+        assert_almost_equal(coef1, coef2)
+
+    def test_polytrim(self):
+        coef = [2, -1, 1, 0]
+
+        # Test exceptions
+        assert_raises(ValueError, poly.polytrim, coef, -1)
+
+        # Test results
+        assert_equal(poly.polytrim(coef), coef[:-1])
+        assert_equal(poly.polytrim(coef, 1), coef[:-3])
+        assert_equal(poly.polytrim(coef, 2), [0])
+
+    def test_polyline(self):
+        assert_equal(poly.polyline(3, 4), [3, 4])
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_polyutils.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_polyutils.py
new file mode 100644
index 00000000000..801c558ccca
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_polyutils.py
@@ -0,0 +1,108 @@
+"""Tests for polyutils module.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+import numpy.polynomial.polyutils as pu
+from numpy.testing import (
+    assert_almost_equal, assert_raises, assert_equal, assert_,
+    )
+
+
+class TestMisc(object):
+
+    def test_trimseq(self):
+        for i in range(5):
+            tgt = [1]
+            res = pu.trimseq([1] + [0]*5)
+            assert_equal(res, tgt)
+
+    def test_as_series(self):
+        # check exceptions
+        assert_raises(ValueError, pu.as_series, [[]])
+        assert_raises(ValueError, pu.as_series, [[[1, 2]]])
+        assert_raises(ValueError, pu.as_series, [[1], ['a']])
+        # check common types
+        types = ['i', 'd', 'O']
+        for i in range(len(types)):
+            for j in range(i):
+                ci = np.ones(1, types[i])
+                cj = np.ones(1, types[j])
+                [resi, resj] = pu.as_series([ci, cj])
+                assert_(resi.dtype.char == resj.dtype.char)
+                assert_(resj.dtype.char == types[i])
+
+    def test_trimcoef(self):
+        coef = [2, -1, 1, 0]
+        # Test exceptions
+        assert_raises(ValueError, pu.trimcoef, coef, -1)
+        # Test results
+        assert_equal(pu.trimcoef(coef), coef[:-1])
+        assert_equal(pu.trimcoef(coef, 1), coef[:-3])
+        assert_equal(pu.trimcoef(coef, 2), [0])
+
+
+class TestDomain(object):
+
+    def test_getdomain(self):
+        # test for real values
+        x = [1, 10, 3, -1]
+        tgt = [-1, 10]
+        res = pu.getdomain(x)
+        assert_almost_equal(res, tgt)
+
+        # test for complex values
+        x = [1 + 1j, 1 - 1j, 0, 2]
+        tgt = [-1j, 2 + 1j]
+        res = pu.getdomain(x)
+        assert_almost_equal(res, tgt)
+
+    def test_mapdomain(self):
+        # test for real values
+        dom1 = [0, 4]
+        dom2 = [1, 3]
+        tgt = dom2
+        res = pu.mapdomain(dom1, dom1, dom2)
+        assert_almost_equal(res, tgt)
+
+        # test for complex values
+        dom1 = [0 - 1j, 2 + 1j]
+        dom2 = [-2, 2]
+        tgt = dom2
+        x = dom1
+        res = pu.mapdomain(x, dom1, dom2)
+        assert_almost_equal(res, tgt)
+
+        # test for multidimensional arrays
+        dom1 = [0, 4]
+        dom2 = [1, 3]
+        tgt = np.array([dom2, dom2])
+        x = np.array([dom1, dom1])
+        res = pu.mapdomain(x, dom1, dom2)
+        assert_almost_equal(res, tgt)
+
+        # test that subtypes are preserved.
+        class MyNDArray(np.ndarray):
+            pass
+
+        dom1 = [0, 4]
+        dom2 = [1, 3]
+        x = np.array([dom1, dom1]).view(MyNDArray)
+        res = pu.mapdomain(x, dom1, dom2)
+        assert_(isinstance(res, MyNDArray))
+
+    def test_mapparms(self):
+        # test for real values
+        dom1 = [0, 4]
+        dom2 = [1, 3]
+        tgt = [1, .5]
+        res = pu. mapparms(dom1, dom2)
+        assert_almost_equal(res, tgt)
+
+        # test for complex values
+        dom1 = [0 - 1j, 2 + 1j]
+        dom2 = [-2, 2]
+        tgt = [-1 + 1j, 1 - 1j]
+        res = pu.mapparms(dom1, dom2)
+        assert_almost_equal(res, tgt)
diff --git a/contrib/python/numpy/py2/numpy/polynomial/tests/test_printing.py b/contrib/python/numpy/py2/numpy/polynomial/tests/test_printing.py
new file mode 100644
index 00000000000..3f123640222
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/polynomial/tests/test_printing.py
@@ -0,0 +1,68 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy.polynomial as poly
+from numpy.testing import assert_equal
+
+
+class TestStr(object):
+    def test_polynomial_str(self):
+        res = str(poly.Polynomial([0, 1]))
+        tgt = 'poly([0. 1.])'
+        assert_equal(res, tgt)
+
+    def test_chebyshev_str(self):
+        res = str(poly.Chebyshev([0, 1]))
+        tgt = 'cheb([0. 1.])'
+        assert_equal(res, tgt)
+
+    def test_legendre_str(self):
+        res = str(poly.Legendre([0, 1]))
+        tgt = 'leg([0. 1.])'
+        assert_equal(res, tgt)
+
+    def test_hermite_str(self):
+        res = str(poly.Hermite([0, 1]))
+        tgt = 'herm([0. 1.])'
+        assert_equal(res, tgt)
+
+    def test_hermiteE_str(self):
+        res = str(poly.HermiteE([0, 1]))
+        tgt = 'herme([0. 1.])'
+        assert_equal(res, tgt)
+
+    def test_laguerre_str(self):
+        res = str(poly.Laguerre([0, 1]))
+        tgt = 'lag([0. 1.])'
+        assert_equal(res, tgt)
+
+
+class TestRepr(object):
+    def test_polynomial_str(self):
+        res = repr(poly.Polynomial([0, 1]))
+        tgt = 'Polynomial([0., 1.], domain=[-1,  1], window=[-1,  1])'
+        assert_equal(res, tgt)
+
+    def test_chebyshev_str(self):
+        res = repr(poly.Chebyshev([0, 1]))
+        tgt = 'Chebyshev([0., 1.], domain=[-1,  1], window=[-1,  1])'
+        assert_equal(res, tgt)
+
+    def test_legendre_repr(self):
+        res = repr(poly.Legendre([0, 1]))
+        tgt = 'Legendre([0., 1.], domain=[-1,  1], window=[-1,  1])'
+        assert_equal(res, tgt)
+
+    def test_hermite_repr(self):
+        res = repr(poly.Hermite([0, 1]))
+        tgt = 'Hermite([0., 1.], domain=[-1,  1], window=[-1,  1])'
+        assert_equal(res, tgt)
+
+    def test_hermiteE_repr(self):
+        res = repr(poly.HermiteE([0, 1]))
+        tgt = 'HermiteE([0., 1.], domain=[-1,  1], window=[-1,  1])'
+        assert_equal(res, tgt)
+
+    def test_laguerre_repr(self):
+        res = repr(poly.Laguerre([0, 1]))
+        tgt = 'Laguerre([0., 1.], domain=[0, 1], window=[0, 1])'
+        assert_equal(res, tgt)
diff --git a/contrib/python/numpy/py2/numpy/random/mtrand/generate_mtrand_c.py b/contrib/python/numpy/py2/numpy/random/mtrand/generate_mtrand_c.py
new file mode 100644
index 00000000000..ec935e6ddf0
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/random/mtrand/generate_mtrand_c.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+from __future__ import division, absolute_import, print_function
+
+import sys
+import re
+import os
+
+unused_internal_funcs = ['__Pyx_PrintItem',
+                         '__Pyx_PrintNewline',
+                         '__Pyx_ReRaise',
+                         #'__Pyx_GetExcValue',
+                         '__Pyx_ArgTypeTest',
+                         '__Pyx_SetVtable',
+                         '__Pyx_GetVtable',
+                         '__Pyx_CreateClass']
+
+if __name__ == '__main__':
+    # Use cython here so that long docstrings are broken up.
+    # This is needed for some VC++ compilers.
+    os.system('cython mtrand.pyx')
+    mtrand_c = open('mtrand.c', 'r')
+    processed = open('mtrand_pp.c', 'w')
+    unused_funcs_str = '(' + '|'.join(unused_internal_funcs) + ')'
+    uifpat = re.compile(r'static \w+ \*?'+unused_funcs_str+r'.*/\*proto\*/')
+    linepat = re.compile(r'/\* ".*/mtrand.pyx":')
+    for linenum, line in enumerate(mtrand_c):
+        m = re.match(r'^(\s+arrayObject\w*\s*=\s*[(])[(]PyObject\s*[*][)]',
+                     line)
+        if m:
+            line = '%s(PyArrayObject *)%s' % (m.group(1), line[m.end():])
+        m = uifpat.match(line)
+        if m:
+            line = ''
+        m = re.search(unused_funcs_str, line)
+        if m:
+            print("%s was declared unused, but is used at line %d" % (m.group(),
+                                                                    linenum+1), file=sys.stderr)
+        line = linepat.sub(r'/* "mtrand.pyx":', line)
+        processed.write(line)
+    mtrand_c.close()
+    processed.close()
+    os.rename('mtrand_pp.c', 'mtrand.c')
diff --git a/contrib/python/numpy/py2/numpy/random/mtrand/randint_helpers.pxi.in b/contrib/python/numpy/py2/numpy/random/mtrand/randint_helpers.pxi.in
new file mode 100644
index 00000000000..894a25167f5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/random/mtrand/randint_helpers.pxi.in
@@ -0,0 +1,77 @@
+"""
+Template for each `dtype` helper function in `np.random.randint`.
+"""
+
+{{py:
+
+dtypes = (
+    ('bool', 'bool', 'bool_'),
+    ('int8', 'uint8', 'int8'),
+    ('int16', 'uint16', 'int16'),
+    ('int32', 'uint32', 'int32'),
+    ('int64', 'uint64', 'int64'),
+    ('uint8', 'uint8', 'uint8'),
+    ('uint16', 'uint16', 'uint16'),
+    ('uint32', 'uint32', 'uint32'),
+    ('uint64', 'uint64', 'uint64'),
+)
+
+def get_dispatch(dtypes):
+    for npy_dt, npy_udt, np_dt in dtypes:
+        yield npy_dt, npy_udt, np_dt
+}}
+
+{{for npy_dt, npy_udt, np_dt in get_dispatch(dtypes)}}
+
+def _rand_{{npy_dt}}(npy_{{npy_dt}} low, npy_{{npy_dt}} high, size, rngstate):
+    """
+    _rand_{{npy_dt}}(low, high, size, rngstate)
+
+    Return random np.{{np_dt}} integers between ``low`` and ``high``, inclusive.
+
+    Return random integers from the "discrete uniform" distribution in the
+    closed interval [``low``, ``high``). On entry the arguments are presumed
+    to have been validated for size and order for the np.{{np_dt}} type.
+
+    Parameters
+    ----------
+    low : int
+        Lowest (signed) integer to be drawn from the distribution.
+    high : int
+        Highest (signed) integer to be drawn from the distribution.
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    rngstate : encapsulated pointer to rk_state
+        The specific type depends on the python version. In Python 2 it is
+        a PyCObject, in Python 3 a PyCapsule object.
+
+    Returns
+    -------
+    out : python integer or ndarray of np.{{np_dt}}
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    """
+    cdef npy_{{npy_udt}} off, rng, buf
+    cdef npy_{{npy_udt}} *out
+    cdef ndarray array "arrayObject"
+    cdef npy_intp cnt
+    cdef rk_state *state = <rk_state *>PyCapsule_GetPointer(rngstate, NULL)
+
+    off = <npy_{{npy_udt}}>(low)
+    rng = <npy_{{npy_udt}}>(high) - <npy_{{npy_udt}}>(low)
+
+    if size is None:
+        rk_random_{{npy_udt}}(off, rng, 1, &buf, state)
+        return np.{{np_dt}}(<npy_{{npy_dt}}>buf)
+    else:
+        array = <ndarray>np.empty(size, np.{{np_dt}})
+        cnt = PyArray_SIZE(array)
+        array_data = <npy_{{npy_udt}} *>PyArray_DATA(array)
+        with nogil:
+            rk_random_{{npy_udt}}(off, rng, cnt, array_data, state)
+        return array
+
+{{endfor}}
diff --git a/contrib/python/numpy/py2/numpy/random/setup.py b/contrib/python/numpy/py2/numpy/random/setup.py
new file mode 100644
index 00000000000..394a70ead37
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/random/setup.py
@@ -0,0 +1,63 @@
+from __future__ import division, print_function
+
+from os.path import join
+import sys
+from distutils.dep_util import newer
+from distutils.msvccompiler import get_build_version as get_msvc_build_version
+
+def needs_mingw_ftime_workaround():
+    # We need the mingw workaround for _ftime if the msvc runtime version is
+    # 7.1 or above and we build with mingw ...
+    # ... but we can't easily detect compiler version outside distutils command
+    # context, so we will need to detect in randomkit whether we build with gcc
+    msver = get_msvc_build_version()
+    if msver and msver >= 8:
+        return True
+
+    return False
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration, get_mathlibs
+    config = Configuration('random', parent_package, top_path)
+
+    def generate_libraries(ext, build_dir):
+        config_cmd = config.get_config_cmd()
+        libs = get_mathlibs()
+        if sys.platform == 'win32':
+            libs.append('Advapi32')
+        ext.libraries.extend(libs)
+        return None
+
+    # enable unix large file support on 32 bit systems
+    # (64 bit off_t, lseek -> lseek64 etc.)
+    if sys.platform[:3] == "aix":
+        defs = [('_LARGE_FILES', None)]
+    else:
+        defs = [('_FILE_OFFSET_BITS', '64'),
+                ('_LARGEFILE_SOURCE', '1'),
+                ('_LARGEFILE64_SOURCE', '1')]
+    if needs_mingw_ftime_workaround():
+        defs.append(("NPY_NEEDS_MINGW_TIME_WORKAROUND", None))
+
+    libs = []
+    # Configure mtrand
+    config.add_extension('mtrand',
+                         sources=[join('mtrand', x) for x in
+                                  ['mtrand.c', 'randomkit.c', 'initarray.c',
+                                   'distributions.c']]+[generate_libraries],
+                         libraries=libs,
+                         depends=[join('mtrand', '*.h'),
+                                  join('mtrand', '*.pyx'),
+                                  join('mtrand', '*.pxi'),],
+                         define_macros=defs,
+                         )
+
+    config.add_data_files(('.', join('mtrand', 'randomkit.h')))
+    config.add_data_dir('tests')
+
+    return config
+
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/contrib/python/numpy/py2/numpy/random/tests/__init__.py b/contrib/python/numpy/py2/numpy/random/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/random/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/random/tests/test_random.py b/contrib/python/numpy/py2/numpy/random/tests/test_random.py
new file mode 100644
index 00000000000..4529b4fbd74
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/random/tests/test_random.py
@@ -0,0 +1,1663 @@
+from __future__ import division, absolute_import, print_function
+import warnings
+
+import numpy as np
+from numpy.testing import (
+        assert_, assert_raises, assert_equal, assert_warns,
+        assert_no_warnings, assert_array_equal, assert_array_almost_equal,
+        suppress_warnings
+        )
+from numpy import random
+import sys
+
+
+class TestSeed(object):
+    def test_scalar(self):
+        s = np.random.RandomState(0)
+        assert_equal(s.randint(1000), 684)
+        s = np.random.RandomState(4294967295)
+        assert_equal(s.randint(1000), 419)
+
+    def test_array(self):
+        s = np.random.RandomState(range(10))
+        assert_equal(s.randint(1000), 468)
+        s = np.random.RandomState(np.arange(10))
+        assert_equal(s.randint(1000), 468)
+        s = np.random.RandomState([0])
+        assert_equal(s.randint(1000), 973)
+        s = np.random.RandomState([4294967295])
+        assert_equal(s.randint(1000), 265)
+
+    def test_invalid_scalar(self):
+        # seed must be an unsigned 32 bit integer
+        assert_raises(TypeError, np.random.RandomState, -0.5)
+        assert_raises(ValueError, np.random.RandomState, -1)
+
+    def test_invalid_array(self):
+        # seed must be an unsigned 32 bit integer
+        assert_raises(TypeError, np.random.RandomState, [-0.5])
+        assert_raises(ValueError, np.random.RandomState, [-1])
+        assert_raises(ValueError, np.random.RandomState, [4294967296])
+        assert_raises(ValueError, np.random.RandomState, [1, 2, 4294967296])
+        assert_raises(ValueError, np.random.RandomState, [1, -2, 4294967296])
+
+    def test_invalid_array_shape(self):
+        # gh-9832
+        assert_raises(ValueError, np.random.RandomState, np.array([], dtype=np.int64))
+        assert_raises(ValueError, np.random.RandomState, [[1, 2, 3]])
+        assert_raises(ValueError, np.random.RandomState, [[1, 2, 3],
+                                                          [4, 5, 6]])
+
+
+class TestBinomial(object):
+    def test_n_zero(self):
+        # Tests the corner case of n == 0 for the binomial distribution.
+        # binomial(0, p) should be zero for any p in [0, 1].
+        # This test addresses issue #3480.
+        zeros = np.zeros(2, dtype='int')
+        for p in [0, .5, 1]:
+            assert_(random.binomial(0, p) == 0)
+            assert_array_equal(random.binomial(zeros, p), zeros)
+
+    def test_p_is_nan(self):
+        # Issue #4571.
+        assert_raises(ValueError, random.binomial, 1, np.nan)
+
+
+class TestMultinomial(object):
+    def test_basic(self):
+        random.multinomial(100, [0.2, 0.8])
+
+    def test_zero_probability(self):
+        random.multinomial(100, [0.2, 0.8, 0.0, 0.0, 0.0])
+
+    def test_int_negative_interval(self):
+        assert_(-5 <= random.randint(-5, -1) < -1)
+        x = random.randint(-5, -1, 5)
+        assert_(np.all(-5 <= x))
+        assert_(np.all(x < -1))
+
+    def test_size(self):
+        # gh-3173
+        p = [0.5, 0.5]
+        assert_equal(np.random.multinomial(1, p, np.uint32(1)).shape, (1, 2))
+        assert_equal(np.random.multinomial(1, p, np.uint32(1)).shape, (1, 2))
+        assert_equal(np.random.multinomial(1, p, np.uint32(1)).shape, (1, 2))
+        assert_equal(np.random.multinomial(1, p, [2, 2]).shape, (2, 2, 2))
+        assert_equal(np.random.multinomial(1, p, (2, 2)).shape, (2, 2, 2))
+        assert_equal(np.random.multinomial(1, p, np.array((2, 2))).shape,
+                     (2, 2, 2))
+
+        assert_raises(TypeError, np.random.multinomial, 1, p,
+                      float(1))
+
+
+class TestSetState(object):
+    def setup(self):
+        self.seed = 1234567890
+        self.prng = random.RandomState(self.seed)
+        self.state = self.prng.get_state()
+
+    def test_basic(self):
+        old = self.prng.tomaxint(16)
+        self.prng.set_state(self.state)
+        new = self.prng.tomaxint(16)
+        assert_(np.all(old == new))
+
+    def test_gaussian_reset(self):
+        # Make sure the cached every-other-Gaussian is reset.
+        old = self.prng.standard_normal(size=3)
+        self.prng.set_state(self.state)
+        new = self.prng.standard_normal(size=3)
+        assert_(np.all(old == new))
+
+    def test_gaussian_reset_in_media_res(self):
+        # When the state is saved with a cached Gaussian, make sure the
+        # cached Gaussian is restored.
+
+        self.prng.standard_normal()
+        state = self.prng.get_state()
+        old = self.prng.standard_normal(size=3)
+        self.prng.set_state(state)
+        new = self.prng.standard_normal(size=3)
+        assert_(np.all(old == new))
+
+    def test_backwards_compatibility(self):
+        # Make sure we can accept old state tuples that do not have the
+        # cached Gaussian value.
+        old_state = self.state[:-2]
+        x1 = self.prng.standard_normal(size=16)
+        self.prng.set_state(old_state)
+        x2 = self.prng.standard_normal(size=16)
+        self.prng.set_state(self.state)
+        x3 = self.prng.standard_normal(size=16)
+        assert_(np.all(x1 == x2))
+        assert_(np.all(x1 == x3))
+
+    def test_negative_binomial(self):
+        # Ensure that the negative binomial results take floating point
+        # arguments without truncation.
+        self.prng.negative_binomial(0.5, 0.5)
+
+
+class TestRandint(object):
+
+    rfunc = np.random.randint
+
+    # valid integer/boolean types
+    itype = [np.bool_, np.int8, np.uint8, np.int16, np.uint16,
+             np.int32, np.uint32, np.int64, np.uint64]
+
+    def test_unsupported_type(self):
+        assert_raises(TypeError, self.rfunc, 1, dtype=float)
+
+    def test_bounds_checking(self):
+        for dt in self.itype:
+            lbnd = 0 if dt is np.bool_ else np.iinfo(dt).min
+            ubnd = 2 if dt is np.bool_ else np.iinfo(dt).max + 1
+            assert_raises(ValueError, self.rfunc, lbnd - 1, ubnd, dtype=dt)
+            assert_raises(ValueError, self.rfunc, lbnd, ubnd + 1, dtype=dt)
+            assert_raises(ValueError, self.rfunc, ubnd, lbnd, dtype=dt)
+            assert_raises(ValueError, self.rfunc, 1, 0, dtype=dt)
+
+    def test_rng_zero_and_extremes(self):
+        for dt in self.itype:
+            lbnd = 0 if dt is np.bool_ else np.iinfo(dt).min
+            ubnd = 2 if dt is np.bool_ else np.iinfo(dt).max + 1
+
+            tgt = ubnd - 1
+            assert_equal(self.rfunc(tgt, tgt + 1, size=1000, dtype=dt), tgt)
+
+            tgt = lbnd
+            assert_equal(self.rfunc(tgt, tgt + 1, size=1000, dtype=dt), tgt)
+
+            tgt = (lbnd + ubnd)//2
+            assert_equal(self.rfunc(tgt, tgt + 1, size=1000, dtype=dt), tgt)
+
+    def test_full_range(self):
+        # Test for ticket #1690
+
+        for dt in self.itype:
+            lbnd = 0 if dt is np.bool_ else np.iinfo(dt).min
+            ubnd = 2 if dt is np.bool_ else np.iinfo(dt).max + 1
+
+            try:
+                self.rfunc(lbnd, ubnd, dtype=dt)
+            except Exception as e:
+                raise AssertionError("No error should have been raised, "
+                                     "but one was with the following "
+                                     "message:\n\n%s" % str(e))
+
+    def test_in_bounds_fuzz(self):
+        # Don't use fixed seed
+        np.random.seed()
+
+        for dt in self.itype[1:]:
+            for ubnd in [4, 8, 16]:
+                vals = self.rfunc(2, ubnd, size=2**16, dtype=dt)
+                assert_(vals.max() < ubnd)
+                assert_(vals.min() >= 2)
+
+        vals = self.rfunc(0, 2, size=2**16, dtype=np.bool_)
+
+        assert_(vals.max() < 2)
+        assert_(vals.min() >= 0)
+
+    def test_repeatability(self):
+        import hashlib
+        # We use a md5 hash of generated sequences of 1000 samples
+        # in the range [0, 6) for all but bool, where the range
+        # is [0, 2). Hashes are for little endian numbers.
+        tgt = {'bool': '7dd3170d7aa461d201a65f8bcf3944b0',
+               'int16': '1b7741b80964bb190c50d541dca1cac1',
+               'int32': '4dc9fcc2b395577ebb51793e58ed1a05',
+               'int64': '17db902806f448331b5a758d7d2ee672',
+               'int8': '27dd30c4e08a797063dffac2490b0be6',
+               'uint16': '1b7741b80964bb190c50d541dca1cac1',
+               'uint32': '4dc9fcc2b395577ebb51793e58ed1a05',
+               'uint64': '17db902806f448331b5a758d7d2ee672',
+               'uint8': '27dd30c4e08a797063dffac2490b0be6'}
+
+        for dt in self.itype[1:]:
+            np.random.seed(1234)
+
+            # view as little endian for hash
+            if sys.byteorder == 'little':
+                val = self.rfunc(0, 6, size=1000, dtype=dt)
+            else:
+                val = self.rfunc(0, 6, size=1000, dtype=dt).byteswap()
+
+            res = hashlib.md5(val.view(np.int8)).hexdigest()
+            assert_(tgt[np.dtype(dt).name] == res)
+
+        # bools do not depend on endianness
+        np.random.seed(1234)
+        val = self.rfunc(0, 2, size=1000, dtype=bool).view(np.int8)
+        res = hashlib.md5(val).hexdigest()
+        assert_(tgt[np.dtype(bool).name] == res)
+
+    def test_int64_uint64_corner_case(self):
+        # When stored in Numpy arrays, `lbnd` is casted
+        # as np.int64, and `ubnd` is casted as np.uint64.
+        # Checking whether `lbnd` >= `ubnd` used to be
+        # done solely via direct comparison, which is incorrect
+        # because when Numpy tries to compare both numbers,
+        # it casts both to np.float64 because there is
+        # no integer superset of np.int64 and np.uint64. However,
+        # `ubnd` is too large to be represented in np.float64,
+        # causing it be round down to np.iinfo(np.int64).max,
+        # leading to a ValueError because `lbnd` now equals
+        # the new `ubnd`.
+
+        dt = np.int64
+        tgt = np.iinfo(np.int64).max
+        lbnd = np.int64(np.iinfo(np.int64).max)
+        ubnd = np.uint64(np.iinfo(np.int64).max + 1)
+
+        # None of these function calls should
+        # generate a ValueError now.
+        actual = np.random.randint(lbnd, ubnd, dtype=dt)
+        assert_equal(actual, tgt)
+
+    def test_respect_dtype_singleton(self):
+        # See gh-7203
+        for dt in self.itype:
+            lbnd = 0 if dt is np.bool_ else np.iinfo(dt).min
+            ubnd = 2 if dt is np.bool_ else np.iinfo(dt).max + 1
+
+            sample = self.rfunc(lbnd, ubnd, dtype=dt)
+            assert_equal(sample.dtype, np.dtype(dt))
+
+        for dt in (bool, int, np.long):
+            lbnd = 0 if dt is bool else np.iinfo(dt).min
+            ubnd = 2 if dt is bool else np.iinfo(dt).max + 1
+
+            # gh-7284: Ensure that we get Python data types
+            sample = self.rfunc(lbnd, ubnd, dtype=dt)
+            assert_(not hasattr(sample, 'dtype'))
+            assert_equal(type(sample), dt)
+
+
+class TestRandomDist(object):
+    # Make sure the random distribution returns the correct value for a
+    # given seed
+
+    def setup(self):
+        self.seed = 1234567890
+
+    def test_rand(self):
+        np.random.seed(self.seed)
+        actual = np.random.rand(3, 2)
+        desired = np.array([[0.61879477158567997, 0.59162362775974664],
+                            [0.88868358904449662, 0.89165480011560816],
+                            [0.4575674820298663, 0.7781880808593471]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_randn(self):
+        np.random.seed(self.seed)
+        actual = np.random.randn(3, 2)
+        desired = np.array([[1.34016345771863121, 1.73759122771936081],
+                           [1.498988344300628, -0.2286433324536169],
+                           [2.031033998682787, 2.17032494605655257]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_randint(self):
+        np.random.seed(self.seed)
+        actual = np.random.randint(-99, 99, size=(3, 2))
+        desired = np.array([[31, 3],
+                            [-52, 41],
+                            [-48, -66]])
+        assert_array_equal(actual, desired)
+
+    def test_random_integers(self):
+        np.random.seed(self.seed)
+        with suppress_warnings() as sup:
+            w = sup.record(DeprecationWarning)
+            actual = np.random.random_integers(-99, 99, size=(3, 2))
+            assert_(len(w) == 1)
+        desired = np.array([[31, 3],
+                            [-52, 41],
+                            [-48, -66]])
+        assert_array_equal(actual, desired)
+
+    def test_random_integers_max_int(self):
+        # Tests whether random_integers can generate the
+        # maximum allowed Python int that can be converted
+        # into a C long. Previous implementations of this
+        # method have thrown an OverflowError when attempting
+        # to generate this integer.
+        with suppress_warnings() as sup:
+            w = sup.record(DeprecationWarning)
+            actual = np.random.random_integers(np.iinfo('l').max,
+                                               np.iinfo('l').max)
+            assert_(len(w) == 1)
+
+        desired = np.iinfo('l').max
+        assert_equal(actual, desired)
+
+    def test_random_integers_deprecated(self):
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", DeprecationWarning)
+
+            # DeprecationWarning raised with high == None
+            assert_raises(DeprecationWarning,
+                          np.random.random_integers,
+                          np.iinfo('l').max)
+
+            # DeprecationWarning raised with high != None
+            assert_raises(DeprecationWarning,
+                          np.random.random_integers,
+                          np.iinfo('l').max, np.iinfo('l').max)
+
+    def test_random_sample(self):
+        np.random.seed(self.seed)
+        actual = np.random.random_sample((3, 2))
+        desired = np.array([[0.61879477158567997, 0.59162362775974664],
+                            [0.88868358904449662, 0.89165480011560816],
+                            [0.4575674820298663, 0.7781880808593471]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_choice_uniform_replace(self):
+        np.random.seed(self.seed)
+        actual = np.random.choice(4, 4)
+        desired = np.array([2, 3, 2, 3])
+        assert_array_equal(actual, desired)
+
+    def test_choice_nonuniform_replace(self):
+        np.random.seed(self.seed)
+        actual = np.random.choice(4, 4, p=[0.4, 0.4, 0.1, 0.1])
+        desired = np.array([1, 1, 2, 2])
+        assert_array_equal(actual, desired)
+
+    def test_choice_uniform_noreplace(self):
+        np.random.seed(self.seed)
+        actual = np.random.choice(4, 3, replace=False)
+        desired = np.array([0, 1, 3])
+        assert_array_equal(actual, desired)
+
+    def test_choice_nonuniform_noreplace(self):
+        np.random.seed(self.seed)
+        actual = np.random.choice(4, 3, replace=False,
+                                  p=[0.1, 0.3, 0.5, 0.1])
+        desired = np.array([2, 3, 1])
+        assert_array_equal(actual, desired)
+
+    def test_choice_noninteger(self):
+        np.random.seed(self.seed)
+        actual = np.random.choice(['a', 'b', 'c', 'd'], 4)
+        desired = np.array(['c', 'd', 'c', 'd'])
+        assert_array_equal(actual, desired)
+
+    def test_choice_exceptions(self):
+        sample = np.random.choice
+        assert_raises(ValueError, sample, -1, 3)
+        assert_raises(ValueError, sample, 3., 3)
+        assert_raises(ValueError, sample, [[1, 2], [3, 4]], 3)
+        assert_raises(ValueError, sample, [], 3)
+        assert_raises(ValueError, sample, [1, 2, 3, 4], 3,
+                      p=[[0.25, 0.25], [0.25, 0.25]])
+        assert_raises(ValueError, sample, [1, 2], 3, p=[0.4, 0.4, 0.2])
+        assert_raises(ValueError, sample, [1, 2], 3, p=[1.1, -0.1])
+        assert_raises(ValueError, sample, [1, 2], 3, p=[0.4, 0.4])
+        assert_raises(ValueError, sample, [1, 2, 3], 4, replace=False)
+        # gh-13087
+        assert_raises(ValueError, sample, [1, 2, 3], -2, replace=False)
+        assert_raises(ValueError, sample, [1, 2, 3], (-1,), replace=False)
+        assert_raises(ValueError, sample, [1, 2, 3], (-1, 1), replace=False)
+        assert_raises(ValueError, sample, [1, 2, 3], 2,
+                      replace=False, p=[1, 0, 0])
+
+    def test_choice_return_shape(self):
+        p = [0.1, 0.9]
+        # Check scalar
+        assert_(np.isscalar(np.random.choice(2, replace=True)))
+        assert_(np.isscalar(np.random.choice(2, replace=False)))
+        assert_(np.isscalar(np.random.choice(2, replace=True, p=p)))
+        assert_(np.isscalar(np.random.choice(2, replace=False, p=p)))
+        assert_(np.isscalar(np.random.choice([1, 2], replace=True)))
+        assert_(np.random.choice([None], replace=True) is None)
+        a = np.array([1, 2])
+        arr = np.empty(1, dtype=object)
+        arr[0] = a
+        assert_(np.random.choice(arr, replace=True) is a)
+
+        # Check 0-d array
+        s = tuple()
+        assert_(not np.isscalar(np.random.choice(2, s, replace=True)))
+        assert_(not np.isscalar(np.random.choice(2, s, replace=False)))
+        assert_(not np.isscalar(np.random.choice(2, s, replace=True, p=p)))
+        assert_(not np.isscalar(np.random.choice(2, s, replace=False, p=p)))
+        assert_(not np.isscalar(np.random.choice([1, 2], s, replace=True)))
+        assert_(np.random.choice([None], s, replace=True).ndim == 0)
+        a = np.array([1, 2])
+        arr = np.empty(1, dtype=object)
+        arr[0] = a
+        assert_(np.random.choice(arr, s, replace=True).item() is a)
+
+        # Check multi dimensional array
+        s = (2, 3)
+        p = [0.1, 0.1, 0.1, 0.1, 0.4, 0.2]
+        assert_equal(np.random.choice(6, s, replace=True).shape, s)
+        assert_equal(np.random.choice(6, s, replace=False).shape, s)
+        assert_equal(np.random.choice(6, s, replace=True, p=p).shape, s)
+        assert_equal(np.random.choice(6, s, replace=False, p=p).shape, s)
+        assert_equal(np.random.choice(np.arange(6), s, replace=True).shape, s)
+
+        # Check zero-size
+        assert_equal(np.random.randint(0, 0, size=(3, 0, 4)).shape, (3, 0, 4))
+        assert_equal(np.random.randint(0, -10, size=0).shape, (0,))
+        assert_equal(np.random.randint(10, 10, size=0).shape, (0,))
+        assert_equal(np.random.choice(0, size=0).shape, (0,))
+        assert_equal(np.random.choice([], size=(0,)).shape, (0,))
+        assert_equal(np.random.choice(['a', 'b'], size=(3, 0, 4)).shape, (3, 0, 4))
+        assert_raises(ValueError, np.random.choice, [], 10)
+
+    def test_choice_nan_probabilities(self):
+        a = np.array([42, 1, 2])
+        p = [None, None, None]
+        assert_raises(ValueError, np.random.choice, a, p=p)
+
+    def test_bytes(self):
+        np.random.seed(self.seed)
+        actual = np.random.bytes(10)
+        desired = b'\x82Ui\x9e\xff\x97+Wf\xa5'
+        assert_equal(actual, desired)
+
+    def test_shuffle(self):
+        # Test lists, arrays (of various dtypes), and multidimensional versions
+        # of both, c-contiguous or not:
+        for conv in [lambda x: np.array([]),
+                     lambda x: x,
+                     lambda x: np.asarray(x).astype(np.int8),
+                     lambda x: np.asarray(x).astype(np.float32),
+                     lambda x: np.asarray(x).astype(np.complex64),
+                     lambda x: np.asarray(x).astype(object),
+                     lambda x: [(i, i) for i in x],
+                     lambda x: np.asarray([[i, i] for i in x]),
+                     lambda x: np.vstack([x, x]).T,
+                     # gh-11442
+                     lambda x: (np.asarray([(i, i) for i in x],
+                                           [("a", int), ("b", int)])
+                                .view(np.recarray)),
+                     # gh-4270
+                     lambda x: np.asarray([(i, i) for i in x],
+                                          [("a", object, 1),
+                                           ("b", np.int32, 1)])]:
+            np.random.seed(self.seed)
+            alist = conv([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])
+            np.random.shuffle(alist)
+            actual = alist
+            desired = conv([0, 1, 9, 6, 2, 4, 5, 8, 7, 3])
+            assert_array_equal(actual, desired)
+
+    def test_shuffle_masked(self):
+        # gh-3263
+        a = np.ma.masked_values(np.reshape(range(20), (5, 4)) % 3 - 1, -1)
+        b = np.ma.masked_values(np.arange(20) % 3 - 1, -1)
+        a_orig = a.copy()
+        b_orig = b.copy()
+        for i in range(50):
+            np.random.shuffle(a)
+            assert_equal(
+                sorted(a.data[~a.mask]), sorted(a_orig.data[~a_orig.mask]))
+            np.random.shuffle(b)
+            assert_equal(
+                sorted(b.data[~b.mask]), sorted(b_orig.data[~b_orig.mask]))
+
+    def test_beta(self):
+        np.random.seed(self.seed)
+        actual = np.random.beta(.1, .9, size=(3, 2))
+        desired = np.array(
+                [[1.45341850513746058e-02, 5.31297615662868145e-04],
+                 [1.85366619058432324e-06, 4.19214516800110563e-03],
+                 [1.58405155108498093e-04, 1.26252891949397652e-04]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_binomial(self):
+        np.random.seed(self.seed)
+        actual = np.random.binomial(100.123, .456, size=(3, 2))
+        desired = np.array([[37, 43],
+                            [42, 48],
+                            [46, 45]])
+        assert_array_equal(actual, desired)
+
+    def test_chisquare(self):
+        np.random.seed(self.seed)
+        actual = np.random.chisquare(50, size=(3, 2))
+        desired = np.array([[63.87858175501090585, 68.68407748911370447],
+                            [65.77116116901505904, 47.09686762438974483],
+                            [72.3828403199695174, 74.18408615260374006]])
+        assert_array_almost_equal(actual, desired, decimal=13)
+
+    def test_dirichlet(self):
+        np.random.seed(self.seed)
+        alpha = np.array([51.72840233779265162, 39.74494232180943953])
+        actual = np.random.mtrand.dirichlet(alpha, size=(3, 2))
+        desired = np.array([[[0.54539444573611562, 0.45460555426388438],
+                             [0.62345816822039413, 0.37654183177960598]],
+                            [[0.55206000085785778, 0.44793999914214233],
+                             [0.58964023305154301, 0.41035976694845688]],
+                            [[0.59266909280647828, 0.40733090719352177],
+                             [0.56974431743975207, 0.43025568256024799]]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_dirichlet_size(self):
+        # gh-3173
+        p = np.array([51.72840233779265162, 39.74494232180943953])
+        assert_equal(np.random.dirichlet(p, np.uint32(1)).shape, (1, 2))
+        assert_equal(np.random.dirichlet(p, np.uint32(1)).shape, (1, 2))
+        assert_equal(np.random.dirichlet(p, np.uint32(1)).shape, (1, 2))
+        assert_equal(np.random.dirichlet(p, [2, 2]).shape, (2, 2, 2))
+        assert_equal(np.random.dirichlet(p, (2, 2)).shape, (2, 2, 2))
+        assert_equal(np.random.dirichlet(p, np.array((2, 2))).shape, (2, 2, 2))
+
+        assert_raises(TypeError, np.random.dirichlet, p, float(1))
+
+    def test_dirichlet_bad_alpha(self):
+        # gh-2089
+        alpha = np.array([5.4e-01, -1.0e-16])
+        assert_raises(ValueError, np.random.mtrand.dirichlet, alpha)
+
+    def test_exponential(self):
+        np.random.seed(self.seed)
+        actual = np.random.exponential(1.1234, size=(3, 2))
+        desired = np.array([[1.08342649775011624, 1.00607889924557314],
+                            [2.46628830085216721, 2.49668106809923884],
+                            [0.68717433461363442, 1.69175666993575979]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_exponential_0(self):
+        assert_equal(np.random.exponential(scale=0), 0)
+        assert_raises(ValueError, np.random.exponential, scale=-0.)
+
+    def test_f(self):
+        np.random.seed(self.seed)
+        actual = np.random.f(12, 77, size=(3, 2))
+        desired = np.array([[1.21975394418575878, 1.75135759791559775],
+                            [1.44803115017146489, 1.22108959480396262],
+                            [1.02176975757740629, 1.34431827623300415]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_gamma(self):
+        np.random.seed(self.seed)
+        actual = np.random.gamma(5, 3, size=(3, 2))
+        desired = np.array([[24.60509188649287182, 28.54993563207210627],
+                            [26.13476110204064184, 12.56988482927716078],
+                            [31.71863275789960568, 33.30143302795922011]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_gamma_0(self):
+        assert_equal(np.random.gamma(shape=0, scale=0), 0)
+        assert_raises(ValueError, np.random.gamma, shape=-0., scale=-0.)
+
+    def test_geometric(self):
+        np.random.seed(self.seed)
+        actual = np.random.geometric(.123456789, size=(3, 2))
+        desired = np.array([[8, 7],
+                            [17, 17],
+                            [5, 12]])
+        assert_array_equal(actual, desired)
+
+    def test_gumbel(self):
+        np.random.seed(self.seed)
+        actual = np.random.gumbel(loc=.123456789, scale=2.0, size=(3, 2))
+        desired = np.array([[0.19591898743416816, 0.34405539668096674],
+                            [-1.4492522252274278, -1.47374816298446865],
+                            [1.10651090478803416, -0.69535848626236174]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_gumbel_0(self):
+        assert_equal(np.random.gumbel(scale=0), 0)
+        assert_raises(ValueError, np.random.gumbel, scale=-0.)
+
+    def test_hypergeometric(self):
+        np.random.seed(self.seed)
+        actual = np.random.hypergeometric(10.1, 5.5, 14, size=(3, 2))
+        desired = np.array([[10, 10],
+                            [10, 10],
+                            [9, 9]])
+        assert_array_equal(actual, desired)
+
+        # Test nbad = 0
+        actual = np.random.hypergeometric(5, 0, 3, size=4)
+        desired = np.array([3, 3, 3, 3])
+        assert_array_equal(actual, desired)
+
+        actual = np.random.hypergeometric(15, 0, 12, size=4)
+        desired = np.array([12, 12, 12, 12])
+        assert_array_equal(actual, desired)
+
+        # Test ngood = 0
+        actual = np.random.hypergeometric(0, 5, 3, size=4)
+        desired = np.array([0, 0, 0, 0])
+        assert_array_equal(actual, desired)
+
+        actual = np.random.hypergeometric(0, 15, 12, size=4)
+        desired = np.array([0, 0, 0, 0])
+        assert_array_equal(actual, desired)
+
+    def test_laplace(self):
+        np.random.seed(self.seed)
+        actual = np.random.laplace(loc=.123456789, scale=2.0, size=(3, 2))
+        desired = np.array([[0.66599721112760157, 0.52829452552221945],
+                            [3.12791959514407125, 3.18202813572992005],
+                            [-0.05391065675859356, 1.74901336242837324]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_laplace_0(self):
+        assert_equal(np.random.laplace(scale=0), 0)
+        assert_raises(ValueError, np.random.laplace, scale=-0.)
+
+    def test_logistic(self):
+        np.random.seed(self.seed)
+        actual = np.random.logistic(loc=.123456789, scale=2.0, size=(3, 2))
+        desired = np.array([[1.09232835305011444, 0.8648196662399954],
+                            [4.27818590694950185, 4.33897006346929714],
+                            [-0.21682183359214885, 2.63373365386060332]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_lognormal(self):
+        np.random.seed(self.seed)
+        actual = np.random.lognormal(mean=.123456789, sigma=2.0, size=(3, 2))
+        desired = np.array([[16.50698631688883822, 36.54846706092654784],
+                            [22.67886599981281748, 0.71617561058995771],
+                            [65.72798501792723869, 86.84341601437161273]])
+        assert_array_almost_equal(actual, desired, decimal=13)
+
+    def test_lognormal_0(self):
+        assert_equal(np.random.lognormal(sigma=0), 1)
+        assert_raises(ValueError, np.random.lognormal, sigma=-0.)
+
+    def test_logseries(self):
+        np.random.seed(self.seed)
+        actual = np.random.logseries(p=.923456789, size=(3, 2))
+        desired = np.array([[2, 2],
+                            [6, 17],
+                            [3, 6]])
+        assert_array_equal(actual, desired)
+
+    def test_multinomial(self):
+        np.random.seed(self.seed)
+        actual = np.random.multinomial(20, [1/6.]*6, size=(3, 2))
+        desired = np.array([[[4, 3, 5, 4, 2, 2],
+                             [5, 2, 8, 2, 2, 1]],
+                            [[3, 4, 3, 6, 0, 4],
+                             [2, 1, 4, 3, 6, 4]],
+                            [[4, 4, 2, 5, 2, 3],
+                             [4, 3, 4, 2, 3, 4]]])
+        assert_array_equal(actual, desired)
+
+    def test_multivariate_normal(self):
+        np.random.seed(self.seed)
+        mean = (.123456789, 10)
+        cov = [[1, 0], [0, 1]]
+        size = (3, 2)
+        actual = np.random.multivariate_normal(mean, cov, size)
+        desired = np.array([[[1.463620246718631, 11.73759122771936 ],
+                             [1.622445133300628, 9.771356667546383]],
+                            [[2.154490787682787, 12.170324946056553],
+                             [1.719909438201865, 9.230548443648306]],
+                            [[0.689515026297799, 9.880729819607714],
+                             [-0.023054015651998, 9.201096623542879]]])
+
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+        # Check for default size, was raising deprecation warning
+        actual = np.random.multivariate_normal(mean, cov)
+        desired = np.array([0.895289569463708, 9.17180864067987])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+        # Check that non positive-semidefinite covariance warns with
+        # RuntimeWarning
+        mean = [0, 0]
+        cov = [[1, 2], [2, 1]]
+        assert_warns(RuntimeWarning, np.random.multivariate_normal, mean, cov)
+
+        # and that it doesn't warn with RuntimeWarning check_valid='ignore'
+        assert_no_warnings(np.random.multivariate_normal, mean, cov,
+                           check_valid='ignore')
+
+        # and that it raises with RuntimeWarning check_valid='raises'
+        assert_raises(ValueError, np.random.multivariate_normal, mean, cov,
+                      check_valid='raise')
+
+        cov = np.array([[1, 0.1],[0.1, 1]], dtype=np.float32)
+        with suppress_warnings() as sup:
+            np.random.multivariate_normal(mean, cov)
+            w = sup.record(RuntimeWarning)
+            assert len(w) == 0
+
+    def test_negative_binomial(self):
+        np.random.seed(self.seed)
+        actual = np.random.negative_binomial(n=100, p=.12345, size=(3, 2))
+        desired = np.array([[848, 841],
+                            [892, 611],
+                            [779, 647]])
+        assert_array_equal(actual, desired)
+
+    def test_noncentral_chisquare(self):
+        np.random.seed(self.seed)
+        actual = np.random.noncentral_chisquare(df=5, nonc=5, size=(3, 2))
+        desired = np.array([[23.91905354498517511, 13.35324692733826346],
+                            [31.22452661329736401, 16.60047399466177254],
+                            [5.03461598262724586, 17.94973089023519464]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+        actual = np.random.noncentral_chisquare(df=.5, nonc=.2, size=(3, 2))
+        desired = np.array([[1.47145377828516666,  0.15052899268012659],
+                            [0.00943803056963588,  1.02647251615666169],
+                            [0.332334982684171,  0.15451287602753125]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+        np.random.seed(self.seed)
+        actual = np.random.noncentral_chisquare(df=5, nonc=0, size=(3, 2))
+        desired = np.array([[9.597154162763948, 11.725484450296079],
+                            [10.413711048138335, 3.694475922923986],
+                            [13.484222138963087, 14.377255424602957]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_noncentral_f(self):
+        np.random.seed(self.seed)
+        actual = np.random.noncentral_f(dfnum=5, dfden=2, nonc=1,
+                                        size=(3, 2))
+        desired = np.array([[1.40598099674926669, 0.34207973179285761],
+                            [3.57715069265772545, 7.92632662577829805],
+                            [0.43741599463544162, 1.1774208752428319]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_normal(self):
+        np.random.seed(self.seed)
+        actual = np.random.normal(loc=.123456789, scale=2.0, size=(3, 2))
+        desired = np.array([[2.80378370443726244, 3.59863924443872163],
+                            [3.121433477601256, -0.33382987590723379],
+                            [4.18552478636557357, 4.46410668111310471]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_normal_0(self):
+        assert_equal(np.random.normal(scale=0), 0)
+        assert_raises(ValueError, np.random.normal, scale=-0.)
+
+    def test_pareto(self):
+        np.random.seed(self.seed)
+        actual = np.random.pareto(a=.123456789, size=(3, 2))
+        desired = np.array(
+                [[2.46852460439034849e+03, 1.41286880810518346e+03],
+                 [5.28287797029485181e+07, 6.57720981047328785e+07],
+                 [1.40840323350391515e+02, 1.98390255135251704e+05]])
+        # For some reason on 32-bit x86 Ubuntu 12.10 the [1, 0] entry in this
+        # matrix differs by 24 nulps. Discussion:
+        #   https://mail.python.org/pipermail/numpy-discussion/2012-September/063801.html
+        # Consensus is that this is probably some gcc quirk that affects
+        # rounding but not in any important way, so we just use a looser
+        # tolerance on this test:
+        np.testing.assert_array_almost_equal_nulp(actual, desired, nulp=30)
+
+    def test_poisson(self):
+        np.random.seed(self.seed)
+        actual = np.random.poisson(lam=.123456789, size=(3, 2))
+        desired = np.array([[0, 0],
+                            [1, 0],
+                            [0, 0]])
+        assert_array_equal(actual, desired)
+
+    def test_poisson_exceptions(self):
+        lambig = np.iinfo('l').max
+        lamneg = -1
+        assert_raises(ValueError, np.random.poisson, lamneg)
+        assert_raises(ValueError, np.random.poisson, [lamneg]*10)
+        assert_raises(ValueError, np.random.poisson, lambig)
+        assert_raises(ValueError, np.random.poisson, [lambig]*10)
+
+    def test_power(self):
+        np.random.seed(self.seed)
+        actual = np.random.power(a=.123456789, size=(3, 2))
+        desired = np.array([[0.02048932883240791, 0.01424192241128213],
+                            [0.38446073748535298, 0.39499689943484395],
+                            [0.00177699707563439, 0.13115505880863756]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_rayleigh(self):
+        np.random.seed(self.seed)
+        actual = np.random.rayleigh(scale=10, size=(3, 2))
+        desired = np.array([[13.8882496494248393, 13.383318339044731],
+                            [20.95413364294492098, 21.08285015800712614],
+                            [11.06066537006854311, 17.35468505778271009]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_rayleigh_0(self):
+        assert_equal(np.random.rayleigh(scale=0), 0)
+        assert_raises(ValueError, np.random.rayleigh, scale=-0.)
+
+    def test_standard_cauchy(self):
+        np.random.seed(self.seed)
+        actual = np.random.standard_cauchy(size=(3, 2))
+        desired = np.array([[0.77127660196445336, -6.55601161955910605],
+                            [0.93582023391158309, -2.07479293013759447],
+                            [-4.74601644297011926, 0.18338989290760804]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_standard_exponential(self):
+        np.random.seed(self.seed)
+        actual = np.random.standard_exponential(size=(3, 2))
+        desired = np.array([[0.96441739162374596, 0.89556604882105506],
+                            [2.1953785836319808, 2.22243285392490542],
+                            [0.6116915921431676, 1.50592546727413201]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_standard_gamma(self):
+        np.random.seed(self.seed)
+        actual = np.random.standard_gamma(shape=3, size=(3, 2))
+        desired = np.array([[5.50841531318455058, 6.62953470301903103],
+                            [5.93988484943779227, 2.31044849402133989],
+                            [7.54838614231317084, 8.012756093271868]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_standard_gamma_0(self):
+        assert_equal(np.random.standard_gamma(shape=0), 0)
+        assert_raises(ValueError, np.random.standard_gamma, shape=-0.)
+
+    def test_standard_normal(self):
+        np.random.seed(self.seed)
+        actual = np.random.standard_normal(size=(3, 2))
+        desired = np.array([[1.34016345771863121, 1.73759122771936081],
+                            [1.498988344300628, -0.2286433324536169],
+                            [2.031033998682787, 2.17032494605655257]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_standard_t(self):
+        np.random.seed(self.seed)
+        actual = np.random.standard_t(df=10, size=(3, 2))
+        desired = np.array([[0.97140611862659965, -0.08830486548450577],
+                            [1.36311143689505321, -0.55317463909867071],
+                            [-0.18473749069684214, 0.61181537341755321]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_triangular(self):
+        np.random.seed(self.seed)
+        actual = np.random.triangular(left=5.12, mode=10.23, right=20.34,
+                                      size=(3, 2))
+        desired = np.array([[12.68117178949215784, 12.4129206149193152],
+                            [16.20131377335158263, 16.25692138747600524],
+                            [11.20400690911820263, 14.4978144835829923]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_uniform(self):
+        np.random.seed(self.seed)
+        actual = np.random.uniform(low=1.23, high=10.54, size=(3, 2))
+        desired = np.array([[6.99097932346268003, 6.73801597444323974],
+                            [9.50364421400426274, 9.53130618907631089],
+                            [5.48995325769805476, 8.47493103280052118]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_uniform_range_bounds(self):
+        fmin = np.finfo('float').min
+        fmax = np.finfo('float').max
+
+        func = np.random.uniform
+        assert_raises(OverflowError, func, -np.inf, 0)
+        assert_raises(OverflowError, func,  0,      np.inf)
+        assert_raises(OverflowError, func,  fmin,   fmax)
+        assert_raises(OverflowError, func, [-np.inf], [0])
+        assert_raises(OverflowError, func, [0], [np.inf])
+
+        # (fmax / 1e17) - fmin is within range, so this should not throw
+        # account for i386 extended precision DBL_MAX / 1e17 + DBL_MAX >
+        # DBL_MAX by increasing fmin a bit
+        np.random.uniform(low=np.nextafter(fmin, 1), high=fmax / 1e17)
+
+    def test_scalar_exception_propagation(self):
+        # Tests that exceptions are correctly propagated in distributions
+        # when called with objects that throw exceptions when converted to
+        # scalars.
+        #
+        # Regression test for gh: 8865
+
+        class ThrowingFloat(np.ndarray):
+            def __float__(self):
+                raise TypeError
+
+        throwing_float = np.array(1.0).view(ThrowingFloat)
+        assert_raises(TypeError, np.random.uniform, throwing_float, throwing_float)
+
+        class ThrowingInteger(np.ndarray):
+            def __int__(self):
+                raise TypeError
+
+        throwing_int = np.array(1).view(ThrowingInteger)
+        assert_raises(TypeError, np.random.hypergeometric, throwing_int, 1, 1)
+
+    def test_vonmises(self):
+        np.random.seed(self.seed)
+        actual = np.random.vonmises(mu=1.23, kappa=1.54, size=(3, 2))
+        desired = np.array([[2.28567572673902042, 2.89163838442285037],
+                            [0.38198375564286025, 2.57638023113890746],
+                            [1.19153771588353052, 1.83509849681825354]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_vonmises_small(self):
+        # check infinite loop, gh-4720
+        np.random.seed(self.seed)
+        r = np.random.vonmises(mu=0., kappa=1.1e-8, size=10**6)
+        np.testing.assert_(np.isfinite(r).all())
+
+    def test_wald(self):
+        np.random.seed(self.seed)
+        actual = np.random.wald(mean=1.23, scale=1.54, size=(3, 2))
+        desired = np.array([[3.82935265715889983, 5.13125249184285526],
+                            [0.35045403618358717, 1.50832396872003538],
+                            [0.24124319895843183, 0.22031101461955038]])
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_weibull(self):
+        np.random.seed(self.seed)
+        actual = np.random.weibull(a=1.23, size=(3, 2))
+        desired = np.array([[0.97097342648766727, 0.91422896443565516],
+                            [1.89517770034962929, 1.91414357960479564],
+                            [0.67057783752390987, 1.39494046635066793]])
+        assert_array_almost_equal(actual, desired, decimal=15)
+
+    def test_weibull_0(self):
+        np.random.seed(self.seed)
+        assert_equal(np.random.weibull(a=0, size=12), np.zeros(12))
+        assert_raises(ValueError, np.random.weibull, a=-0.)
+
+    def test_zipf(self):
+        np.random.seed(self.seed)
+        actual = np.random.zipf(a=1.23, size=(3, 2))
+        desired = np.array([[66, 29],
+                            [1, 1],
+                            [3, 13]])
+        assert_array_equal(actual, desired)
+
+
+class TestBroadcast(object):
+    # tests that functions that broadcast behave
+    # correctly when presented with non-scalar arguments
+    def setup(self):
+        self.seed = 123456789
+
+    def setSeed(self):
+        np.random.seed(self.seed)
+
+    # TODO: Include test for randint once it can broadcast
+    # Can steal the test written in PR #6938
+
+    def test_uniform(self):
+        low = [0]
+        high = [1]
+        uniform = np.random.uniform
+        desired = np.array([0.53283302478975902,
+                            0.53413660089041659,
+                            0.50955303552646702])
+
+        self.setSeed()
+        actual = uniform(low * 3, high)
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+        self.setSeed()
+        actual = uniform(low, high * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_normal(self):
+        loc = [0]
+        scale = [1]
+        bad_scale = [-1]
+        normal = np.random.normal
+        desired = np.array([2.2129019979039612,
+                            2.1283977976520019,
+                            1.8417114045748335])
+
+        self.setSeed()
+        actual = normal(loc * 3, scale)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, normal, loc * 3, bad_scale)
+
+        self.setSeed()
+        actual = normal(loc, scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, normal, loc, bad_scale * 3)
+
+    def test_beta(self):
+        a = [1]
+        b = [2]
+        bad_a = [-1]
+        bad_b = [-2]
+        beta = np.random.beta
+        desired = np.array([0.19843558305989056,
+                            0.075230336409423643,
+                            0.24976865978980844])
+
+        self.setSeed()
+        actual = beta(a * 3, b)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, beta, bad_a * 3, b)
+        assert_raises(ValueError, beta, a * 3, bad_b)
+
+        self.setSeed()
+        actual = beta(a, b * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, beta, bad_a, b * 3)
+        assert_raises(ValueError, beta, a, bad_b * 3)
+
+    def test_exponential(self):
+        scale = [1]
+        bad_scale = [-1]
+        exponential = np.random.exponential
+        desired = np.array([0.76106853658845242,
+                            0.76386282278691653,
+                            0.71243813125891797])
+
+        self.setSeed()
+        actual = exponential(scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, exponential, bad_scale * 3)
+
+    def test_standard_gamma(self):
+        shape = [1]
+        bad_shape = [-1]
+        std_gamma = np.random.standard_gamma
+        desired = np.array([0.76106853658845242,
+                            0.76386282278691653,
+                            0.71243813125891797])
+
+        self.setSeed()
+        actual = std_gamma(shape * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, std_gamma, bad_shape * 3)
+
+    def test_gamma(self):
+        shape = [1]
+        scale = [2]
+        bad_shape = [-1]
+        bad_scale = [-2]
+        gamma = np.random.gamma
+        desired = np.array([1.5221370731769048,
+                            1.5277256455738331,
+                            1.4248762625178359])
+
+        self.setSeed()
+        actual = gamma(shape * 3, scale)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, gamma, bad_shape * 3, scale)
+        assert_raises(ValueError, gamma, shape * 3, bad_scale)
+
+        self.setSeed()
+        actual = gamma(shape, scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, gamma, bad_shape, scale * 3)
+        assert_raises(ValueError, gamma, shape, bad_scale * 3)
+
+    def test_f(self):
+        dfnum = [1]
+        dfden = [2]
+        bad_dfnum = [-1]
+        bad_dfden = [-2]
+        f = np.random.f
+        desired = np.array([0.80038951638264799,
+                            0.86768719635363512,
+                            2.7251095168386801])
+
+        self.setSeed()
+        actual = f(dfnum * 3, dfden)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, f, bad_dfnum * 3, dfden)
+        assert_raises(ValueError, f, dfnum * 3, bad_dfden)
+
+        self.setSeed()
+        actual = f(dfnum, dfden * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, f, bad_dfnum, dfden * 3)
+        assert_raises(ValueError, f, dfnum, bad_dfden * 3)
+
+    def test_noncentral_f(self):
+        dfnum = [2]
+        dfden = [3]
+        nonc = [4]
+        bad_dfnum = [0]
+        bad_dfden = [-1]
+        bad_nonc = [-2]
+        nonc_f = np.random.noncentral_f
+        desired = np.array([9.1393943263705211,
+                            13.025456344595602,
+                            8.8018098359100545])
+
+        self.setSeed()
+        actual = nonc_f(dfnum * 3, dfden, nonc)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, nonc_f, bad_dfnum * 3, dfden, nonc)
+        assert_raises(ValueError, nonc_f, dfnum * 3, bad_dfden, nonc)
+        assert_raises(ValueError, nonc_f, dfnum * 3, dfden, bad_nonc)
+
+        self.setSeed()
+        actual = nonc_f(dfnum, dfden * 3, nonc)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, nonc_f, bad_dfnum, dfden * 3, nonc)
+        assert_raises(ValueError, nonc_f, dfnum, bad_dfden * 3, nonc)
+        assert_raises(ValueError, nonc_f, dfnum, dfden * 3, bad_nonc)
+
+        self.setSeed()
+        actual = nonc_f(dfnum, dfden, nonc * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, nonc_f, bad_dfnum, dfden, nonc * 3)
+        assert_raises(ValueError, nonc_f, dfnum, bad_dfden, nonc * 3)
+        assert_raises(ValueError, nonc_f, dfnum, dfden, bad_nonc * 3)
+
+    def test_noncentral_f_small_df(self):
+        self.setSeed()
+        desired = np.array([6.869638627492048, 0.785880199263955])
+        actual = np.random.noncentral_f(0.9, 0.9, 2, size=2)
+        assert_array_almost_equal(actual, desired, decimal=14)
+
+    def test_chisquare(self):
+        df = [1]
+        bad_df = [-1]
+        chisquare = np.random.chisquare
+        desired = np.array([0.57022801133088286,
+                            0.51947702108840776,
+                            0.1320969254923558])
+
+        self.setSeed()
+        actual = chisquare(df * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, chisquare, bad_df * 3)
+
+    def test_noncentral_chisquare(self):
+        df = [1]
+        nonc = [2]
+        bad_df = [-1]
+        bad_nonc = [-2]
+        nonc_chi = np.random.noncentral_chisquare
+        desired = np.array([9.0015599467913763,
+                            4.5804135049718742,
+                            6.0872302432834564])
+
+        self.setSeed()
+        actual = nonc_chi(df * 3, nonc)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, nonc_chi, bad_df * 3, nonc)
+        assert_raises(ValueError, nonc_chi, df * 3, bad_nonc)
+
+        self.setSeed()
+        actual = nonc_chi(df, nonc * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, nonc_chi, bad_df, nonc * 3)
+        assert_raises(ValueError, nonc_chi, df, bad_nonc * 3)
+
+    def test_standard_t(self):
+        df = [1]
+        bad_df = [-1]
+        t = np.random.standard_t
+        desired = np.array([3.0702872575217643,
+                            5.8560725167361607,
+                            1.0274791436474273])
+
+        self.setSeed()
+        actual = t(df * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, t, bad_df * 3)
+
+    def test_vonmises(self):
+        mu = [2]
+        kappa = [1]
+        bad_kappa = [-1]
+        vonmises = np.random.vonmises
+        desired = np.array([2.9883443664201312,
+                            -2.7064099483995943,
+                            -1.8672476700665914])
+
+        self.setSeed()
+        actual = vonmises(mu * 3, kappa)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, vonmises, mu * 3, bad_kappa)
+
+        self.setSeed()
+        actual = vonmises(mu, kappa * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, vonmises, mu, bad_kappa * 3)
+
+    def test_pareto(self):
+        a = [1]
+        bad_a = [-1]
+        pareto = np.random.pareto
+        desired = np.array([1.1405622680198362,
+                            1.1465519762044529,
+                            1.0389564467453547])
+
+        self.setSeed()
+        actual = pareto(a * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, pareto, bad_a * 3)
+
+    def test_weibull(self):
+        a = [1]
+        bad_a = [-1]
+        weibull = np.random.weibull
+        desired = np.array([0.76106853658845242,
+                            0.76386282278691653,
+                            0.71243813125891797])
+
+        self.setSeed()
+        actual = weibull(a * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, weibull, bad_a * 3)
+
+    def test_power(self):
+        a = [1]
+        bad_a = [-1]
+        power = np.random.power
+        desired = np.array([0.53283302478975902,
+                            0.53413660089041659,
+                            0.50955303552646702])
+
+        self.setSeed()
+        actual = power(a * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, power, bad_a * 3)
+
+    def test_laplace(self):
+        loc = [0]
+        scale = [1]
+        bad_scale = [-1]
+        laplace = np.random.laplace
+        desired = np.array([0.067921356028507157,
+                            0.070715642226971326,
+                            0.019290950698972624])
+
+        self.setSeed()
+        actual = laplace(loc * 3, scale)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, laplace, loc * 3, bad_scale)
+
+        self.setSeed()
+        actual = laplace(loc, scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, laplace, loc, bad_scale * 3)
+
+    def test_gumbel(self):
+        loc = [0]
+        scale = [1]
+        bad_scale = [-1]
+        gumbel = np.random.gumbel
+        desired = np.array([0.2730318639556768,
+                            0.26936705726291116,
+                            0.33906220393037939])
+
+        self.setSeed()
+        actual = gumbel(loc * 3, scale)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, gumbel, loc * 3, bad_scale)
+
+        self.setSeed()
+        actual = gumbel(loc, scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, gumbel, loc, bad_scale * 3)
+
+    def test_logistic(self):
+        loc = [0]
+        scale = [1]
+        bad_scale = [-1]
+        logistic = np.random.logistic
+        desired = np.array([0.13152135837586171,
+                            0.13675915696285773,
+                            0.038216792802833396])
+
+        self.setSeed()
+        actual = logistic(loc * 3, scale)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, logistic, loc * 3, bad_scale)
+
+        self.setSeed()
+        actual = logistic(loc, scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, logistic, loc, bad_scale * 3)
+
+    def test_lognormal(self):
+        mean = [0]
+        sigma = [1]
+        bad_sigma = [-1]
+        lognormal = np.random.lognormal
+        desired = np.array([9.1422086044848427,
+                            8.4013952870126261,
+                            6.3073234116578671])
+
+        self.setSeed()
+        actual = lognormal(mean * 3, sigma)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, lognormal, mean * 3, bad_sigma)
+
+        self.setSeed()
+        actual = lognormal(mean, sigma * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, lognormal, mean, bad_sigma * 3)
+
+    def test_rayleigh(self):
+        scale = [1]
+        bad_scale = [-1]
+        rayleigh = np.random.rayleigh
+        desired = np.array([1.2337491937897689,
+                            1.2360119924878694,
+                            1.1936818095781789])
+
+        self.setSeed()
+        actual = rayleigh(scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, rayleigh, bad_scale * 3)
+
+    def test_wald(self):
+        mean = [0.5]
+        scale = [1]
+        bad_mean = [0]
+        bad_scale = [-2]
+        wald = np.random.wald
+        desired = np.array([0.11873681120271318,
+                            0.12450084820795027,
+                            0.9096122728408238])
+
+        self.setSeed()
+        actual = wald(mean * 3, scale)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, wald, bad_mean * 3, scale)
+        assert_raises(ValueError, wald, mean * 3, bad_scale)
+
+        self.setSeed()
+        actual = wald(mean, scale * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, wald, bad_mean, scale * 3)
+        assert_raises(ValueError, wald, mean, bad_scale * 3)
+
+    def test_triangular(self):
+        left = [1]
+        right = [3]
+        mode = [2]
+        bad_left_one = [3]
+        bad_mode_one = [4]
+        bad_left_two, bad_mode_two = right * 2
+        triangular = np.random.triangular
+        desired = np.array([2.03339048710429,
+                            2.0347400359389356,
+                            2.0095991069536208])
+
+        self.setSeed()
+        actual = triangular(left * 3, mode, right)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, triangular, bad_left_one * 3, mode, right)
+        assert_raises(ValueError, triangular, left * 3, bad_mode_one, right)
+        assert_raises(ValueError, triangular, bad_left_two * 3, bad_mode_two, right)
+
+        self.setSeed()
+        actual = triangular(left, mode * 3, right)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, triangular, bad_left_one, mode * 3, right)
+        assert_raises(ValueError, triangular, left, bad_mode_one * 3, right)
+        assert_raises(ValueError, triangular, bad_left_two, bad_mode_two * 3, right)
+
+        self.setSeed()
+        actual = triangular(left, mode, right * 3)
+        assert_array_almost_equal(actual, desired, decimal=14)
+        assert_raises(ValueError, triangular, bad_left_one, mode, right * 3)
+        assert_raises(ValueError, triangular, left, bad_mode_one, right * 3)
+        assert_raises(ValueError, triangular, bad_left_two, bad_mode_two, right * 3)
+
+    def test_binomial(self):
+        n = [1]
+        p = [0.5]
+        bad_n = [-1]
+        bad_p_one = [-1]
+        bad_p_two = [1.5]
+        binom = np.random.binomial
+        desired = np.array([1, 1, 1])
+
+        self.setSeed()
+        actual = binom(n * 3, p)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, binom, bad_n * 3, p)
+        assert_raises(ValueError, binom, n * 3, bad_p_one)
+        assert_raises(ValueError, binom, n * 3, bad_p_two)
+
+        self.setSeed()
+        actual = binom(n, p * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, binom, bad_n, p * 3)
+        assert_raises(ValueError, binom, n, bad_p_one * 3)
+        assert_raises(ValueError, binom, n, bad_p_two * 3)
+
+    def test_negative_binomial(self):
+        n = [1]
+        p = [0.5]
+        bad_n = [-1]
+        bad_p_one = [-1]
+        bad_p_two = [1.5]
+        neg_binom = np.random.negative_binomial
+        desired = np.array([1, 0, 1])
+
+        self.setSeed()
+        actual = neg_binom(n * 3, p)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, neg_binom, bad_n * 3, p)
+        assert_raises(ValueError, neg_binom, n * 3, bad_p_one)
+        assert_raises(ValueError, neg_binom, n * 3, bad_p_two)
+
+        self.setSeed()
+        actual = neg_binom(n, p * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, neg_binom, bad_n, p * 3)
+        assert_raises(ValueError, neg_binom, n, bad_p_one * 3)
+        assert_raises(ValueError, neg_binom, n, bad_p_two * 3)
+
+    def test_poisson(self):
+        max_lam = np.random.RandomState().poisson_lam_max
+
+        lam = [1]
+        bad_lam_one = [-1]
+        bad_lam_two = [max_lam * 2]
+        poisson = np.random.poisson
+        desired = np.array([1, 1, 0])
+
+        self.setSeed()
+        actual = poisson(lam * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, poisson, bad_lam_one * 3)
+        assert_raises(ValueError, poisson, bad_lam_two * 3)
+
+    def test_zipf(self):
+        a = [2]
+        bad_a = [0]
+        zipf = np.random.zipf
+        desired = np.array([2, 2, 1])
+
+        self.setSeed()
+        actual = zipf(a * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, zipf, bad_a * 3)
+        with np.errstate(invalid='ignore'):
+            assert_raises(ValueError, zipf, np.nan)
+            assert_raises(ValueError, zipf, [0, 0, np.nan])
+
+    def test_geometric(self):
+        p = [0.5]
+        bad_p_one = [-1]
+        bad_p_two = [1.5]
+        geom = np.random.geometric
+        desired = np.array([2, 2, 2])
+
+        self.setSeed()
+        actual = geom(p * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, geom, bad_p_one * 3)
+        assert_raises(ValueError, geom, bad_p_two * 3)
+
+    def test_hypergeometric(self):
+        ngood = [1]
+        nbad = [2]
+        nsample = [2]
+        bad_ngood = [-1]
+        bad_nbad = [-2]
+        bad_nsample_one = [0]
+        bad_nsample_two = [4]
+        hypergeom = np.random.hypergeometric
+        desired = np.array([1, 1, 1])
+
+        self.setSeed()
+        actual = hypergeom(ngood * 3, nbad, nsample)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, hypergeom, bad_ngood * 3, nbad, nsample)
+        assert_raises(ValueError, hypergeom, ngood * 3, bad_nbad, nsample)
+        assert_raises(ValueError, hypergeom, ngood * 3, nbad, bad_nsample_one)
+        assert_raises(ValueError, hypergeom, ngood * 3, nbad, bad_nsample_two)
+
+        self.setSeed()
+        actual = hypergeom(ngood, nbad * 3, nsample)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, hypergeom, bad_ngood, nbad * 3, nsample)
+        assert_raises(ValueError, hypergeom, ngood, bad_nbad * 3, nsample)
+        assert_raises(ValueError, hypergeom, ngood, nbad * 3, bad_nsample_one)
+        assert_raises(ValueError, hypergeom, ngood, nbad * 3, bad_nsample_two)
+
+        self.setSeed()
+        actual = hypergeom(ngood, nbad, nsample * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, hypergeom, bad_ngood, nbad, nsample * 3)
+        assert_raises(ValueError, hypergeom, ngood, bad_nbad, nsample * 3)
+        assert_raises(ValueError, hypergeom, ngood, nbad, bad_nsample_one * 3)
+        assert_raises(ValueError, hypergeom, ngood, nbad, bad_nsample_two * 3)
+
+    def test_logseries(self):
+        p = [0.5]
+        bad_p_one = [2]
+        bad_p_two = [-1]
+        logseries = np.random.logseries
+        desired = np.array([1, 1, 1])
+
+        self.setSeed()
+        actual = logseries(p * 3)
+        assert_array_equal(actual, desired)
+        assert_raises(ValueError, logseries, bad_p_one * 3)
+        assert_raises(ValueError, logseries, bad_p_two * 3)
+
+class TestThread(object):
+    # make sure each state produces the same sequence even in threads
+    def setup(self):
+        self.seeds = range(4)
+
+    def check_function(self, function, sz):
+        from threading import Thread
+
+        out1 = np.empty((len(self.seeds),) + sz)
+        out2 = np.empty((len(self.seeds),) + sz)
+
+        # threaded generation
+        t = [Thread(target=function, args=(np.random.RandomState(s), o))
+             for s, o in zip(self.seeds, out1)]
+        [x.start() for x in t]
+        [x.join() for x in t]
+
+        # the same serial
+        for s, o in zip(self.seeds, out2):
+            function(np.random.RandomState(s), o)
+
+        # these platforms change x87 fpu precision mode in threads
+        if np.intp().dtype.itemsize == 4 and sys.platform == "win32":
+            assert_array_almost_equal(out1, out2)
+        else:
+            assert_array_equal(out1, out2)
+
+    def test_normal(self):
+        def gen_random(state, out):
+            out[...] = state.normal(size=10000)
+        self.check_function(gen_random, sz=(10000,))
+
+    def test_exp(self):
+        def gen_random(state, out):
+            out[...] = state.exponential(scale=np.ones((100, 1000)))
+        self.check_function(gen_random, sz=(100, 1000))
+
+    def test_multinomial(self):
+        def gen_random(state, out):
+            out[...] = state.multinomial(10, [1/6.]*6, size=10000)
+        self.check_function(gen_random, sz=(10000, 6))
+
+# See Issue #4263
+class TestSingleEltArrayInput(object):
+    def setup(self):
+        self.argOne = np.array([2])
+        self.argTwo = np.array([3])
+        self.argThree = np.array([4])
+        self.tgtShape = (1,)
+
+    def test_one_arg_funcs(self):
+        funcs = (np.random.exponential, np.random.standard_gamma,
+                 np.random.chisquare, np.random.standard_t,
+                 np.random.pareto, np.random.weibull,
+                 np.random.power, np.random.rayleigh,
+                 np.random.poisson, np.random.zipf,
+                 np.random.geometric, np.random.logseries)
+
+        probfuncs = (np.random.geometric, np.random.logseries)
+
+        for func in funcs:
+            if func in probfuncs:  # p < 1.0
+                out = func(np.array([0.5]))
+
+            else:
+                out = func(self.argOne)
+
+            assert_equal(out.shape, self.tgtShape)
+
+    def test_two_arg_funcs(self):
+        funcs = (np.random.uniform, np.random.normal,
+                 np.random.beta, np.random.gamma,
+                 np.random.f, np.random.noncentral_chisquare,
+                 np.random.vonmises, np.random.laplace,
+                 np.random.gumbel, np.random.logistic,
+                 np.random.lognormal, np.random.wald,
+                 np.random.binomial, np.random.negative_binomial)
+
+        probfuncs = (np.random.binomial, np.random.negative_binomial)
+
+        for func in funcs:
+            if func in probfuncs:  # p <= 1
+                argTwo = np.array([0.5])
+
+            else:
+                argTwo = self.argTwo
+
+            out = func(self.argOne, argTwo)
+            assert_equal(out.shape, self.tgtShape)
+
+            out = func(self.argOne[0], argTwo)
+            assert_equal(out.shape, self.tgtShape)
+
+            out = func(self.argOne, argTwo[0])
+            assert_equal(out.shape, self.tgtShape)
+
+# TODO: Uncomment once randint can broadcast arguments
+#    def test_randint(self):
+#        itype = [bool, np.int8, np.uint8, np.int16, np.uint16,
+#                 np.int32, np.uint32, np.int64, np.uint64]
+#        func = np.random.randint
+#        high = np.array([1])
+#        low = np.array([0])
+#
+#        for dt in itype:
+#            out = func(low, high, dtype=dt)
+#            self.assert_equal(out.shape, self.tgtShape)
+#
+#            out = func(low[0], high, dtype=dt)
+#            self.assert_equal(out.shape, self.tgtShape)
+#
+#            out = func(low, high[0], dtype=dt)
+#            self.assert_equal(out.shape, self.tgtShape)
+
+    def test_three_arg_funcs(self):
+        funcs = [np.random.noncentral_f, np.random.triangular,
+                 np.random.hypergeometric]
+
+        for func in funcs:
+            out = func(self.argOne, self.argTwo, self.argThree)
+            assert_equal(out.shape, self.tgtShape)
+
+            out = func(self.argOne[0], self.argTwo, self.argThree)
+            assert_equal(out.shape, self.tgtShape)
+
+            out = func(self.argOne, self.argTwo[0], self.argThree)
+            assert_equal(out.shape, self.tgtShape)
diff --git a/contrib/python/numpy/py2/numpy/random/tests/test_regression.py b/contrib/python/numpy/py2/numpy/random/tests/test_regression.py
new file mode 100644
index 00000000000..ca9bbbc719d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/random/tests/test_regression.py
@@ -0,0 +1,157 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+from numpy.testing import (
+    assert_, assert_array_equal, assert_raises,
+    )
+from numpy import random
+from numpy.compat import long
+import numpy as np
+
+
+class TestRegression(object):
+
+    def test_VonMises_range(self):
+        # Make sure generated random variables are in [-pi, pi].
+        # Regression test for ticket #986.
+        for mu in np.linspace(-7., 7., 5):
+            r = random.mtrand.vonmises(mu, 1, 50)
+            assert_(np.all(r > -np.pi) and np.all(r <= np.pi))
+
+    def test_hypergeometric_range(self):
+        # Test for ticket #921
+        assert_(np.all(np.random.hypergeometric(3, 18, 11, size=10) < 4))
+        assert_(np.all(np.random.hypergeometric(18, 3, 11, size=10) > 0))
+
+        # Test for ticket #5623
+        args = [
+            (2**20 - 2, 2**20 - 2, 2**20 - 2),  # Check for 32-bit systems
+        ]
+        is_64bits = sys.maxsize > 2**32
+        if is_64bits and sys.platform != 'win32':
+            args.append((2**40 - 2, 2**40 - 2, 2**40 - 2)) # Check for 64-bit systems
+        for arg in args:
+            assert_(np.random.hypergeometric(*arg) > 0)
+
+    def test_logseries_convergence(self):
+        # Test for ticket #923
+        N = 1000
+        np.random.seed(0)
+        rvsn = np.random.logseries(0.8, size=N)
+        # these two frequency counts should be close to theoretical
+        # numbers with this large sample
+        # theoretical large N result is 0.49706795
+        freq = np.sum(rvsn == 1) / float(N)
+        msg = "Frequency was %f, should be > 0.45" % freq
+        assert_(freq > 0.45, msg)
+        # theoretical large N result is 0.19882718
+        freq = np.sum(rvsn == 2) / float(N)
+        msg = "Frequency was %f, should be < 0.23" % freq
+        assert_(freq < 0.23, msg)
+
+    def test_permutation_longs(self):
+        np.random.seed(1234)
+        a = np.random.permutation(12)
+        np.random.seed(1234)
+        b = np.random.permutation(long(12))
+        assert_array_equal(a, b)
+
+    def test_shuffle_mixed_dimension(self):
+        # Test for trac ticket #2074
+        for t in [[1, 2, 3, None],
+                  [(1, 1), (2, 2), (3, 3), None],
+                  [1, (2, 2), (3, 3), None],
+                  [(1, 1), 2, 3, None]]:
+            np.random.seed(12345)
+            shuffled = list(t)
+            random.shuffle(shuffled)
+            assert_array_equal(shuffled, [t[0], t[3], t[1], t[2]])
+
+    def test_call_within_randomstate(self):
+        # Check that custom RandomState does not call into global state
+        m = np.random.RandomState()
+        res = np.array([0, 8, 7, 2, 1, 9, 4, 7, 0, 3])
+        for i in range(3):
+            np.random.seed(i)
+            m.seed(4321)
+            # If m.state is not honored, the result will change
+            assert_array_equal(m.choice(10, size=10, p=np.ones(10)/10.), res)
+
+    def test_multivariate_normal_size_types(self):
+        # Test for multivariate_normal issue with 'size' argument.
+        # Check that the multivariate_normal size argument can be a
+        # numpy integer.
+        np.random.multivariate_normal([0], [[0]], size=1)
+        np.random.multivariate_normal([0], [[0]], size=np.int_(1))
+        np.random.multivariate_normal([0], [[0]], size=np.int64(1))
+
+    def test_beta_small_parameters(self):
+        # Test that beta with small a and b parameters does not produce
+        # NaNs due to roundoff errors causing 0 / 0, gh-5851
+        np.random.seed(1234567890)
+        x = np.random.beta(0.0001, 0.0001, size=100)
+        assert_(not np.any(np.isnan(x)), 'Nans in np.random.beta')
+
+    def test_choice_sum_of_probs_tolerance(self):
+        # The sum of probs should be 1.0 with some tolerance.
+        # For low precision dtypes the tolerance was too tight.
+        # See numpy github issue 6123.
+        np.random.seed(1234)
+        a = [1, 2, 3]
+        counts = [4, 4, 2]
+        for dt in np.float16, np.float32, np.float64:
+            probs = np.array(counts, dtype=dt) / sum(counts)
+            c = np.random.choice(a, p=probs)
+            assert_(c in a)
+            assert_raises(ValueError, np.random.choice, a, p=probs*0.9)
+
+    def test_shuffle_of_array_of_different_length_strings(self):
+        # Test that permuting an array of different length strings
+        # will not cause a segfault on garbage collection
+        # Tests gh-7710
+        np.random.seed(1234)
+
+        a = np.array(['a', 'a' * 1000])
+
+        for _ in range(100):
+            np.random.shuffle(a)
+
+        # Force Garbage Collection - should not segfault.
+        import gc
+        gc.collect()
+
+    def test_shuffle_of_array_of_objects(self):
+        # Test that permuting an array of objects will not cause
+        # a segfault on garbage collection.
+        # See gh-7719
+        np.random.seed(1234)
+        a = np.array([np.arange(1), np.arange(4)])
+
+        for _ in range(1000):
+            np.random.shuffle(a)
+
+        # Force Garbage Collection - should not segfault.
+        import gc
+        gc.collect()
+
+    def test_permutation_subclass(self):
+        class N(np.ndarray):
+            pass
+
+        np.random.seed(1)
+        orig = np.arange(3).view(N)
+        perm = np.random.permutation(orig)
+        assert_array_equal(perm, np.array([0, 2, 1]))
+        assert_array_equal(orig, np.arange(3).view(N))
+
+        class M(object):
+            a = np.arange(5)
+
+            def __array__(self):
+                return self.a
+
+        np.random.seed(1)
+        m = M()
+        perm = np.random.permutation(m)
+        assert_array_equal(perm, np.array([2, 1, 4, 0, 3]))
+        assert_array_equal(m.__array__(), np.arange(5))
diff --git a/contrib/python/numpy/py2/numpy/setup.py b/contrib/python/numpy/py2/numpy/setup.py
new file mode 100644
index 00000000000..4ccdaeea5e9
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/setup.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+from __future__ import division, print_function
+
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('numpy', parent_package, top_path)
+
+    config.add_subpackage('compat')
+    config.add_subpackage('core')
+    config.add_subpackage('distutils')
+    config.add_subpackage('doc')
+    config.add_subpackage('f2py')
+    config.add_subpackage('fft')
+    config.add_subpackage('lib')
+    config.add_subpackage('linalg')
+    config.add_subpackage('ma')
+    config.add_subpackage('matrixlib')
+    config.add_subpackage('polynomial')
+    config.add_subpackage('random')
+    config.add_subpackage('testing')
+    config.add_data_dir('doc')
+    config.add_data_dir('tests')
+    config.make_config_py() # installs __config__.py
+    return config
+
+if __name__ == '__main__':
+    print('This is the wrong setup.py file to run')
diff --git a/contrib/python/numpy/py2/numpy/testing/setup.py b/contrib/python/numpy/py2/numpy/testing/setup.py
new file mode 100755
index 00000000000..7c3f2fbdfd6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/testing/setup.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+from __future__ import division, print_function
+
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration('testing', parent_package, top_path)
+
+    config.add_subpackage('_private')
+    config.add_data_dir('tests')
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(maintainer="NumPy Developers",
+          maintainer_email="[email protected]",
+          description="NumPy test module",
+          url="https://www.numpy.org",
+          license="NumPy License (BSD Style)",
+          configuration=configuration,
+          )
diff --git a/contrib/python/numpy/py2/numpy/testing/tests/__init__.py b/contrib/python/numpy/py2/numpy/testing/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/testing/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/testing/tests/test_decorators.py b/contrib/python/numpy/py2/numpy/testing/tests/test_decorators.py
new file mode 100644
index 00000000000..c029bf90c11
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/testing/tests/test_decorators.py
@@ -0,0 +1,216 @@
+"""
+Test the decorators from ``testing.decorators``.
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import warnings
+import pytest
+
+from numpy.testing import (
+    assert_, assert_raises, dec, SkipTest, KnownFailureException,
+    )
+
+
+try:
+    with warnings.catch_warnings():
+        warnings.simplefilter("always")
+        import nose  # noqa: F401
+except ImportError:
+    HAVE_NOSE = False
+else:
+    HAVE_NOSE = True
+
+
+[email protected](not HAVE_NOSE, reason="Needs nose")
+class TestNoseDecorators(object):
+    # These tests are run in a class for simplicity while still
+    # getting a report on each, skipped or success.
+
+    class DidntSkipException(Exception):
+        pass
+
+    def test_slow(self):
+        @dec.slow
+        def slow_func(x, y, z):
+            pass
+
+        assert_(slow_func.slow)
+
+    def test_setastest(self):
+        @dec.setastest()
+        def f_default(a):
+            pass
+
+        @dec.setastest(True)
+        def f_istest(a):
+            pass
+
+        @dec.setastest(False)
+        def f_isnottest(a):
+            pass
+
+        assert_(f_default.__test__)
+        assert_(f_istest.__test__)
+        assert_(not f_isnottest.__test__)
+
+    def test_skip_functions_hardcoded(self):
+        @dec.skipif(True)
+        def f1(x):
+            raise self.DidntSkipException
+
+        try:
+            f1('a')
+        except self.DidntSkipException:
+            raise Exception('Failed to skip')
+        except SkipTest().__class__:
+            pass
+
+        @dec.skipif(False)
+        def f2(x):
+            raise self.DidntSkipException
+
+        try:
+            f2('a')
+        except self.DidntSkipException:
+            pass
+        except SkipTest().__class__:
+            raise Exception('Skipped when not expected to')
+
+    def test_skip_functions_callable(self):
+        def skip_tester():
+            return skip_flag == 'skip me!'
+
+        @dec.skipif(skip_tester)
+        def f1(x):
+            raise self.DidntSkipException
+
+        try:
+            skip_flag = 'skip me!'
+            f1('a')
+        except self.DidntSkipException:
+            raise Exception('Failed to skip')
+        except SkipTest().__class__:
+            pass
+
+        @dec.skipif(skip_tester)
+        def f2(x):
+            raise self.DidntSkipException
+
+        try:
+            skip_flag = 'five is right out!'
+            f2('a')
+        except self.DidntSkipException:
+            pass
+        except SkipTest().__class__:
+            raise Exception('Skipped when not expected to')
+
+    def test_skip_generators_hardcoded(self):
+        @dec.knownfailureif(True, "This test is known to fail")
+        def g1(x):
+            for i in range(x):
+                yield i
+
+        try:
+            for j in g1(10):
+                pass
+        except KnownFailureException().__class__:
+            pass
+        else:
+            raise Exception('Failed to mark as known failure')
+
+        @dec.knownfailureif(False, "This test is NOT known to fail")
+        def g2(x):
+            for i in range(x):
+                yield i
+            raise self.DidntSkipException('FAIL')
+
+        try:
+            for j in g2(10):
+                pass
+        except KnownFailureException().__class__:
+            raise Exception('Marked incorrectly as known failure')
+        except self.DidntSkipException:
+            pass
+
+    def test_skip_generators_callable(self):
+        def skip_tester():
+            return skip_flag == 'skip me!'
+
+        @dec.knownfailureif(skip_tester, "This test is known to fail")
+        def g1(x):
+            for i in range(x):
+                yield i
+
+        try:
+            skip_flag = 'skip me!'
+            for j in g1(10):
+                pass
+        except KnownFailureException().__class__:
+            pass
+        else:
+            raise Exception('Failed to mark as known failure')
+
+        @dec.knownfailureif(skip_tester, "This test is NOT known to fail")
+        def g2(x):
+            for i in range(x):
+                yield i
+            raise self.DidntSkipException('FAIL')
+
+        try:
+            skip_flag = 'do not skip'
+            for j in g2(10):
+                pass
+        except KnownFailureException().__class__:
+            raise Exception('Marked incorrectly as known failure')
+        except self.DidntSkipException:
+            pass
+
+    def test_deprecated(self):
+        @dec.deprecated(True)
+        def non_deprecated_func():
+            pass
+
+        @dec.deprecated()
+        def deprecated_func():
+            import warnings
+            warnings.warn("TEST: deprecated func", DeprecationWarning)
+
+        @dec.deprecated()
+        def deprecated_func2():
+            import warnings
+            warnings.warn("AHHHH")
+            raise ValueError
+
+        @dec.deprecated()
+        def deprecated_func3():
+            import warnings
+            warnings.warn("AHHHH")
+
+        # marked as deprecated, but does not raise DeprecationWarning
+        assert_raises(AssertionError, non_deprecated_func)
+        # should be silent
+        deprecated_func()
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")  # do not propagate unrelated warnings
+            # fails if deprecated decorator just disables test. See #1453.
+            assert_raises(ValueError, deprecated_func2)
+            # warning is not a DeprecationWarning
+            assert_raises(AssertionError, deprecated_func3)
+
+    def test_parametrize(self):
+        # dec.parametrize assumes that it is being run by nose. Because
+        # we are running under pytest, we need to explicitly check the
+        # results.
+        @dec.parametrize('base, power, expected',
+                [(1, 1, 1),
+                 (2, 1, 2),
+                 (2, 2, 4)])
+        def check_parametrize(base, power, expected):
+            assert_(base**power == expected)
+
+        count = 0
+        for test in check_parametrize():
+            test[0](*test[1:])
+            count += 1
+        assert_(count == 3)
diff --git a/contrib/python/numpy/py2/numpy/testing/tests/test_doctesting.py b/contrib/python/numpy/py2/numpy/testing/tests/test_doctesting.py
new file mode 100644
index 00000000000..b77cd93e0b5
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/testing/tests/test_doctesting.py
@@ -0,0 +1,59 @@
+""" Doctests for NumPy-specific nose/doctest modifications
+
+"""
+from __future__ import division, absolute_import, print_function
+
+#FIXME: None of these tests is run, because 'check' is not a recognized
+# testing prefix.
+
+# try the #random directive on the output line
+def check_random_directive():
+    '''
+    >>> 2+2
+    <BadExample object at 0x084D05AC>  #random: may vary on your system
+    '''
+
+# check the implicit "import numpy as np"
+def check_implicit_np():
+    '''
+    >>> np.array([1,2,3])
+    array([1, 2, 3])
+    '''
+
+# there's some extraneous whitespace around the correct responses
+def check_whitespace_enabled():
+    '''
+    # whitespace after the 3
+    >>> 1+2
+    3
+
+    # whitespace before the 7
+    >>> 3+4
+     7
+    '''
+
+def check_empty_output():
+    """ Check that no output does not cause an error.
+
+    This is related to nose bug 445; the numpy plugin changed the
+    doctest-result-variable default and therefore hit this bug:
+    http://code.google.com/p/python-nose/issues/detail?id=445
+
+    >>> a = 10
+    """
+
+def check_skip():
+    """ Check skip directive
+
+    The test below should not run
+
+    >>> 1/0 #doctest: +SKIP
+    """
+
+
+if __name__ == '__main__':
+    # Run tests outside numpy test rig
+    import nose
+    from numpy.testing.noseclasses import NumpyDoctest
+    argv = ['', __file__, '--with-numpydoctest']
+    nose.core.TestProgram(argv=argv, addplugins=[NumpyDoctest()])
diff --git a/contrib/python/numpy/py2/numpy/testing/tests/test_utils.py b/contrib/python/numpy/py2/numpy/testing/tests/test_utils.py
new file mode 100644
index 00000000000..7f6cbb8fe76
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/testing/tests/test_utils.py
@@ -0,0 +1,1597 @@
+from __future__ import division, absolute_import, print_function
+
+import warnings
+import sys
+import os
+import itertools
+import textwrap
+import pytest
+import weakref
+
+import numpy as np
+from numpy.testing import (
+    assert_equal, assert_array_equal, assert_almost_equal,
+    assert_array_almost_equal, assert_array_less, build_err_msg, raises,
+    assert_raises, assert_warns, assert_no_warnings, assert_allclose,
+    assert_approx_equal, assert_array_almost_equal_nulp, assert_array_max_ulp,
+    clear_and_catch_warnings, suppress_warnings, assert_string_equal, assert_,
+    tempdir, temppath, assert_no_gc_cycles, HAS_REFCOUNT
+    )
+from numpy.core.overrides import ENABLE_ARRAY_FUNCTION
+
+
+class _GenericTest(object):
+
+    def _test_equal(self, a, b):
+        self._assert_func(a, b)
+
+    def _test_not_equal(self, a, b):
+        with assert_raises(AssertionError):
+            self._assert_func(a, b)
+
+    def test_array_rank1_eq(self):
+        """Test two equal array of rank 1 are found equal."""
+        a = np.array([1, 2])
+        b = np.array([1, 2])
+
+        self._test_equal(a, b)
+
+    def test_array_rank1_noteq(self):
+        """Test two different array of rank 1 are found not equal."""
+        a = np.array([1, 2])
+        b = np.array([2, 2])
+
+        self._test_not_equal(a, b)
+
+    def test_array_rank2_eq(self):
+        """Test two equal array of rank 2 are found equal."""
+        a = np.array([[1, 2], [3, 4]])
+        b = np.array([[1, 2], [3, 4]])
+
+        self._test_equal(a, b)
+
+    def test_array_diffshape(self):
+        """Test two arrays with different shapes are found not equal."""
+        a = np.array([1, 2])
+        b = np.array([[1, 2], [1, 2]])
+
+        self._test_not_equal(a, b)
+
+    def test_objarray(self):
+        """Test object arrays."""
+        a = np.array([1, 1], dtype=object)
+        self._test_equal(a, 1)
+
+    def test_array_likes(self):
+        self._test_equal([1, 2, 3], (1, 2, 3))
+
+
+class TestArrayEqual(_GenericTest):
+
+    def setup(self):
+        self._assert_func = assert_array_equal
+
+    def test_generic_rank1(self):
+        """Test rank 1 array for all dtypes."""
+        def foo(t):
+            a = np.empty(2, t)
+            a.fill(1)
+            b = a.copy()
+            c = a.copy()
+            c.fill(0)
+            self._test_equal(a, b)
+            self._test_not_equal(c, b)
+
+        # Test numeric types and object
+        for t in '?bhilqpBHILQPfdgFDG':
+            foo(t)
+
+        # Test strings
+        for t in ['S1', 'U1']:
+            foo(t)
+
+    def test_0_ndim_array(self):
+        x = np.array(473963742225900817127911193656584771)
+        y = np.array(18535119325151578301457182298393896)
+        assert_raises(AssertionError, self._assert_func, x, y)
+
+        y = x
+        self._assert_func(x, y)
+
+        x = np.array(43)
+        y = np.array(10)
+        assert_raises(AssertionError, self._assert_func, x, y)
+
+        y = x
+        self._assert_func(x, y)
+
+    def test_generic_rank3(self):
+        """Test rank 3 array for all dtypes."""
+        def foo(t):
+            a = np.empty((4, 2, 3), t)
+            a.fill(1)
+            b = a.copy()
+            c = a.copy()
+            c.fill(0)
+            self._test_equal(a, b)
+            self._test_not_equal(c, b)
+
+        # Test numeric types and object
+        for t in '?bhilqpBHILQPfdgFDG':
+            foo(t)
+
+        # Test strings
+        for t in ['S1', 'U1']:
+            foo(t)
+
+    def test_nan_array(self):
+        """Test arrays with nan values in them."""
+        a = np.array([1, 2, np.nan])
+        b = np.array([1, 2, np.nan])
+
+        self._test_equal(a, b)
+
+        c = np.array([1, 2, 3])
+        self._test_not_equal(c, b)
+
+    def test_string_arrays(self):
+        """Test two arrays with different shapes are found not equal."""
+        a = np.array(['floupi', 'floupa'])
+        b = np.array(['floupi', 'floupa'])
+
+        self._test_equal(a, b)
+
+        c = np.array(['floupipi', 'floupa'])
+
+        self._test_not_equal(c, b)
+
+    def test_recarrays(self):
+        """Test record arrays."""
+        a = np.empty(2, [('floupi', float), ('floupa', float)])
+        a['floupi'] = [1, 2]
+        a['floupa'] = [1, 2]
+        b = a.copy()
+
+        self._test_equal(a, b)
+
+        c = np.empty(2, [('floupipi', float), ('floupa', float)])
+        c['floupipi'] = a['floupi'].copy()
+        c['floupa'] = a['floupa'].copy()
+
+        with suppress_warnings() as sup:
+            l = sup.record(FutureWarning, message="elementwise == ")
+            self._test_not_equal(c, b)
+            assert_equal(len(l), 1)
+
+    def test_masked_nan_inf(self):
+        # Regression test for gh-11121
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[False, True, False])
+        b = np.array([3., np.nan, 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, False, False])
+        b = np.array([np.inf, 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+
+    def test_subclass_that_overrides_eq(self):
+        # While we cannot guarantee testing functions will always work for
+        # subclasses, the tests should ideally rely only on subclasses having
+        # comparison operators, not on them being able to store booleans
+        # (which, e.g., astropy Quantity cannot usefully do). See gh-8452.
+        class MyArray(np.ndarray):
+            def __eq__(self, other):
+                return bool(np.equal(self, other).all())
+
+            def __ne__(self, other):
+                return not self == other
+
+        a = np.array([1., 2.]).view(MyArray)
+        b = np.array([2., 3.]).view(MyArray)
+        assert_(type(a == a), bool)
+        assert_(a == a)
+        assert_(a != b)
+        self._test_equal(a, a)
+        self._test_not_equal(a, b)
+        self._test_not_equal(b, a)
+
+    @pytest.mark.skipif(
+        not ENABLE_ARRAY_FUNCTION, reason='requires __array_function__')
+    def test_subclass_that_does_not_implement_npall(self):
+        class MyArray(np.ndarray):
+            def __array_function__(self, *args, **kwargs):
+                return NotImplemented
+
+        a = np.array([1., 2.]).view(MyArray)
+        b = np.array([2., 3.]).view(MyArray)
+        with assert_raises(TypeError):
+            np.all(a)
+        self._test_equal(a, a)
+        self._test_not_equal(a, b)
+        self._test_not_equal(b, a)
+
+
+class TestBuildErrorMessage(object):
+
+    def test_build_err_msg_defaults(self):
+        x = np.array([1.00001, 2.00002, 3.00003])
+        y = np.array([1.00002, 2.00003, 3.00004])
+        err_msg = 'There is a mismatch'
+
+        a = build_err_msg([x, y], err_msg)
+        b = ('\nItems are not equal: There is a mismatch\n ACTUAL: array(['
+             '1.00001, 2.00002, 3.00003])\n DESIRED: array([1.00002, '
+             '2.00003, 3.00004])')
+        assert_equal(a, b)
+
+    def test_build_err_msg_no_verbose(self):
+        x = np.array([1.00001, 2.00002, 3.00003])
+        y = np.array([1.00002, 2.00003, 3.00004])
+        err_msg = 'There is a mismatch'
+
+        a = build_err_msg([x, y], err_msg, verbose=False)
+        b = '\nItems are not equal: There is a mismatch'
+        assert_equal(a, b)
+
+    def test_build_err_msg_custom_names(self):
+        x = np.array([1.00001, 2.00002, 3.00003])
+        y = np.array([1.00002, 2.00003, 3.00004])
+        err_msg = 'There is a mismatch'
+
+        a = build_err_msg([x, y], err_msg, names=('FOO', 'BAR'))
+        b = ('\nItems are not equal: There is a mismatch\n FOO: array(['
+             '1.00001, 2.00002, 3.00003])\n BAR: array([1.00002, 2.00003, '
+             '3.00004])')
+        assert_equal(a, b)
+
+    def test_build_err_msg_custom_precision(self):
+        x = np.array([1.000000001, 2.00002, 3.00003])
+        y = np.array([1.000000002, 2.00003, 3.00004])
+        err_msg = 'There is a mismatch'
+
+        a = build_err_msg([x, y], err_msg, precision=10)
+        b = ('\nItems are not equal: There is a mismatch\n ACTUAL: array(['
+             '1.000000001, 2.00002    , 3.00003    ])\n DESIRED: array(['
+             '1.000000002, 2.00003    , 3.00004    ])')
+        assert_equal(a, b)
+
+
+class TestEqual(TestArrayEqual):
+
+    def setup(self):
+        self._assert_func = assert_equal
+
+    def test_nan_items(self):
+        self._assert_func(np.nan, np.nan)
+        self._assert_func([np.nan], [np.nan])
+        self._test_not_equal(np.nan, [np.nan])
+        self._test_not_equal(np.nan, 1)
+
+    def test_inf_items(self):
+        self._assert_func(np.inf, np.inf)
+        self._assert_func([np.inf], [np.inf])
+        self._test_not_equal(np.inf, [np.inf])
+
+    def test_datetime(self):
+        self._test_equal(
+            np.datetime64("2017-01-01", "s"),
+            np.datetime64("2017-01-01", "s")
+        )
+        self._test_equal(
+            np.datetime64("2017-01-01", "s"),
+            np.datetime64("2017-01-01", "m")
+        )
+
+        # gh-10081
+        self._test_not_equal(
+            np.datetime64("2017-01-01", "s"),
+            np.datetime64("2017-01-02", "s")
+        )
+        self._test_not_equal(
+            np.datetime64("2017-01-01", "s"),
+            np.datetime64("2017-01-02", "m")
+        )
+
+    def test_nat_items(self):
+        # not a datetime
+        nadt_no_unit = np.datetime64("NaT")
+        nadt_s = np.datetime64("NaT", "s")
+        nadt_d = np.datetime64("NaT", "ns")
+        # not a timedelta
+        natd_no_unit = np.timedelta64("NaT")
+        natd_s = np.timedelta64("NaT", "s")
+        natd_d = np.timedelta64("NaT", "ns")
+
+        dts = [nadt_no_unit, nadt_s, nadt_d]
+        tds = [natd_no_unit, natd_s, natd_d]
+        for a, b in itertools.product(dts, dts):
+            self._assert_func(a, b)
+            self._assert_func([a], [b])
+            self._test_not_equal([a], b)
+
+        for a, b in itertools.product(tds, tds):
+            self._assert_func(a, b)
+            self._assert_func([a], [b])
+            self._test_not_equal([a], b)
+
+        for a, b in itertools.product(tds, dts):
+            self._test_not_equal(a, b)
+            self._test_not_equal(a, [b])
+            self._test_not_equal([a], [b])
+            self._test_not_equal([a], np.datetime64("2017-01-01", "s"))
+            self._test_not_equal([b], np.datetime64("2017-01-01", "s"))
+            self._test_not_equal([a], np.timedelta64(123, "s"))
+            self._test_not_equal([b], np.timedelta64(123, "s"))
+
+    def test_non_numeric(self):
+        self._assert_func('ab', 'ab')
+        self._test_not_equal('ab', 'abb')
+
+    def test_complex_item(self):
+        self._assert_func(complex(1, 2), complex(1, 2))
+        self._assert_func(complex(1, np.nan), complex(1, np.nan))
+        self._test_not_equal(complex(1, np.nan), complex(1, 2))
+        self._test_not_equal(complex(np.nan, 1), complex(1, np.nan))
+        self._test_not_equal(complex(np.nan, np.inf), complex(np.nan, 2))
+
+    def test_negative_zero(self):
+        self._test_not_equal(np.PZERO, np.NZERO)
+
+    def test_complex(self):
+        x = np.array([complex(1, 2), complex(1, np.nan)])
+        y = np.array([complex(1, 2), complex(1, 2)])
+        self._assert_func(x, x)
+        self._test_not_equal(x, y)
+
+    def test_error_message(self):
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(np.array([1, 2]), np.array([[1, 2]]))
+        msg = str(exc_info.value)
+        msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
+        msg_reference = textwrap.dedent("""\
+
+        Arrays are not equal
+
+        (shapes (2,), (1, 2) mismatch)
+         x: array([1, 2])
+         y: array([[1, 2]])""")
+
+        try:
+            assert_equal(msg, msg_reference)
+        except AssertionError:
+            assert_equal(msg2, msg_reference)
+
+    def test_object(self):
+        #gh-12942
+        import datetime
+        a = np.array([datetime.datetime(2000, 1, 1),
+                      datetime.datetime(2000, 1, 2)])
+        self._test_not_equal(a, a[::-1])
+
+
+class TestArrayAlmostEqual(_GenericTest):
+
+    def setup(self):
+        self._assert_func = assert_array_almost_equal
+
+    def test_closeness(self):
+        # Note that in the course of time we ended up with
+        #     `abs(x - y) < 1.5 * 10**(-decimal)`
+        # instead of the previously documented
+        #     `abs(x - y) < 0.5 * 10**(-decimal)`
+        # so this check serves to preserve the wrongness.
+
+        # test scalars
+        self._assert_func(1.499999, 0.0, decimal=0)
+        assert_raises(AssertionError,
+                          lambda: self._assert_func(1.5, 0.0, decimal=0))
+
+        # test arrays
+        self._assert_func([1.499999], [0.0], decimal=0)
+        assert_raises(AssertionError,
+                          lambda: self._assert_func([1.5], [0.0], decimal=0))
+
+    def test_simple(self):
+        x = np.array([1234.2222])
+        y = np.array([1234.2223])
+
+        self._assert_func(x, y, decimal=3)
+        self._assert_func(x, y, decimal=4)
+        assert_raises(AssertionError,
+                lambda: self._assert_func(x, y, decimal=5))
+
+    def test_nan(self):
+        anan = np.array([np.nan])
+        aone = np.array([1])
+        ainf = np.array([np.inf])
+        self._assert_func(anan, anan)
+        assert_raises(AssertionError,
+                lambda: self._assert_func(anan, aone))
+        assert_raises(AssertionError,
+                lambda: self._assert_func(anan, ainf))
+        assert_raises(AssertionError,
+                lambda: self._assert_func(ainf, anan))
+
+    def test_inf(self):
+        a = np.array([[1., 2.], [3., 4.]])
+        b = a.copy()
+        a[0, 0] = np.inf
+        assert_raises(AssertionError,
+                lambda: self._assert_func(a, b))
+        b[0, 0] = -np.inf
+        assert_raises(AssertionError,
+                lambda: self._assert_func(a, b))
+
+    def test_subclass(self):
+        a = np.array([[1., 2.], [3., 4.]])
+        b = np.ma.masked_array([[1., 2.], [0., 4.]],
+                               [[False, False], [True, False]])
+        self._assert_func(a, b)
+        self._assert_func(b, a)
+        self._assert_func(b, b)
+
+        # Test fully masked as well (see gh-11123).
+        a = np.ma.MaskedArray(3.5, mask=True)
+        b = np.array([3., 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.masked
+        b = np.array([3., 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, True, True])
+        b = np.array([1., 2., 3.])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, True, True])
+        b = np.array(1.)
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+
+    def test_subclass_that_cannot_be_bool(self):
+        # While we cannot guarantee testing functions will always work for
+        # subclasses, the tests should ideally rely only on subclasses having
+        # comparison operators, not on them being able to store booleans
+        # (which, e.g., astropy Quantity cannot usefully do). See gh-8452.
+        class MyArray(np.ndarray):
+            def __eq__(self, other):
+                return super(MyArray, self).__eq__(other).view(np.ndarray)
+
+            def __lt__(self, other):
+                return super(MyArray, self).__lt__(other).view(np.ndarray)
+
+            def all(self, *args, **kwargs):
+                raise NotImplementedError
+
+        a = np.array([1., 2.]).view(MyArray)
+        self._assert_func(a, a)
+
+
+class TestAlmostEqual(_GenericTest):
+
+    def setup(self):
+        self._assert_func = assert_almost_equal
+
+    def test_closeness(self):
+        # Note that in the course of time we ended up with
+        #     `abs(x - y) < 1.5 * 10**(-decimal)`
+        # instead of the previously documented
+        #     `abs(x - y) < 0.5 * 10**(-decimal)`
+        # so this check serves to preserve the wrongness.
+
+        # test scalars
+        self._assert_func(1.499999, 0.0, decimal=0)
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(1.5, 0.0, decimal=0))
+
+        # test arrays
+        self._assert_func([1.499999], [0.0], decimal=0)
+        assert_raises(AssertionError,
+                      lambda: self._assert_func([1.5], [0.0], decimal=0))
+
+    def test_nan_item(self):
+        self._assert_func(np.nan, np.nan)
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(np.nan, 1))
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(np.nan, np.inf))
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(np.inf, np.nan))
+
+    def test_inf_item(self):
+        self._assert_func(np.inf, np.inf)
+        self._assert_func(-np.inf, -np.inf)
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(np.inf, 1))
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(-np.inf, np.inf))
+
+    def test_simple_item(self):
+        self._test_not_equal(1, 2)
+
+    def test_complex_item(self):
+        self._assert_func(complex(1, 2), complex(1, 2))
+        self._assert_func(complex(1, np.nan), complex(1, np.nan))
+        self._assert_func(complex(np.inf, np.nan), complex(np.inf, np.nan))
+        self._test_not_equal(complex(1, np.nan), complex(1, 2))
+        self._test_not_equal(complex(np.nan, 1), complex(1, np.nan))
+        self._test_not_equal(complex(np.nan, np.inf), complex(np.nan, 2))
+
+    def test_complex(self):
+        x = np.array([complex(1, 2), complex(1, np.nan)])
+        z = np.array([complex(1, 2), complex(np.nan, 1)])
+        y = np.array([complex(1, 2), complex(1, 2)])
+        self._assert_func(x, x)
+        self._test_not_equal(x, y)
+        self._test_not_equal(x, z)
+
+    def test_error_message(self):
+        """Check the message is formatted correctly for the decimal value.
+           Also check the message when input includes inf or nan (gh12200)"""
+        x = np.array([1.00000000001, 2.00000000002, 3.00003])
+        y = np.array([1.00000000002, 2.00000000003, 3.00004])
+
+        # Test with a different amount of decimal digits
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y, decimal=12)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatched elements: 3 / 3 (100%)')
+        assert_equal(msgs[4], 'Max absolute difference: 1.e-05')
+        assert_equal(msgs[5], 'Max relative difference: 3.33328889e-06')
+        assert_equal(
+            msgs[6],
+            ' x: array([1.00000000001, 2.00000000002, 3.00003      ])')
+        assert_equal(
+            msgs[7],
+            ' y: array([1.00000000002, 2.00000000003, 3.00004      ])')
+
+        # With the default value of decimal digits, only the 3rd element
+        # differs. Note that we only check for the formatting of the arrays
+        # themselves.
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatched elements: 1 / 3 (33.3%)')
+        assert_equal(msgs[4], 'Max absolute difference: 1.e-05')
+        assert_equal(msgs[5], 'Max relative difference: 3.33328889e-06')
+        assert_equal(msgs[6], ' x: array([1.     , 2.     , 3.00003])')
+        assert_equal(msgs[7], ' y: array([1.     , 2.     , 3.00004])')
+
+        # Check the error message when input includes inf
+        x = np.array([np.inf, 0])
+        y = np.array([np.inf, 1])
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatched elements: 1 / 2 (50%)')
+        assert_equal(msgs[4], 'Max absolute difference: 1.')
+        assert_equal(msgs[5], 'Max relative difference: 1.')
+        assert_equal(msgs[6], ' x: array([inf,  0.])')
+        assert_equal(msgs[7], ' y: array([inf,  1.])')
+
+        # Check the error message when dividing by zero
+        x = np.array([1, 2])
+        y = np.array([0, 0])
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatched elements: 2 / 2 (100%)')
+        assert_equal(msgs[4], 'Max absolute difference: 2')
+        assert_equal(msgs[5], 'Max relative difference: inf')
+
+    def test_error_message_2(self):
+        """Check the message is formatted correctly when either x or y is a scalar."""
+        x = 2
+        y = np.ones(20)
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatched elements: 20 / 20 (100%)')
+        assert_equal(msgs[4], 'Max absolute difference: 1.')
+        assert_equal(msgs[5], 'Max relative difference: 1.')
+
+        y = 2
+        x = np.ones(20)
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatched elements: 20 / 20 (100%)')
+        assert_equal(msgs[4], 'Max absolute difference: 1.')
+        assert_equal(msgs[5], 'Max relative difference: 0.5')
+
+    def test_subclass_that_cannot_be_bool(self):
+        # While we cannot guarantee testing functions will always work for
+        # subclasses, the tests should ideally rely only on subclasses having
+        # comparison operators, not on them being able to store booleans
+        # (which, e.g., astropy Quantity cannot usefully do). See gh-8452.
+        class MyArray(np.ndarray):
+            def __eq__(self, other):
+                return super(MyArray, self).__eq__(other).view(np.ndarray)
+
+            def __lt__(self, other):
+                return super(MyArray, self).__lt__(other).view(np.ndarray)
+
+            def all(self, *args, **kwargs):
+                raise NotImplementedError
+
+        a = np.array([1., 2.]).view(MyArray)
+        self._assert_func(a, a)
+
+
+class TestApproxEqual(object):
+
+    def setup(self):
+        self._assert_func = assert_approx_equal
+
+    def test_simple_0d_arrays(self):
+        x = np.array(1234.22)
+        y = np.array(1234.23)
+
+        self._assert_func(x, y, significant=5)
+        self._assert_func(x, y, significant=6)
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(x, y, significant=7))
+
+    def test_simple_items(self):
+        x = 1234.22
+        y = 1234.23
+
+        self._assert_func(x, y, significant=4)
+        self._assert_func(x, y, significant=5)
+        self._assert_func(x, y, significant=6)
+        assert_raises(AssertionError,
+                      lambda: self._assert_func(x, y, significant=7))
+
+    def test_nan_array(self):
+        anan = np.array(np.nan)
+        aone = np.array(1)
+        ainf = np.array(np.inf)
+        self._assert_func(anan, anan)
+        assert_raises(AssertionError, lambda: self._assert_func(anan, aone))
+        assert_raises(AssertionError, lambda: self._assert_func(anan, ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, anan))
+
+    def test_nan_items(self):
+        anan = np.array(np.nan)
+        aone = np.array(1)
+        ainf = np.array(np.inf)
+        self._assert_func(anan, anan)
+        assert_raises(AssertionError, lambda: self._assert_func(anan, aone))
+        assert_raises(AssertionError, lambda: self._assert_func(anan, ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, anan))
+
+
+class TestArrayAssertLess(object):
+
+    def setup(self):
+        self._assert_func = assert_array_less
+
+    def test_simple_arrays(self):
+        x = np.array([1.1, 2.2])
+        y = np.array([1.2, 2.3])
+
+        self._assert_func(x, y)
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+        y = np.array([1.0, 2.3])
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, y))
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+    def test_rank2(self):
+        x = np.array([[1.1, 2.2], [3.3, 4.4]])
+        y = np.array([[1.2, 2.3], [3.4, 4.5]])
+
+        self._assert_func(x, y)
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+        y = np.array([[1.0, 2.3], [3.4, 4.5]])
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, y))
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+    def test_rank3(self):
+        x = np.ones(shape=(2, 2, 2))
+        y = np.ones(shape=(2, 2, 2))+1
+
+        self._assert_func(x, y)
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+        y[0, 0, 0] = 0
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, y))
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+    def test_simple_items(self):
+        x = 1.1
+        y = 2.2
+
+        self._assert_func(x, y)
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+        y = np.array([2.2, 3.3])
+
+        self._assert_func(x, y)
+        assert_raises(AssertionError, lambda: self._assert_func(y, x))
+
+        y = np.array([1.0, 3.3])
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, y))
+
+    def test_nan_noncompare(self):
+        anan = np.array(np.nan)
+        aone = np.array(1)
+        ainf = np.array(np.inf)
+        self._assert_func(anan, anan)
+        assert_raises(AssertionError, lambda: self._assert_func(aone, anan))
+        assert_raises(AssertionError, lambda: self._assert_func(anan, aone))
+        assert_raises(AssertionError, lambda: self._assert_func(anan, ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, anan))
+
+    def test_nan_noncompare_array(self):
+        x = np.array([1.1, 2.2, 3.3])
+        anan = np.array(np.nan)
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, anan))
+        assert_raises(AssertionError, lambda: self._assert_func(anan, x))
+
+        x = np.array([1.1, 2.2, np.nan])
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, anan))
+        assert_raises(AssertionError, lambda: self._assert_func(anan, x))
+
+        y = np.array([1.0, 2.0, np.nan])
+
+        self._assert_func(y, x)
+        assert_raises(AssertionError, lambda: self._assert_func(x, y))
+
+    def test_inf_compare(self):
+        aone = np.array(1)
+        ainf = np.array(np.inf)
+
+        self._assert_func(aone, ainf)
+        self._assert_func(-ainf, aone)
+        self._assert_func(-ainf, ainf)
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, aone))
+        assert_raises(AssertionError, lambda: self._assert_func(aone, -ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, -ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(-ainf, -ainf))
+
+    def test_inf_compare_array(self):
+        x = np.array([1.1, 2.2, np.inf])
+        ainf = np.array(np.inf)
+
+        assert_raises(AssertionError, lambda: self._assert_func(x, ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(ainf, x))
+        assert_raises(AssertionError, lambda: self._assert_func(x, -ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(-x, -ainf))
+        assert_raises(AssertionError, lambda: self._assert_func(-ainf, -x))
+        self._assert_func(-ainf, x)
+
+
+[email protected](reason="The raises decorator depends on Nose")
+class TestRaises(object):
+
+    def setup(self):
+        class MyException(Exception):
+            pass
+
+        self.e = MyException
+
+    def raises_exception(self, e):
+        raise e
+
+    def does_not_raise_exception(self):
+        pass
+
+    def test_correct_catch(self):
+        raises(self.e)(self.raises_exception)(self.e)  # raises?
+
+    def test_wrong_exception(self):
+        try:
+            raises(self.e)(self.raises_exception)(RuntimeError)  # raises?
+        except RuntimeError:
+            return
+        else:
+            raise AssertionError("should have caught RuntimeError")
+
+    def test_catch_no_raise(self):
+        try:
+            raises(self.e)(self.does_not_raise_exception)()  # raises?
+        except AssertionError:
+            return
+        else:
+            raise AssertionError("should have raised an AssertionError")
+
+
+class TestWarns(object):
+
+    def test_warn(self):
+        def f():
+            warnings.warn("yo")
+            return 3
+
+        before_filters = sys.modules['warnings'].filters[:]
+        assert_equal(assert_warns(UserWarning, f), 3)
+        after_filters = sys.modules['warnings'].filters
+
+        assert_raises(AssertionError, assert_no_warnings, f)
+        assert_equal(assert_no_warnings(lambda x: x, 1), 1)
+
+        # Check that the warnings state is unchanged
+        assert_equal(before_filters, after_filters,
+                     "assert_warns does not preserver warnings state")
+
+    def test_context_manager(self):
+
+        before_filters = sys.modules['warnings'].filters[:]
+        with assert_warns(UserWarning):
+            warnings.warn("yo")
+        after_filters = sys.modules['warnings'].filters
+
+        def no_warnings():
+            with assert_no_warnings():
+                warnings.warn("yo")
+
+        assert_raises(AssertionError, no_warnings)
+        assert_equal(before_filters, after_filters,
+                     "assert_warns does not preserver warnings state")
+
+    def test_warn_wrong_warning(self):
+        def f():
+            warnings.warn("yo", DeprecationWarning)
+
+        failed = False
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", DeprecationWarning)
+            try:
+                # Should raise a DeprecationWarning
+                assert_warns(UserWarning, f)
+                failed = True
+            except DeprecationWarning:
+                pass
+
+        if failed:
+            raise AssertionError("wrong warning caught by assert_warn")
+
+
+class TestAssertAllclose(object):
+
+    def test_simple(self):
+        x = 1e-3
+        y = 1e-9
+
+        assert_allclose(x, y, atol=1)
+        assert_raises(AssertionError, assert_allclose, x, y)
+
+        a = np.array([x, y, x, y])
+        b = np.array([x, y, x, x])
+
+        assert_allclose(a, b, atol=1)
+        assert_raises(AssertionError, assert_allclose, a, b)
+
+        b[-1] = y * (1 + 1e-8)
+        assert_allclose(a, b)
+        assert_raises(AssertionError, assert_allclose, a, b, rtol=1e-9)
+
+        assert_allclose(6, 10, rtol=0.5)
+        assert_raises(AssertionError, assert_allclose, 10, 6, rtol=0.5)
+
+    def test_min_int(self):
+        a = np.array([np.iinfo(np.int_).min], dtype=np.int_)
+        # Should not raise:
+        assert_allclose(a, a)
+
+    def test_report_fail_percentage(self):
+        a = np.array([1, 1, 1, 1])
+        b = np.array([1, 1, 1, 2])
+
+        with pytest.raises(AssertionError) as exc_info:
+            assert_allclose(a, b)
+        msg = str(exc_info.value)
+        assert_('Mismatched elements: 1 / 4 (25%)\n'
+                'Max absolute difference: 1\n'
+                'Max relative difference: 0.5' in msg)
+
+    def test_equal_nan(self):
+        a = np.array([np.nan])
+        b = np.array([np.nan])
+        # Should not raise:
+        assert_allclose(a, b, equal_nan=True)
+
+    def test_not_equal_nan(self):
+        a = np.array([np.nan])
+        b = np.array([np.nan])
+        assert_raises(AssertionError, assert_allclose, a, b, equal_nan=False)
+
+    def test_equal_nan_default(self):
+        # Make sure equal_nan default behavior remains unchanged. (All
+        # of these functions use assert_array_compare under the hood.)
+        # None of these should raise.
+        a = np.array([np.nan])
+        b = np.array([np.nan])
+        assert_array_equal(a, b)
+        assert_array_almost_equal(a, b)
+        assert_array_less(a, b)
+        assert_allclose(a, b)
+
+    def test_report_max_relative_error(self):
+        a = np.array([0, 1])
+        b = np.array([0, 2])
+
+        with pytest.raises(AssertionError) as exc_info:
+            assert_allclose(a, b)
+        msg = str(exc_info.value)
+        assert_('Max relative difference: 0.5' in msg)
+
+
+class TestArrayAlmostEqualNulp(object):
+
+    def test_float64_pass(self):
+        # The number of units of least precision
+        # In this case, use a few places above the lowest level (ie nulp=1)
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float64)
+        x = 10**x
+        x = np.r_[-x, x]
+
+        # Addition
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp/2.
+        assert_array_almost_equal_nulp(x, y, nulp)
+
+        # Subtraction
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp/2.
+        assert_array_almost_equal_nulp(x, y, nulp)
+
+    def test_float64_fail(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float64)
+        x = 10**x
+        x = np.r_[-x, x]
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      x, y, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      x, y, nulp)
+
+    def test_float32_pass(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float32)
+        x = 10**x
+        x = np.r_[-x, x]
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp/2.
+        assert_array_almost_equal_nulp(x, y, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp/2.
+        assert_array_almost_equal_nulp(x, y, nulp)
+
+    def test_float32_fail(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float32)
+        x = 10**x
+        x = np.r_[-x, x]
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      x, y, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      x, y, nulp)
+
+    def test_float16_pass(self):
+        nulp = 5
+        x = np.linspace(-4, 4, 10, dtype=np.float16)
+        x = 10**x
+        x = np.r_[-x, x]
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp/2.
+        assert_array_almost_equal_nulp(x, y, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp/2.
+        assert_array_almost_equal_nulp(x, y, nulp)
+
+    def test_float16_fail(self):
+        nulp = 5
+        x = np.linspace(-4, 4, 10, dtype=np.float16)
+        x = 10**x
+        x = np.r_[-x, x]
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      x, y, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      x, y, nulp)
+
+    def test_complex128_pass(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float64)
+        x = 10**x
+        x = np.r_[-x, x]
+        xi = x + x*1j
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp/2.
+        assert_array_almost_equal_nulp(xi, x + y*1j, nulp)
+        assert_array_almost_equal_nulp(xi, y + x*1j, nulp)
+        # The test condition needs to be at least a factor of sqrt(2) smaller
+        # because the real and imaginary parts both change
+        y = x + x*eps*nulp/4.
+        assert_array_almost_equal_nulp(xi, y + y*1j, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp/2.
+        assert_array_almost_equal_nulp(xi, x + y*1j, nulp)
+        assert_array_almost_equal_nulp(xi, y + x*1j, nulp)
+        y = x - x*epsneg*nulp/4.
+        assert_array_almost_equal_nulp(xi, y + y*1j, nulp)
+
+    def test_complex128_fail(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float64)
+        x = 10**x
+        x = np.r_[-x, x]
+        xi = x + x*1j
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, x + y*1j, nulp)
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + x*1j, nulp)
+        # The test condition needs to be at least a factor of sqrt(2) smaller
+        # because the real and imaginary parts both change
+        y = x + x*eps*nulp
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + y*1j, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, x + y*1j, nulp)
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + x*1j, nulp)
+        y = x - x*epsneg*nulp
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + y*1j, nulp)
+
+    def test_complex64_pass(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float32)
+        x = 10**x
+        x = np.r_[-x, x]
+        xi = x + x*1j
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp/2.
+        assert_array_almost_equal_nulp(xi, x + y*1j, nulp)
+        assert_array_almost_equal_nulp(xi, y + x*1j, nulp)
+        y = x + x*eps*nulp/4.
+        assert_array_almost_equal_nulp(xi, y + y*1j, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp/2.
+        assert_array_almost_equal_nulp(xi, x + y*1j, nulp)
+        assert_array_almost_equal_nulp(xi, y + x*1j, nulp)
+        y = x - x*epsneg*nulp/4.
+        assert_array_almost_equal_nulp(xi, y + y*1j, nulp)
+
+    def test_complex64_fail(self):
+        nulp = 5
+        x = np.linspace(-20, 20, 50, dtype=np.float32)
+        x = 10**x
+        x = np.r_[-x, x]
+        xi = x + x*1j
+
+        eps = np.finfo(x.dtype).eps
+        y = x + x*eps*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, x + y*1j, nulp)
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + x*1j, nulp)
+        y = x + x*eps*nulp
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + y*1j, nulp)
+
+        epsneg = np.finfo(x.dtype).epsneg
+        y = x - x*epsneg*nulp*2.
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, x + y*1j, nulp)
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + x*1j, nulp)
+        y = x - x*epsneg*nulp
+        assert_raises(AssertionError, assert_array_almost_equal_nulp,
+                      xi, y + y*1j, nulp)
+
+
+class TestULP(object):
+
+    def test_equal(self):
+        x = np.random.randn(10)
+        assert_array_max_ulp(x, x, maxulp=0)
+
+    def test_single(self):
+        # Generate 1 + small deviation, check that adding eps gives a few UNL
+        x = np.ones(10).astype(np.float32)
+        x += 0.01 * np.random.randn(10).astype(np.float32)
+        eps = np.finfo(np.float32).eps
+        assert_array_max_ulp(x, x+eps, maxulp=20)
+
+    def test_double(self):
+        # Generate 1 + small deviation, check that adding eps gives a few UNL
+        x = np.ones(10).astype(np.float64)
+        x += 0.01 * np.random.randn(10).astype(np.float64)
+        eps = np.finfo(np.float64).eps
+        assert_array_max_ulp(x, x+eps, maxulp=200)
+
+    def test_inf(self):
+        for dt in [np.float32, np.float64]:
+            inf = np.array([np.inf]).astype(dt)
+            big = np.array([np.finfo(dt).max])
+            assert_array_max_ulp(inf, big, maxulp=200)
+
+    def test_nan(self):
+        # Test that nan is 'far' from small, tiny, inf, max and min
+        for dt in [np.float32, np.float64]:
+            if dt == np.float32:
+                maxulp = 1e6
+            else:
+                maxulp = 1e12
+            inf = np.array([np.inf]).astype(dt)
+            nan = np.array([np.nan]).astype(dt)
+            big = np.array([np.finfo(dt).max])
+            tiny = np.array([np.finfo(dt).tiny])
+            zero = np.array([np.PZERO]).astype(dt)
+            nzero = np.array([np.NZERO]).astype(dt)
+            assert_raises(AssertionError,
+                          lambda: assert_array_max_ulp(nan, inf,
+                          maxulp=maxulp))
+            assert_raises(AssertionError,
+                          lambda: assert_array_max_ulp(nan, big,
+                          maxulp=maxulp))
+            assert_raises(AssertionError,
+                          lambda: assert_array_max_ulp(nan, tiny,
+                          maxulp=maxulp))
+            assert_raises(AssertionError,
+                          lambda: assert_array_max_ulp(nan, zero,
+                          maxulp=maxulp))
+            assert_raises(AssertionError,
+                          lambda: assert_array_max_ulp(nan, nzero,
+                          maxulp=maxulp))
+
+
+class TestStringEqual(object):
+    def test_simple(self):
+        assert_string_equal("hello", "hello")
+        assert_string_equal("hello\nmultiline", "hello\nmultiline")
+
+        with pytest.raises(AssertionError) as exc_info:
+            assert_string_equal("foo\nbar", "hello\nbar")
+        msg = str(exc_info.value)
+        assert_equal(msg, "Differences in strings:\n- foo\n+ hello")
+
+        assert_raises(AssertionError,
+                      lambda: assert_string_equal("foo", "hello"))
+
+    def test_regex(self):
+        assert_string_equal("a+*b", "a+*b")
+
+        assert_raises(AssertionError,
+                      lambda: assert_string_equal("aaa", "a+b"))
+
+
+def assert_warn_len_equal(mod, n_in_context, py34=None, py37=None):
+    try:
+        mod_warns = mod.__warningregistry__
+    except AttributeError:
+        # the lack of a __warningregistry__
+        # attribute means that no warning has
+        # occurred; this can be triggered in
+        # a parallel test scenario, while in
+        # a serial test scenario an initial
+        # warning (and therefore the attribute)
+        # are always created first
+        mod_warns = {}
+
+    num_warns = len(mod_warns)
+    # Python 3.4 appears to clear any pre-existing warnings of the same type,
+    # when raising warnings inside a catch_warnings block. So, there is a
+    # warning generated by the tests within the context manager, but no
+    # previous warnings.
+    if 'version' in mod_warns:
+        # Python 3 adds a 'version' entry to the registry,
+        # do not count it.
+        num_warns -= 1
+
+        # Behavior of warnings is Python version dependent. Adjust the
+        # expected result to compensate. In particular, Python 3.7 does
+        # not make an entry for ignored warnings.
+        if sys.version_info[:2] >= (3, 7):
+            if py37 is not None:
+                n_in_context = py37
+        elif sys.version_info[:2] >= (3, 4):
+            if py34 is not None:
+                n_in_context = py34
+    assert_equal(num_warns, n_in_context)
+
+def test_warn_len_equal_call_scenarios():
+    # assert_warn_len_equal is called under
+    # varying circumstances depending on serial
+    # vs. parallel test scenarios; this test
+    # simply aims to probe both code paths and
+    # check that no assertion is uncaught
+
+    # parallel scenario -- no warning issued yet
+    class mod(object):
+        pass
+
+    mod_inst = mod()
+
+    assert_warn_len_equal(mod=mod_inst,
+                          n_in_context=0)
+
+    # serial test scenario -- the __warningregistry__
+    # attribute should be present
+    class mod(object):
+        def __init__(self):
+            self.__warningregistry__ = {'warning1':1,
+                                        'warning2':2}
+
+    mod_inst = mod()
+    assert_warn_len_equal(mod=mod_inst,
+                          n_in_context=2)
+
+
+def _get_fresh_mod():
+    # Get this module, with warning registry empty
+    my_mod = sys.modules[__name__]
+    try:
+        my_mod.__warningregistry__.clear()
+    except AttributeError:
+        # will not have a __warningregistry__ unless warning has been
+        # raised in the module at some point
+        pass
+    return my_mod
+
+
+def test_clear_and_catch_warnings():
+    # Initial state of module, no warnings
+    my_mod = _get_fresh_mod()
+    assert_equal(getattr(my_mod, '__warningregistry__', {}), {})
+    with clear_and_catch_warnings(modules=[my_mod]):
+        warnings.simplefilter('ignore')
+        warnings.warn('Some warning')
+    assert_equal(my_mod.__warningregistry__, {})
+    # Without specified modules, don't clear warnings during context
+    # Python 3.7 catch_warnings doesn't make an entry for 'ignore'.
+    with clear_and_catch_warnings():
+        warnings.simplefilter('ignore')
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 1, py37=0)
+    # Confirm that specifying module keeps old warning, does not add new
+    with clear_and_catch_warnings(modules=[my_mod]):
+        warnings.simplefilter('ignore')
+        warnings.warn('Another warning')
+    assert_warn_len_equal(my_mod, 1, py37=0)
+    # Another warning, no module spec does add to warnings dict, except on
+    # Python 3.4 (see comments in `assert_warn_len_equal`)
+    # Python 3.7 catch_warnings doesn't make an entry for 'ignore'.
+    with clear_and_catch_warnings():
+        warnings.simplefilter('ignore')
+        warnings.warn('Another warning')
+    assert_warn_len_equal(my_mod, 2, py34=1, py37=0)
+
+
+def test_suppress_warnings_module():
+    # Initial state of module, no warnings
+    my_mod = _get_fresh_mod()
+    assert_equal(getattr(my_mod, '__warningregistry__', {}), {})
+
+    def warn_other_module():
+        # Apply along axis is implemented in python; stacklevel=2 means
+        # we end up inside its module, not ours.
+        def warn(arr):
+            warnings.warn("Some warning 2", stacklevel=2)
+            return arr
+        np.apply_along_axis(warn, 0, [0])
+
+    # Test module based warning suppression:
+    assert_warn_len_equal(my_mod, 0)
+    with suppress_warnings() as sup:
+        sup.record(UserWarning)
+        # suppress warning from other module (may have .pyc ending),
+        # if apply_along_axis is moved, had to be changed.
+        sup.filter(module=np.lib.shape_base)
+        warnings.warn("Some warning")
+        warn_other_module()
+    # Check that the suppression did test the file correctly (this module
+    # got filtered)
+    assert_equal(len(sup.log), 1)
+    assert_equal(sup.log[0].message.args[0], "Some warning")
+    assert_warn_len_equal(my_mod, 0, py37=0)
+    sup = suppress_warnings()
+    # Will have to be changed if apply_along_axis is moved:
+    sup.filter(module=my_mod)
+    with sup:
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 0)
+    # And test repeat works:
+    sup.filter(module=my_mod)
+    with sup:
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 0)
+
+    # Without specified modules, don't clear warnings during context
+    # Python 3.7 does not add ignored warnings.
+    with suppress_warnings():
+        warnings.simplefilter('ignore')
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 1, py37=0)
+
+def test_suppress_warnings_type():
+    # Initial state of module, no warnings
+    my_mod = _get_fresh_mod()
+    assert_equal(getattr(my_mod, '__warningregistry__', {}), {})
+
+    # Test module based warning suppression:
+    with suppress_warnings() as sup:
+        sup.filter(UserWarning)
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 0)
+    sup = suppress_warnings()
+    sup.filter(UserWarning)
+    with sup:
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 0)
+    # And test repeat works:
+    sup.filter(module=my_mod)
+    with sup:
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 0)
+
+    # Without specified modules, don't clear warnings during context
+    # Python 3.7 does not add ignored warnings.
+    with suppress_warnings():
+        warnings.simplefilter('ignore')
+        warnings.warn('Some warning')
+    assert_warn_len_equal(my_mod, 1, py37=0)
+
+
+def test_suppress_warnings_decorate_no_record():
+    sup = suppress_warnings()
+    sup.filter(UserWarning)
+
+    @sup
+    def warn(category):
+        warnings.warn('Some warning', category)
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        warn(UserWarning)  # should be supppressed
+        warn(RuntimeWarning)
+        assert_equal(len(w), 1)
+
+
+def test_suppress_warnings_record():
+    sup = suppress_warnings()
+    log1 = sup.record()
+
+    with sup:
+        log2 = sup.record(message='Some other warning 2')
+        sup.filter(message='Some warning')
+        warnings.warn('Some warning')
+        warnings.warn('Some other warning')
+        warnings.warn('Some other warning 2')
+
+        assert_equal(len(sup.log), 2)
+        assert_equal(len(log1), 1)
+        assert_equal(len(log2),1)
+        assert_equal(log2[0].message.args[0], 'Some other warning 2')
+
+    # Do it again, with the same context to see if some warnings survived:
+    with sup:
+        log2 = sup.record(message='Some other warning 2')
+        sup.filter(message='Some warning')
+        warnings.warn('Some warning')
+        warnings.warn('Some other warning')
+        warnings.warn('Some other warning 2')
+
+        assert_equal(len(sup.log), 2)
+        assert_equal(len(log1), 1)
+        assert_equal(len(log2), 1)
+        assert_equal(log2[0].message.args[0], 'Some other warning 2')
+
+    # Test nested:
+    with suppress_warnings() as sup:
+        sup.record()
+        with suppress_warnings() as sup2:
+            sup2.record(message='Some warning')
+            warnings.warn('Some warning')
+            warnings.warn('Some other warning')
+            assert_equal(len(sup2.log), 1)
+        assert_equal(len(sup.log), 1)
+
+
+def test_suppress_warnings_forwarding():
+    def warn_other_module():
+        # Apply along axis is implemented in python; stacklevel=2 means
+        # we end up inside its module, not ours.
+        def warn(arr):
+            warnings.warn("Some warning", stacklevel=2)
+            return arr
+        np.apply_along_axis(warn, 0, [0])
+
+    with suppress_warnings() as sup:
+        sup.record()
+        with suppress_warnings("always"):
+            for i in range(2):
+                warnings.warn("Some warning")
+
+        assert_equal(len(sup.log), 2)
+
+    with suppress_warnings() as sup:
+        sup.record()
+        with suppress_warnings("location"):
+            for i in range(2):
+                warnings.warn("Some warning")
+                warnings.warn("Some warning")
+
+        assert_equal(len(sup.log), 2)
+
+    with suppress_warnings() as sup:
+        sup.record()
+        with suppress_warnings("module"):
+            for i in range(2):
+                warnings.warn("Some warning")
+                warnings.warn("Some warning")
+                warn_other_module()
+
+        assert_equal(len(sup.log), 2)
+
+    with suppress_warnings() as sup:
+        sup.record()
+        with suppress_warnings("once"):
+            for i in range(2):
+                warnings.warn("Some warning")
+                warnings.warn("Some other warning")
+                warn_other_module()
+
+        assert_equal(len(sup.log), 2)
+
+
+def test_tempdir():
+    with tempdir() as tdir:
+        fpath = os.path.join(tdir, 'tmp')
+        with open(fpath, 'w'):
+            pass
+    assert_(not os.path.isdir(tdir))
+
+    raised = False
+    try:
+        with tempdir() as tdir:
+            raise ValueError()
+    except ValueError:
+        raised = True
+    assert_(raised)
+    assert_(not os.path.isdir(tdir))
+
+
+def test_temppath():
+    with temppath() as fpath:
+        with open(fpath, 'w'):
+            pass
+    assert_(not os.path.isfile(fpath))
+
+    raised = False
+    try:
+        with temppath() as fpath:
+            raise ValueError()
+    except ValueError:
+        raised = True
+    assert_(raised)
+    assert_(not os.path.isfile(fpath))
+
+
+class my_cacw(clear_and_catch_warnings):
+
+    class_modules = (sys.modules[__name__],)
+
+
+def test_clear_and_catch_warnings_inherit():
+    # Test can subclass and add default modules
+    my_mod = _get_fresh_mod()
+    with my_cacw():
+        warnings.simplefilter('ignore')
+        warnings.warn('Some warning')
+    assert_equal(my_mod.__warningregistry__, {})
+
+
+[email protected](not HAS_REFCOUNT, reason="Python lacks refcounts")
+class TestAssertNoGcCycles(object):
+    """ Test assert_no_gc_cycles """
+    def test_passes(self):
+        def no_cycle():
+            b = []
+            b.append([])
+            return b
+
+        with assert_no_gc_cycles():
+            no_cycle()
+
+        assert_no_gc_cycles(no_cycle)
+
+    def test_asserts(self):
+        def make_cycle():
+            a = []
+            a.append(a)
+            a.append(a)
+            return a
+
+        with assert_raises(AssertionError):
+            with assert_no_gc_cycles():
+                make_cycle()
+
+        with assert_raises(AssertionError):
+            assert_no_gc_cycles(make_cycle)
+
+    @pytest.mark.slow
+    def test_fails(self):
+        """
+        Test that in cases where the garbage cannot be collected, we raise an
+        error, instead of hanging forever trying to clear it.
+        """
+
+        class ReferenceCycleInDel(object):
+            """
+            An object that not only contains a reference cycle, but creates new
+            cycles whenever it's garbage-collected and its __del__ runs
+            """
+            make_cycle = True
+
+            def __init__(self):
+                self.cycle = self
+
+            def __del__(self):
+                # break the current cycle so that `self` can be freed
+                self.cycle = None
+
+                if ReferenceCycleInDel.make_cycle:
+                    # but create a new one so that the garbage collector has more
+                    # work to do.
+                    ReferenceCycleInDel()
+
+        try:
+            w = weakref.ref(ReferenceCycleInDel())
+            try:
+                with assert_raises(RuntimeError):
+                    # this will be unable to get a baseline empty garbage
+                    assert_no_gc_cycles(lambda: None)
+            except AssertionError:
+                # the above test is only necessary if the GC actually tried to free
+                # our object anyway, which python 2.7 does not.
+                if w() is not None:
+                    pytest.skip("GC does not call __del__ on cyclic objects")
+                    raise
+
+        finally:
+            # make sure that we stop creating reference cycles
+            ReferenceCycleInDel.make_cycle = False
diff --git a/contrib/python/numpy/py2/numpy/tests/__init__.py b/contrib/python/numpy/py2/numpy/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/__init__.py
diff --git a/contrib/python/numpy/py2/numpy/tests/test_ctypeslib.py b/contrib/python/numpy/py2/numpy/tests/test_ctypeslib.py
new file mode 100644
index 00000000000..521208c36d3
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_ctypeslib.py
@@ -0,0 +1,367 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import pytest
+import weakref
+
+import numpy as np
+from numpy.ctypeslib import ndpointer, load_library, as_array
+from numpy.distutils.misc_util import get_shared_lib_extension
+from numpy.testing import assert_, assert_array_equal, assert_raises, assert_equal
+
+try:
+    import ctypes
+except ImportError:
+    ctypes = None
+else:
+    cdll = None
+    test_cdll = None
+    if hasattr(sys, 'gettotalrefcount'):
+        try:
+            cdll = load_library('_multiarray_umath_d', np.core._multiarray_umath.__file__)
+        except OSError:
+            pass
+        try:
+            test_cdll = load_library('_multiarray_tests', np.core._multiarray_tests.__file__)
+        except OSError:
+            pass
+    if cdll is None:
+        cdll = load_library('_multiarray_umath', np.core._multiarray_umath.__file__)
+    if test_cdll is None:
+        test_cdll = load_library('_multiarray_tests', np.core._multiarray_tests.__file__)
+
+    c_forward_pointer = test_cdll.forward_pointer
+
+
+[email protected](ctypes is None,
+                    reason="ctypes not available in this python")
+[email protected](sys.platform == 'cygwin',
+                    reason="Known to fail on cygwin")
+class TestLoadLibrary(object):
+    def test_basic(self):
+        try:
+            # Should succeed
+            load_library('_multiarray_umath', np.core._multiarray_umath.__file__)
+        except ImportError as e:
+            msg = ("ctypes is not available on this python: skipping the test"
+                   " (import error was: %s)" % str(e))
+            print(msg)
+
+    def test_basic2(self):
+        # Regression for #801: load_library with a full library name
+        # (including extension) does not work.
+        try:
+            try:
+                so = get_shared_lib_extension(is_python_ext=True)
+                # Should succeed
+                load_library('_multiarray_umath%s' % so, np.core._multiarray_umath.__file__)
+            except ImportError:
+                print("No distutils available, skipping test.")
+        except ImportError as e:
+            msg = ("ctypes is not available on this python: skipping the test"
+                   " (import error was: %s)" % str(e))
+            print(msg)
+
+
+class TestNdpointer(object):
+    def test_dtype(self):
+        dt = np.intc
+        p = ndpointer(dtype=dt)
+        assert_(p.from_param(np.array([1], dt)))
+        dt = '<i4'
+        p = ndpointer(dtype=dt)
+        assert_(p.from_param(np.array([1], dt)))
+        dt = np.dtype('>i4')
+        p = ndpointer(dtype=dt)
+        p.from_param(np.array([1], dt))
+        assert_raises(TypeError, p.from_param,
+                          np.array([1], dt.newbyteorder('swap')))
+        dtnames = ['x', 'y']
+        dtformats = [np.intc, np.float64]
+        dtdescr = {'names': dtnames, 'formats': dtformats}
+        dt = np.dtype(dtdescr)
+        p = ndpointer(dtype=dt)
+        assert_(p.from_param(np.zeros((10,), dt)))
+        samedt = np.dtype(dtdescr)
+        p = ndpointer(dtype=samedt)
+        assert_(p.from_param(np.zeros((10,), dt)))
+        dt2 = np.dtype(dtdescr, align=True)
+        if dt.itemsize != dt2.itemsize:
+            assert_raises(TypeError, p.from_param, np.zeros((10,), dt2))
+        else:
+            assert_(p.from_param(np.zeros((10,), dt2)))
+
+    def test_ndim(self):
+        p = ndpointer(ndim=0)
+        assert_(p.from_param(np.array(1)))
+        assert_raises(TypeError, p.from_param, np.array([1]))
+        p = ndpointer(ndim=1)
+        assert_raises(TypeError, p.from_param, np.array(1))
+        assert_(p.from_param(np.array([1])))
+        p = ndpointer(ndim=2)
+        assert_(p.from_param(np.array([[1]])))
+
+    def test_shape(self):
+        p = ndpointer(shape=(1, 2))
+        assert_(p.from_param(np.array([[1, 2]])))
+        assert_raises(TypeError, p.from_param, np.array([[1], [2]]))
+        p = ndpointer(shape=())
+        assert_(p.from_param(np.array(1)))
+
+    def test_flags(self):
+        x = np.array([[1, 2], [3, 4]], order='F')
+        p = ndpointer(flags='FORTRAN')
+        assert_(p.from_param(x))
+        p = ndpointer(flags='CONTIGUOUS')
+        assert_raises(TypeError, p.from_param, x)
+        p = ndpointer(flags=x.flags.num)
+        assert_(p.from_param(x))
+        assert_raises(TypeError, p.from_param, np.array([[1, 2], [3, 4]]))
+
+    def test_cache(self):
+        assert_(ndpointer(dtype=np.float64) is ndpointer(dtype=np.float64))
+
+        # shapes are normalized
+        assert_(ndpointer(shape=2) is ndpointer(shape=(2,)))
+
+        # 1.12 <= v < 1.16 had a bug that made these fail
+        assert_(ndpointer(shape=2) is not ndpointer(ndim=2))
+        assert_(ndpointer(ndim=2) is not ndpointer(shape=2))
+
+[email protected](ctypes is None,
+                    reason="ctypes not available on this python installation")
+class TestNdpointerCFunc(object):
+    def test_arguments(self):
+        """ Test that arguments are coerced from arrays """
+        c_forward_pointer.restype = ctypes.c_void_p
+        c_forward_pointer.argtypes = (ndpointer(ndim=2),)
+
+        c_forward_pointer(np.zeros((2, 3)))
+        # too many dimensions
+        assert_raises(
+            ctypes.ArgumentError, c_forward_pointer, np.zeros((2, 3, 4)))
+
+    @pytest.mark.parametrize(
+        'dt', [
+            float,
+            np.dtype(dict(
+                formats=['<i4', '<i4'],
+                names=['a', 'b'],
+                offsets=[0, 2],
+                itemsize=6
+            ))
+        ], ids=[
+            'float',
+            'overlapping-fields'
+        ]
+    )
+    def test_return(self, dt):
+        """ Test that return values are coerced to arrays """
+        arr = np.zeros((2, 3), dt)
+        ptr_type = ndpointer(shape=arr.shape, dtype=arr.dtype)
+
+        c_forward_pointer.restype = ptr_type
+        c_forward_pointer.argtypes = (ptr_type,)
+
+        # check that the arrays are equivalent views on the same data
+        arr2 = c_forward_pointer(arr)
+        assert_equal(arr2.dtype, arr.dtype)
+        assert_equal(arr2.shape, arr.shape)
+        assert_equal(
+            arr2.__array_interface__['data'],
+            arr.__array_interface__['data']
+        )
+
+    def test_vague_return_value(self):
+        """ Test that vague ndpointer return values do not promote to arrays """
+        arr = np.zeros((2, 3))
+        ptr_type = ndpointer(dtype=arr.dtype)
+
+        c_forward_pointer.restype = ptr_type
+        c_forward_pointer.argtypes = (ptr_type,)
+
+        ret = c_forward_pointer(arr)
+        assert_(isinstance(ret, ptr_type))
+
+
+[email protected](ctypes is None,
+                    reason="ctypes not available on this python installation")
+class TestAsArray(object):
+    def test_array(self):
+        from ctypes import c_int
+
+        pair_t = c_int * 2
+        a = as_array(pair_t(1, 2))
+        assert_equal(a.shape, (2,))
+        assert_array_equal(a, np.array([1, 2]))
+        a = as_array((pair_t * 3)(pair_t(1, 2), pair_t(3, 4), pair_t(5, 6)))
+        assert_equal(a.shape, (3, 2))
+        assert_array_equal(a, np.array([[1, 2], [3, 4], [5, 6]]))
+
+    def test_pointer(self):
+        from ctypes import c_int, cast, POINTER
+
+        p = cast((c_int * 10)(*range(10)), POINTER(c_int))
+
+        a = as_array(p, shape=(10,))
+        assert_equal(a.shape, (10,))
+        assert_array_equal(a, np.arange(10))
+
+        a = as_array(p, shape=(2, 5))
+        assert_equal(a.shape, (2, 5))
+        assert_array_equal(a, np.arange(10).reshape((2, 5)))
+
+        # shape argument is required
+        assert_raises(TypeError, as_array, p)
+
+    def test_struct_array_pointer(self):
+        from ctypes import c_int16, Structure, pointer
+
+        class Struct(Structure):
+            _fields_ = [('a', c_int16)]
+
+        Struct3 = 3 * Struct
+
+        c_array = (2 * Struct3)(
+            Struct3(Struct(a=1), Struct(a=2), Struct(a=3)),
+            Struct3(Struct(a=4), Struct(a=5), Struct(a=6))
+        )
+
+        expected = np.array([
+            [(1,), (2,), (3,)],
+            [(4,), (5,), (6,)],
+        ], dtype=[('a', np.int16)])
+
+        def check(x):
+            assert_equal(x.dtype, expected.dtype)
+            assert_equal(x, expected)
+
+        # all of these should be equivalent
+        check(as_array(c_array))
+        check(as_array(pointer(c_array), shape=()))
+        check(as_array(pointer(c_array[0]), shape=(2,)))
+        check(as_array(pointer(c_array[0][0]), shape=(2, 3)))
+
+    def test_reference_cycles(self):
+        # related to gh-6511
+        import ctypes
+
+        # create array to work with
+        # don't use int/long to avoid running into bpo-10746
+        N = 100
+        a = np.arange(N, dtype=np.short)
+
+        # get pointer to array
+        pnt = np.ctypeslib.as_ctypes(a)
+
+        with np.testing.assert_no_gc_cycles():
+            # decay the array above to a pointer to its first element
+            newpnt = ctypes.cast(pnt, ctypes.POINTER(ctypes.c_short))
+            # and construct an array using this data
+            b = np.ctypeslib.as_array(newpnt, (N,))
+            # now delete both, which should cleanup both objects
+            del newpnt, b
+
+    def test_segmentation_fault(self):
+        arr = np.zeros((224, 224, 3))
+        c_arr = np.ctypeslib.as_ctypes(arr)
+        arr_ref = weakref.ref(arr)
+        del arr
+
+        # check the reference wasn't cleaned up
+        assert_(arr_ref() is not None)
+
+        # check we avoid the segfault
+        c_arr[0][0][0]
+
+
+[email protected](ctypes is None,
+                    reason="ctypes not available on this python installation")
+class TestAsCtypesType(object):
+    """ Test conversion from dtypes to ctypes types """
+    def test_scalar(self):
+        dt = np.dtype('<u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16.__ctype_le__)
+
+        dt = np.dtype('>u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16.__ctype_be__)
+
+        dt = np.dtype('u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16)
+
+    def test_subarray(self):
+        dt = np.dtype((np.int32, (2, 3)))
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, 2 * (3 * ctypes.c_int32))
+
+    def test_structure(self):
+        dt = np.dtype([
+            ('a', np.uint16),
+            ('b', np.uint32),
+        ])
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Structure))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_structure_aligned(self):
+        dt = np.dtype([
+            ('a', np.uint16),
+            ('b', np.uint32),
+        ], align=True)
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Structure))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('', ctypes.c_char * 2),  # padding
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_union(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 0],
+            formats=[np.uint16, np.uint32]
+        ))
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Union))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_padded_union(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 0],
+            formats=[np.uint16, np.uint32],
+            itemsize=5,
+        ))
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Union))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+            ('', ctypes.c_char * 5),  # padding
+        ])
+
+    def test_overlapping(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 2],
+            formats=[np.uint32, np.uint32]
+        ))
+        assert_raises(NotImplementedError, np.ctypeslib.as_ctypes_type, dt)
diff --git a/contrib/python/numpy/py2/numpy/tests/test_matlib.py b/contrib/python/numpy/py2/numpy/tests/test_matlib.py
new file mode 100644
index 00000000000..38a7e39dfb6
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_matlib.py
@@ -0,0 +1,68 @@
+from __future__ import division, absolute_import, print_function
+
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
+import numpy as np
+import numpy.matlib
+from numpy.testing import assert_array_equal, assert_
+
+def test_empty():
+    x = numpy.matlib.empty((2,))
+    assert_(isinstance(x, np.matrix))
+    assert_(x.shape, (1, 2))
+
+def test_ones():
+    assert_array_equal(numpy.matlib.ones((2, 3)),
+                       np.matrix([[ 1.,  1.,  1.],
+                                 [ 1.,  1.,  1.]]))
+
+    assert_array_equal(numpy.matlib.ones(2), np.matrix([[ 1.,  1.]]))
+
+def test_zeros():
+    assert_array_equal(numpy.matlib.zeros((2, 3)),
+                       np.matrix([[ 0.,  0.,  0.],
+                                 [ 0.,  0.,  0.]]))
+
+    assert_array_equal(numpy.matlib.zeros(2), np.matrix([[ 0.,  0.]]))
+
+def test_identity():
+    x = numpy.matlib.identity(2, dtype=int)
+    assert_array_equal(x, np.matrix([[1, 0], [0, 1]]))
+
+def test_eye():
+    xc = numpy.matlib.eye(3, k=1, dtype=int)
+    assert_array_equal(xc, np.matrix([[ 0,  1,  0],
+                                      [ 0,  0,  1],
+                                      [ 0,  0,  0]]))
+    assert xc.flags.c_contiguous
+    assert not xc.flags.f_contiguous
+
+    xf = numpy.matlib.eye(3, 4, dtype=int, order='F')
+    assert_array_equal(xf, np.matrix([[ 1,  0,  0,  0],
+                                      [ 0,  1,  0,  0],
+                                      [ 0,  0,  1,  0]]))
+    assert not xf.flags.c_contiguous
+    assert xf.flags.f_contiguous
+
+def test_rand():
+    x = numpy.matlib.rand(3)
+    # check matrix type, array would have shape (3,)
+    assert_(x.ndim == 2)
+
+def test_randn():
+    x = np.matlib.randn(3)
+    # check matrix type, array would have shape (3,)
+    assert_(x.ndim == 2)
+
+def test_repmat():
+    a1 = np.arange(4)
+    x = numpy.matlib.repmat(a1, 2, 2)
+    y = np.array([[0, 1, 2, 3, 0, 1, 2, 3],
+                  [0, 1, 2, 3, 0, 1, 2, 3]])
+    assert_array_equal(x, y)
diff --git a/contrib/python/numpy/py2/numpy/tests/test_numpy_version.py b/contrib/python/numpy/py2/numpy/tests/test_numpy_version.py
new file mode 100644
index 00000000000..7fac8fd22ea
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_numpy_version.py
@@ -0,0 +1,19 @@
+from __future__ import division, absolute_import, print_function
+
+import re
+
+import numpy as np
+from numpy.testing import assert_
+
+
+def test_valid_numpy_version():
+    # Verify that the numpy version is a valid one (no .post suffix or other
+    # nonsense).  See gh-6431 for an issue caused by an invalid version.
+    version_pattern = r"^[0-9]+\.[0-9]+\.[0-9]+(|a[0-9]|b[0-9]|rc[0-9])"
+    dev_suffix = r"(\.dev0\+([0-9a-f]{7}|Unknown))"
+    if np.version.release:
+        res = re.match(version_pattern, np.__version__)
+    else:
+        res = re.match(version_pattern + dev_suffix, np.__version__)
+
+    assert_(res is not None, np.__version__)
diff --git a/contrib/python/numpy/py2/numpy/tests/test_public_api.py b/contrib/python/numpy/py2/numpy/tests/test_public_api.py
new file mode 100644
index 00000000000..194f8ecbb8a
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_public_api.py
@@ -0,0 +1,89 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+import numpy as np
+import pytest
+try:
+    import ctypes
+except ImportError:
+    ctypes = None
+
+def check_dir(module, module_name=None):
+    """Returns a mapping of all objects with the wrong __module__ attribute."""
+    if module_name is None:
+        module_name = module.__name__
+    results = {}
+    for name in dir(module):
+        item = getattr(module, name)
+        if (hasattr(item, '__module__') and hasattr(item, '__name__')
+                and item.__module__ != module_name):
+            results[name] = item.__module__ + '.' + item.__name__
+    return results
+
+
+[email protected](
+    sys.version_info[0] < 3,
+    reason="NumPy exposes slightly different functions on Python 2")
+def test_numpy_namespace():
+    # None of these objects are publicly documented.
+    undocumented = {
+        'Tester': 'numpy.testing._private.nosetester.NoseTester',
+        '_add_newdoc_ufunc': 'numpy.core._multiarray_umath._add_newdoc_ufunc',
+        'add_docstring': 'numpy.core._multiarray_umath.add_docstring',
+        'add_newdoc': 'numpy.core.function_base.add_newdoc',
+        'add_newdoc_ufunc': 'numpy.core._multiarray_umath._add_newdoc_ufunc',
+        'byte_bounds': 'numpy.lib.utils.byte_bounds',
+        'compare_chararrays': 'numpy.core._multiarray_umath.compare_chararrays',
+        'deprecate': 'numpy.lib.utils.deprecate',
+        'deprecate_with_doc': 'numpy.lib.utils.<lambda>',
+        'disp': 'numpy.lib.function_base.disp',
+        'fastCopyAndTranspose': 'numpy.core._multiarray_umath._fastCopyAndTranspose',
+        'get_array_wrap': 'numpy.lib.shape_base.get_array_wrap',
+        'get_include': 'numpy.lib.utils.get_include',
+        'int_asbuffer': 'numpy.core._multiarray_umath.int_asbuffer',
+        'mafromtxt': 'numpy.lib.npyio.mafromtxt',
+        'maximum_sctype': 'numpy.core.numerictypes.maximum_sctype',
+        'ndfromtxt': 'numpy.lib.npyio.ndfromtxt',
+        'recfromcsv': 'numpy.lib.npyio.recfromcsv',
+        'recfromtxt': 'numpy.lib.npyio.recfromtxt',
+        'safe_eval': 'numpy.lib.utils.safe_eval',
+        'set_string_function': 'numpy.core.arrayprint.set_string_function',
+        'show_config': 'numpy.__config__.show',
+        'who': 'numpy.lib.utils.who',
+    }
+    # These built-in types are re-exported by numpy.
+    builtins = {
+        'bool': 'builtins.bool',
+        'complex': 'builtins.complex',
+        'float': 'builtins.float',
+        'int': 'builtins.int',
+        'long': 'builtins.int',
+        'object': 'builtins.object',
+        'str': 'builtins.str',
+        'unicode': 'builtins.str',
+    }
+    whitelist = dict(undocumented, **builtins)
+    bad_results = check_dir(np)
+    # pytest gives better error messages with the builtin assert than with
+    # assert_equal
+    assert bad_results == whitelist
+
+
+def test_numpy_linalg():
+    bad_results = check_dir(np.linalg)
+    assert bad_results == {}
+
+
+def test_numpy_fft():
+    bad_results = check_dir(np.fft)
+    assert bad_results == {}
+
+[email protected](ctypes is None,
+                    reason="ctypes not available in this python")
+def test_NPY_NO_EXPORT():
+    cdll = ctypes.CDLL(np.core._multiarray_tests.__file__)
+    # Make sure an arbitrary NPY_NO_EXPORT function is actually hidden
+    f = getattr(cdll, 'test_not_exported', None)
+    assert f is None, ("'test_not_exported' is mistakenly exported, "
+                      "NPY_NO_EXPORT does not work")
diff --git a/contrib/python/numpy/py2/numpy/tests/test_reloading.py b/contrib/python/numpy/py2/numpy/tests/test_reloading.py
new file mode 100644
index 00000000000..a073d691f71
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_reloading.py
@@ -0,0 +1,38 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+
+from numpy.testing import assert_raises, assert_, assert_equal
+from numpy.core.numeric import pickle
+
+if sys.version_info[:2] >= (3, 4):
+    from importlib import reload
+else:
+    from imp import reload
+
+def test_numpy_reloading():
+    # gh-7844. Also check that relevant globals retain their identity.
+    import numpy as np
+    import numpy._globals
+
+    _NoValue = np._NoValue
+    VisibleDeprecationWarning = np.VisibleDeprecationWarning
+    ModuleDeprecationWarning = np.ModuleDeprecationWarning
+
+    reload(np)
+    assert_(_NoValue is np._NoValue)
+    assert_(ModuleDeprecationWarning is np.ModuleDeprecationWarning)
+    assert_(VisibleDeprecationWarning is np.VisibleDeprecationWarning)
+
+    assert_raises(RuntimeError, reload, numpy._globals)
+    reload(np)
+    assert_(_NoValue is np._NoValue)
+    assert_(ModuleDeprecationWarning is np.ModuleDeprecationWarning)
+    assert_(VisibleDeprecationWarning is np.VisibleDeprecationWarning)
+
+def test_novalue():
+    import numpy as np
+    for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
+        assert_equal(repr(np._NoValue), '<no value>')
+        assert_(pickle.loads(pickle.dumps(np._NoValue,
+                                          protocol=proto)) is np._NoValue)
diff --git a/contrib/python/numpy/py2/numpy/tests/test_scripts.py b/contrib/python/numpy/py2/numpy/tests/test_scripts.py
new file mode 100644
index 00000000000..e42dc25f98e
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_scripts.py
@@ -0,0 +1,49 @@
+""" Test scripts
+
+Test that we can run executable scripts that have been installed with numpy.
+"""
+from __future__ import division, print_function, absolute_import
+
+import sys
+import os
+import pytest
+from os.path import join as pathjoin, isfile, dirname
+import subprocess
+
+import numpy as np
+from numpy.compat.py3k import basestring
+from numpy.testing import assert_, assert_equal
+
+is_inplace = isfile(pathjoin(dirname(np.__file__),  '..', 'setup.py'))
+
+
+def find_f2py_commands():
+    if sys.platform == 'win32':
+        exe_dir = dirname(sys.executable)
+        if exe_dir.endswith('Scripts'): # virtualenv
+            return [os.path.join(exe_dir, 'f2py')]
+        else:
+            return [os.path.join(exe_dir, "Scripts", 'f2py')]
+    else:
+        # Three scripts are installed in Unix-like systems:
+        # 'f2py', 'f2py{major}', and 'f2py{major.minor}'. For example,
+        # if installed with python3.7 the scripts would be named
+        # 'f2py', 'f2py3', and 'f2py3.7'.
+        version = sys.version_info
+        major = str(version.major)
+        minor = str(version.minor)
+        return ['f2py', 'f2py' + major, 'f2py' + major + '.' + minor]
+
+
+[email protected](is_inplace, reason="Cannot test f2py command inplace")
+[email protected](reason="Test is unreliable")
+[email protected]('f2py_cmd', find_f2py_commands())
+def test_f2py(f2py_cmd):
+    # test that we can run f2py script
+    stdout = subprocess.check_output([f2py_cmd, '-v'])
+    assert_equal(stdout.strip(), b'2')
+
+
+def test_pep338():
+    stdout = subprocess.check_output([sys.executable, '-mnumpy.f2py', '-v'])
+    assert_equal(stdout.strip(), b'2')
diff --git a/contrib/python/numpy/py2/numpy/tests/test_warnings.py b/contrib/python/numpy/py2/numpy/tests/test_warnings.py
new file mode 100644
index 00000000000..aa6f69f7eea
--- /dev/null
+++ b/contrib/python/numpy/py2/numpy/tests/test_warnings.py
@@ -0,0 +1,78 @@
+"""
+Tests which scan for certain occurrences in the code, they may not find
+all of these occurrences but should catch almost all.
+"""
+from __future__ import division, absolute_import, print_function
+
+import sys
+import pytest
+
+if sys.version_info >= (3, 4):
+    from pathlib import Path
+    import ast
+    import tokenize
+    import numpy
+
+    class ParseCall(ast.NodeVisitor):
+        def __init__(self):
+            self.ls = []
+
+        def visit_Attribute(self, node):
+            ast.NodeVisitor.generic_visit(self, node)
+            self.ls.append(node.attr)
+
+        def visit_Name(self, node):
+            self.ls.append(node.id)
+
+
+    class FindFuncs(ast.NodeVisitor):
+        def __init__(self, filename):
+            super().__init__()
+            self.__filename = filename
+
+        def visit_Call(self, node):
+            p = ParseCall()
+            p.visit(node.func)
+            ast.NodeVisitor.generic_visit(self, node)
+
+            if p.ls[-1] == 'simplefilter' or p.ls[-1] == 'filterwarnings':
+                if node.args[0].s == "ignore":
+                    raise AssertionError(
+                        "ignore filter should not be used; found in "
+                        "{} on line {}".format(self.__filename, node.lineno))
+
+            if p.ls[-1] == 'warn' and (
+                    len(p.ls) == 1 or p.ls[-2] == 'warnings'):
+
+                if "testing/tests/test_warnings.py" is self.__filename:
+                    # This file
+                    return
+
+                # See if stacklevel exists:
+                if len(node.args) == 3:
+                    return
+                args = {kw.arg for kw in node.keywords}
+                if "stacklevel" in args:
+                    return
+                raise AssertionError(
+                    "warnings should have an appropriate stacklevel; found in "
+                    "{} on line {}".format(self.__filename, node.lineno))
+
+
+    @pytest.mark.slow
+    def test_warning_calls():
+        # combined "ignore" and stacklevel error
+        base = Path(numpy.__file__).parent
+
+        for path in base.rglob("*.py"):
+            if base / "testing" in path.parents:
+                continue
+            if path == base / "__init__.py":
+                continue
+            if path == base / "random" / "__init__.py":
+                continue
+            # use tokenize to auto-detect encoding on systems where no
+            # default encoding is defined (e.g. LANG='C')
+            with tokenize.open(str(path)) as file:
+                tree = ast.parse(file.read())
+                FindFuncs(path).visit(tree)
author	maxim-yurchuk <[email protected]>	2024-10-09 12:29:46 +0300
committer	maxim-yurchuk <[email protected]>	2024-10-09 13:14:22 +0300
commit	9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80 (patch)
tree	a8fb3181d5947c0d78cf402aa56e686130179049 /contrib/python/numpy/py2
parent	a44b779cd359f06c3ebbef4ec98c6b38609d9d85 (diff)